### Purpose
This function takes two dataframes from Yelp and extracts features for further analysis.

In [1]:
#import necessary libraries
import pandas as pd
import numpy as np
import json
import ast

In [2]:
def clean_yelp(df1,df2):
    """Takes in 2 dataframes from Yelp and cleans them, returning relevant columns.""" 
    """If df2 does not exist, pass 'df2 = None'."""
    #check if df2 exists and set final to either equal the concatted form of df1 and df2
    #or just df1 if df2 doesn't exist
    if df2 is not None:
        final = pd.concat([df1,df2])
    else:
        final = df1
    #drop duplicates by restaurant ID
    final = final.drop_duplicates(subset = "id",keep = "first").reset_index(drop = True)
    #get main restaurant category
    my_list = []
    for entry in range(len(final['categories'])):
        #make sure entry is not nan and can be read (must be str)
        if type(final['categories'][entry])== float:
            my_list.append(np.nan)
            continue
        #read in the entry and convert to dictionary
        x = ast.literal_eval(final['categories'][entry])
        #get first alias as this will be the main restaurant category
        x = x[0]['alias']
        my_list.append(x)
    final["Restaurant Category"] = pd.Series(my_list)
    #see whether pickup, delivery, or reservations are available
    pickup_list = []
    delivery_list = []
    reserve_list = []
    for entry in range(len(final['transactions'])):
        #make sure entry is not nan and can be read (must be str)
        if type(final['transactions'][entry])== float:
            continue
        y = ast.literal_eval(final['transactions'][entry])
        if "pickup" in y:
            pickup_list.append(1)
        else:
            pickup_list.append(0)
        if "delivery" in y:
            delivery_list.append(1)
        else:
            delivery_list.append(0)
        if "restaurant_reservation" in y:
            reserve_list.append(1)
        else:
            reserve_list.append(0)
    final['Pickup Available'] = pd.Series(pickup_list)
    final['Delivery Available'] = pd.Series(delivery_list)
    final["Reservation Available"] = pd.Series(reserve_list)
    #create price column
    my_prices = []
    for entry in range(len(final['price'])):
        #find number of dollar signs in price and assign value accordingly
        if final['price'][entry]=="$":
            my_prices.append("Low")
        elif final['price'][entry]=="$$":
            my_prices.append("Medium")
        elif final['price'][entry]=="$$$":
            my_prices.append("High")
        elif final['price'][entry]=="$$$$":
            my_prices.append("Very High")
        else:
            my_prices.append(np.nan)
    final['Price Level'] = pd.Series(my_prices)
    #find number of photos
    num_photos = []
    for entry in range(len(final['photos'])):
        if type(final['photos'][entry])!=str:
            num_photos.append(np.nan)
            continue
        x = ast.literal_eval(final['photos'][entry])
        #use length of x to determine the number of photos
        x = len(x)
        num_photos.append(x)
    final['number of photos'] = pd.Series(num_photos)
    #find city and state of restaurant
    city = []
    state = []
    for entry in range(len(final['location'])):
        if type(final['location'][entry])!=str:
            city.append(np.nan)
            state.append(np.nan)
            continue
        #create dictionary based on entry and find the city and state
        x= ast.literal_eval(final['location'][entry])
        y = x['city']
        z = x['state']
        city.append(y)
        state.append(z)
    final['City'] = pd.Series(city)
    final['State'] = pd.Series(state)
    #find opening (start) and closing (end) times of restaurants
    start_time = []
    end_time = []
    for entry in range(len(final['hours'])):   
        if type(final['hours'][entry]) == float:
            start_time.append(np.nan)
            end_time.append(np.nan)
        else:
            x = ast.literal_eval(final['hours'][entry])
            start_time.append(x[0]['open'][0]['start'])
            end_time.append(x[0]['open'][0]['end'])
    final['Opening Time'] = pd.Series(start_time)
    final['Closing Time'] = pd.Series(end_time)
    
    #find whether messaging is available from restaurant
    messaging = []
    for entry in range(len(final['messaging'])):      
        if type(final['messaging'][entry])!=float:
            messaging.append(1)
        else:
            messaging.append(0)
    final["Has Messaging"] = pd.Series(messaging)
    #keep certain columns
    keep = ['id','is_closed',
    'is_claimed',
    'review_count',
    'rating',
    'Restaurant Category',
    'Pickup Available',
    'Delivery Available',
    "Reservation Available",
    'Price Level',
    'number of photos',
    'City',
    'State',
    'Opening Time',
    'Closing Time',
    'Has Messaging']
    final = final[final.columns[final.columns.isin(keep)]]
    return final

### Example
Below follows an example of how the Yelp data sets are cleaned.

In [3]:
#read in Yelp data sets
dallas = pd.read_csv("All_Available_Dallas_Data_MoreINFO.csv")
enterprise = pd.read_csv("All_Available_Enterprise_Data_MoreINFO.csv")

In [4]:
#show first rows of Dallas data
dallas.head()

Unnamed: 0.1,Unnamed: 0,id,alias,name,image_url,is_claimed,is_closed,url,phone,display_phone,...,rating,location,coordinates,photos,price,hours,transactions,messaging,special_hours,error
0,0,JwqeWMQgHnW988lGRrT4gw,dingdi-myanmar-restaurant-lewisville,Dingdi Myanmar Restaurant,https://s3-media3.fl.yelpcdn.com/bphoto/dprWY0...,True,False,https://www.yelp.com/biz/dingdi-myanmar-restau...,12145140000.0,(214) 513-9323,...,4.0,"{'address1': '1165 S Stemmons Fwy', 'address2'...","{'latitude': 33.0292301219091, 'longitude': -9...",['https://s3-media3.fl.yelpcdn.com/bphoto/dprW...,$,"[{'open': [{'is_overnight': False, 'start': '0...","['delivery', 'pickup']",,,
1,1,aOovQO44RRvMl7z9NAjV0Q,taco-hut-lewisville,Taco Hut,https://s3-media1.fl.yelpcdn.com/bphoto/1nOYFY...,True,False,https://www.yelp.com/biz/taco-hut-lewisville?a...,19722210000.0,(972) 221-0698,...,4.0,"{'address1': '1235 S Hwy 121', 'address2': '',...","{'latitude': 33.0279278681, 'longitude': -96.9...",['https://s3-media1.fl.yelpcdn.com/bphoto/1nOY...,$,"[{'open': [{'is_overnight': True, 'start': '00...","['pickup', 'delivery']",,,
2,2,mHlAob1B53kynBAyPFlDWg,el-paisa-cocina-mexicana-lewisville-2,El Paisa Cocina Mexicana,https://s3-media2.fl.yelpcdn.com/bphoto/k6xasw...,False,False,https://www.yelp.com/biz/el-paisa-cocina-mexic...,14692940000.0,(469) 293-8785,...,4.5,"{'address1': '1342 TX 121', 'address2': '', 'a...","{'latitude': 33.0262195, 'longitude': -96.9924...",['https://s3-media2.fl.yelpcdn.com/bphoto/k6xa...,$,"[{'open': [{'is_overnight': False, 'start': '0...","['pickup', 'delivery']",,,
3,3,6ZmeVf0Kg9KXBGHgE0x_6Q,catrina-grill-lewisville-2,Catrina Grill,https://s3-media3.fl.yelpcdn.com/bphoto/zR7QGl...,True,False,https://www.yelp.com/biz/catrina-grill-lewisvi...,19729070000.0,(972) 906-9229,...,4.5,"{'address1': '383 Huffines Plz', 'address2': N...","{'latitude': 33.03438944177758, 'longitude': -...",['https://s3-media3.fl.yelpcdn.com/bphoto/zR7Q...,,"[{'open': [{'is_overnight': False, 'start': '0...","['pickup', 'delivery']",{'url': 'https://www.yelp.com/raq/6ZmeVf0Kg9KX...,,
4,4,7E5_euVu635zjvWiSR2IZg,martinez-grill-and-taqueria-lewisville,Martinez Grill and Taqueria,https://s3-media3.fl.yelpcdn.com/bphoto/Xo7ONl...,True,False,https://www.yelp.com/biz/martinez-grill-and-ta...,19729060000.0,(972) 906-0057,...,4.5,"{'address1': '788 South Mill St', 'address2': ...","{'latitude': 33.03815, 'longitude': -96.9923148}",['https://s3-media3.fl.yelpcdn.com/bphoto/Xo7O...,$,"[{'open': [{'is_overnight': False, 'start': '0...",['delivery'],,,


In [5]:
#show first rows of Enterprise data
enterprise.head()

Unnamed: 0.1,Unnamed: 0,id,alias,name,image_url,is_claimed,is_closed,url,phone,display_phone,...,categories,rating,location,coordinates,photos,price,hours,transactions,messaging,special_hours
0,0,NQ0pRcBM45akp3ms2EbbMg,on-demand-sushi-las-vegas-3,On Demand Sushi,https://s3-media3.fl.yelpcdn.com/bphoto/op1vrO...,True,False,https://www.yelp.com/biz/on-demand-sushi-las-v...,17029140000.0,(702) 914-0033,...,"[{'alias': 'sushi', 'title': 'Sushi Bars'}, {'...",4.5,"{'address1': '5660 S Hualapai Way', 'address2'...","{'latitude': 36.085777, 'longitude': -115.314945}",['https://s3-media3.fl.yelpcdn.com/bphoto/op1v...,$$,"[{'open': [{'is_overnight': False, 'start': '1...","['delivery', 'pickup']",{'url': 'https://www.yelp.com/raq/NQ0pRcBM45ak...,
1,1,JrSxduS5XN6TZZ_PyDz1Cg,squallys-juice-bar-and-cafe-hualapai-las-vegas,Squally's Juice Bar & Cafe - Hualapai,https://s3-media1.fl.yelpcdn.com/bphoto/XUHZf0...,True,False,https://www.yelp.com/biz/squallys-juice-bar-an...,17026050000.0,(702) 605-2609,...,"[{'alias': 'cafes', 'title': 'Cafes'}, {'alias...",4.5,"{'address1': '5660 S Hualapai Way', 'address2'...","{'latitude': 36.085757921417084, 'longitude': ...",['https://s3-media1.fl.yelpcdn.com/bphoto/XUHZ...,,"[{'open': [{'is_overnight': False, 'start': '0...","['pickup', 'delivery']",{'url': 'https://www.yelp.com/raq/JrSxduS5XN6T...,"[{'date': '2022-11-24', 'is_closed': True, 'st..."
2,2,do8NhKHRiKpUtKb5xUL9kw,robertos-taco-shop-las-vegas-83,Roberto's Taco Shop,https://s3-media4.fl.yelpcdn.com/bphoto/bs9oLn...,False,False,https://www.yelp.com/biz/robertos-taco-shop-la...,17024620000.0,(702) 462-2728,...,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",3.0,"{'address1': '6440 S Hualapai Way', 'address2'...","{'latitude': 36.071335255354604, 'longitude': ...",['https://s3-media4.fl.yelpcdn.com/bphoto/bs9o...,,"[{'open': [{'is_overnight': True, 'start': '00...",['delivery'],,
3,3,n2kgJSiB7Q4u7AkDvYfzlg,munch-box-las-vegas-4,Munch Box,https://s3-media1.fl.yelpcdn.com/bphoto/63ptbU...,True,False,https://www.yelp.com/biz/munch-box-las-vegas-4...,17027790000.0,(702) 778-7458,...,"[{'alias': 'breakfast_brunch', 'title': 'Break...",4.5,"{'address1': '6105 S Fort Apache Rd', 'address...","{'latitude': 36.07681, 'longitude': -115.29834}",['https://s3-media1.fl.yelpcdn.com/bphoto/63pt...,$$,"[{'open': [{'is_overnight': False, 'start': '0...","['pickup', 'delivery']",{'url': 'https://www.yelp.com/raq/n2kgJSiB7Q4u...,
4,4,w2D-bXyIaKwh3AlEb-LygQ,big-bs-texas-bbq-las-vegas-2,Big B's Texas BBQ,https://s3-media4.fl.yelpcdn.com/bphoto/tTiaP3...,True,False,https://www.yelp.com/biz/big-bs-texas-bbq-las-...,17028450000.0,(702) 844-8206,...,"[{'alias': 'bbq', 'title': 'Barbeque'}, {'alia...",4.5,"{'address1': '6115 S Fort Apache Rd', 'address...","{'latitude': 36.077004, 'longitude': -115.298394}",['https://s3-media4.fl.yelpcdn.com/bphoto/tTia...,$$,"[{'open': [{'is_overnight': False, 'start': '1...","['delivery', 'pickup']",{'url': 'https://www.yelp.com/raq/w2D-bXyIaKwh...,


In [6]:
#create cleaned data set
clean_data  = clean_yelp(dallas,enterprise)

In [7]:
#show first rows of clean_data
clean_data.head()

Unnamed: 0,id,is_claimed,is_closed,review_count,rating,Restaurant Category,Pickup Available,Delivery Available,Reservation Available,Price Level,number of photos,City,State,Opening Time,Closing Time,Has Messaging
0,JwqeWMQgHnW988lGRrT4gw,True,False,32.0,4.0,burmese,1.0,1.0,0.0,Low,3.0,Lewisville,TX,830,1730,0
1,aOovQO44RRvMl7z9NAjV0Q,True,False,258.0,4.0,mexican,1.0,1.0,0.0,Low,3.0,Lewisville,TX,0,0,0
2,mHlAob1B53kynBAyPFlDWg,False,False,23.0,4.5,mexican,1.0,1.0,0.0,Low,3.0,Lewisville,TX,630,2200,0
3,6ZmeVf0Kg9KXBGHgE0x_6Q,True,False,19.0,4.5,mexican,1.0,1.0,0.0,,3.0,Lewisville,TX,800,2100,1
4,7E5_euVu635zjvWiSR2IZg,True,False,146.0,4.5,mexican,0.0,1.0,0.0,Low,3.0,Lewisville,TX,800,1600,0
