# Load from canonical restaurant data

In [644]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join

In [645]:
files = [f for f in listdir('./') if '.csv' in f]
files

['CanonicalRestaurants.csv', 'CanonicalSummary.csv', 'ClosedRestaurants.csv']

In [646]:
df = pd.read_csv('./CanonicalRestaurants.csv')

In [647]:
df.columns

Index(['address', 'category', 'claimed_status', 'compound', 'date',
       'first_review', 'health_rating', 'id', 'info', 'last_review',
       'latitude', 'longitude', 'name', 'negative', 'neighborhood', 'neutral',
       'permanently_closed', 'phone', 'positive', 'price_range', 'ratings',
       'ratings_histogram', 'reviews', 'star', 'subjectivity', 'url',
       'website', 'working_hours'],
      dtype='object')

In [648]:
df.shape

(484650, 28)

In [649]:
df['Claimed?'] = df['claimed_status'].apply(lambda x: 1 if str(x) == 'Claimed' else 0)
df['HasWebsite'] = df['website'].apply(lambda x: 1 if 'http' in str(x) else 0)

In [650]:
droplist = ['working_hours', 'health_rating', 'phone', 'url', 'claimed_status', 
            'website', 'address', 'longitude', 'latitude']
df.drop(droplist, inplace=True, axis=1)

In [651]:
df = df[(df['last_review'] != 'MISSING') & (df['first_review'] != 'MISSING')].copy()
df.shape

(484609, 21)

In [652]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 484609 entries, 0 to 484649
Data columns (total 21 columns):
category              484315 non-null object
compound              484609 non-null float64
date                  484609 non-null object
first_review          484609 non-null object
id                    484609 non-null object
info                  484609 non-null object
last_review           484609 non-null object
name                  484609 non-null object
negative              484609 non-null float64
neighborhood          482772 non-null object
neutral               484609 non-null float64
permanently_closed    484609 non-null int64
positive              484609 non-null float64
price_range           483514 non-null object
ratings               484609 non-null float64
ratings_histogram     484609 non-null object
reviews               484609 non-null float64
star                  484609 non-null float64
subjectivity          484609 non-null float64
Claimed?              48460

In [653]:
df['date'] =  pd.to_datetime(df['date'])
df['last_review'] =  pd.to_datetime(df['last_review'])
df['first_review'] =  pd.to_datetime(df['first_review'])

# Remove closed restaurants that have last review earlier than 2012/01/01

In [654]:
import datetime
cut_day = datetime.date(2012, 1, 1)

In [655]:
mask = (df['permanently_closed'] == 1) & (df['last_review'] < cut_day)
cut_df = df[~mask].copy()

In [656]:
cut_df.head()

Unnamed: 0,category,compound,date,first_review,id,info,last_review,name,negative,neighborhood,...,permanently_closed,positive,price_range,ratings,ratings_histogram,reviews,star,subjectivity,Claimed?,HasWebsite
0,"Breakfast & Brunch,American (Traditional)",0.9655,2015-02-02,2011-10-08,0_2-sparrows,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2015-02-02,2 Sparrows,0.104,Lincoln Park,...,1,0.295,$11-30,3.0,"[{5: 63}, {4: 94}, {3: 67}, {2: 78}, {1: 34}]",336.0,3.0,0.57619,1,1
1,"Breakfast & Brunch,American (Traditional)",0.0926,2014-12-04,2011-10-08,0_2-sparrows,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2015-02-02,2 Sparrows,0.131,Lincoln Park,...,1,0.111,$11-30,3.0,"[{5: 63}, {4: 94}, {3: 67}, {2: 78}, {1: 34}]",336.0,5.0,0.4,1,1
2,"Breakfast & Brunch,American (Traditional)",-0.6532,2014-10-07,2011-10-08,0_2-sparrows,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2015-02-02,2 Sparrows,0.132,Lincoln Park,...,1,0.064,$11-30,3.0,"[{5: 63}, {4: 94}, {3: 67}, {2: 78}, {1: 34}]",336.0,3.0,0.629167,1,1
3,"Breakfast & Brunch,American (Traditional)",0.9852,2014-09-25,2011-10-08,0_2-sparrows,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2015-02-02,2 Sparrows,0.057,Lincoln Park,...,1,0.144,$11-30,3.0,"[{5: 63}, {4: 94}, {3: 67}, {2: 78}, {1: 34}]",336.0,4.0,0.520476,1,1
4,"Breakfast & Brunch,American (Traditional)",0.973,2014-09-23,2011-10-08,0_2-sparrows,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2015-02-02,2 Sparrows,0.067,Lincoln Park,...,1,0.189,$11-30,3.0,"[{5: 63}, {4: 94}, {3: 67}, {2: 78}, {1: 34}]",336.0,3.0,0.597,1,1


In [657]:
cut_df.shape

(476583, 21)

# Collapse the cononical data to one restaurant per row to do train/test split

In [658]:
id_closed = cut_df[['id', 'permanently_closed']].copy()
id_closed.shape

(476583, 2)

In [659]:
id_closed.drop_duplicates(inplace=True)
id_closed.shape

(1152, 2)

# Baseline

In [660]:
id_closed['permanently_closed'].value_counts()

0    849
1    303
Name: permanently_closed, dtype: int64

In [661]:
id_closed['id'].value_counts()

76_the-brass-monkey                              1
735_dianas-food-and-restaurant-chicago           1
318_leos-lunchroom                               1
107_shiso                                        1
1_hugos-frog-bar-and-fish-house-chicago          1
718_bretts-kitchen-chicago                       1
1028_primehouse-chicago-2                        1
97_ditkas-restaurant-chicago                     1
294_marketplace-cafe-chicago-4                   1
1230_kikis-bistro-chicago                        1
541_west-town-tavern                             1
148_birchwood-kitchen                            1
108_caliterra                                    1
252_agami-chicago                                1
277_local-option-chicago                         1
370_rios-d-sudamerica-chicago                    1
99_pita-express-chicago                          1
61_letizias-fiore-ristorante-and-wine-shoppe     1
553_zed451                                       1
260_in-fine-spirits-lounge     

# Split data into train/test by restaurant id

In [662]:
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
id_train, id_test, _, _ = train_test_split(id_closed[['id']], id_closed['permanently_closed'])

In [663]:
id_train_list = list(id_train['id'])
id_test_list = list(id_test['id'])

# Functions to extract NLP data with varying block weeks and open weeks

In [664]:
def GetReviewSummary(df=None, blockweeks=52, openweeks=52, NLPsummary=['star']):
    from sklearn.linear_model import LinearRegression
    from datetime import timedelta
    Dict = {}
    df = df.copy()
    df['date'] =  pd.to_datetime(df['date'])
    df['year'] = df['date'].apply(lambda x: x.year)
    blck = timedelta(weeks=blockweeks)
    blocktime = df['date'].iloc[0] - blck
    new_df = df[df['date'] <= blocktime].copy()
    Dict['avg_reviews'] = [float(new_df.shape[0])/(new_df['year'].max() - new_df['year'].min() + 1)]    
    Dict['avg_star'] = new_df['star'].mean()
    Dict['5_star'] = new_df[new_df['star'] == 5.0].shape[0]
    Dict['4_star'] = new_df[new_df['star'] == 4.0].shape[0]
    Dict['3_star'] = new_df[new_df['star'] == 3.0].shape[0]
    Dict['2_star'] = new_df[new_df['star'] == 2.0].shape[0]
    Dict['1_star'] = new_df[new_df['star'] == 1.0].shape[0]
    if NLPsummary != None:            
        opn = timedelta(weeks=openweeks)        
        opentime = blocktime - opn
        open_df = new_df[(new_df['date'] >= opentime)].copy()
        open_df['days'] = open_df['date'].apply(lambda x: (x - list(open_df['date'])[-1]).days)
        for item in NLPsummary:
            Dict['AvgLast_'+item] = [open_df[item].mean()]
            if open_df.shape[0] >= 2:             
                lr = LinearRegression()
                lr.fit(open_df[['days']],open_df[item])
                Dict['Last_'+item+'_intrcpt'] = [lr.intercept_]
                Dict['Last_'+item+'_coef'] = [lr.coef_[0]]
            else:
                Dict['Last_'+item+'_intrcpt'] = [open_df[item].mean()]
                Dict['Last_'+item+'_coef'] = [0.0] 
    return Dict

In [665]:
def GetEachSummary(df=None, idname=None, blockweeks=52, openweeks=52, NLPsummary=['star','compound','subjectivity']):
    subset = df[df['id']==idname]
    row = pd.DataFrame(subset.iloc[0,:]).transpose()
    InfoList = ['id', 'name', 'category', 'price_range', 'neighborhood', 'info', 'Claimed?', 
                'HasWebsite', 'first_review', 'last_review', 'permanently_closed']
    Info = row[InfoList]
    ReviewList = ['date', 'star', 'compound', 'neutral', 'positive', 'negative', 'subjectivity']
    Review = subset[ReviewList]
    Summary = GetReviewSummary(df=Review, blockweeks=blockweeks, openweeks=openweeks, NLPsummary=NLPsummary)
    Sum_df = pd.DataFrame(Summary, index=[Info.index[0]])
    each = Info.join(Sum_df)
    return each

In [666]:
def GetSummarydf(df=None, idlist=None, blockweeks=52, openweeks=52, NLPsummary=['star','compound','subjectivity']):
    Summary_df = GetEachSummary(df=df, idname=idlist[0], 
                                blockweeks=blockweeks, openweeks=openweeks, NLPsummary=NLPsummary)
    for i, idname in enumerate(idlist[1:]):
        new = GetEachSummary(df=df, idname=idname, 
                             blockweeks=blockweeks, openweeks=openweeks, NLPsummary=NLPsummary)
        Summary_df = Summary_df.append(new, ignore_index=True)     
    return Summary_df

# Create dictionaries to store train/test dataframes of different block/open weeks combinations
- block weeks: 13, 26, 52 weeks
- open weeks: 26, 52, 78 weeks
- 9 combinations in total

In [667]:
blck = [13, 26, 52]
opn = [26, 52, 78]
train = {} 
test = {}
for b in blck:
    for o in opn:
        key = 'df_block' + str(b) + '_open' + str(o)
        print(key)
        train[key] = GetSummarydf(df=cut_df, idlist=id_train_list, 
                                  blockweeks=b, openweeks=o, NLPsummary=['star','compound','subjectivity'])
        test[key] = GetSummarydf(df=cut_df, idlist=id_test_list, 
                                 blockweeks=b, openweeks=o, NLPsummary=['star','compound','subjectivity'])

df_block13_open26
df_block13_open52
df_block13_open78
df_block26_open26
df_block26_open52
df_block26_open78
df_block52_open26
df_block52_open52
df_block52_open78


# Creat lists of dummy variables for category, neighborhood, and price range

In [668]:
def GetLabels(df=train['df_block13_open26'], column='category'):
    ensumble = []
    for line in df[column]:
        if type(line) != float:
            labels = line.split(',')
            for l in labels:
                l = l.strip()
                if l not in ensumble:
                    ensumble.append(l)
    return ensumble

In [669]:
categories = GetLabels(df=train['df_block13_open26'], column='category')
categories

['Brazilian',
 'Creperies',
 'Sandwiches',
 'Breakfast & Brunch',
 'Mexican',
 'Tex-Mex',
 'Pubs',
 'American (Traditional)',
 'Seafood',
 'Modern European',
 'American (New)',
 'Mediterranean',
 'Cuban',
 'Smokehouse',
 'Tapas Bars',
 'Restaurants',
 'Pizza',
 'Italian',
 'Greek',
 'Moroccan',
 'Bars',
 'Coffee & Tea',
 'Thai',
 'Sushi Bars',
 'Diners',
 'Juice Bars & Smoothies',
 'Indian',
 'Pakistani',
 'Japanese',
 'Lounges',
 'Caterers',
 'Indonesian',
 'Beer',
 'Wine & Spirits',
 'Delis',
 'Asian Fusion',
 'Vietnamese',
 'Filipino',
 'German',
 'Burgers',
 'Cocktail Bars',
 'Hot Dogs',
 'Grocery',
 'Bakeries',
 'Chinese',
 'Noodles',
 'Gluten-Free',
 'Korean',
 'Salad',
 'Latin American',
 'French',
 'Cafes',
 'Ice Cream & Frozen Yogurt',
 'Steakhouses',
 'Beer Bar',
 'Venues & Event Spaces',
 'Fish & Chips',
 'Scottish',
 'Argentine',
 'Comfort Food',
 'Vegetarian',
 'Polish',
 'Sports Bars',
 'Wine Bars',
 'Gastropubs',
 'Jazz & Blues',
 'Irish',
 'Irish Pub',
 'Vegan',
 'Middl

In [670]:
neighborhoods = GetLabels(df=train['df_block13_open26'], column='neighborhood')
neighborhoods

['Near West Side',
 'River West',
 'Noble Square',
 'West Town',
 'Near North Side',
 'Lincoln Park',
 'Irving Park',
 'Logan Square',
 'Bucktown',
 'Near Southside',
 'Avondale',
 'Andersonville',
 'Edgewater',
 'Lakeview',
 'Old Town',
 'West Rogers Park',
 'Uptown',
 'West Loop',
 'River North',
 'Humboldt Park',
 'DePaul',
 'Edison Park',
 'The Loop',
 'Ravenswood',
 'Lincoln Square',
 'Pilsen',
 'Wicker Park',
 'Greektown',
 'Chinatown',
 'Bridgeport',
 'Portage Park',
 'River East',
 'Roscoe Village',
 'North Center',
 'University Village',
 'Rogers Park',
 'Cragin',
 'Gold Coast',
 'Ukrainian Village',
 'Forest Glen',
 'Albany Park',
 'Garfield Ridge',
 'Wrigleyville',
 'South Loop',
 'Fulton Market',
 'Streeterville',
 "Printer's Row",
 'East Garfield Park',
 'Hermosa',
 'Goose Island',
 'Little Village']

In [671]:
price_ranges = GetLabels(df=train['df_block13_open26'], column='price_range')
price_ranges

['$11-30',
 'Moderate',
 'Under $10',
 '$31-60',
 'Inexpensive',
 'Above $61',
 'Pricey']

Drop 'Restaurant', 'Pilsen', and 'Pricey' for category, neighborhood, and price range, respectively

In [634]:
#categories.remove('Restaurants')
#neighborhoods.remove('Pilsen')
#price_ranges.remove('Pricey')

# Check values for 'Attire', 'Parking', 'Alcohol', 'Noise Level', and 'Wi-Fi' in info:

In [672]:
def ColumnParser(info, text=False):
    '''Parse info column to a dictionary'''
    import re
    Dict = {}
    if len(info) > 2:
        List = re.findall(r"\{(.*?)\}", info)        
        for item in List:  
            if text:
                key = re.findall(r"\'(.*?)\'", item)[0]
                value = re.findall(r"\'(.*?)\'", item)[1]
            else:
                key = item.split(': ')[0]
                value = item.split(': ')[1]
            Dict[key] = value
    return Dict    

In [673]:
df = train['df_block13_open26'].copy()
df['info'] = df['info'].apply(lambda x: ColumnParser(x, text=True))

In [674]:
def GetValues(df=train['df_block13_open26'], feature='Attire'):
    ensumble = []
    for item in list(df['info']):
        if feature in item.keys():
            if ', ' in item[feature]:
                for i in item[feature].split(', '):
                    ensumble.append(i)
            else:
                ensumble.append(item[feature])
    return list(set(ensumble))

In [675]:
for f in ['Attire', 'Parking', 'Alcohol', 'Noise Level', 'Wi-Fi']:
    print(f+': ', GetValues(df=df, feature=f))

Attire:  ['Dressy', 'Formal (Jacket Required)', 'Casual']
Parking:  ['Garage', 'Street', 'Valet', 'Validated', 'Private Lot']
Alcohol:  ['No', 'Full Bar', 'Beer & Wine Only']
Noise Level:  ['Loud', 'Very Loud', 'Quiet', 'Average']
Wi-Fi:  ['No', 'Paid', 'Free']


In [676]:
binomial = ['Accepts Credit Cards', 'Good for Groups', 'Good for Kids', 
                'Takes Reservations', 'Outdoor Seating', 'Take-out',  'Delivery', 'Has TV']
polynomial = {'Attire': ['Dressy', 'Casual'],
              'Parking': ['Valet', 'Garage', 'Street', 'Validated'], 
              'Alcohol': ['Full Bar', 'No'], 
              'Noise Level': ['Loud', 'Quite', 'Average'],
              'Wi-Fi': ['No', 'Free']}

In [677]:
train_copy = {}
test_copy = {}
for key, df in train.items():
    train_copy[key] = df.copy()
for key, df in test.items():
    test_copy[key] = df.copy()    

# Process each train/test pair to get dummy variables

In [678]:
for i, df in enumerate(list(train_copy.values()) + list(test_copy.values())):
    print(i)
    for key, columns in {'category': categories, 'neighborhood': neighborhoods, 'price_range': price_ranges}.items():
        for c in columns:
            df[c] = df[key].apply(lambda x: 1 if c in str(x) else 0)
    df['info'] = df['info'].apply(lambda x: ColumnParser(x, text=True))
    for b in binomial:
        df[b] = df['info'].apply(lambda x: 1 if x.get(b,'0') == 'Yes' else 0)
    for k, values in polynomial.items():
        for v in values:
            df[k+'_'+v] = df['info'].apply(lambda x: 1 if x.get(k, '0') == v else 0 )
    df.drop(['category', 'neighborhood', 'price_range', 'info'], inplace=True,axis=1)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17


# Write each train/test dataframe as csv files in ../part_03

In [679]:
for key, df in train_copy.items():
    df.to_csv('../part_03/train_'+key+'.csv', index=False)

In [680]:
for key, df in test_copy.items():
    df.to_csv('../part_03/test_'+key+'.csv', index=False)