# Load from canonical restaurant data

In [1]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join
import re
import datetime

In [2]:
files = [f for f in listdir('./') if f.endswith(".csv")]
files

['CanonicalSummary.csv', 'ClosedRestaurants.csv']

In [4]:
df = pd.read_csv('./CanonicalRestaurants.csv')

In [5]:
df.columns

Index(['address', 'category', 'claimed_status', 'compound', 'date',
       'first_review', 'health_rating', 'id', 'info', 'last_review',
       'latitude', 'longitude', 'name', 'negative', 'neighborhood', 'neutral',
       'permanently_closed', 'phone', 'positive', 'price_range', 'ratings',
       'ratings_histogram', 'reviews', 'star', 'subjectivity', 'url',
       'website', 'working_hours'],
      dtype='object')

In [6]:
df.head().T

Unnamed: 0,0,1,2,3,4
address,"553 West Diversey Pkwy Chicago, IL 60614 b/t H...","553 West Diversey Pkwy Chicago, IL 60614 b/t H...","553 West Diversey Pkwy Chicago, IL 60614 b/t H...","553 West Diversey Pkwy Chicago, IL 60614 b/t H...","553 West Diversey Pkwy Chicago, IL 60614 b/t H..."
category,"Breakfast & Brunch,American (Traditional)","Breakfast & Brunch,American (Traditional)","Breakfast & Brunch,American (Traditional)","Breakfast & Brunch,American (Traditional)","Breakfast & Brunch,American (Traditional)"
claimed_status,Claimed,Claimed,Claimed,Claimed,Claimed
compound,0.9655,0.0926,-0.6532,0.9852,0.973
date,2015-02-02,2014-12-04,2014-10-07,2014-09-25,2014-09-23
first_review,2011-10-08,2011-10-08,2011-10-08,2011-10-08,2011-10-08
health_rating,,,,,
id,0_2-sparrows,0_2-sparrows,0_2-sparrows,0_2-sparrows,0_2-sparrows
info,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...","[{'Takes Reservations': 'No'}, {'Delivery': 'N...","[{'Takes Reservations': 'No'}, {'Delivery': 'N...","[{'Takes Reservations': 'No'}, {'Delivery': 'N...","[{'Takes Reservations': 'No'}, {'Delivery': 'N..."
last_review,2015-02-02,2015-02-02,2015-02-02,2015-02-02,2015-02-02


In [7]:
df.shape

(484650, 28)

In [8]:
df['Claimed?'] = df['claimed_status'].apply(lambda x: 1 if str(x) == 'Claimed' else 0)
df['HasWebsite'] = df['website'].apply(lambda x: 1 if 'http' in str(x) else 0)

In [9]:
droplist = ['health_rating'] #'phone', 'url', 'claimed_status', 'website', 'address', 'longitude', 'latitude']
df.drop(droplist, inplace=True, axis=1)

In [10]:
df = df[(df['last_review'] != 'MISSING') & (df['first_review'] != 'MISSING')].copy()
df.shape

(484609, 29)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 484609 entries, 0 to 484649
Data columns (total 29 columns):
address               484609 non-null object
category              484315 non-null object
claimed_status        433154 non-null object
compound              484609 non-null float64
date                  484609 non-null object
first_review          484609 non-null object
id                    484609 non-null object
info                  484609 non-null object
last_review           484609 non-null object
latitude              484609 non-null float64
longitude             484609 non-null float64
name                  484609 non-null object
negative              484609 non-null float64
neighborhood          482772 non-null object
neutral               484609 non-null float64
permanently_closed    484609 non-null int64
phone                 482982 non-null object
positive              484609 non-null float64
price_range           483514 non-null object
ratings               484609 

In [12]:
df['date'] =  pd.to_datetime(df['date'])
df['last_review'] =  pd.to_datetime(df['last_review'])
df['first_review'] =  pd.to_datetime(df['first_review'])

In [13]:
df.head()

Unnamed: 0,address,category,claimed_status,compound,date,first_review,id,info,last_review,latitude,...,ratings,ratings_histogram,reviews,star,subjectivity,url,website,working_hours,Claimed?,HasWebsite
0,"553 West Diversey Pkwy Chicago, IL 60614 b/t H...","Breakfast & Brunch,American (Traditional)",Claimed,0.9655,2015-02-02,2011-10-08,0_2-sparrows,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2015-02-02,41.932572,...,3.0,"[{5: 63}, {4: 94}, {3: 67}, {2: 78}, {1: 34}]",336.0,3.0,0.57619,https://www.yelp.com/biz/2-sparrows-chicago,http://2sparrowschicago.com,[],1,1
1,"553 West Diversey Pkwy Chicago, IL 60614 b/t H...","Breakfast & Brunch,American (Traditional)",Claimed,0.0926,2014-12-04,2011-10-08,0_2-sparrows,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2015-02-02,41.932572,...,3.0,"[{5: 63}, {4: 94}, {3: 67}, {2: 78}, {1: 34}]",336.0,5.0,0.4,https://www.yelp.com/biz/2-sparrows-chicago,http://2sparrowschicago.com,[],1,1
2,"553 West Diversey Pkwy Chicago, IL 60614 b/t H...","Breakfast & Brunch,American (Traditional)",Claimed,-0.6532,2014-10-07,2011-10-08,0_2-sparrows,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2015-02-02,41.932572,...,3.0,"[{5: 63}, {4: 94}, {3: 67}, {2: 78}, {1: 34}]",336.0,3.0,0.629167,https://www.yelp.com/biz/2-sparrows-chicago,http://2sparrowschicago.com,[],1,1
3,"553 West Diversey Pkwy Chicago, IL 60614 b/t H...","Breakfast & Brunch,American (Traditional)",Claimed,0.9852,2014-09-25,2011-10-08,0_2-sparrows,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2015-02-02,41.932572,...,3.0,"[{5: 63}, {4: 94}, {3: 67}, {2: 78}, {1: 34}]",336.0,4.0,0.520476,https://www.yelp.com/biz/2-sparrows-chicago,http://2sparrowschicago.com,[],1,1
4,"553 West Diversey Pkwy Chicago, IL 60614 b/t H...","Breakfast & Brunch,American (Traditional)",Claimed,0.973,2014-09-23,2011-10-08,0_2-sparrows,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2015-02-02,41.932572,...,3.0,"[{5: 63}, {4: 94}, {3: 67}, {2: 78}, {1: 34}]",336.0,3.0,0.597,https://www.yelp.com/biz/2-sparrows-chicago,http://2sparrowschicago.com,[],1,1


# Remove closed restaurants that have last review earlier than 2012/01/01

In [14]:
cut_day = datetime.date(2012, 1, 1)
mask = (df['permanently_closed'] == 1) & (df['last_review'] < cut_day)
cut_df = df[~mask].copy()

In [15]:
cut_df.tail()

Unnamed: 0,address,category,claimed_status,compound,date,first_review,id,info,last_review,latitude,...,ratings,ratings_histogram,reviews,star,subjectivity,url,website,working_hours,Claimed?,HasWebsite
484645,"100 W Randolph St Chicago, IL 60601 b/t Beaubi...",Mediterranean,Claimed,0.9844,2007-07-24,2005-07-29,99_pita-express-chicago,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2017-07-10,41.885127,...,4.0,"[{5: 35}, {4: 13}, {3: 8}, {2: 10}, {1: 7}]",73.0,4.0,0.634476,https://www.yelp.com/biz/pita-express-chicago?...,http://pitaexpressinc.com/,"[{'Mon': '8:00 am - 3:00 pm'}, {'Tue': '8:00 a...",1,1
484646,"100 W Randolph St Chicago, IL 60601 b/t Beaubi...",Mediterranean,Claimed,0.59,2007-07-20,2005-07-29,99_pita-express-chicago,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2017-07-10,41.885127,...,4.0,"[{5: 35}, {4: 13}, {3: 8}, {2: 10}, {1: 7}]",73.0,4.0,0.469444,https://www.yelp.com/biz/pita-express-chicago?...,http://pitaexpressinc.com/,"[{'Mon': '8:00 am - 3:00 pm'}, {'Tue': '8:00 a...",1,1
484647,"100 W Randolph St Chicago, IL 60601 b/t Beaubi...",Mediterranean,Claimed,0.9789,2007-05-03,2005-07-29,99_pita-express-chicago,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2017-07-10,41.885127,...,4.0,"[{5: 35}, {4: 13}, {3: 8}, {2: 10}, {1: 7}]",73.0,5.0,0.500281,https://www.yelp.com/biz/pita-express-chicago?...,http://pitaexpressinc.com/,"[{'Mon': '8:00 am - 3:00 pm'}, {'Tue': '8:00 a...",1,1
484648,"100 W Randolph St Chicago, IL 60601 b/t Beaubi...",Mediterranean,Claimed,0.8879,2005-08-26,2005-07-29,99_pita-express-chicago,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2017-07-10,41.885127,...,4.0,"[{5: 35}, {4: 13}, {3: 8}, {2: 10}, {1: 7}]",73.0,3.0,0.663333,https://www.yelp.com/biz/pita-express-chicago?...,http://pitaexpressinc.com/,"[{'Mon': '8:00 am - 3:00 pm'}, {'Tue': '8:00 a...",1,1
484649,"100 W Randolph St Chicago, IL 60601 b/t Beaubi...",Mediterranean,Claimed,0.7579,2005-07-29,2005-07-29,99_pita-express-chicago,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2017-07-10,41.885127,...,4.0,"[{5: 35}, {4: 13}, {3: 8}, {2: 10}, {1: 7}]",73.0,5.0,0.49,https://www.yelp.com/biz/pita-express-chicago?...,http://pitaexpressinc.com/,"[{'Mon': '8:00 am - 3:00 pm'}, {'Tue': '8:00 a...",1,1


In [16]:
cut_df.shape

(476583, 29)

In [20]:
cut_df.iloc[-1,24]

'https://www.yelp.com/biz/pita-express-chicago?sort_by=date_asc&start=0'

# Use url to get id

In [21]:
cut_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 476583 entries, 0 to 484649
Data columns (total 29 columns):
address               476583 non-null object
category              476292 non-null object
claimed_status        428565 non-null object
compound              476583 non-null float64
date                  476583 non-null datetime64[ns]
first_review          476583 non-null datetime64[ns]
id                    476583 non-null object
info                  476583 non-null object
last_review           476583 non-null datetime64[ns]
latitude              476583 non-null float64
longitude             476583 non-null float64
name                  476583 non-null object
negative              476583 non-null float64
neighborhood          474746 non-null object
neutral               476583 non-null float64
permanently_closed    476583 non-null int64
phone                 474970 non-null object
positive              476583 non-null float64
price_range           475623 non-null object
ratin

In [22]:
cut_df['permanently_closed'].value_counts()

0    404590
1     71993
Name: permanently_closed, dtype: int64

In [30]:
cut_df['permanently_closed'].value_counts()

0    404590
1     71993
Name: permanently_closed, dtype: int64

In [31]:
cut_df = cut_df.copy()

In [35]:
def get_yelpid(row):
    if isinstance(row['url'], str):
        yelp_id = row['url'].replace('https://www.yelp.com/biz/','').split('?')[0]
    else:
        yelp_id = row['id']+'-chicago'
    return yelp_id
        
cut_df['yelp_id'] = cut_df[['id','url']].apply(lambda x: get_yelpid(x), axis=1)

In [36]:
droplist = ['id'] #'phone', 'url', 'claimed_status', 'website', 'address', 'longitude', 'latitude']
cut_df.drop(droplist, inplace=True, axis=1)

In [37]:
cut_df.tail()

Unnamed: 0,address,category,claimed_status,compound,date,first_review,info,last_review,latitude,longitude,...,ratings_histogram,reviews,star,subjectivity,url,website,working_hours,Claimed?,HasWebsite,yelp_id
484645,"100 W Randolph St Chicago, IL 60601 b/t Beaubi...",Mediterranean,Claimed,0.9844,2007-07-24,2005-07-29,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2017-07-10,41.885127,-87.63169,...,"[{5: 35}, {4: 13}, {3: 8}, {2: 10}, {1: 7}]",73.0,4.0,0.634476,https://www.yelp.com/biz/pita-express-chicago?...,http://pitaexpressinc.com/,"[{'Mon': '8:00 am - 3:00 pm'}, {'Tue': '8:00 a...",1,1,pita-express-chicago
484646,"100 W Randolph St Chicago, IL 60601 b/t Beaubi...",Mediterranean,Claimed,0.59,2007-07-20,2005-07-29,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2017-07-10,41.885127,-87.63169,...,"[{5: 35}, {4: 13}, {3: 8}, {2: 10}, {1: 7}]",73.0,4.0,0.469444,https://www.yelp.com/biz/pita-express-chicago?...,http://pitaexpressinc.com/,"[{'Mon': '8:00 am - 3:00 pm'}, {'Tue': '8:00 a...",1,1,pita-express-chicago
484647,"100 W Randolph St Chicago, IL 60601 b/t Beaubi...",Mediterranean,Claimed,0.9789,2007-05-03,2005-07-29,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2017-07-10,41.885127,-87.63169,...,"[{5: 35}, {4: 13}, {3: 8}, {2: 10}, {1: 7}]",73.0,5.0,0.500281,https://www.yelp.com/biz/pita-express-chicago?...,http://pitaexpressinc.com/,"[{'Mon': '8:00 am - 3:00 pm'}, {'Tue': '8:00 a...",1,1,pita-express-chicago
484648,"100 W Randolph St Chicago, IL 60601 b/t Beaubi...",Mediterranean,Claimed,0.8879,2005-08-26,2005-07-29,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2017-07-10,41.885127,-87.63169,...,"[{5: 35}, {4: 13}, {3: 8}, {2: 10}, {1: 7}]",73.0,3.0,0.663333,https://www.yelp.com/biz/pita-express-chicago?...,http://pitaexpressinc.com/,"[{'Mon': '8:00 am - 3:00 pm'}, {'Tue': '8:00 a...",1,1,pita-express-chicago
484649,"100 W Randolph St Chicago, IL 60601 b/t Beaubi...",Mediterranean,Claimed,0.7579,2005-07-29,2005-07-29,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2017-07-10,41.885127,-87.63169,...,"[{5: 35}, {4: 13}, {3: 8}, {2: 10}, {1: 7}]",73.0,5.0,0.49,https://www.yelp.com/biz/pita-express-chicago?...,http://pitaexpressinc.com/,"[{'Mon': '8:00 am - 3:00 pm'}, {'Tue': '8:00 a...",1,1,pita-express-chicago


# Collapse the cononical data to one restaurant per row to do train/test split

In [38]:
id_closed = cut_df[['yelp_id', 'permanently_closed']].copy()
id_closed.shape

(476583, 2)

In [50]:
id_closed = cut_df[['yelp_id','address', 'name', 'permanently_closed']].copy()
id_closed.shape

(476583, 4)

In [51]:
id_closed.drop_duplicates(inplace=True)
id_closed.shape

(1081, 4)

In [80]:
id_closed.drop_duplicates(inplace=True)
id_closed.shape

(1072, 2)

# Baseline

In [52]:
id_closed['permanently_closed'].value_counts()

0    793
1    288
Name: permanently_closed, dtype: int64

In [81]:
id_closed['permanently_closed'].value_counts()

0    789
1    283
Name: permanently_closed, dtype: int64

In [53]:
id_closed['yelp_id'].value_counts()

embeya-chicago                              2
kitchenette-chicago                         1
137_nouveau-tavern-chicago                  1
angelina-ristorante-chicago                 1
bellas-pizza-and-restaurant-chicago         1
mr-greek-gyros-chicago                      1
susies-noon-hour-grill-chicago              1
las-tablas-chicago                          1
king-cafe-gourmet-and-go-chicago            1
el-cid-2-chicago                            1
sticky-rice-chicago                         1
riverview-tavern-chicago                    1
xoco-chicago                                1
double-happiness-chicago                    1
the-belmont-cafe-chicago                    1
33-club-chicago                             1
la-bocca-della-verita-chicago               1
jims-original-hot-dog-chicago               1
el-tipico-mexican-grill-chicago             1
the-motel-bar-chicago                       1
371_rylons-smokehouse-chicago               1
81_pizza-house-1647-chicago       

# Split data into train/test by restaurant id

In [83]:
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
id_train, id_test, _, _ = train_test_split(id_closed[['yelp_id']], id_closed['permanently_closed'])

In [84]:
id_train_list = list(id_train['yelp_id'])
id_test_list = list(id_test['yelp_id'])

# Functions to extract NLP data with varying block weeks and open weeks

In [85]:
def get_review_summary(df=None, blockweeks=52, openweeks=52, NLPsummary=['star']):
    from sklearn.linear_model import LinearRegression
    from datetime import timedelta
    Dict = {}
    df = df.copy()
    df['date'] =  pd.to_datetime(df['date'])
    df['year'] = df['date'].apply(lambda x: x.year)
    blck = timedelta(weeks=blockweeks)
    blocktime = df['date'].iloc[0] - blck
    new_df = df[df['date'] <= blocktime].copy()
    Dict['avg_reviews'] = [float(new_df.shape[0])/(new_df['year'].max() - new_df['year'].min() + 1)]    
    Dict['avg_star'] = new_df['star'].mean()
    Dict['5_star'] = new_df[new_df['star'] == 5.0].shape[0]
    Dict['4_star'] = new_df[new_df['star'] == 4.0].shape[0]
    Dict['3_star'] = new_df[new_df['star'] == 3.0].shape[0]
    Dict['2_star'] = new_df[new_df['star'] == 2.0].shape[0]
    Dict['1_star'] = new_df[new_df['star'] == 1.0].shape[0]
    if NLPsummary != None:            
        opn = timedelta(weeks=openweeks)        
        opentime = blocktime - opn
        open_df = new_df[(new_df['date'] >= opentime)].copy()
        open_df['days'] = open_df['date'].apply(lambda x: (x - list(open_df['date'])[-1]).days)
        for item in NLPsummary:
            Dict['AvgLast_'+item] = [open_df[item].mean()]
            if open_df.shape[0] >= 2:             
                lr = LinearRegression()
                lr.fit(open_df[['days']],open_df[item])
                Dict['Last_'+item+'_intrcpt'] = [lr.intercept_]
                Dict['Last_'+item+'_coef'] = [lr.coef_[0]]
            else:
                Dict['Last_'+item+'_intrcpt'] = [open_df[item].mean()]
                Dict['Last_'+item+'_coef'] = [0.0] 
    return Dict

In [86]:
def get_each_summary(df=None, idname=None, blockweeks=52, openweeks=52, NLPsummary=['star','compound','subjectivity']):
    subset = df[df['yelp_id']==idname]
    row = pd.DataFrame(subset.iloc[0,:]).transpose()
    InfoList = ['yelp_id', 'name', 'category', 'price_range', 'address', 'neighborhood', 
                'info', 'Claimed?', 'website', 'phone', 'working_hours', 'ratings', 'reviews',
                'HasWebsite', 'first_review', 'last_review', 'permanently_closed']
    Info = row[InfoList]
    ReviewList = ['date', 'star', 'compound', 'neutral', 'positive', 'negative', 'subjectivity']
    Review = subset[ReviewList]
    Summary = get_review_summary(df=Review, blockweeks=blockweeks, openweeks=openweeks, NLPsummary=NLPsummary)
    Sum_df = pd.DataFrame(Summary, index=[Info.index[0]])
    each = Info.join(Sum_df)
    return each

In [87]:
def get_summary_df(df=None, idlist=None, blockweeks=52, openweeks=52, NLPsummary=['star','compound','subjectivity']):
    Summary_df = get_each_summary(df=df, idname=idlist[0], 
                                blockweeks=blockweeks, openweeks=openweeks, NLPsummary=NLPsummary)
    for i, idname in enumerate(idlist[1:]):
        new = get_each_summary(df=df, idname=idname, 
                             blockweeks=blockweeks, openweeks=openweeks, NLPsummary=NLPsummary)
        Summary_df = Summary_df.append(new, ignore_index=True)     
    return Summary_df

# Create dictionaries to store train/test dataframes of different block/open weeks combinations
- block weeks: 13, 26, 52 weeks
- open weeks: 26, 52, 78 weeks
- 9 combinations in total

In [88]:
blck = [0, 13, 26, 52]
opn = [26, 52, 78]
train = {} 
test = {}
for b in blck:
    for o in opn:
        key = 'df_block' + str(b) + '_open' + str(o)
        print(key)
        train[key] = get_summary_df(df=cut_df, idlist=id_train_list, 
                                  blockweeks=b, openweeks=o, NLPsummary=['star','compound','subjectivity'])
        test[key] = get_summary_df(df=cut_df, idlist=id_test_list, 
                                 blockweeks=b, openweeks=o, NLPsummary=['star','compound','subjectivity'])

df_block0_open26
df_block0_open52
df_block0_open78
df_block13_open26
df_block13_open52
df_block13_open78
df_block26_open26
df_block26_open52
df_block26_open78
df_block52_open26
df_block52_open52
df_block52_open78


In [89]:
for key, train_df in train.items():
    test_df = test[key]
    print('train size:', len(train_df), 'test size', len(test[key]))
    df = train_df.append(test_df)    
    df.to_csv(key+'.csv', index=False)

train size: 804 test size 268
train size: 804 test size 268
train size: 804 test size 268
train size: 804 test size 268
train size: 804 test size 268
train size: 804 test size 268
train size: 804 test size 268
train size: 804 test size 268
train size: 804 test size 268
train size: 804 test size 268
train size: 804 test size 268
train size: 804 test size 268


In [90]:
df = pd.read_csv('df_block13_open26.csv')

In [91]:
df.head()

Unnamed: 0,yelp_id,name,category,price_range,address,neighborhood,info,Claimed?,website,phone,...,AvgLast_star,AvgLast_subjectivity,Last_compound_coef,Last_compound_intrcpt,Last_star_coef,Last_star_intrcpt,Last_subjectivity_coef,Last_subjectivity_intrcpt,avg_reviews,avg_star
0,afghan-kabob-chicago,Afghan Kabob,"Middle Eastern,Afghan",$11-30,"4040 W Montrose Ave Chicago, IL 60641 b/t Pula...",Albany Park,"[{'Takes Reservations': 'No'}, {'Delivery': 'Y...",1,http://www.afghan-kabob.com,(773) 427-5041,...,4.0,0.602101,0.018577,0.030445,0.031406,3.322532,-0.004437,0.697803,21.454545,4.394068
1,taco-and-burrito-house-chicago,Taco & Burrito House,Mexican,Under $10,"1548 W Fullerton Ave Chicago, IL 60614 b/t Bos...","Lincoln Park, DePaul","[{'Takes Reservations': 'No'}, {'Delivery': 'N...",0,,(773) 665-8389,...,4.0,0.534943,0.004739,0.090441,0.017407,2.807598,0.000517,0.499548,7.0,3.369048
2,meatball-hero-and-pasta-chicago,Meatball Hero & Pasta,"American (Traditional),Italian",Under $10,"3037 N Clark St Chicago, IL 60657 b/t Barry Av...",Lakeview,"[{'Takes Reservations': 'No'}, {'Delivery': 'Y...",1,http://www.meatballheroes.com/,(773) 348-3037,...,4.625,0.610423,0.002643,0.615613,0.005913,4.222562,-8.2e-05,0.616036,16.5,4.575758
3,sinha-elegant-cuisine-chicago,Sinha Elegant Cuisine,Brazilian,$11-30,"2018 W Adams Chicago, IL 60612 b/t Damen Ave &...",Near West Side,"[{'Takes Reservations': 'Yes'}, {'Delivery': '...",1,http://www.sinhaelegantcuisine.com,(312) 491-8200,...,4.454545,0.665821,0.00041,0.909065,-0.000515,4.493152,0.000481,0.629725,10.0,4.258333
4,bella-luna-bar-and-pizza-chicago,Bella Luna Bar & Pizza,"Italian,Pizza",$11-30,"731 N Dearborn Chicago, IL 60654 Near North Side",Near North Side,"[{'Takes Reservations': 'Yes'}, {'Delivery': '...",1,http://www.bellalunachicago.com/,(312) 751-2552,...,3.666667,0.692331,-0.002152,0.869853,-0.02133,5.010469,-0.001287,0.773386,8.230769,2.990654


In [92]:
df.columns

Index(['yelp_id', 'name', 'category', 'price_range', 'address', 'neighborhood',
       'info', 'Claimed?', 'website', 'phone', 'working_hours', 'ratings',
       'reviews', 'HasWebsite', 'first_review', 'last_review',
       'permanently_closed', '1_star', '2_star', '3_star', '4_star', '5_star',
       'AvgLast_compound', 'AvgLast_star', 'AvgLast_subjectivity',
       'Last_compound_coef', 'Last_compound_intrcpt', 'Last_star_coef',
       'Last_star_intrcpt', 'Last_subjectivity_coef',
       'Last_subjectivity_intrcpt', 'avg_reviews', 'avg_star'],
      dtype='object')

In [93]:
df['between'] = df['address'].apply(lambda x: 'b/t ' + x.split('b/t')[1].strip() if len(x.split('b/t'))==2 else None)
df['address'] = df['address'].apply(lambda x: x.split('b/t')[0].strip())

In [94]:
def column_parser(info, text=False, working_hr=False):
    '''Parse info column to a dictionary'''
    Dict = {}
    if len(info) > 2:
        List = re.findall(r"\{(.*?)\}", info)        
        for item in List:  
            if text:
                key = re.findall(r"\'(.*?)\'", item)[0]
                value = re.findall(r"\'(.*?)\'", item)[1]
                if working_hr:
                    additional = '\\n        \\n                Closed now'
                    value = value.split(additional)[0]
                    if value == 'Closed':
                        value = ['Closed']
                    else:
                        value = re.findall(r"(\d+:\d+ \w+m - \d+:\d+ \w+m)", value)                    
            else:
                key = item.split(': ')[0]
                value = item.split(': ')[1]
            Dict[key] = value
    return Dict    

In [95]:
df['working_hours'] = df['working_hours'].apply(lambda x: column_parser(x, text=True, working_hr=True))
df['info'] = df['info'].apply(lambda x: column_parser(x, text=True))

In [96]:
def hour_parser(working_hours, day='Mon', n=0):
    '''Parse info column to a dictionary'''
    import numpy as np    
    day_hours = working_hours.get(day)
    if day_hours != None and len(day_hours) >= n+1:
        return day_hours[n]
    else:
        return np.nan

In [97]:
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
for d in days:
    df[d] = df['working_hours'].apply(lambda x: hour_parser(x, day=d, n=0))

for d in days:
    df[d+'1'] = df['working_hours'].apply(lambda x: hour_parser(x, day=d, n=1))

In [98]:
def dollar_signs(price_range):
    import numpy as np
    dictionary = {'Pricey':'$$$', 'Moderate':'$$', 'Inexpensive':'$', 
                  'Above $61':'$$$$', '$31-60':'$$$', '$11-30':'$$', 'Under $10':'$'}
    return dictionary.get(price_range, np.nan)

In [99]:
df['dollar_signs'] = df['price_range'].apply(lambda x: dollar_signs(x))

In [100]:
infos = ['Accepts Credit Cards', 'Good for Groups', 'Good for Kids', 
         'Takes Reservations', 'Outdoor Seating', 'Take-out', 'Delivery', 
         'Has TV', 'Attire', 'Parking', 'Alcohol', 'Noise Level', 'Wi-Fi']
for i in infos:
    df[i] = df['info'].apply(lambda x: x.get(i, np.nan))

In [101]:
df['category'] = df['category'].apply(lambda x: x.replace(',', ', ') if type(x) == str else x)
df['probability'] = '90%'
df['ratings_notation'] = df['ratings'].apply(lambda x: str(x).replace('.0','').replace('.5', '-half'))

In [102]:
df['address'] = df[['address', 'neighborhood']].apply(lambda row: row['address'].replace(str(row['neighborhood']),'').strip(), axis=1)
df['between'] = df[['between', 'neighborhood']].apply(lambda row: row['between'].replace(str(row['neighborhood']),'').strip() if type(row['between']) == str else row['between'], axis=1)

In [103]:
def address_parser(address, n=0):
    parsed = re.findall(r"(.*)(Chicago, IL .*)", address)
    if parsed != [] and len(parsed[0]) >= n+1:
        return parsed[0][n].strip()
    else:
        return address

In [104]:
df['address1'] = df['address'].apply(lambda x: address_parser(x, n=0))
df['address2'] = df['address'].apply(lambda x: address_parser(x, n=1))

In [105]:
df.head()

Unnamed: 0,yelp_id,name,category,price_range,address,neighborhood,info,Claimed?,website,phone,...,Has TV,Attire,Parking,Alcohol,Noise Level,Wi-Fi,probability,ratings_notation,address1,address2
0,afghan-kabob-chicago,Afghan Kabob,"Middle Eastern, Afghan",$11-30,"4040 W Montrose Ave Chicago, IL 60641",Albany Park,"{'Takes Reservations': 'No', 'Delivery': 'Yes'...",1,http://www.afghan-kabob.com,(773) 427-5041,...,Yes,Casual,Street,No,Quiet,No,90%,4-half,4040 W Montrose Ave,"Chicago, IL 60641"
1,taco-and-burrito-house-chicago,Taco & Burrito House,Mexican,Under $10,"1548 W Fullerton Ave Chicago, IL 60614","Lincoln Park, DePaul","{'Takes Reservations': 'No', 'Delivery': 'No',...",0,,(773) 665-8389,...,No,Casual,Street,No,Average,No,90%,3-half,1548 W Fullerton Ave,"Chicago, IL 60614"
2,meatball-hero-and-pasta-chicago,Meatball Hero & Pasta,"American (Traditional), Italian",Under $10,"3037 N Clark St Chicago, IL 60657",Lakeview,"{'Takes Reservations': 'No', 'Delivery': 'Yes'...",1,http://www.meatballheroes.com/,(773) 348-3037,...,No,Casual,Street,No,Average,Free,90%,4-half,3037 N Clark St,"Chicago, IL 60657"
3,sinha-elegant-cuisine-chicago,Sinha Elegant Cuisine,Brazilian,$11-30,"2018 W Adams Chicago, IL 60612",Near West Side,"{'Takes Reservations': 'Yes', 'Delivery': 'No'...",1,http://www.sinhaelegantcuisine.com,(312) 491-8200,...,No,Casual,"Street, Private Lot",No,Average,Free,90%,4-half,2018 W Adams,"Chicago, IL 60612"
4,bella-luna-bar-and-pizza-chicago,Bella Luna Bar & Pizza,"Italian, Pizza",$11-30,"731 N Dearborn Chicago, IL 60654",Near North Side,"{'Takes Reservations': 'Yes', 'Delivery': 'Yes...",1,http://www.bellalunachicago.com/,(312) 751-2552,...,Yes,Casual,Street,Full Bar,Quiet,Free,90%,3,731 N Dearborn,"Chicago, IL 60654"


In [106]:
df.to_csv('database.csv', index=False)

In [107]:
test_data = df[(df['yelp_id']=='rios-d-sudamerica-chicago') | (df['yelp_id']=='le-colonial-chicago')]

In [110]:
test_data.T

Unnamed: 0,426,537
yelp_id,le-colonial-chicago,rios-d-sudamerica-chicago
name,Le Colonial,Rio’s D’Sudamerica
category,"Vietnamese, French",Peruvian
price_range,$31-60,$11-30
address,"937 N Rush St Chicago, IL 60611","2010 W Armitage Ave Chicago, IL 60647"
neighborhood,Near North Side,Bucktown
info,"{'Takes Reservations': 'Yes', 'Delivery': 'No'...","{'Takes Reservations': 'Yes', 'Delivery': 'No'..."
Claimed?,1,0
website,http://www.lecolonialchicago.com,http://riosdesudamerica.com
phone,(312) 255-0088,(773) 276-0170


In [566]:
test_data.to_csv('test_data.csv')

In [516]:
test_data.columns

Index(['yelp_id', 'name', 'category', 'price_range', 'address', 'neighborhood',
       'info', 'Claimed?', 'website', 'phone', 'working_hours', 'ratings',
       'reviews', 'HasWebsite', 'first_review', 'last_review',
       'permanently_closed', '1_star', '2_star', '3_star', '4_star', '5_star',
       'AvgLast_compound', 'AvgLast_star', 'AvgLast_subjectivity',
       'Last_compound_coef', 'Last_compound_intrcpt', 'Last_star_coef',
       'Last_star_intrcpt', 'Last_subjectivity_coef',
       'Last_subjectivity_intrcpt', 'avg_reviews', 'avg_star', 'between',
       'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'Mon1', 'Tue1', 'Wed1',
       'Thu1', 'Fri1', 'Sat1', 'Sun1', 'dollar_signs', 'Accepts Credit Cards',
       'Good for Groups', 'Good for Kids', 'Takes Reservations',
       'Outdoor Seating', 'Take-out', 'Delivery', 'Has TV', 'Attire',
       'Parking', 'Alcohol', 'Noise Level', 'Wi-Fi', 'probability',
       'ratings_notation', 'address1', 'address2'],
      dtype='object')

In [523]:
test_data.T

Unnamed: 0,73,12
yelp_id,le-colonial-chicago,rios-d-sudamerica-chicago
name,Le Colonial,Rio’s D’Sudamerica
category,"Vietnamese, French",Peruvian
price_range,$31-60,$11-30
address,"937 N Rush St Chicago, IL 60611","2010 W Armitage Ave Chicago, IL 60647"
neighborhood,Near North Side,Bucktown
info,"{'Takes Reservations': 'Yes', 'Delivery': 'No'...","{'Takes Reservations': 'Yes', 'Delivery': 'No'..."
Claimed?,1,0
website,http://www.lecolonialchicago.com,http://riosdesudamerica.com
phone,(312) 255-0088,(773) 276-0170


In [134]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2 entries, 426 to 537
Data columns (total 66 columns):
yelp_id                      2 non-null object
name                         2 non-null object
category                     2 non-null object
price_range                  2 non-null object
address                      2 non-null object
neighborhood                 2 non-null object
info                         2 non-null object
Claimed?                     2 non-null int64
website                      2 non-null object
phone                        2 non-null object
working_hours                2 non-null object
ratings                      2 non-null float64
reviews                      2 non-null float64
HasWebsite                   2 non-null int64
first_review                 2 non-null object
last_review                  2 non-null object
permanently_closed           2 non-null int64
1_star                       2 non-null int64
2_star                       2 non-null int64
3_sta

# Creat lists of dummy variables for category, neighborhood, and price range

In [111]:
def GetLabels(df=None, column='category'):
    ensumble = []
    for line in df[column]:
        if type(line) != float:
            labels = line.split(',')
            for l in labels:
                l = l.strip()
                if l not in ensumble:
                    ensumble.append(l)
    return ensumble

In [112]:
df_block13_open26 = pd.read_csv('df_block13_open26.csv')

In [113]:
df_block13_open26.columns

Index(['yelp_id', 'name', 'category', 'price_range', 'address', 'neighborhood',
       'info', 'Claimed?', 'website', 'phone', 'working_hours', 'ratings',
       'reviews', 'HasWebsite', 'first_review', 'last_review',
       'permanently_closed', '1_star', '2_star', '3_star', '4_star', '5_star',
       'AvgLast_compound', 'AvgLast_star', 'AvgLast_subjectivity',
       'Last_compound_coef', 'Last_compound_intrcpt', 'Last_star_coef',
       'Last_star_intrcpt', 'Last_subjectivity_coef',
       'Last_subjectivity_intrcpt', 'avg_reviews', 'avg_star'],
      dtype='object')

In [114]:
df_block13_open26.head()

Unnamed: 0,yelp_id,name,category,price_range,address,neighborhood,info,Claimed?,website,phone,...,AvgLast_star,AvgLast_subjectivity,Last_compound_coef,Last_compound_intrcpt,Last_star_coef,Last_star_intrcpt,Last_subjectivity_coef,Last_subjectivity_intrcpt,avg_reviews,avg_star
0,afghan-kabob-chicago,Afghan Kabob,"Middle Eastern,Afghan",$11-30,"4040 W Montrose Ave Chicago, IL 60641 b/t Pula...",Albany Park,"[{'Takes Reservations': 'No'}, {'Delivery': 'Y...",1,http://www.afghan-kabob.com,(773) 427-5041,...,4.0,0.602101,0.018577,0.030445,0.031406,3.322532,-0.004437,0.697803,21.454545,4.394068
1,taco-and-burrito-house-chicago,Taco & Burrito House,Mexican,Under $10,"1548 W Fullerton Ave Chicago, IL 60614 b/t Bos...","Lincoln Park, DePaul","[{'Takes Reservations': 'No'}, {'Delivery': 'N...",0,,(773) 665-8389,...,4.0,0.534943,0.004739,0.090441,0.017407,2.807598,0.000517,0.499548,7.0,3.369048
2,meatball-hero-and-pasta-chicago,Meatball Hero & Pasta,"American (Traditional),Italian",Under $10,"3037 N Clark St Chicago, IL 60657 b/t Barry Av...",Lakeview,"[{'Takes Reservations': 'No'}, {'Delivery': 'Y...",1,http://www.meatballheroes.com/,(773) 348-3037,...,4.625,0.610423,0.002643,0.615613,0.005913,4.222562,-8.2e-05,0.616036,16.5,4.575758
3,sinha-elegant-cuisine-chicago,Sinha Elegant Cuisine,Brazilian,$11-30,"2018 W Adams Chicago, IL 60612 b/t Damen Ave &...",Near West Side,"[{'Takes Reservations': 'Yes'}, {'Delivery': '...",1,http://www.sinhaelegantcuisine.com,(312) 491-8200,...,4.454545,0.665821,0.00041,0.909065,-0.000515,4.493152,0.000481,0.629725,10.0,4.258333
4,bella-luna-bar-and-pizza-chicago,Bella Luna Bar & Pizza,"Italian,Pizza",$11-30,"731 N Dearborn Chicago, IL 60654 Near North Side",Near North Side,"[{'Takes Reservations': 'Yes'}, {'Delivery': '...",1,http://www.bellalunachicago.com/,(312) 751-2552,...,3.666667,0.692331,-0.002152,0.869853,-0.02133,5.010469,-0.001287,0.773386,8.230769,2.990654


In [115]:
df_block13_open26_train = df_block13_open26.iloc[:804,:]
df_block13_open26_test = df_block13_open26.iloc[804:,:]
len(df_block13_open26_test)

268

In [116]:
df_block13_open26_train['permanently_closed'].value_counts()

0    588
1    216
Name: permanently_closed, dtype: int64

In [117]:
df_block13_open26_test['permanently_closed'].value_counts()

0    201
1     67
Name: permanently_closed, dtype: int64

In [118]:
categories = GetLabels(df=df_block13_open26_train, column='category')
categories

['Middle Eastern',
 'Afghan',
 'Mexican',
 'American (Traditional)',
 'Italian',
 'Brazilian',
 'Pizza',
 'Delis',
 'Sandwiches',
 'Restaurants',
 'Jazz & Blues',
 'Lounges',
 'Greek',
 'Mediterranean',
 'Southern',
 'Breakfast & Brunch',
 'American (New)',
 'Bars',
 'Cuban',
 'Burgers',
 'Seafood',
 'Steakhouses',
 'Hot Dogs',
 'Comfort Food',
 'French',
 'Basque',
 'Sushi Bars',
 'Japanese',
 'Pakistani',
 'Halal',
 'Indian',
 'Chicken Wings',
 'Latin American',
 'Music Venues',
 'Irish',
 'Pubs',
 'Barbeque',
 'Chinese',
 'Cafes',
 'Irish Pub',
 'Vegetarian',
 'Vegan',
 'Sports Bars',
 'Tapas/Small Plates',
 'Diners',
 'Wine Bars',
 'Thai',
 'Pasta Shops',
 'Coffee & Tea',
 'Asian Fusion',
 'Ice Cream & Frozen Yogurt',
 'Fondue',
 'Salad',
 'Gluten-Free',
 'Ramen',
 'Puerto Rican',
 'Desserts',
 'German',
 'Food Stands',
 'Bakeries',
 'Noodles',
 'British',
 'Creperies',
 'Grocery',
 'Caterers',
 'Persian/Iranian',
 'Filipino',
 'Bubble Tea',
 'Venues & Event Spaces',
 'Korean',
 'C

In [119]:
neighborhoods = GetLabels(df=df_block13_open26_train, column='neighborhood')
neighborhoods

['Albany Park',
 'Lincoln Park',
 'DePaul',
 'Lakeview',
 'Near West Side',
 'Near North Side',
 'University Village',
 'Wicker Park',
 'West Town',
 'Uptown',
 'West Loop',
 'Greektown',
 'The Loop',
 'West Lawn',
 'Bucktown',
 'Logan Square',
 'Ravenswood',
 'Lincoln Square',
 'North Center',
 'Old Town',
 'Hermosa',
 'River North',
 'Ukrainian Village',
 'Garfield Ridge',
 'Irving Park',
 'River East',
 'Streeterville',
 'Humboldt Park',
 'Roscoe Village',
 'Wrigleyville',
 'South Loop',
 'Noble Square',
 'Edgewater',
 'Rogers Park',
 'West Rogers Park',
 'Cragin',
 'River West',
 'Andersonville',
 "Printer's Row",
 'Gold Coast',
 'Near Southside',
 'Portage Park',
 'Avondale',
 'Chinatown',
 'Forest Glen',
 'Bridgeport',
 'Fulton Market',
 'Belmont Central',
 'Pilsen',
 'Little Village',
 'East Garfield Park',
 'Edison Park']

In [120]:
price_ranges = GetLabels(df=df_block13_open26_train, column='price_range')
price_ranges

['$11-30',
 'Under $10',
 'Moderate',
 '$31-60',
 'Above $61',
 'Inexpensive',
 'Pricey']

Drop 'Restaurant', 'Pilsen', and 'Pricey' for category, neighborhood, and price range, respectively

In [121]:
#categories.remove('Restaurants')
#neighborhoods.remove('Pilsen')
#price_ranges.remove('Pricey')

# Check values for 'Attire', 'Parking', 'Alcohol', 'Noise Level', and 'Wi-Fi' in info:

In [122]:
def ColumnParser(info, text=False):
    '''Parse info column to a dictionary'''
    import re
    Dict = {}
    if len(info) > 2:
        List = re.findall(r"\{(.*?)\}", info)        
        for item in List:  
            if text:
                key = re.findall(r"\'(.*?)\'", item)[0]
                value = re.findall(r"\'(.*?)\'", item)[1]
            else:
                key = item.split(': ')[0]
                value = item.split(': ')[1]
            Dict[key] = value
    return Dict    

In [123]:
df = df_block13_open26_train.copy()
df['info'] = df['info'].apply(lambda x: ColumnParser(x, text=True))

In [124]:
def GetValues(df=df_block13_open26_train, feature='Attire'):
    ensumble = []
    for item in list(df['info']):
        if feature in item.keys():
            if ', ' in item[feature]:
                for i in item[feature].split(', '):
                    ensumble.append(i)
            else:
                ensumble.append(item[feature])
    return list(set(ensumble))

In [125]:
for f in ['Attire', 'Parking', 'Alcohol', 'Noise Level', 'Wi-Fi']:
    print(f+': ', GetValues(df=df, feature=f))

Attire:  ['Casual', 'Dressy', 'Formal (Jacket Required)']
Parking:  ['Validated', 'Garage', 'Valet', 'Street', 'Private Lot']
Alcohol:  ['Beer & Wine Only', 'No', 'Full Bar']
Noise Level:  ['Quiet', 'Loud', 'Very Loud', 'Average']
Wi-Fi:  ['Free', 'No', 'Paid']


In [126]:
binomial = ['Accepts Credit Cards', 'Good for Groups', 'Good for Kids', 
                'Takes Reservations', 'Outdoor Seating', 'Take-out',  'Delivery', 'Has TV']
polynomial = {'Attire': ['Dressy', 'Casual'],
              'Parking': ['Valet', 'Garage', 'Street', 'Validated'], 
              'Alcohol': ['Full Bar', 'No'], 
              'Noise Level': ['Loud', 'Quite', 'Average'],
              'Wi-Fi': ['No', 'Free']}

In [128]:
train = {}
test = {}
files = [f for f in listdir('./') if '.csv' in f and 'df_' in f]
for f in files:
    df = pd.read_csv(f)
    key = f.replace('.csv','')
    train[key] = df.iloc[:804,:]
    test[key] = df.iloc[804:,:]
    print(key)

df_block0_open26
df_block0_open52
df_block0_open78
df_block13_open26
df_block13_open52
df_block13_open78
df_block26_open26
df_block26_open52
df_block26_open78
df_block52_open26
df_block52_open52
df_block52_open78


In [129]:
train_copy = {}
test_copy = {}
for key, df in train.items():
    train_copy[key] = df.copy()
for key, df in test.items():
    test_copy[key] = df.copy()    

# Process each train/test pair to get dummy variables

In [130]:
droplist = ['address', 'website', 'phone', 'working_hours', 'ratings', 'reviews' ] 
#df.drop(['category', 'neighborhood', 'price_range', 'info'], inplace=True,axis=1)

In [131]:
for i, df in enumerate(list(train_copy.values()) + list(test_copy.values())):
    print(i)
    df.drop(droplist, inplace=True, axis=1)
    for key, columns in {'category': categories, 'neighborhood': neighborhoods, 'price_range': price_ranges}.items():
        for c in columns:
            df[c] = df[key].apply(lambda x: 1 if c in str(x) else 0)
    df['info'] = df['info'].apply(lambda x: ColumnParser(x, text=True))
    for b in binomial:
        df[b] = df['info'].apply(lambda x: 1 if x.get(b,'0') == 'Yes' else 0)
    for k, values in polynomial.items():
        for v in values:
            df[k+'_'+v] = df['info'].apply(lambda x: 1 if x.get(k, '0') == v else 0 )
    df.drop(['category', 'neighborhood', 'price_range', 'info'], inplace=True, axis=1)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23


# Write each train/test dataframe as csv files in ../part_03

In [132]:
for key, df in train_copy.items():
    df.to_csv('./train_'+key+'.csv', index=False)

In [133]:
for key, df in test_copy.items():
    df.to_csv('./test_'+key+'.csv', index=False)