# Load from canonical restaurant data

In [75]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join
#from sklearn.model_selection import train_test_split, StratifiedKFold
#from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import StandardScaler, Imputer, FunctionTransformer, LabelBinarizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [42]:
files = [f for f in listdir('./') if '.csv' in f]
files

['test_df_block13_open26.csv',
 'test_df_block13_open52.csv',
 'test_df_block13_open78.csv',
 'test_df_block26_open26.csv',
 'test_df_block26_open52.csv',
 'test_df_block26_open78.csv',
 'test_df_block52_open26.csv',
 'test_df_block52_open52.csv',
 'test_df_block52_open78.csv',
 'train_df_block13_open26.csv',
 'train_df_block13_open52.csv',
 'train_df_block13_open78.csv',
 'train_df_block26_open26.csv',
 'train_df_block26_open52.csv',
 'train_df_block26_open78.csv',
 'train_df_block52_open26.csv',
 'train_df_block52_open52.csv',
 'train_df_block52_open78.csv']

In [43]:
train_df_block13_open26 = pd.read_csv('./train_df_block13_open26.csv')
test_df_block13_open26 = pd.read_csv('./test_df_block13_open26.csv')

In [44]:
for c in (train_df_block13_open26.columns):
    print(c)

id
name
Claimed?
HasWebsite
first_review
last_review
permanently_closed
1_star
2_star
3_star
4_star
5_star
AvgLast_compound
AvgLast_star
AvgLast_subjectivity
Last_compound_coef
Last_compound_intrcpt
Last_star_coef
Last_star_intrcpt
Last_subjectivity_coef
Last_subjectivity_intrcpt
avg_reviews
avg_star
French
Barbeque
Music Venues
Pizza
Italian
Middle Eastern
Vegetarian
Falafel
Bakeries
Breakfast & Brunch
Ukrainian
American (New)
Southern
Cocktail Bars
American (Traditional)
Sandwiches
Burgers
Bars
Coffee & Tea
Pubs
Hot Dogs
Thai
Japanese
Chinese
Sushi Bars
Mexican
Cuban
Indian
Pakistani
Ethiopian
Cafes
Juice Bars & Smoothies
Desserts
Irish
Latin American
Korean
Asian Fusion
Grocery
Venues & Event Spaces
Tapas Bars
Spanish
Tapas/Small Plates
Shanghainese
Cantonese
Wine Bars
Steakhouses
Beer Bar
Austrian
Salad
Argentine
Greek
British
Gastropubs
Modern European
Fast Food
Dance Clubs
Comfort Food
Himalayan/Nepalese
Soup
Diners
Lounges
Food Stands
Jazz & Blues
Noodles
Fondue
Mediterranean
Gl

In [45]:
unwanted = ['id', 'name', 'first_review', 'last_review', 'permanently_closed']
wanted = [i for i in train_df_block13_open26.columns if i not in unwanted]

In [46]:
X_train = train_df_block13_open26[wanted]
X_test = test_df_block13_open26[wanted]
y_train = train_df_block13_open26['permanently_closed']
y_test = test_df_block13_open26['permanently_closed']

In [49]:
X_train.shape

(864, 221)

In [58]:
for c in X_train.columns:
    print(c + ': ', X_train[c].isnull().sum())

Claimed?:  0
HasWebsite:  0
1_star:  0
2_star:  0
3_star:  0
4_star:  0
5_star:  0
AvgLast_compound:  68
AvgLast_star:  68
AvgLast_subjectivity:  68
Last_compound_coef:  0
Last_compound_intrcpt:  68
Last_star_coef:  0
Last_star_intrcpt:  68
Last_subjectivity_coef:  0
Last_subjectivity_intrcpt:  68
avg_reviews:  12
avg_star:  12
French:  0
Barbeque:  0
Music Venues:  0
Pizza:  0
Italian:  0
Middle Eastern:  0
Vegetarian:  0
Falafel:  0
Bakeries:  0
Breakfast & Brunch:  0
Ukrainian:  0
American (New):  0
Southern:  0
Cocktail Bars:  0
American (Traditional):  0
Sandwiches:  0
Burgers:  0
Bars:  0
Coffee & Tea:  0
Pubs:  0
Hot Dogs:  0
Thai:  0
Japanese:  0
Chinese:  0
Sushi Bars:  0
Mexican:  0
Cuban:  0
Indian:  0
Pakistani:  0
Ethiopian:  0
Cafes:  0
Juice Bars & Smoothies:  0
Desserts:  0
Irish:  0
Latin American:  0
Korean:  0
Asian Fusion:  0
Grocery:  0
Venues & Event Spaces:  0
Tapas Bars:  0
Spanish:  0
Tapas/Small Plates:  0
Shanghainese:  0
Cantonese:  0
Wine Bars:  0
Steakhous

In [61]:
imputer = Imputer(missing_values=np.nan, strategy='mean', axis=1)
ss = StandardScaler() 
pipe = make_pipeline(imputer, ss) 

In [63]:
Xtrain = pipe.fit_transform(X_train)
Xtest = pipe.transform(X_test)

In [88]:
list_of_classifiers = [
    #('logreg', LogisticRegression()),
    ('sgd_cls', SGDClassifier()),
    #('dt', DecisionTreeClassifier()),
    ('rf', RandomForestClassifier()),
    ('gb', GradientBoostingClassifier())
]
vclf = VotingClassifier(list_of_classifiers, voting='soft')
params = {
#    'logreg__C': [0.1, 1.0, 10.0],
#    'logreg__penalty': ['l1', 'l2'],
    'sgd_cls__loss':['log'], #, 'squared_loss'],
    'sgd_cls__penalty':['elasticnet'],
    'sgd_cls__alpha':np.logspace(-1,1,5),
    'sgd_cls__l1_ratio':[i/10.0 for i in range(6)],
    #'dt__max_depth': [3, 5, 7, 9],
    'rf__max_depth': [3, 5, 7],
    'rf__n_estimators': [100, 500],
    'gb__max_depth': [3, 5, 7],
    'gb__n_estimators': [100, 500],
}
gs = GridSearchCV(vclf, params, verbose=2)

In [None]:
gs.fit(Xtrain, y_train)

Fitting 3 folds for each of 1080 candidates, totalling 3240 fits
[CV] gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=100, sgd_cls__alpha=0.1, sgd_cls__l1_ratio=0.0, sgd_cls__loss=log, sgd_cls__penalty=elasticnet 
[CV]  gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=100, sgd_cls__alpha=0.1, sgd_cls__l1_ratio=0.0, sgd_cls__loss=log, sgd_cls__penalty=elasticnet, total=   0.5s
[CV] gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=100, sgd_cls__alpha=0.1, sgd_cls__l1_ratio=0.0, sgd_cls__loss=log, sgd_cls__penalty=elasticnet 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


[CV]  gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=100, sgd_cls__alpha=0.1, sgd_cls__l1_ratio=0.0, sgd_cls__loss=log, sgd_cls__penalty=elasticnet, total=   0.5s
[CV] gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=100, sgd_cls__alpha=0.1, sgd_cls__l1_ratio=0.0, sgd_cls__loss=log, sgd_cls__penalty=elasticnet 
[CV]  gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=100, sgd_cls__alpha=0.1, sgd_cls__l1_ratio=0.0, sgd_cls__loss=log, sgd_cls__penalty=elasticnet, total=   0.5s
[CV] gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=100, sgd_cls__alpha=0.1, sgd_cls__l1_ratio=0.1, sgd_cls__loss=log, sgd_cls__penalty=elasticnet 
[CV]  gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=100, sgd_cls__alpha=0.1, sgd_cls__l1_ratio=0.1, sgd_cls__loss=log, sgd_cls__penalty=elasticnet, total=   0.5s
[CV] gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=100, sg

[CV]  gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=100, sgd_cls__alpha=0.316227766017, sgd_cls__l1_ratio=0.2, sgd_cls__loss=log, sgd_cls__penalty=elasticnet, total=   0.5s
[CV] gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=100, sgd_cls__alpha=0.316227766017, sgd_cls__l1_ratio=0.2, sgd_cls__loss=log, sgd_cls__penalty=elasticnet 
[CV]  gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=100, sgd_cls__alpha=0.316227766017, sgd_cls__l1_ratio=0.2, sgd_cls__loss=log, sgd_cls__penalty=elasticnet, total=   0.5s
[CV] gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=100, sgd_cls__alpha=0.316227766017, sgd_cls__l1_ratio=0.2, sgd_cls__loss=log, sgd_cls__penalty=elasticnet 
[CV]  gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=100, sgd_cls__alpha=0.316227766017, sgd_cls__l1_ratio=0.2, sgd_cls__loss=log, sgd_cls__penalty=elasticnet, total=   0.5s
[CV] gb__max_depth=3, gb__n_es

[CV]  gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=100, sgd_cls__alpha=1.0, sgd_cls__l1_ratio=0.3, sgd_cls__loss=log, sgd_cls__penalty=elasticnet, total=   0.5s
[CV] gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=100, sgd_cls__alpha=1.0, sgd_cls__l1_ratio=0.3, sgd_cls__loss=log, sgd_cls__penalty=elasticnet 
[CV]  gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=100, sgd_cls__alpha=1.0, sgd_cls__l1_ratio=0.3, sgd_cls__loss=log, sgd_cls__penalty=elasticnet, total=   0.6s
[CV] gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=100, sgd_cls__alpha=1.0, sgd_cls__l1_ratio=0.4, sgd_cls__loss=log, sgd_cls__penalty=elasticnet 
[CV]  gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=100, sgd_cls__alpha=1.0, sgd_cls__l1_ratio=0.4, sgd_cls__loss=log, sgd_cls__penalty=elasticnet, total=   0.5s
[CV] gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=100, sg

[CV]  gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=100, sgd_cls__alpha=3.16227766017, sgd_cls__l1_ratio=0.4, sgd_cls__loss=log, sgd_cls__penalty=elasticnet, total=   0.6s
[CV] gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=100, sgd_cls__alpha=3.16227766017, sgd_cls__l1_ratio=0.5, sgd_cls__loss=log, sgd_cls__penalty=elasticnet 
[CV]  gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=100, sgd_cls__alpha=3.16227766017, sgd_cls__l1_ratio=0.5, sgd_cls__loss=log, sgd_cls__penalty=elasticnet, total=   0.6s
[CV] gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=100, sgd_cls__alpha=3.16227766017, sgd_cls__l1_ratio=0.5, sgd_cls__loss=log, sgd_cls__penalty=elasticnet 
[CV]  gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=100, sgd_cls__alpha=3.16227766017, sgd_cls__l1_ratio=0.5, sgd_cls__loss=log, sgd_cls__penalty=elasticnet, total=   0.6s
[CV] gb__max_depth=3, gb__n_estimat

[CV]  gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=500, sgd_cls__alpha=0.1, sgd_cls__l1_ratio=0.0, sgd_cls__loss=log, sgd_cls__penalty=elasticnet, total=   1.1s
[CV] gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=500, sgd_cls__alpha=0.1, sgd_cls__l1_ratio=0.0, sgd_cls__loss=log, sgd_cls__penalty=elasticnet 
[CV]  gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=500, sgd_cls__alpha=0.1, sgd_cls__l1_ratio=0.0, sgd_cls__loss=log, sgd_cls__penalty=elasticnet, total=   1.1s
[CV] gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=500, sgd_cls__alpha=0.1, sgd_cls__l1_ratio=0.1, sgd_cls__loss=log, sgd_cls__penalty=elasticnet 
[CV]  gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=500, sgd_cls__alpha=0.1, sgd_cls__l1_ratio=0.1, sgd_cls__loss=log, sgd_cls__penalty=elasticnet, total=   1.2s
[CV] gb__max_depth=3, gb__n_estimators=100, rf__max_depth=3, rf__n_estimators=500, sg

In [72]:
gs.score(Xtest, y_test)

0.91666666666666663

In [40]:
X_train['Claimed?'].value_counts()

1    715
0    151
Name: Claimed?, dtype: int64

In [33]:
columns

['Claimed?',
 'HasWebsite',
 '1_star',
 '2_star',
 '3_star',
 '4_star',
 '5_star',
 'AvgLast_compound',
 'AvgLast_star',
 'AvgLast_subjectivity',
 'Last_compound_coef',
 'Last_compound_intrcpt',
 'Last_star_coef',
 'Last_star_intrcpt',
 'Last_subjectivity_coef',
 'Last_subjectivity_intrcpt',
 'avg_reviews',
 'avg_star',
 'Sports Bars',
 'American (Traditional)',
 'Diners',
 'Breakfast & Brunch',
 'Gastropubs',
 'American (New)',
 'Jazz & Blues',
 'Bars',
 'Modern European',
 'Mexican',
 'Pizza',
 'Vietnamese',
 'French',
 'Vegan',
 'Vegetarian',
 'Japanese',
 'Sushi Bars',
 'Coffee & Tea',
 'Desserts',
 'Tapas/Small Plates',
 'Seafood',
 'Steakhouses',
 'Greek',
 'Mediterranean',
 'Barbeque',
 'Sandwiches',
 'Pubs',
 'Chicken Wings',
 'Burgers',
 'Hot Dogs',
 'Ice Cream & Frozen Yogurt',
 'Italian',
 'Korean',
 'Wine Bars',
 'Bakeries',
 'Grocery',
 'Cuban',
 'Gluten-Free',
 'Salad',
 'Ramen',
 'Moroccan',
 'Thai',
 'Pop-up Shops',
 'Spanish',
 'Tapas Bars',
 'Venues & Event Spaces',
 

In [15]:
train_df_block13_open26[train_df_block13_open26['Adult Entertainment'] == 1]

Unnamed: 0,id,name,Claimed?,HasWebsite,first_review,last_review,permanently_closed,1_star,2_star,3_star,...,Parking_Garage,Parking_Street,Parking_Validated,Alcohol_Full Bar,Alcohol_No,Noise Level_Loud,Noise Level_Quite,Noise Level_Average,Wi-Fi_No,Wi-Fi_Free
94,31_crabbby-kims,Crabbby Kim’s,0,0,2006-11-20 00:00:00,2015-01-26 00:00:00,1,1,5,10,...,0,1,0,1,0,0,0,0,0,0


In [541]:
df = pd.read_csv('CanonicalRestaurants.csv')

In [542]:
df.columns

Index(['address', 'category', 'claimed_status', 'compound', 'date',
       'first_review', 'health_rating', 'id', 'info', 'last_review',
       'latitude', 'longitude', 'name', 'negative', 'neighborhood', 'neutral',
       'permanently_closed', 'phone', 'positive', 'price_range', 'ratings',
       'ratings_histogram', 'reviews', 'star', 'subjectivity', 'url',
       'website', 'working_hours'],
      dtype='object')

In [543]:
df.shape

(484879, 28)

In [544]:
df['Claimed?'] = df['claimed_status'].apply(lambda x: 1 if str(x) == 'Claimed' else 0)
df['HasWebsite'] = df['website'].apply(lambda x: 1 if 'http' in str(x) else 0)

In [546]:
droplist = ['working_hours', 'health_rating', 'phone', 'url', 'claimed_status', 
            'website', 'address', 'longitude', 'latitude']
df.drop(droplist, inplace=True, axis=1)

In [547]:
df = df[(df['last_review'] != 'MISSING') & (df['first_review'] != 'MISSING')].copy()
df.shape

(484838, 21)

In [548]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 484838 entries, 0 to 484878
Data columns (total 21 columns):
category              484544 non-null object
compound              484838 non-null float64
date                  484838 non-null object
first_review          484838 non-null object
id                    484838 non-null object
info                  484838 non-null object
last_review           484838 non-null object
name                  484838 non-null object
negative              484838 non-null float64
neighborhood          483001 non-null object
neutral               484838 non-null float64
permanently_closed    484838 non-null int64
positive              484838 non-null float64
price_range           483735 non-null object
ratings               484838 non-null float64
ratings_histogram     484838 non-null object
reviews               484838 non-null float64
star                  484838 non-null float64
subjectivity          484838 non-null float64
Claimed?              48483

In [549]:
df['date'] =  pd.to_datetime(df['date'])
df['last_review'] =  pd.to_datetime(df['last_review'])
df['first_review'] =  pd.to_datetime(df['first_review'])

# Remove closed restaurants that have last review earlier than 2012/01/01

In [550]:
import datetime
cut_day = datetime.date(2012, 1, 1)

In [551]:
mask = (df['permanently_closed'] == 1) & (df['last_review'] < cut_day)
cut_df = df[~mask].copy()

In [552]:
cut_df.head()

Unnamed: 0,category,compound,date,first_review,id,info,last_review,name,negative,neighborhood,...,permanently_closed,positive,price_range,ratings,ratings_histogram,reviews,star,subjectivity,Claimed?,HasWebsite
0,"Breakfast & Brunch,American (Traditional)",0.9655,2015-02-02,2011-10-08,0_2-sparrows,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2015-02-02,2 Sparrows,0.104,Lincoln Park,...,1,0.295,$11-30,3.0,"[{5: 63}, {4: 94}, {3: 67}, {2: 78}, {1: 34}]",336.0,3.0,0.57619,1,1
1,"Breakfast & Brunch,American (Traditional)",0.0926,2014-12-04,2011-10-08,0_2-sparrows,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2015-02-02,2 Sparrows,0.131,Lincoln Park,...,1,0.111,$11-30,3.0,"[{5: 63}, {4: 94}, {3: 67}, {2: 78}, {1: 34}]",336.0,5.0,0.4,1,1
2,"Breakfast & Brunch,American (Traditional)",-0.6532,2014-10-07,2011-10-08,0_2-sparrows,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2015-02-02,2 Sparrows,0.132,Lincoln Park,...,1,0.064,$11-30,3.0,"[{5: 63}, {4: 94}, {3: 67}, {2: 78}, {1: 34}]",336.0,3.0,0.629167,1,1
3,"Breakfast & Brunch,American (Traditional)",0.9852,2014-09-25,2011-10-08,0_2-sparrows,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2015-02-02,2 Sparrows,0.057,Lincoln Park,...,1,0.144,$11-30,3.0,"[{5: 63}, {4: 94}, {3: 67}, {2: 78}, {1: 34}]",336.0,4.0,0.520476,1,1
4,"Breakfast & Brunch,American (Traditional)",0.973,2014-09-23,2011-10-08,0_2-sparrows,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2015-02-02,2 Sparrows,0.067,Lincoln Park,...,1,0.189,$11-30,3.0,"[{5: 63}, {4: 94}, {3: 67}, {2: 78}, {1: 34}]",336.0,3.0,0.597,1,1


In [553]:
cut_df.shape

(476812, 21)

# Collapse the cononical data to one restaurant per row to do train/test split

In [554]:
id_closed = cut_df[['id', 'permanently_closed']].copy()
id_closed.shape

(476812, 2)

In [555]:
id_closed.drop_duplicates(inplace=True)
id_closed.shape

(1155, 2)

# Baseline

In [556]:
id_closed['permanently_closed'].value_counts()

0    852
1    303
Name: permanently_closed, dtype: int64

In [557]:
id_closed['id'].value_counts()

76_the-brass-monkey                              1
701_billy-goat-inn-chicago                       1
61_letizias-fiore-ristorante-and-wine-shoppe     1
318_leos-lunchroom                               1
107_shiso                                        1
1_hugos-frog-bar-and-fish-house-chicago          1
718_bretts-kitchen-chicago                       1
1028_primehouse-chicago-2                        1
97_ditkas-restaurant-chicago                     1
294_marketplace-cafe-chicago-4                   1
1230_kikis-bistro-chicago                        1
541_west-town-tavern                             1
148_birchwood-kitchen                            1
108_caliterra                                    1
252_agami-chicago                                1
277_local-option-chicago                         1
370_rios-d-sudamerica-chicago                    1
553_zed451                                       1
260_in-fine-spirits-lounge                       1
287_wow-bao-chicago-8          

# Split data into train/test by restaurant id

In [558]:
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
id_train, id_test, _, _ = train_test_split(id_closed[['id']], id_closed['permanently_closed'])

In [559]:
id_train_list = list(id_train['id'])
id_test_list = list(id_test['id'])

# Functions to extract NLP data with varying block weeks and open weeks

In [560]:
def GetReviewSummary(df=None, blockweeks=52, openweeks=52, NLPsummary=['star']):
    from sklearn.linear_model import LinearRegression
    from datetime import timedelta
    Dict = {}
    df = df.copy()
    df['date'] =  pd.to_datetime(df['date'])
    df['year'] = df['date'].apply(lambda x: x.year)
    blck = timedelta(weeks=blockweeks)
    blocktime = df['date'].iloc[0] - blck
    new_df = df[df['date'] <= blocktime].copy()
    Dict['avg_reviews'] = [float(new_df.shape[0])/(new_df['year'].max() - new_df['year'].min() + 1)]    
    Dict['avg_star'] = new_df['star'].mean()
    Dict['5_star'] = new_df[new_df['star'] == 5.0].shape[0]
    Dict['4_star'] = new_df[new_df['star'] == 4.0].shape[0]
    Dict['3_star'] = new_df[new_df['star'] == 3.0].shape[0]
    Dict['2_star'] = new_df[new_df['star'] == 2.0].shape[0]
    Dict['1_star'] = new_df[new_df['star'] == 1.0].shape[0]
    if NLPsummary != None:            
        opn = timedelta(weeks=openweeks)        
        opentime = blocktime - opn
        open_df = new_df[(new_df['date'] >= opentime)].copy()
        open_df['days'] = open_df['date'].apply(lambda x: (x - list(open_df['date'])[-1]).days)
        for item in NLPsummary:
            Dict['AvgLast_'+item] = [open_df[item].mean()]
            if open_df.shape[0] >= 2:             
                lr = LinearRegression()
                lr.fit(open_df[['days']],open_df[item])
                Dict['Last_'+item+'_intrcpt'] = [lr.intercept_]
                Dict['Last_'+item+'_coef'] = [lr.coef_[0]]
            else:
                Dict['Last_'+item+'_intrcpt'] = [open_df[item].mean()]
                Dict['Last_'+item+'_coef'] = [0.0] 
    return Dict

In [561]:
def GetEachSummary(df=None, idname=None, blockweeks=52, openweeks=52, NLPsummary=['star','compound','subjectivity']):
    subset = df[df['id']==idname]
    row = pd.DataFrame(subset.iloc[0,:]).transpose()
    InfoList = ['id', 'name', 'category', 'price_range', 'neighborhood', 'info', 'Claimed?', 
                'HasWebsite', 'first_review', 'last_review', 'permanently_closed']
    Info = row[InfoList]
    ReviewList = ['date', 'star', 'compound', 'neutral', 'positive', 'negative', 'subjectivity']
    Review = subset[ReviewList]
    Summary = GetReviewSummary(df=Review, blockweeks=blockweeks, openweeks=openweeks, NLPsummary=NLPsummary)
    Sum_df = pd.DataFrame(Summary, index=[Info.index[0]])
    each = Info.join(Sum_df)
    return each

In [562]:
def GetSummarydf(df=None, idlist=None, blockweeks=52, openweeks=52, NLPsummary=['star','compound','subjectivity']):
    Summary_df = GetEachSummary(df=df, idname=idlist[0], 
                                blockweeks=blockweeks, openweeks=openweeks, NLPsummary=NLPsummary)
    for i, idname in enumerate(idlist[1:]):
        new = GetEachSummary(df=df, idname=idname, 
                             blockweeks=blockweeks, openweeks=openweeks, NLPsummary=NLPsummary)
        Summary_df = Summary_df.append(new, ignore_index=True)     
    return Summary_df

# Create dictionaries to store train/test dataframes of different block/open weeks combinations
- block weeks: 13, 26, 52 weeks
- open weeks: 26, 52, 78 weeks
- 9 combinations in total

In [563]:
blck = [13, 26, 52]
opn = [26, 52, 78]
train = {} 
test = {}
for b in blck:
    for o in opn:
        key = 'df_block' + str(b) + '_open' + str(o)
        print(key)
        train[key] = GetSummarydf(df=cut_df, idlist=id_train_list, 
                                  blockweeks=b, openweeks=o, NLPsummary=['star','compound','subjectivity'])
        test[key] = GetSummarydf(df=cut_df, idlist=id_test_list, 
                                 blockweeks=b, openweeks=o, NLPsummary=['star','compound','subjectivity'])

df_block13_open26
df_block13_open52
df_block13_open78
df_block26_open26
df_block26_open52
df_block26_open78
df_block52_open26
df_block52_open52
df_block52_open78


# Creat lists of dummy variables for category, neighborhood, and price range

In [565]:
def GetLabels(df=train['df_block13_open26'], column='category'):
    ensumble = []
    for line in df[column]:
        if type(line) != float:
            labels = line.split(',')
            for l in labels:
                l = l.strip()
                if l not in ensumble:
                    ensumble.append(l)
    return ensumble

In [566]:
categories = GetLabels(df=train['df_block13_open26'], column='category')
categories

['Sports Bars',
 'American (Traditional)',
 'Diners',
 'Breakfast & Brunch',
 'Gastropubs',
 'American (New)',
 'Jazz & Blues',
 'Bars',
 'Modern European',
 'Mexican',
 'Pizza',
 'Vietnamese',
 'French',
 'Vegan',
 'Vegetarian',
 'Japanese',
 'Sushi Bars',
 'Coffee & Tea',
 'Desserts',
 'Tapas/Small Plates',
 'Seafood',
 'Steakhouses',
 'Greek',
 'Mediterranean',
 'Barbeque',
 'Sandwiches',
 'Pubs',
 'Chicken Wings',
 'Restaurants',
 'Burgers',
 'Hot Dogs',
 'Ice Cream & Frozen Yogurt',
 'Italian',
 'Korean',
 'Wine Bars',
 'Bakeries',
 'Grocery',
 'Cuban',
 'Gluten-Free',
 'Salad',
 'Ramen',
 'Moroccan',
 'Thai',
 'Pop-up Shops',
 'Spanish',
 'Tapas Bars',
 'Venues & Event Spaces',
 'Lounges',
 'Comfort Food',
 'Cocktail Bars',
 'Fast Food',
 'Food Stands',
 'Argentine',
 'Community Service/Non-Profit',
 'Caterers',
 'British',
 'Chinese',
 'Adult Entertainment',
 'Beer Bar',
 'Tex-Mex',
 'Breweries',
 'Dance Clubs',
 'Caribbean',
 'Indian',
 'Latin American',
 'Irish',
 'Irish Pub',

In [567]:
neighborhoods = GetLabels(df=train['df_block13_open26'], column='neighborhood')
neighborhoods

['Lakeview',
 'River North',
 'Near North Side',
 'Bucktown',
 'Irving Park',
 'Andersonville',
 'Edgewater',
 'West Town',
 'Wicker Park',
 'Ravenswood',
 'Uptown',
 'Lincoln Park',
 'DePaul',
 'Near West Side',
 'West Loop',
 'The Loop',
 'West Rogers Park',
 'Chinatown',
 'Avondale',
 'North Center',
 'Wrigleyville',
 'Pilsen',
 'Greektown',
 "Printer's Row",
 'South Loop',
 'Gold Coast',
 'University Village',
 'Roscoe Village',
 'Cragin',
 'Bridgeport',
 'Streeterville',
 'River East',
 'East Garfield Park',
 'Fulton Market',
 'Humboldt Park',
 'Noble Square',
 'Lincoln Square',
 'Near Southside',
 'Logan Square',
 'Old Town',
 'Ukrainian Village',
 'Portage Park',
 'Hermosa',
 'Belmont Central',
 'Forest Glen',
 'River West',
 'Albany Park',
 'Rogers Park',
 'Edison Park',
 'Little Village',
 'Garfield Ridge',
 'West Lawn',
 'Goose Island',
 'North Park']

In [568]:
price_ranges = GetLabels(df=train['df_block13_open26'], column='price_range')
price_ranges

['$11-30',
 'Under $10',
 '$31-60',
 'Moderate',
 'Above $61',
 'Inexpensive',
 'Pricey']

Drop 'Restaurant', 'Pilsen', and 'Pricey' for category, neighborhood, and price range, respectively

In [569]:
categories.remove('Restaurants')
neighborhoods.remove('Pilsen')
price_ranges.remove('Pricey')

# Check values for 'Attire', 'Parking', 'Alcohol', 'Noise Level', and 'Wi-Fi' in info:

In [571]:
def ColumnParser(info, text=False):
    '''Parse info column to a dictionary'''
    import re
    Dict = {}
    if len(info) > 2:
        List = re.findall(r"\{(.*?)\}", info)        
        for item in List:  
            if text:
                key = re.findall(r"\'(.*?)\'", item)[0]
                value = re.findall(r"\'(.*?)\'", item)[1]
            else:
                key = item.split(': ')[0]
                value = item.split(': ')[1]
            Dict[key] = value
    return Dict    

In [572]:
df = train['df_block13_open26'].copy()
df['info'] = df['info'].apply(lambda x: ColumnParser(x, text=True))

In [581]:
def GetValues(df=train['df_block13_open26'], feature='Attire'):
    ensumble = []
    for item in list(df['info']):
        if feature in item.keys():
            if ', ' in item[feature]:
                for i in item[feature].split(', '):
                    ensumble.append(i)
            else:
                ensumble.append(item[feature])
    return list(set(ensumble))

In [582]:
for f in ['Attire', 'Parking', 'Alcohol', 'Noise Level', 'Wi-Fi']:
    print(f+': ', GetValues(df=df, feature=f))

Attire:  ['Dressy', 'Formal (Jacket Required)', 'Casual']
Parking:  ['Garage', 'Street', 'Valet', 'Validated', 'Private Lot']
Alcohol:  ['Full Bar', 'No', 'Beer & Wine Only']
Noise Level:  ['Quiet', 'Very Loud', 'Loud', 'Average']
Wi-Fi:  ['No', 'Paid', 'Free']


In [583]:
binomial = ['Accepts Credit Cards', 'Good for Groups', 'Good for Kids', 
                'Takes Reservations', 'Outdoor Seating', 'Take-out',  'Delivery', 'Has TV']
polynomial = {'Attire': ['Dressy', 'Casual'],
              'Parking': ['Valet', 'Garage', 'Street', 'Validated'], 
              'Alcohol': ['Full Bar', 'No'], 
              'Noise Level': ['Loud', 'Quite', 'Average'],
              'Wi-Fi': ['No', 'Free']}

In [584]:
train_copy = {}
test_copy = {}
for key, df in train.items():
    train_copy[key] = df.copy()
for key, df in test.items():
    test_copy[key] = df.copy()    

In [527]:
train_copy['df_block13_open26']['info'][0]

"[{'Takes Reservations': 'Yes'}, {'Delivery': 'No'}, {'Take-out': 'No'}, {'Accepts Credit Cards': 'Yes'}, {'Accepts Android Pay': 'No'}, {'Parking': 'Valet'}, {'Bike Parking': 'Yes'}, {'Good for Kids': 'No'}, {'Good for Groups': 'Yes'}, {'Attire': 'Dressy'}, {'Ambience': 'Classy'}, {'Noise Level': 'Average'}, {'Alcohol': 'Full Bar'}, {'Outdoor Seating': 'Yes'}, {'Wi-Fi': 'Free'}, {'Has TV': 'No'}, {'Caters': 'No'}]"

# Process each train/test pair to get dummy variables

In [585]:
for i, df in enumerate(list(train_copy.values()) + list(test_copy.values())):
    print(i)
    for key, columns in {'category': categories, 'neighborhood': neighborhoods, 'price_range': price_ranges}.items():
        for c in columns:
            df[c] = df[key].apply(lambda x: 1 if c in str(x) else 0)
    df['info'] = df['info'].apply(lambda x: ColumnParser(x, text=True))
    for b in binomial:
        df[b] = df['info'].apply(lambda x: 1 if x.get(b,'0') == 'Yes' else 0)
    for k, values in polynomial.items():
        for v in values:
            df[k+'_'+v] = df['info'].apply(lambda x: 1 if x.get(k, '0') == v else 0 )
    df.drop(['category', 'neighborhood', 'price_range', 'info'], inplace=True,axis=1)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17


# Write each train/test dataframe as csv files in ../part_03

In [586]:
for key, df in train_copy.items():
    df.to_csv('../part_03/train_'+key+'.csv', index=False)

In [587]:
for key, df in test_copy.items():
    df.to_csv('../part_03/test_'+key+'.csv', index=False)

In [None]:
for c in categories:
    #print(c)
    df[c] = df['category'].apply(lambda x: 1 if c in str(x) else 0)

In [190]:
ClosedRestaurants.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 304 entries, 333 to 226
Data columns (total 21 columns):
address               304 non-null object
category              302 non-null object
claimed_status        246 non-null object
first_review          304 non-null object
health_rating         0 non-null float64
info                  304 non-null object
last_review           304 non-null object
latitude              304 non-null float64
longitude             304 non-null float64
name                  304 non-null object
neighborhood          302 non-null object
permanently_closed    304 non-null int64
phone                 296 non-null object
price_range           296 non-null object
ratings               304 non-null float64
ratings_histogram     304 non-null object
reviews               304 non-null float64
url                   250 non-null object
website               238 non-null object
working_hours         304 non-null object
Claimed?              304 non-null int64
dtypes: fl

In [187]:
ClosedRestaurants['claimed_status'].value_counts()

Claimed    246
Name: claimed_status, dtype: int64

In [498]:
df['price_range'].value_counts()

$11-30            242
$31-60            127
Under $10          34
Above $61          17
Moderate           14
Inexpensive         7
Pricey              5
Ultra High-End      2
Name: price_range, dtype: int64

In [462]:
def ColumnParser(info, text=False):
    Dict = {}
    if len(info) > 2:
        List = re.findall(r"\{(.*?)\}", info)        
        for item in List:  
            if text:
                key = re.findall(r"\'(.*?)\'", item)[0]
                value = re.findall(r"\'(.*?)\'", item)[1]
            else:
                key = item.split(': ')[0]
                value = item.split(': ')[1]
            Dict[key] = value
    return Dict    

In [465]:
df['info'] = df['info'].apply(lambda x: ColumnParser(x, text=True))

In [466]:
df['ratings_histogram'] = df['ratings_histogram'].apply(lambda x: ColumnParser(x, text=False))

In [467]:
df['working_hours'] = df['working_hours'].apply(lambda x: ColumnParser(x, text=True))

In [468]:
df

Unnamed: 0,address,category,claimed_status,health_rating,info,latitude,longitude,name,neighborhood,phone,price_range,ratings,ratings_histogram,reviews,url,website,working_hours
0,"553 West Diversey Pkwy Chicago, IL 60614 b/t H...","Breakfast & Brunch,American (Traditional)",Claimed,,"{'Takes Reservations': 'No', 'Delivery': 'No',...",41.932572,-87.643253,2 Sparrows,Lincoln Park,(773) 234-2320,$11-30,3.0,"{'5': '63', '4': '94', '3': '67', '2': '78', '...",336.0,https://www.yelp.com/biz/2-sparrows-chicago,http://2sparrowschicago.com,{}
1,"140 E Walton St Chicago, IL 60611 b/t Michigan...","Lounges,American (Traditional)",Claimed,,"{'Takes Reservations': 'No', 'Delivery': 'No',...",41.900147,-87.623480,Coq D’or Restaurant & Lounge,"Near North Side, Streeterville",(312) 787-2200,$11-30,3.5,"{'5': '39', '4': '41', '3': '24', '2': '16', '...",131.0,https://www.yelp.com/biz/coq-d-or-restaurant-a...,,"{'Mon': '11:00 am - 1:00 am', 'Tue': '11:00 am..."
2,"2111 W Armitage Ave Chicago, IL 60647 b/t Leav...","Cuban,Latin American",Claimed,,"{'Takes Reservations': 'Yes', 'Delivery': 'No'...",41.917521,-87.680450,Cafe Laguardia,Bucktown,(773) 862-5996,$11-30,3.5,"{'5': '95', '4': '187', '3': '81', '2': '43', ...",433.0,https://www.yelp.com/biz/cafe-laguardia-chicago,http://www.cafelaguardia.com,{}
3,"871 N Rush St Chicago, IL 60686 Near North Side",Italian,,,{},41.898765,-87.626551,Luciano’s on Rush,Near North Side,(312) 266-1414,,3.5,"{'5': '1', '4': '3', '3': '0', '2': '0', '1': ...",5.0,https://www.yelp.com/biz/cafe-luciano-chicago,,{}
4,"1846 N Milwaukee Ave Chicago, IL 60647 b/t Blo...",French,Claimed,,"{'Takes Reservations': 'Yes', 'Delivery': 'No'...",41.914953,-87.684838,Cafe Matou,Bucktown,(773) 384-8911,$31-60,4.0,"{'5': '30', '4': '44', '3': '15', '2': '8', '1...",99.0,https://www.yelp.com/biz/cafe-matou-chicago,http://www.cafematou.com,{}
5,"4749 N Spaulding Ave Chicago, IL 60625 b/t Lel...",Coffee & Tea,Claimed,,"{'Take-out': 'Yes', 'Accepts Credit Cards': 'Y...",41.968092,-87.710801,Cafe Chien,Albany Park,(312) 493-2658,Inexpensive,4.5,"{'5': '30', '4': '12', '3': '3', '2': '1', '1'...",46.0,https://www.yelp.com/biz/cafe-chien-chicago,http://www.cafechien.com,{}
6,"4729 N Lincoln Ave Chicago, IL 60625 b/t Lelan...","Bakeries,Breakfast & Brunch,Cafes",Claimed,,"{'Takes Reservations': 'No', 'Delivery': 'No',...",41.967456,-87.687520,Cafe Selmarie,"Lincoln Square, Ravenswood",(773) 989-5595,$11-30,4.0,"{'5': '236', '4': '272', '3': '72', '2': '35',...",635.0,https://www.yelp.com/biz/cafe-selmarie-chicago,http://www.cafeselmarie.com,"{'Mon': 'Closed', 'Tue': '8:00 am - 9:00 pm', ..."
7,"225 W Wacker Dr Chicago, IL 60606 b/t Post Pl ...","Italian,Caterers,Cafes",Claimed,,"{'Takes Reservations': 'No', 'Delivery': 'Yes'...",41.886508,-87.634999,Caffè Baci,The Loop,(312) 251-0135,$11-30,3.5,"{'5': '11', '4': '14', '3': '10', '2': '7', '1...",45.0,https://www.yelp.com/biz/caffe-baci-chicago,http://www.caffebaci.com,"{'Mon': '6:30 am - 9:00 pm', 'Tue': '6:30 am -..."
8,"2005 W Division Street Chicago, IL 60622 West ...",Mexican,,,"{'Takes Reservations': 'Yes', 'Take-out': 'Yes...",41.903050,-87.677654,Adobo Grill,"West Town, Ukrainian Village",(773) 252-9990,$11-30,3.5,"{'5': '0', '4': '7', '3': '7', '2': '1', '1': ...",16.0,https://www.yelp.com/biz/adobo-grill-chicago,http://www.adobogrill.com,{}
9,"1132 W Grand Ave Chicago, IL 60622 b/t May St ...",Restaurants,,,{},41.891282,-87.656057,Cannella’s On Grand,"Near West Side, Noble Square, West Town",(312) 433-9400,,5.0,"{'5': '1', '4': '0', '3': '0', '2': '0', '1': ...",1.0,https://www.yelp.com/biz/cannellas-on-grand-ch...,,{}


In [470]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 546 entries, 0 to 545
Data columns (total 17 columns):
address              536 non-null object
category             519 non-null object
claimed_status       335 non-null object
health_rating        0 non-null float64
info                 546 non-null object
latitude             536 non-null float64
longitude            536 non-null float64
name                 536 non-null object
neighborhood         532 non-null object
phone                525 non-null object
price_range          448 non-null object
ratings              546 non-null float64
ratings_histogram    546 non-null object
reviews              493 non-null float64
url                  546 non-null object
website              373 non-null object
working_hours        546 non-null object
dtypes: float64(5), object(12)
memory usage: 72.6+ KB


In [463]:
ColumnParser(df['working_hours'][1],text=True)

{'Fri': '11:00 am - 2:00 am',
 'Mon': '11:00 am - 1:00 am',
 'Sat': '11:00 am - 2:00 am',
 'Sun': '11:00 am - 1:00 am\\n        \\n                Open now',
 'Thu': '11:00 am - 1:00 am',
 'Tue': '11:00 am - 1:00 am',
 'Wed': '11:00 am - 1:00 am'}

In [464]:
ColumnParser(df['ratings_histogram'][0], text=False)

{'1': '34', '2': '78', '3': '67', '4': '94', '5': '63'}

In [388]:
info_dict = {}
for item in df['new_info']:
    for key in list(item.keys()):
        if key not in info_dict.keys():
            info_dict[key] = 1            
        else:
            info_dict[key] += 1

In [404]:
info_df = pd.DataFrame(pd.Series(info_dict), columns=['counts'])

In [405]:
info_df.columns

Index(['counts'], dtype='object')

In [407]:
info_df.shape

(37, 1)

In [412]:
ordered_info = info_df.sort_values(by='counts', ascending=False)
ordered_info

Unnamed: 0,counts
Accepts Credit Cards,449
Good for Groups,427
Good for Kids,418
Attire,417
Takes Reservations,415
Outdoor Seating,412
Take-out,408
Parking,406
Alcohol,395
Delivery,394


In [418]:
wanted_info = ordered_info.index[:13]
wanted_info

Index(['Accepts Credit Cards', 'Good for Groups', 'Good for Kids', 'Attire',
       'Takes Reservations', 'Outdoor Seating', 'Take-out', 'Parking',
       'Alcohol', 'Delivery', 'Noise Level', 'Has TV', 'Wi-Fi'],
      dtype='object')

In [423]:
testtext = df['ratings_histogram'][0]
testtext

'[{5: 63}, {4: 94}, {3: 67}, {2: 78}, {1: 34}]'

In [431]:
import re
re.findall(r"<(.*?)>", '<a> b <c>')

['a', 'c']

In [433]:
re.findall(r"\{(.*?)\}", testtext)

['5: 63', '4: 94', '3: 67', '2: 78', '1: 34']

In [435]:
df['info'][0]

"[{'Takes Reservations': 'No'}, {'Delivery': 'No'}, {'Take-out': 'Yes'}, {'Accepts Credit Cards': 'Yes'}, {'Parking': 'Street'}, {'Bike Parking': 'Yes'}, {'Good for Kids': 'Yes'}, {'Good for Groups': 'Yes'}, {'Attire': 'Casual'}, {'Ambience': 'Trendy'}, {'Noise Level': 'Average'}, {'Alcohol': 'Full Bar'}, {'Outdoor Seating': 'No'}, {'Wi-Fi': 'Free'}, {'Has TV': 'No'}, {'Caters': 'No'}]"

In [443]:
info_list = re.findall(r"\{(.*?)\}", df['info'][0])
info_list

["'Takes Reservations': 'No'",
 "'Delivery': 'No'",
 "'Take-out': 'Yes'",
 "'Accepts Credit Cards': 'Yes'",
 "'Parking': 'Street'",
 "'Bike Parking': 'Yes'",
 "'Good for Kids': 'Yes'",
 "'Good for Groups': 'Yes'",
 "'Attire': 'Casual'",
 "'Ambience': 'Trendy'",
 "'Noise Level': 'Average'",
 "'Alcohol': 'Full Bar'",
 "'Outdoor Seating': 'No'",
 "'Wi-Fi': 'Free'",
 "'Has TV': 'No'",
 "'Caters': 'No'"]

In [445]:
re.findall(r"\'(.*?)\'", "'Takes Reservations': 'No, Yes'")
# for item in info_list:
#     key = re.findall(r"\{(.*?)\}", df['info'][0])

['Takes Reservations', 'No, Yes']

In [397]:
info_df.sort_values(by=)

<bound method Series.sort_values of Accepts Android Pay          19
Accepts Apple Pay            98
Accepts Bitcoin              46
Accepts Credit Cards        449
Ages Allowed                  1
Alcohol                     395
Ambience                    291
Attire                      417
Best Nights                  13
Bike Parking                238
By Appointment Only           5
Caters                      283
Coat Check                   15
Delivery                    394
Dogs Allowed                 19
Drive-Thru                    6
Gender Neutral Restrooms     10
Good For                     81
Good For Dancing             21
Good for Groups             427
Good for Kids               418
Good for Working              6
Happy Hour                   18
Has ATM                       1
Has Pool Table               36
Has TV                      325
Music                        12
Noise Level                 326
Offers Military Discount      1
Outdoor Seating             412
Park

In [145]:
keys = list(df['info'][0].keys())
keys

['Takes Reservations',
 'Delivery',
 'Take-out',
 'Accepts Credit Cards',
 'Parking',
 'Bike Parking',
 'Good for Kids',
 'Good for Groups',
 'Attire',
 'Ambience',
 'Noise Level',
 'Alcohol',
 'Outdoor Seating',
 'Wi-Fi',
 'Has TV',
 'Caters']

In [155]:
info_dict = {}
for item in df['info']:
    print(item)
    for key in list(item.keys()):
        if key not in info_dict.keys():
            info_dict[key] = 1            
            print('not in !')
        else:
            info_dict[key] = info_dcit[key] + 1
            print('in!')

{'Takes Reservations': 'No', 'Delivery': 'No', 'Take-out': 'Yes', 'Accepts Credit Cards': 'Yes', 'Parking': 'Street', 'Bike Parking': 'Yes', 'Good for Kids': 'Yes', 'Good for Groups': 'Yes', 'Attire': 'Casual', 'Ambience': 'Trendy', 'Noise Level': 'Average', 'Alcohol': 'Full Bar', 'Outdoor Seating': 'No', 'Wi-Fi': 'Free', 'Has TV': 'No', 'Caters': 'No'}
not in !
not in !
not in !
not in !
not in !
not in !
not in !
not in !
not in !
not in !
not in !
not in !
not in !
not in !
not in !
not in !
{'Takes Reservations': 'No', 'Delivery': 'No', 'Take-out': 'Yes', 'Accepts Credit Cards': 'Yes', 'Parking': 'Street', 'Bike Parking': 'Yes', 'Good for Kids': 'Yes', 'Good for Groups': 'Yes', 'Attire': 'Casual', 'Ambience': 'Trendy', 'Noise Level': 'Average', 'Alcohol': 'Full Bar', 'Outdoor Seating': 'No', 'Wi-Fi': 'Free', 'Has TV': 'No', 'Caters': 'No'}


KeyError: 'Takes Reservations'

In [154]:
info_dict

{'Accepts Credit Cards': 1,
 'Alcohol': 1,
 'Ambience': 1,
 'Attire': 1,
 'Bike Parking': 1,
 'Caters': 1,
 'Delivery': 1,
 'Good for Groups': 1,
 'Good for Kids': 1,
 'Has TV': 1,
 'Noise Level': 1,
 'Outdoor Seating': 1,
 'Parking': 1,
 'Take-out': 1,
 'Takes Reservations': 1,
 'Wi-Fi': 1}

In [136]:
df['info'][0]

{'Accepts Credit Cards': 'Yes',
 'Alcohol': 'Full Bar',
 'Ambience': 'Trendy',
 'Attire': 'Casual',
 'Bike Parking': 'Yes',
 'Caters': 'No',
 'Delivery': 'No',
 'Good for Groups': 'Yes',
 'Good for Kids': 'Yes',
 'Has TV': 'No',
 'Noise Level': 'Average',
 'Outdoor Seating': 'No',
 'Parking': 'Street',
 'Take-out': 'Yes',
 'Takes Reservations': 'No',
 'Wi-Fi': 'Free'}

In [131]:
info0 = df['info'][0]

In [132]:
InfoParser(info0)

{'Accepts Credit Cards': 'Yes',
 'Alcohol': 'Full Bar',
 'Ambience': 'Trendy',
 'Attire': 'Casual',
 'Bike Parking': 'Yes',
 'Caters': 'No',
 'Delivery': 'No',
 'Good for Groups': 'Yes',
 'Good for Kids': 'Yes',
 'Has TV': 'No',
 'Noise Level': 'Average',
 'Outdoor Seating': 'No',
 'Parking': 'Street',
 'Take-out': 'Yes',
 'Takes Reservations': 'No',
 'Wi-Fi': 'Free'}

In [115]:
info0

"[{'Takes Reservations': 'No'}, {'Delivery': 'No'}, {'Take-out': 'Yes'}, {'Accepts Credit Cards': 'Yes'}, {'Parking': 'Street'}, {'Bike Parking': 'Yes'}, {'Good for Kids': 'Yes'}, {'Good for Groups': 'Yes'}, {'Attire': 'Casual'}, {'Ambience': 'Trendy'}, {'Noise Level': 'Average'}, {'Alcohol': 'Full Bar'}, {'Outdoor Seating': 'No'}, {'Wi-Fi': 'Free'}, {'Has TV': 'No'}, {'Caters': 'No'}]"

In [123]:
Dict = {}
for item in info0[1:-1].split(', '):
    key = item[1:-1].split(': ')[0][1:-1]
    value = item[1:-1].split(': ')[1][1:-1]
    Dict[key] = value
    print(key,value)
Dict

Takes Reservations No
Delivery No
Take-out Yes
Accepts Credit Cards Yes
Parking Street
Bike Parking Yes
Good for Kids Yes
Good for Groups Yes
Attire Casual
Ambience Trendy
Noise Level Average
Alcohol Full Bar
Outdoor Seating No
Wi-Fi Free
Has TV No
Caters No


{'Accepts Credit Cards': 'Yes',
 'Alcohol': 'Full Bar',
 'Ambience': 'Trendy',
 'Attire': 'Casual',
 'Bike Parking': 'Yes',
 'Caters': 'No',
 'Delivery': 'No',
 'Good for Groups': 'Yes',
 'Good for Kids': 'Yes',
 'Has TV': 'No',
 'Noise Level': 'Average',
 'Outdoor Seating': 'No',
 'Parking': 'Street',
 'Take-out': 'Yes',
 'Takes Reservations': 'No',
 'Wi-Fi': 'Free'}

In [112]:
df['info'][0][1:-1].split(', ')[0][1:-1].split(': ')[0][1:-1]

'Takes Reservations'

In [101]:
for info in df['info']:
    print(info)

[{'Takes Reservations': 'No'}, {'Delivery': 'No'}, {'Take-out': 'Yes'}, {'Accepts Credit Cards': 'Yes'}, {'Parking': 'Street'}, {'Bike Parking': 'Yes'}, {'Good for Kids': 'Yes'}, {'Good for Groups': 'Yes'}, {'Attire': 'Casual'}, {'Ambience': 'Trendy'}, {'Noise Level': 'Average'}, {'Alcohol': 'Full Bar'}, {'Outdoor Seating': 'No'}, {'Wi-Fi': 'Free'}, {'Has TV': 'No'}, {'Caters': 'No'}]
[{'Takes Reservations': 'No'}, {'Delivery': 'No'}, {'Take-out': 'Yes'}, {'Accepts Credit Cards': 'Yes'}, {'Accepts Apple Pay': 'No'}, {'Accepts Android Pay': 'No'}, {'Accepts Bitcoin': 'No'}, {'Good For': 'Late Night'}, {'Parking': 'Valet'}, {'Bike Parking': 'No'}, {'Good for Kids': 'No'}, {'Good for Groups': 'Yes'}, {'Attire': 'Dressy'}, {'Ambience': 'Romantic, Classy, Upscale'}, {'Noise Level': 'Average'}, {'Alcohol': 'Full Bar'}, {'Outdoor Seating': 'No'}, {'Wi-Fi': 'Free'}, {'Has TV': 'Yes'}, {'Caters': 'No'}]
[{'Takes Reservations': 'Yes'}, {'Delivery': 'No'}, {'Take-out': 'Yes'}, {'Accepts Credit Ca

In [26]:
a = '24_ann-sathers_review.csv'

In [29]:
a[:-11]

'24_ann-sathers'

In [319]:
df1 = pd.DataFrame({'a':[1,2,1], 'b':[3,4,4]})

In [320]:
df1

Unnamed: 0,a,b
0,1,3
1,2,4
2,1,4


In [330]:
a = df1[df1['a'] == 1].shape[0]

In [331]:
a

2

In [662]:
df2 = pd.DataFrame({'a':[5,6], 'b':[7,8], 'c':[9,10]})

In [657]:
df1 = df1.append(df2, ignore_index=True)

In [663]:
df1['c'] = [9,10]

In [664]:
df1

Unnamed: 0,a,b,c
0,1,3,9
1,2,4,10


In [256]:
info = {'a':[1], 'b':[2]}

In [257]:
df_info = pd.DataFrame(info)

In [267]:
df_info

Unnamed: 0,a,b
0,1,2
1,1,2
2,1,2
3,1,2


In [261]:
df_review = pd.DataFrame({'star':[0,1,2], 'time':[7,8,9], 'review':['good','bad','ok']})

In [262]:
df_review

Unnamed: 0,review,star,time
0,good,0,7
1,bad,1,8
2,ok,2,9


In [278]:
df_info = pd.DataFrame(info)
df_info_copy = df_info.copy()
print(df_info)
print()
for i in range(df_review.shape[0]-1):
    print(i)
    df_info = df_info.append(df_info_copy, ignore_index=True)
    print(df_info)
    print()

   a  b
0  1  2

0
   a  b
0  1  2
1  1  2

1
   a  b
0  1  2
1  1  2
2  1  2



In [280]:
df_info.join(df_review)

Unnamed: 0,a,b,review,star,time
0,1,2,good,0,7
1,1,2,bad,1,8
2,1,2,ok,2,9


In [196]:
index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']
test = pd.DataFrame({'http_status': [200,200,404,404,301],'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]},index=index)

In [197]:
test

Unnamed: 0,http_status,response_time
Firefox,200,0.04
Chrome,200,0.02
Safari,404,0.07
IE10,404,0.08
Konqueror,301,1.0


In [199]:
new_index= ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10','Chrome']
test.reindex(new_index)

Unnamed: 0,http_status,response_time
Safari,404.0,0.07
Iceweasel,,
Comodo Dragon,,
IE10,404.0,0.08
Chrome,200.0,0.02
