# Load from canonical restaurant data

In [26]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join

In [27]:
files = [f for f in listdir('./') if '.csv' in f]
files

['CanonicalRestaurants.csv', 'CanonicalSummary.csv', 'ClosedRestaurants.csv']

In [36]:
df = pd.read_csv('./CanonicalRestaurants.csv')

In [37]:
df.columns

Index(['address', 'category', 'claimed_status', 'compound', 'date',
       'first_review', 'health_rating', 'id', 'info', 'last_review',
       'latitude', 'longitude', 'name', 'negative', 'neighborhood', 'neutral',
       'permanently_closed', 'phone', 'positive', 'price_range', 'ratings',
       'ratings_histogram', 'reviews', 'star', 'subjectivity', 'url',
       'website', 'working_hours'],
      dtype='object')

In [38]:
df.shape

(484650, 28)

In [39]:
df['Claimed?'] = df['claimed_status'].apply(lambda x: 1 if str(x) == 'Claimed' else 0)
df['HasWebsite'] = df['website'].apply(lambda x: 1 if 'http' in str(x) else 0)

In [40]:
droplist = ['working_hours', 'health_rating', 'phone', 'url', 'claimed_status', 
            'website', 'address', 'longitude', 'latitude', 'reviews', 
            'date', 'star', 'subjectivity', 'negative', 'neutral', 'positive', 'compound']
df.drop(droplist, inplace=True, axis=1)

In [41]:
df = df[(df['last_review'] != 'MISSING') & (df['first_review'] != 'MISSING')].copy()
df.shape

(484609, 13)

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 484609 entries, 0 to 484649
Data columns (total 13 columns):
category              484315 non-null object
first_review          484609 non-null object
id                    484609 non-null object
info                  484609 non-null object
last_review           484609 non-null object
name                  484609 non-null object
neighborhood          482772 non-null object
permanently_closed    484609 non-null int64
price_range           483514 non-null object
ratings               484609 non-null float64
ratings_histogram     484609 non-null object
Claimed?              484609 non-null int64
HasWebsite            484609 non-null int64
dtypes: float64(1), int64(3), object(9)
memory usage: 51.8+ MB


In [43]:
#df['date'] =  pd.to_datetime(df['date'])
df['last_review'] =  pd.to_datetime(df['last_review'])
df['first_review'] =  pd.to_datetime(df['first_review'])

# Remove closed restaurants that have last review earlier than 2012/01/01

In [44]:
import datetime
cut_day = datetime.date(2012, 1, 1)

In [45]:
mask = (df['permanently_closed'] == 1) & (df['last_review'] < cut_day)
cut_df = df[~mask].copy()

In [46]:
cut_df.head()

Unnamed: 0,category,first_review,id,info,last_review,name,neighborhood,permanently_closed,price_range,ratings,ratings_histogram,Claimed?,HasWebsite
0,"Breakfast & Brunch,American (Traditional)",2011-10-08,0_2-sparrows,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2015-02-02,2 Sparrows,Lincoln Park,1,$11-30,3.0,"[{5: 63}, {4: 94}, {3: 67}, {2: 78}, {1: 34}]",1,1
1,"Breakfast & Brunch,American (Traditional)",2011-10-08,0_2-sparrows,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2015-02-02,2 Sparrows,Lincoln Park,1,$11-30,3.0,"[{5: 63}, {4: 94}, {3: 67}, {2: 78}, {1: 34}]",1,1
2,"Breakfast & Brunch,American (Traditional)",2011-10-08,0_2-sparrows,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2015-02-02,2 Sparrows,Lincoln Park,1,$11-30,3.0,"[{5: 63}, {4: 94}, {3: 67}, {2: 78}, {1: 34}]",1,1
3,"Breakfast & Brunch,American (Traditional)",2011-10-08,0_2-sparrows,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2015-02-02,2 Sparrows,Lincoln Park,1,$11-30,3.0,"[{5: 63}, {4: 94}, {3: 67}, {2: 78}, {1: 34}]",1,1
4,"Breakfast & Brunch,American (Traditional)",2011-10-08,0_2-sparrows,"[{'Takes Reservations': 'No'}, {'Delivery': 'N...",2015-02-02,2 Sparrows,Lincoln Park,1,$11-30,3.0,"[{5: 63}, {4: 94}, {3: 67}, {2: 78}, {1: 34}]",1,1


In [47]:
cut_df.shape

(476583, 13)

# Collapse the cononical data to one restaurant per row

In [48]:
cut_df.drop_duplicates(inplace=True)
cut_df.shape

(1152, 13)

# Baseline

In [49]:
cut_df['permanently_closed'].value_counts()

0    849
1    303
Name: permanently_closed, dtype: int64

In [50]:
cut_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1152 entries, 0 to 484577
Data columns (total 13 columns):
category              1150 non-null object
first_review          1152 non-null datetime64[ns]
id                    1152 non-null object
info                  1152 non-null object
last_review           1152 non-null datetime64[ns]
name                  1152 non-null object
neighborhood          1141 non-null object
permanently_closed    1152 non-null int64
price_range           1127 non-null object
ratings               1152 non-null float64
ratings_histogram     1152 non-null object
Claimed?              1152 non-null int64
HasWebsite            1152 non-null int64
dtypes: datetime64[ns](2), float64(1), int64(3), object(7)
memory usage: 126.0+ KB


In [51]:
closed = cut_df[cut_df['permanently_closed'] == 1].copy()
opened = cut_df[cut_df['permanently_closed'] == 0].copy()

In [52]:
closed['price_range'].value_counts()

$11-30         167
$31-60          76
Under $10       23
Above $61       12
Moderate        11
Inexpensive      4
Pricey           2
Name: price_range, dtype: int64