# Jonathan Halverson
# Keeping it Fresh: Predict Restaurant Inspections
## Part 8: Model based on categories and neighborhoods

Here we formulate a simple time-independent model based on the business categories and neighborhoods.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('halverson')

### Load the training data:

In [2]:
df_vio = pd.read_csv('data/training_labels.txt', parse_dates=['date'])
df_vio.rename(columns={'date':'inspect_date'}, inplace=True)
df_vio.head()

Unnamed: 0,id,inspect_date,restaurant_id,*,**,***
0,589,2010-02-02,KAoKWjog,3,0,1
1,28589,2009-12-10,p038M4om,2,0,0
2,31170,2008-07-16,B1oXymOV,4,0,0
3,2600,2015-01-30,m0oWJl3G,1,0,3
4,1016,2012-03-19,rJoQwlEV,0,0,0


### Clean the training data by removing duplicate inspections

In [3]:
from helper_methods import drop_duplicate_inspections
df_vio = df_vio.sort_values(['restaurant_id', 'inspect_date'])
df_vio = drop_duplicate_inspections(df_vio, threshold=60)
df_vio.head()

Unnamed: 0,id,inspect_date,restaurant_id,*,**,***
1801,28144,2007-09-21,0ZED0WED,3,1,0
551,24765,2008-03-26,0ZED0WED,3,1,0
5460,25193,2008-10-08,0ZED0WED,6,2,4
3641,12775,2009-03-03,0ZED0WED,3,0,0
18452,25850,2009-07-23,0ZED0WED,1,0,2


### Load the data to relate restaurant id to business id

In [4]:
trans = pd.read_csv('data/restaurant_ids_to_yelp_ids.csv')
trans = trans[trans['yelp_id_1'].isnull()]
trans.drop(['yelp_id_1', 'yelp_id_2', 'yelp_id_3'], axis=1, inplace=True)
trans.columns = ['restaurant_id', 'business_id']
trans.head()

Unnamed: 0,restaurant_id,business_id
0,Y1Em4GOw,5Kdf1DGbRScRk6Cx3jaX8w
1,KAoKP6Og,Urw6NASrebP6tyFdjwjkwQ
2,WeEe7eoa,xlOE7jqbW1Q_PrvLBVlegQ
3,V430mqoB,ktYpqtygWIJ2RjVPGTxNaA
4,ekE4Qz32,n8CsQy7Iy1IMhP85hPVKPA


In [5]:
df_trans = pd.merge(trans, df_vio, on='restaurant_id', how='inner')
df_trans.head()

Unnamed: 0,restaurant_id,business_id,id,inspect_date,*,**,***
0,Y1Em4GOw,5Kdf1DGbRScRk6Cx3jaX8w,17,2015-02-06,1,0,0
1,KAoKP6Og,Urw6NASrebP6tyFdjwjkwQ,9337,2007-06-27,0,0,0
2,KAoKP6Og,Urw6NASrebP6tyFdjwjkwQ,5052,2008-03-25,8,0,0
3,KAoKP6Og,Urw6NASrebP6tyFdjwjkwQ,12585,2009-04-23,5,0,2
4,KAoKP6Og,Urw6NASrebP6tyFdjwjkwQ,3583,2010-03-25,4,0,0


### Load the business data

In [6]:
from helper_methods import read_json

df_biz = read_json('data/yelp_academic_dataset_business.json')
df_biz.head(2).transpose()

Unnamed: 0,0,1
attributes,"{u'Price Range': 1, u'Accepts Credit Cards': T...","{u'Take-out': True, u'Price Range': 1, u'Outdo..."
business_id,Jp9svt7sRT4zwdbzQ8KQmw,CgdK8DiyX9Y4kTKEPi_qgA
categories,"[Bakeries, Food, Event Planning & Services, Ca...","[Delis, Restaurants, Event Planning & Services..."
city,Boston,Boston
full_address,"75 Federal Street\nFinancial District\nBoston,...","8 City Hall Ave\nBeacon Hill\nBoston, MA 02108"
hours,{},"{u'Tuesday': {u'close': u'15:00', u'open': u'0..."
latitude,42.3551,42.3575
longitude,-71.0565,-71.0691
name,Rebecca's Cafe,Delicato Cafe & Caterer
neighborhoods,[Financial District],[Beacon Hill]


### Add the neighborhoods as features

In [7]:
neighborhoods = list(set(df_biz['neighborhoods'].sum()))
for neighborhood in neighborhoods:
    df_biz[neighborhood] = df_biz.neighborhoods.apply(lambda x: 1 if neighborhood in x else 0)
df_biz['Other'] = df_biz.neighborhoods.apply(lambda x: 1 if x == [] else 0)
neighborhoods += ['Other']
df_biz[['neighborhoods'] + neighborhoods].head(5).transpose()

Unnamed: 0,0,1,2,3,4
neighborhoods,[Financial District],[Beacon Hill],[Downtown],"[Beacon Hill, Downtown]",[Downtown]
Roslindale Village,0,0,0,0,0
Fenway,0,0,0,0,0
Financial District,1,0,0,0,0
East Boston,0,0,0,0,0
Hyde Park,0,0,0,0,0
West Roxbury,0,0,0,0,0
Fields Corner,0,0,0,0,0
Mission Hill,0,0,0,0,0
Allston/Brighton,0,0,0,0,0


In [8]:
# it was necessary to add Other so that each restaurant was assigned
df_biz[neighborhoods].sum(axis=0).sort_index()

Allston/Brighton       152
Back Bay               216
Beacon Hill             52
Central Square           1
Charlestown             31
Chinatown               75
Dorchester              94
Downtown               123
Dudley Square           11
East Boston             88
Egleston Square          2
Fenway                  52
Fields Corner            5
Financial District      93
Hyde Park                8
Jamaica Plain           71
Leather District        15
Mattapan                 7
Mission Hill            13
North End              103
Other                  417
Roslindale              40
Roslindale Village      19
South Boston           149
South End               96
Uphams Corner            5
Waterfront              85
West Roxbury            40
West Roxbury Center     21
dtype: int64

In [9]:
# every restaurant is assigned to at least 1 neighborhood which may be Other
df_biz[neighborhoods].sum(axis=1).value_counts()

1    1793
2     120
3      17
dtype: int64

### Add the categories as features

In [10]:
categories = list(set(df_biz['categories'].sum()))
for category in categories:
    df_biz[category] = df_biz.categories.apply(lambda x: 1 if category in x else 0)
df_biz[['categories'] + categories].head(3).transpose()

Unnamed: 0,0,1,2
categories,"[Bakeries, Food, Event Planning & Services, Ca...","[Delis, Restaurants, Event Planning & Services...","[Delis, Restaurants]"
Tapas/Small Plates,0,0,0
Fondue,0,0,0
Buffets,0,0,0
Gluten-Free,0,0,0
Arts & Entertainment,0,0,0
Sandwiches,0,0,0
Austrian,0,0,0
Creperies,0,0,0
Sushi Bars,0,0,0


In [11]:
df_biz[categories].sum(axis=1).value_counts().sort_index()

1     36
2    830
3    476
4    325
5    188
6     63
7     12
dtype: int64

### Add average violations as a feature

In [12]:
avg_violations = df_trans.groupby('business_id').agg({'*':np.mean})
avg_violations.columns = ['mean-*']
avg_violations.head(3)

Unnamed: 0_level_0,mean-*
business_id,Unnamed: 1_level_1
-2DW8dE1S0eu48ZnfJeAGQ,7.333333
-2jQG9-gNH3TshkF-hhR0Q,5.4
-5ieYfS-fjUyK93b6Vbv-Q,4.066667


In [13]:
df_biz = pd.merge(avg_violations, df_biz, left_index=True, right_on='business_id', how='inner')
df_biz['mean-*'] = df_biz['mean-*'].apply(lambda x: x / df_biz['mean-*'].max())

### Add crime density as a feature

In [14]:
cd = pd.read_csv('crime_density.csv', names=['crime_density', 'business_id', 'stars'])
df_biz = pd.merge(cd, df_biz, on='business_id', how='inner')

### Finally, join the business data with the violation data

In [15]:
df_cmb = pd.merge(df_trans, df_biz, on='business_id', how='inner')
df_cmb.head(2).transpose()

Unnamed: 0,0,1
restaurant_id,Y1Em4GOw,KAoKP6Og
business_id,5Kdf1DGbRScRk6Cx3jaX8w,Urw6NASrebP6tyFdjwjkwQ
id,17,9337
inspect_date,2015-02-06 00:00:00,2007-06-27 00:00:00
*,1,0
**,0,0
***,0,0
crime_density,327,17
stars_x,3.5,2.5
mean-*,0.0526316,0.178947


### Create a train-test split

In [16]:
np.random.seed(0)
msk = np.random.rand(df_cmb.shape[0]) < 0.8
df_train = df_cmb[msk]
df_test = df_cmb[~msk]

Load modules from scikit-learn and create method to compute scores:

In [17]:
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error as mse

def get_scores(train, test, columns):
    X_train = train[columns].values
    y_true_train = train['*'].values

    param_grid = {'alpha':np.logspace(-3, 3, num=15, base=10)}
    gs = GridSearchCV(Lasso(), param_grid, scoring='mean_squared_error', cv=10)
    gs = gs.fit(X_train, y_true_train)

    y_pred_train = gs.predict(X_train)
    y_true_test = test['*'].values
    y_pred_test = gs.predict(df_test[columns].values)

    mse_train = mse(y_true_train, y_pred_train)
    mse_test = mse(y_true_test, y_pred_test)
    
    return 'MSE (train) = %.1f, MSE (test) = %.1f' % (mse_train, mse_test)

### Predictive Models

In [18]:
print get_scores(df_train, df_test, neighborhoods)

MSE (train) = 17.2, MSE (test) = 17.0


In [19]:
print get_scores(df_train, df_test, categories)

MSE (train) = 16.8, MSE (test) = 16.8


In [20]:
print get_scores(df_train, df_test, neighborhoods + categories)

MSE (train) = 16.4, MSE (test) = 16.4


In [21]:
print get_scores(df_train, df_test, ['mean-*'])

MSE (train) = 13.4, MSE (test) = 13.3


In [22]:
print get_scores(df_train, df_test, ['crime_density'])

MSE (train) = 17.8, MSE (test) = 17.5


In [23]:
print get_scores(df_train, df_test, ['mean-*', 'crime_density'])

MSE (train) = 13.4, MSE (test) = 13.3
