In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

In [2]:
inspect_df = pd.read_csv('health_inspect.csv')

In [3]:
inspect_df.columns

Index(['CAMIS', 'DBA', 'BORO', 'BUILDING', 'STREET', 'ZIPCODE', 'PHONE',
       'CUISINE DESCRIPTION', 'INSPECTION DATE', 'ACTION', 'VIOLATION CODE',
       'VIOLATION DESCRIPTION', 'CRITICAL FLAG', 'SCORE', 'GRADE',
       'GRADE DATE', 'RECORD DATE', 'INSPECTION TYPE'],
      dtype='object')

In [4]:
inspect_df.head()

Unnamed: 0,CAMIS,DBA,BORO,BUILDING,STREET,ZIPCODE,PHONE,CUISINE DESCRIPTION,INSPECTION DATE,ACTION,VIOLATION CODE,VIOLATION DESCRIPTION,CRITICAL FLAG,SCORE,GRADE,GRADE DATE,RECORD DATE,INSPECTION TYPE
0,41640556,COVE LOUNGE,MANHATTAN,325,LENOX AVENUE,10027.0,2126653455,American,08/10/2016,Violations were cited in the following area(s).,18F,Permit not conspicuously displayed.,Not Critical,,,,04/10/2017,Administrative Miscellaneous / Initial Inspection
1,41503575,YUMMY THAI,MANHATTAN,4959,BROADWAY,10034.0,9175290811,Thai,10/24/2016,Violations were cited in the following area(s).,04N,Filth flies or food/refuse/sewage-associated (...,Critical,42.0,,,04/10/2017,Cycle Inspection / Initial Inspection
2,41705626,ZUCKER'S BAGEL,MANHATTAN,370,LEXINGTON AVENUE,10017.0,2126611080,Delicatessen,01/08/2016,Violations were cited in the following area(s).,04N,Filth flies or food/refuse/sewage-associated (...,Critical,10.0,A,01/08/2016,04/10/2017,Cycle Inspection / Re-inspection
3,41709346,DELISH BY AMERIVENTS,QUEENS,4701,111 STREET,11368.0,2122451080,American,07/22/2015,Violations were cited in the following area(s).,04J,Appropriately scaled metal stem-type thermomet...,Critical,17.0,,,04/10/2017,Cycle Inspection / Initial Inspection
4,41467162,"DUNKIN' DONUTS, BASKIN ROBBINS",QUEENS,5401,108th STREET,11368.0,7182718061,American,11/12/2014,Violations were cited in the following area(s).,10F,Non-food contact surface improperly constructe...,Not Critical,11.0,A,11/12/2014,04/10/2017,Cycle Inspection / Initial Inspection


#### Convert dates to datetime rather than string to order by date

In [5]:
inspect_df['INSPECTION DATE'] = pd.to_datetime(inspect_df['INSPECTION DATE'])

In [6]:
inspect_df.sort(columns=['CAMIS','INSPECTION DATE'],ascending=[True,False],inplace=True)

  if __name__ == '__main__':


#### Drop inspections unrelated to food quality

In [7]:
inspect_df = inspect_df[inspect_df['INSPECTION TYPE'].isnull() == False]
inspect_df = inspect_df[~inspect_df['INSPECTION TYPE'].str.contains('Calorie')]
inspect_df = inspect_df[~inspect_df['INSPECTION TYPE'].str.contains('Smoke')]
inspect_df = inspect_df[~inspect_df['INSPECTION TYPE'].str.contains('Trans Fat')]
inspect_df = inspect_df[~inspect_df['INSPECTION TYPE'].str.contains('Administrative')]
inspect_df = inspect_df[~inspect_df['INSPECTION TYPE'].str.contains('Non-operational')]

#### Extract aggregate information on business by CAMIS number

In [8]:
df2 = inspect_df.groupby(['CAMIS','INSPECTION DATE']).count()['DBA']
ids = [(x[0],x[1]) for x in df2.index]
df3 = pd.DataFrame(ids)
df3 = df3.rename(index=str, columns={0: 'CAMIS',1:'Date'})
df3.sort(columns=['CAMIS','Date'],ascending=[True,False],inplace=True)

first_inspect = df3.groupby('CAMIS').nth(-1)
first_inspect = first_inspect.rename(index=str, columns={'Date':'earliest_inspection'})
first_inspect.index = first_inspect.index.map(str)
first_inspect.head()
sec_last_inspect = df3.groupby('CAMIS').nth(1)
sec_last_inspect = sec_last_inspect.rename(index=str, columns={'Date':'second_latest_inspection'})
sec_last_inspect.index = sec_last_inspect.index.map(str)
last_inspect = df3.groupby('CAMIS').nth(0)
last_inspect = last_inspect.rename(index=str, columns={'Date':'latest_inspection'})
last_inspect.index = last_inspect.index.map(str)
last_inspect.head()



Unnamed: 0,latest_inspection
30075445,2016-02-18
30112340,2016-10-27
30191841,2016-05-31
40356018,2016-05-16
40356151,2016-05-14


In [9]:
zips = pd.DataFrame(inspect_df.groupby('CAMIS').first()['ZIPCODE'])
boros = pd.DataFrame(inspect_df.groupby('CAMIS').first()['BORO'])
cuisine = pd.DataFrame(inspect_df.groupby('CAMIS').first()['CUISINE DESCRIPTION'])
build = pd.DataFrame(inspect_df.groupby('CAMIS').first()['BUILDING'])
street = pd.DataFrame(inspect_df.groupby('CAMIS').first()['STREET'])
address = pd.DataFrame(build.BUILDING.str.cat(street.STREET, sep=' '))
num_inspects = pd.DataFrame(inspect_df.groupby('CAMIS')['INSPECTION DATE'].nunique())

zips.index = zips.index.map(str)
zips = zips.rename(index=str, columns={'ZIPCODE': 'zipcode'})
boros.index = boros.index.map(str)
boros = boros.rename(index=str, columns={'BORO': 'boro'})
cuisine.index = cuisine.index.map(str)
cuisine = cuisine.rename(index=str, columns={'CUISINE DESCRIPTION': 'cuisine'})
address.index = address.index.map(str)
address = address.rename(index=str, columns={'BUILDING': 'address'})
num_inspects = num_inspects.rename(index=str, columns={'INSPECTION DATE': 'num_inspections'})

#### Group by Critical & Non-Critical violations, extract counts of most recent inspection

In [10]:
not_crit_df = inspect_df[inspect_df['CRITICAL FLAG']== 'Not Critical']
crit_df = inspect_df[inspect_df['CRITICAL FLAG']== 'Critical']

df = crit_df.groupby(['CAMIS','INSPECTION DATE']).count()['DBA']
ids = [x[0] for x in df.index]
last_inspect_crit = pd.DataFrame(df.groupby(ids).last())
last_inspect_crit = last_inspect_crit.rename(index=str, columns={'DBA': 'crit_violations_recent_inspect'})

df = not_crit_df.groupby(['CAMIS','INSPECTION DATE']).count()['DBA']
ids = [x[0] for x in df.index]
last_inspect_non_crit = pd.DataFrame(df.groupby(ids).last())
last_inspect_non_crit = last_inspect_non_crit.rename(index=str, columns={'DBA': 'non_crit_violations_recent_inspect'})

#### Extract overall total number of violations in each category

In [11]:
non_crit_num = pd.DataFrame(not_crit_df.groupby('CAMIS').count()['CRITICAL FLAG'])
non_crit_num = non_crit_num.rename(index=str, columns={'CRITICAL FLAG': 'non_crit_violations'})
crit_num = pd.DataFrame(crit_df.groupby('CAMIS').count()['CRITICAL FLAG'])
crit_num = crit_num.rename(index=str, columns={'CRITICAL FLAG': 'crit_violations'})

#### Combine dataframes for each piece of aggregate information

In [12]:
violations_df = pd.merge(crit_num,non_crit_num, how='outer', left_index=True, right_index = True)
violations_df.index = violations_df.index.map(str)
violations_df = violations_df.fillna(value=0)
violations_df = violations_df.merge(first_inspect, how='inner', left_index=True, right_index = True)
violations_df = violations_df.merge(last_inspect, how='inner', left_index=True, right_index = True)
violations_df = violations_df.merge(sec_last_inspect, how='inner', left_index=True, right_index = True)
violations_df = violations_df.merge(zips, how='inner', left_index=True, right_index = True)
violations_df['zipcode'] = violations_df['zipcode'].map(int).map(str)
violations_df = violations_df.merge(boros, how='inner', left_index=True, right_index = True)
violations_df = violations_df.merge(cuisine, how='inner', left_index=True, right_index = True)
violations_df = violations_df.merge(address, how='inner', left_index=True, right_index = True)
violations_df = violations_df.merge(num_inspects, how='inner', left_index=True, right_index = True)
violations_df = violations_df.merge(last_inspect_crit, how='inner', left_index=True, right_index = True)
violations_df = violations_df.merge(last_inspect_non_crit, how='inner', left_index=True, right_index = True)
violations_df.head()

Unnamed: 0,crit_violations,non_crit_violations,earliest_inspection,latest_inspection,second_latest_inspection,zipcode,boro,cuisine,address,num_inspections,crit_violations_recent_inspect,non_crit_violations_recent_inspect
30075445,8.0,4.0,2013-08-14,2016-02-18,2015-02-09,10462,BRONX,Bakery,1007 MORRIS PARK AVE,5,1,1
30112340,14.0,10.0,2014-06-05,2016-10-27,2016-10-03,11225,BROOKLYN,Hamburgers,469 FLATBUSH AVENUE,9,1,1
30191841,4.0,7.0,2013-07-22,2016-05-31,2015-09-21,10019,MANHATTAN,Irish,351 WEST 57 STREET,5,1,1
40356018,1.0,7.0,2013-06-05,2016-05-16,2015-06-05,11224,BROOKLYN,American,2780 STILLWELL AVENUE,4,1,1
40356151,13.0,5.0,2014-04-11,2016-05-14,2015-05-29,11369,QUEENS,American,8825 ASTORIA BOULEVARD,7,1,1


#### Ignore businesses that only have 1 inspection (no prior history)

In [13]:
violations_df = violations_df[violations_df.num_inspections > 1]

#### Reduce data leakage by reversing each business to the state just prior to its most recent inspection; 
#### The results of the most recent inspection (2+ critical violations) will form the target variable

In [14]:
violations_df['crit_violations_train'] = violations_df['crit_violations'] - violations_df['crit_violations_recent_inspect']
violations_df['non_crit_violations_train'] = violations_df['non_crit_violations'] - violations_df['non_crit_violations_recent_inspect']
violations_df['num_inspections_train'] = violations_df['num_inspections'] - 1
violations_df['average_crit_v_train'] = violations_df['crit_violations_train'] / violations_df['num_inspections_train']
violations_df['average_non_crit_v_train'] = violations_df['non_crit_violations_train'] / violations_df['num_inspections_train']
violations_df['time_since_last_inspection'] = (violations_df['latest_inspection'] - violations_df['second_latest_inspection'])
violations_df['time_since_last_inspection'] = (violations_df['time_since_last_inspection'] / np.timedelta64(1, 'D')).astype(int)
violations_df['time_since_first_inspection'] = (violations_df['latest_inspection'] - violations_df['earliest_inspection'])
violations_df['time_since_first_inspection'] = (violations_df['time_since_first_inspection'] / np.timedelta64(1, 'D')).astype(int)
violations_df['crit_v_2plus'] = (violations_df['crit_violations_recent_inspect'] > 1)*1
violations_df.head()

Unnamed: 0,crit_violations,non_crit_violations,earliest_inspection,latest_inspection,second_latest_inspection,zipcode,boro,cuisine,address,num_inspections,crit_violations_recent_inspect,non_crit_violations_recent_inspect,crit_violations_train,non_crit_violations_train,num_inspections_train,average_crit_v_train,average_non_crit_v_train,time_since_last_inspection,time_since_first_inspection,crit_v_2plus
30075445,8.0,4.0,2013-08-14,2016-02-18,2015-02-09,10462,BRONX,Bakery,1007 MORRIS PARK AVE,5,1,1,7.0,3.0,4,1.75,0.75,374,918,0
30112340,14.0,10.0,2014-06-05,2016-10-27,2016-10-03,11225,BROOKLYN,Hamburgers,469 FLATBUSH AVENUE,9,1,1,13.0,9.0,8,1.625,1.125,24,875,0
30191841,4.0,7.0,2013-07-22,2016-05-31,2015-09-21,10019,MANHATTAN,Irish,351 WEST 57 STREET,5,1,1,3.0,6.0,4,0.75,1.5,253,1044,0
40356018,1.0,7.0,2013-06-05,2016-05-16,2015-06-05,11224,BROOKLYN,American,2780 STILLWELL AVENUE,4,1,1,0.0,6.0,3,0.0,2.0,346,1076,0
40356151,13.0,5.0,2014-04-11,2016-05-14,2015-05-29,11369,QUEENS,American,8825 ASTORIA BOULEVARD,7,1,1,12.0,4.0,6,2.0,0.666667,351,764,0


In [25]:
violations_df2 = violations_df.drop(['address','crit_violations','earliest_inspection','latest_inspection','second_latest_inspection','non_crit_violations','num_inspections','crit_violations_recent_inspect','non_crit_violations_recent_inspect'],axis=1)
violations_df2.head()

Unnamed: 0,zipcode,boro,cuisine,crit_violations_train,non_crit_violations_train,num_inspections_train,average_crit_v_train,average_non_crit_v_train,time_since_last_inspection,time_since_first_inspection,crit_v_2plus
30075445,10462,BRONX,Bakery,7.0,3.0,4,1.75,0.75,374,918,0
30112340,11225,BROOKLYN,Hamburgers,13.0,9.0,8,1.625,1.125,24,875,0
30191841,10019,MANHATTAN,Irish,3.0,6.0,4,0.75,1.5,253,1044,0
40356018,11224,BROOKLYN,American,0.0,6.0,3,0.0,2.0,346,1076,0
40356151,11369,QUEENS,American,12.0,4.0,6,2.0,0.666667,351,764,0


In [26]:
violations_df3 = pd.get_dummies(violations_df2)

In [27]:
X = violations_df3.drop(['crit_v_2plus'],axis=1)
y = violations_df3['crit_v_2plus']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Calculate metrics of predicting 0 class for all

In [28]:
pred_zero = np.zeros(len(y_test))
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_zero)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_zero)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_zero)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_zero)))

accuracy = 0.657040519364
recall = 0.0
precision = 0.0
f1 = 0.0


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


### Logistic Regression

w/ L2-regularization

In [29]:
lr = linear_model.LogisticRegression(penalty = 'l2',C=1e20)
lr.fit(X_train, y_train)
pred_lr = lr.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_lr)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_lr)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_lr)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_lr)))

accuracy = 0.654801880457
recall = 0.153394255875
precision = 0.489583333333
f1 = 0.233598409543


w/ L1-regularization

In [30]:
lr = linear_model.LogisticRegression(penalty = 'l1',C=1e20)
lr.fit(X_train, y_train)
pred_lr = lr.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_lr)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_lr)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_lr)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_lr)))

accuracy = 0.653010969331
recall = 0.181462140992
precision = 0.484320557491
f1 = 0.264007597341


### Naive Bayes

#### w/ fit prior:

In [31]:
nb = BernoulliNB(alpha=1,fit_prior=True)
nb.fit(X_train, y_train)
pred_nb = nb.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_nb)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_nb)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_nb)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_nb)))

accuracy = 0.649653010969
recall = 0.0646214099217
precision = 0.428571428571
f1 = 0.112308564946


#### w/ uniform prior

In [32]:
nb = BernoulliNB(alpha=1,fit_prior=False)
nb.fit(X_train, y_train)
pred_nb = nb.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_nb)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_nb)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_nb)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_nb)))

accuracy = 0.538840385046
recall = 0.548955613577
precision = 0.380542986425
f1 = 0.449492250134


### Random Forest

#### 10 trees

In [33]:
rf = RandomForestClassifier(n_estimators=10)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_rf)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_rf)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_rf)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_rf)))

accuracy = 0.644951869263
recall = 0.218015665796
precision = 0.462603878116
f1 = 0.29636202307


#### 20 trees

In [34]:
rf = RandomForestClassifier(n_estimators=20)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_rf)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_rf)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_rf)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_rf)))

accuracy = 0.647190508171
recall = 0.211488250653
precision = 0.468208092486
f1 = 0.291366906475
