In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

In [133]:
inspect_df = pd.read_csv('health_inspect_old.csv')

In [4]:
inspect_df.columns

Index(['CAMIS', 'DBA', 'BORO', 'BUILDING', 'STREET', 'ZIPCODE', 'PHONE',
       'CUISINE DESCRIPTION', 'INSPECTION DATE', 'ACTION', 'VIOLATION CODE',
       'VIOLATION DESCRIPTION', 'CRITICAL FLAG', 'SCORE', 'GRADE',
       'GRADE DATE', 'RECORD DATE', 'INSPECTION TYPE'],
      dtype='object')

In [71]:
inspect_df.head()

Unnamed: 0,CAMIS,DBA,BORO,BUILDING,STREET,ZIPCODE,PHONE,CUISINE DESCRIPTION,INSPECTION DATE,ACTION,VIOLATION CODE,VIOLATION DESCRIPTION,CRITICAL FLAG,SCORE,GRADE,GRADE DATE,RECORD DATE,INSPECTION TYPE
72853,30075445,MORRIS PARK BAKE SHOP,BRONX,1007,MORRIS PARK AVE,10462.0,7188924968,Bakery,2016-02-18,Violations were cited in the following area(s).,08A,Facility not vermin proof. Harborage or condit...,Not Critical,10.0,A,02/18/2016,05/02/2017,Cycle Inspection / Initial Inspection
298956,30075445,MORRIS PARK BAKE SHOP,BRONX,1007,MORRIS PARK AVE,10462.0,7188924968,Bakery,2016-02-18,Violations were cited in the following area(s).,04L,Evidence of mice or live mice present in facil...,Critical,10.0,A,02/18/2016,05/02/2017,Cycle Inspection / Initial Inspection
267536,30075445,MORRIS PARK BAKE SHOP,BRONX,1007,MORRIS PARK AVE,10462.0,7188924968,Bakery,2015-02-09,Violations were cited in the following area(s).,06C,Food not protected from potential source of co...,Critical,6.0,A,02/09/2015,05/02/2017,Cycle Inspection / Initial Inspection
69419,30075445,MORRIS PARK BAKE SHOP,BRONX,1007,MORRIS PARK AVE,10462.0,7188924968,Bakery,2014-03-03,Violations were cited in the following area(s).,10F,Non-food contact surface improperly constructe...,Not Critical,2.0,A,03/03/2014,05/02/2017,Cycle Inspection / Initial Inspection
125736,30075445,MORRIS PARK BAKE SHOP,BRONX,1007,MORRIS PARK AVE,10462.0,7188924968,Bakery,2013-09-11,Violations were cited in the following area(s).,04L,Evidence of mice or live mice present in facil...,Critical,6.0,A,09/11/2013,05/02/2017,Cycle Inspection / Re-inspection


#### Convert dates to datetime rather than string to order by date

In [101]:
inspect_df['INSPECTION DATE'] = pd.to_datetime(inspect_df['INSPECTION DATE'])

In [102]:
inspect_df.sort(columns=['CAMIS','INSPECTION DATE'],ascending=[True,False],inplace=True)

  if __name__ == '__main__':


#### Drop inspections unrelated to food quality

In [103]:
inspect_df = inspect_df[inspect_df['INSPECTION TYPE'].isnull() == False]
inspect_df = inspect_df[~inspect_df['INSPECTION TYPE'].str.contains('Calorie')]
inspect_df = inspect_df[~inspect_df['INSPECTION TYPE'].str.contains('Smoke')]
inspect_df = inspect_df[~inspect_df['INSPECTION TYPE'].str.contains('Trans Fat')]
inspect_df = inspect_df[~inspect_df['INSPECTION TYPE'].str.contains('Administrative')]
inspect_df = inspect_df[~inspect_df['INSPECTION TYPE'].str.contains('Non-operational')]

#### Extract aggregate information on business by CAMIS number

In [104]:
df2 = inspect_df.groupby(['CAMIS','INSPECTION DATE']).count()['DBA']
ids = [(x[0],x[1]) for x in df2.index]
df3 = pd.DataFrame(ids)
df3 = df3.rename(index=str, columns={0: 'CAMIS',1:'Date'})
df3.sort(columns=['CAMIS','Date'],ascending=[True,False],inplace=True)

first_inspect = df3.groupby('CAMIS').nth(-1)
first_inspect = first_inspect.rename(index=str, columns={'Date':'earliest_inspection'})
first_inspect.index = first_inspect.index.map(str)
first_inspect.head()
sec_last_inspect = df3.groupby('CAMIS').nth(1)
sec_last_inspect = sec_last_inspect.rename(index=str, columns={'Date':'second_latest_inspection'})
sec_last_inspect.index = sec_last_inspect.index.map(str)
last_inspect = df3.groupby('CAMIS').nth(0)
last_inspect = last_inspect.rename(index=str, columns={'Date':'latest_inspection'})
last_inspect.index = last_inspect.index.map(str)
last_inspect.head()



Unnamed: 0,latest_inspection
30075445,2016-02-18
30112340,2016-10-27
30191841,2016-05-31
40356018,2016-05-16
40356151,2016-05-14


In [105]:
zips = pd.DataFrame(inspect_df.groupby('CAMIS').first()['ZIPCODE'])
boros = pd.DataFrame(inspect_df.groupby('CAMIS').first()['BORO'])
cuisine = pd.DataFrame(inspect_df.groupby('CAMIS').first()['CUISINE DESCRIPTION'])
build = pd.DataFrame(inspect_df.groupby('CAMIS').first()['BUILDING'])
street = pd.DataFrame(inspect_df.groupby('CAMIS').first()['STREET'])
address = pd.DataFrame(build.BUILDING.str.cat(street.STREET, sep=' '))
num_inspects = pd.DataFrame(inspect_df.groupby('CAMIS')['INSPECTION DATE'].nunique())

zips.index = zips.index.map(str)
zips = zips.rename(index=str, columns={'ZIPCODE': 'zipcode'})
boros.index = boros.index.map(str)
boros = boros.rename(index=str, columns={'BORO': 'boro'})
cuisine.index = cuisine.index.map(str)
cuisine = cuisine.rename(index=str, columns={'CUISINE DESCRIPTION': 'cuisine'})
address.index = address.index.map(str)
address = address.rename(index=str, columns={'BUILDING': 'address'})
num_inspects = num_inspects.rename(index=str, columns={'INSPECTION DATE': 'num_inspections'})

In [130]:
#get all unique camis
camis = np.unique(inspect_df.CAMIS.values)
sec_last_crits = []
prev_2p_crit_insp = []

for i in range(len(camis)):
    #find all inspection dates for a CAMIS and all inspection dates that had critical violations
    inspect_df_camis = inspect_df[inspect_df.CAMIS == camis[i]]
    insp_violation_count = inspect_df_camis.groupby('INSPECTION DATE').count()
    insp_violation_count_crit = inspect_df_camis[inspect_df_camis['CRITICAL FLAG'] == 'Critical'].groupby('INSPECTION DATE').count()
    
    if inspect_df_camis.groupby('INSPECTION DATE').count().shape[0] > 1:
        #get date of the second-to-last inspection
        sec_last_insp = inspect_df_camis.groupby('INSPECTION DATE').count().index[-2]
        #If there were critical violations, tally them up, otherwise set tally to 0
        try:
            sec_last_violations = inspect_df_camis[inspect_df_camis['CRITICAL FLAG'] == 'Critical'].groupby('INSPECTION DATE').count().ix[sec_last_insp][0]
        except KeyError:
            sec_last_violations = 0
        #Line total violation tallies up with critical violation tallies to indentify which inspections had 
        #zero critical violations, then tally up number of prior 2+ critical violation inspections
        crit_df = pd.DataFrame([insp_violation_count['ACTION'],insp_violation_count_crit['CRITICAL FLAG']]).T
        crit_df = crit_df.fillna(0)
        crit_df = crit_df.iloc[0:-1]
        prev_2p_crit_insp.append((crit_df['CRITICAL FLAG'] > 1).sum())
    else:
        sec_last_violations = 0
        prev_2p_crit_insp.append(0)
    sec_last_crits.append(sec_last_violations)

In [131]:
prior_violations_df = pd.DataFrame({'crit_vs_sec_last_insp': sec_last_crits,'prior_2p_crit_insps': prev_2p_crit_insp}, index=camis)
prior_violations_df.index = prior_violations_df.index.map(str)
prior_violations_df.to_csv('prior_violations.csv')

Unnamed: 0,crit_vs_sec_last_insp,prior_2p_crit_insps
40389266,3,4
40391383,1,3
40673117,1,3
40907587,0,1
40959339,2,2
41337287,1,0
41417994,1,2
41495616,1,0
41702352,1,0
50000244,1,6


#### Group by Critical & Non-Critical violations, extract counts of most recent inspection

In [109]:
not_crit_df = inspect_df[inspect_df['CRITICAL FLAG']== 'Not Critical']
crit_df = inspect_df[inspect_df['CRITICAL FLAG']== 'Critical']

df = crit_df.groupby(['CAMIS','INSPECTION DATE']).count()['DBA']
ids = [x[0] for x in df.index]
last_inspect_crit = pd.DataFrame(df.groupby(ids).last())
last_inspect_crit = last_inspect_crit.rename(index=str, columns={'DBA': 'crit_violations_recent_inspect'})

df = not_crit_df.groupby(['CAMIS','INSPECTION DATE']).count()['DBA']
ids = [x[0] for x in df.index]
last_inspect_non_crit = pd.DataFrame(df.groupby(ids).last())
last_inspect_non_crit = last_inspect_non_crit.rename(index=str, columns={'DBA': 'non_crit_violations_recent_inspect'})

#### Extract overall total number of violations in each category

In [110]:
non_crit_num = pd.DataFrame(not_crit_df.groupby('CAMIS').count()['CRITICAL FLAG'])
non_crit_num = non_crit_num.rename(index=str, columns={'CRITICAL FLAG': 'non_crit_violations'})
crit_num = pd.DataFrame(crit_df.groupby('CAMIS').count()['CRITICAL FLAG'])
crit_num = crit_num.rename(index=str, columns={'CRITICAL FLAG': 'crit_violations'})

#### Combine dataframes for each piece of aggregate information

In [139]:
violations_df = pd.merge(crit_num,non_crit_num, how='outer', left_index=True, right_index = True)
violations_df.index = violations_df.index.map(str)
violations_df = violations_df.fillna(value=0)
violations_df = violations_df.merge(first_inspect, how='inner', left_index=True, right_index = True)
violations_df = violations_df.merge(last_inspect, how='inner', left_index=True, right_index = True)
violations_df = violations_df.merge(sec_last_inspect, how='inner', left_index=True, right_index = True)
violations_df = violations_df.merge(zips, how='inner', left_index=True, right_index = True)
violations_df['zipcode'] = violations_df['zipcode'].map(int).map(str)
violations_df = violations_df.merge(boros, how='inner', left_index=True, right_index = True)
violations_df = violations_df.merge(cuisine, how='inner', left_index=True, right_index = True)
violations_df = violations_df.merge(address, how='inner', left_index=True, right_index = True)
violations_df = violations_df.merge(num_inspects, how='inner', left_index=True, right_index = True)
violations_df = violations_df.merge(last_inspect_crit, how='inner', left_index=True, right_index = True)
violations_df = violations_df.merge(last_inspect_non_crit, how='inner', left_index=True, right_index = True)
violations_df = violations_df.merge(a, how = 'left', left_index=True, right_index=True)

#violations_df = violations_df.merge(prior_violations_df, how = 'left', left_index=True, right_index=True)
violations_df.head()

Unnamed: 0,crit_violations,non_crit_violations,earliest_inspection,latest_inspection,second_latest_inspection,zipcode,boro,cuisine,address,num_inspections,crit_violations_recent_inspect,non_crit_violations_recent_inspect,crit_vs_sec_last_insp,prior_2p_crit_insps
30075445,8.0,4.0,2013-08-14,2016-02-18,2015-02-09,10462,BRONX,Bakery,1007 MORRIS PARK AVE,5,1,1,,
30112340,14.0,10.0,2014-06-05,2016-10-27,2016-10-03,11225,BROOKLYN,Hamburgers,469 FLATBUSH AVENUE,9,1,1,,
30191841,4.0,7.0,2013-07-22,2016-05-31,2015-09-21,10019,MANHATTAN,Irish,351 WEST 57 STREET,5,1,1,,
40356018,1.0,7.0,2013-06-05,2016-05-16,2015-06-05,11224,BROOKLYN,American,2780 STILLWELL AVENUE,4,1,1,,
40356151,13.0,5.0,2014-04-11,2016-05-14,2015-05-29,11369,QUEENS,American,8825 ASTORIA BOULEVARD,7,1,1,,


#### Ignore businesses that only have 1 inspection (no prior history)

In [140]:
violations_df = violations_df[violations_df.num_inspections > 1]

#### Reduce data leakage by reversing each business to the state just prior to its most recent inspection; 
#### The results of the most recent inspection (2+ critical violations) will form the target variable

In [113]:
violations_df['crit_violations_train'] = violations_df['crit_violations'] - violations_df['crit_violations_recent_inspect']
violations_df['non_crit_violations_train'] = violations_df['non_crit_violations'] - violations_df['non_crit_violations_recent_inspect']
violations_df['num_inspections_train'] = violations_df['num_inspections'] - 1
violations_df['average_crit_v_train'] = violations_df['crit_violations_train'] / violations_df['num_inspections_train']
violations_df['average_non_crit_v_train'] = violations_df['non_crit_violations_train'] / violations_df['num_inspections_train']
violations_df['time_since_last_inspection'] = (violations_df['latest_inspection'] - violations_df['second_latest_inspection'])
violations_df['time_since_last_inspection'] = (violations_df['time_since_last_inspection'] / np.timedelta64(1, 'D')).astype(int)
violations_df['time_since_first_inspection'] = (violations_df['latest_inspection'] - violations_df['earliest_inspection'])
violations_df['time_since_first_inspection'] = (violations_df['time_since_first_inspection'] / np.timedelta64(1, 'D')).astype(int)
violations_df['crit_v_2plus'] = (violations_df['crit_violations_recent_inspect'] > 1)*1
violations_df.head()

Unnamed: 0,crit_violations,non_crit_violations,earliest_inspection,latest_inspection,second_latest_inspection,zipcode,boro,cuisine,address,num_inspections,...,crit_vs_sec_last_insp,prior_2p_crit_insps,crit_violations_train,non_crit_violations_train,num_inspections_train,average_crit_v_train,average_non_crit_v_train,time_since_last_inspection,time_since_first_inspection,crit_v_2plus
30075445,8.0,4.0,2013-08-14,2016-02-18,2015-02-09,10462,BRONX,Bakery,1007 MORRIS PARK AVE,5,...,1,2,7.0,3.0,4,1.75,0.75,374,918,0
30112340,14.0,10.0,2014-06-05,2016-10-27,2016-10-03,11225,BROOKLYN,Hamburgers,469 FLATBUSH AVENUE,9,...,3,4,13.0,9.0,8,1.625,1.125,24,875,0
30191841,4.0,7.0,2013-07-22,2016-05-31,2015-09-21,10019,MANHATTAN,Irish,351 WEST 57 STREET,5,...,1,0,3.0,6.0,4,0.75,1.5,253,1044,0
40356018,1.0,7.0,2013-06-05,2016-05-16,2015-06-05,11224,BROOKLYN,American,2780 STILLWELL AVENUE,4,...,0,0,0.0,6.0,3,0.0,2.0,346,1076,0
40356151,13.0,5.0,2014-04-11,2016-05-14,2015-05-29,11369,QUEENS,American,8825 ASTORIA BOULEVARD,7,...,1,4,12.0,4.0,6,2.0,0.666667,351,764,0


In [114]:
violations_df2 = violations_df.drop(['address','crit_violations','earliest_inspection','latest_inspection','second_latest_inspection','non_crit_violations','num_inspections','crit_violations_recent_inspect','non_crit_violations_recent_inspect'],axis=1)
violations_df2.head()

Unnamed: 0,zipcode,boro,cuisine,crit_vs_sec_last_insp,prior_2p_crit_insps,crit_violations_train,non_crit_violations_train,num_inspections_train,average_crit_v_train,average_non_crit_v_train,time_since_last_inspection,time_since_first_inspection,crit_v_2plus
30075445,10462,BRONX,Bakery,1,2,7.0,3.0,4,1.75,0.75,374,918,0
30112340,11225,BROOKLYN,Hamburgers,3,4,13.0,9.0,8,1.625,1.125,24,875,0
30191841,10019,MANHATTAN,Irish,1,0,3.0,6.0,4,0.75,1.5,253,1044,0
40356018,11224,BROOKLYN,American,0,0,0.0,6.0,3,0.0,2.0,346,1076,0
40356151,11369,QUEENS,American,1,4,12.0,4.0,6,2.0,0.666667,351,764,0


In [115]:
violations_df3 = pd.get_dummies(violations_df2)

In [116]:
X = violations_df3.drop(['crit_v_2plus'],axis=1)
y = violations_df3['crit_v_2plus']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Calculate metrics of predicting 0 class for all

In [117]:
pred_zero = np.zeros(len(y_test))
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_zero)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_zero)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_zero)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_zero)))

accuracy = 0.657040519364
recall = 0.0
precision = 0.0
f1 = 0.0


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


### Logistic Regression

w/ L2-regularization

In [118]:
lr = linear_model.LogisticRegression(penalty = 'l2',C=1e20)
lr.fit(X_train, y_train)
pred_lr = lr.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_lr)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_lr)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_lr)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_lr)))

accuracy = 0.666218938885
recall = 0.222584856397
precision = 0.531981279251
f1 = 0.313851817763


w/ L1-regularization

In [119]:
lr = linear_model.LogisticRegression(penalty = 'l1',C=1e20)
lr.fit(X_train, y_train)
pred_lr = lr.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_lr)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_lr)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_lr)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_lr)))

accuracy = 0.663308708305
recall = 0.238250652742
precision = 0.519943019943
f1 = 0.326768128917


### Naive Bayes

#### w/ fit prior:

In [120]:
nb = BernoulliNB(alpha=1,fit_prior=True)
nb.fit(X_train, y_train)
pred_nb = nb.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_nb)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_nb)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_nb)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_nb)))

accuracy = 0.641593910902
recall = 0.160574412533
precision = 0.438502673797
f1 = 0.235069278548


#### w/ uniform prior

In [121]:
nb = BernoulliNB(alpha=1,fit_prior=False)
nb.fit(X_train, y_train)
pred_nb = nb.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_nb)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_nb)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_nb)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_nb)))

accuracy = 0.551152899037
recall = 0.686031331593
precision = 0.408155339806
f1 = 0.511809106404


### Random Forest

#### 10 trees

In [122]:
rf = RandomForestClassifier(n_estimators=10)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_rf)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_rf)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_rf)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_rf)))

accuracy = 0.652339377658
recall = 0.227806788512
precision = 0.485396383866
f1 = 0.31008440693


#### 20 trees

In [123]:
rf = RandomForestClassifier(n_estimators=20)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_rf)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_rf)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_rf)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_rf)))

accuracy = 0.671143944482
recall = 0.27088772846
precision = 0.541069100391
f1 = 0.361026533275
