In [1]:
from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier

from IPython.display import display
%matplotlib inline
pd.options.display.max_rows = 10

## Load the data
### Has had some preprocessing already, plus new features added from other datasets

In [2]:
violations_df = pd.read_csv('health_inspect_allFeatures.csv', index_col=0)
violations_df = violations_df[violations_df['complaints_null']==0] # Ignore the 23 restaurants we couldn't match with 311 data
display(violations_df)

Unnamed: 0,crit_violations,non_crit_violations,earliest_inspection,latest_inspection,second_latest_inspection,zipcode,boro,cuisine,address,num_inspections,...,liquor_license,sidewalk_license,food_poisoning_complaints,food_establishment_complaints,electric_complaints,safety_complaints,rodent_complaints,dirty_conditions_complaints,missed_collection_complaints,complaints_null
30075445,8,4,8/14/2013,2/18/2016,2/9/2015,10462,BRONX,Bakery,1007 MORRIS PARK AVE,5,...,L,,2.0,5.0,3.0,1.0,24.0,24.0,20.0,0
30112340,14,10,6/5/2014,10/27/2016,10/3/2016,11225,BROOKLYN,Hamburgers,469 FLATBUSH AVENUE,9,...,OP,,5.0,7.0,23.0,3.0,27.0,18.0,4.0,0
30191841,4,7,7/22/2013,5/31/2016,9/21/2015,10019,MANHATTAN,Irish,351 WEST 57 STREET,5,...,AX,Unenclosed,16.0,13.0,172.0,72.0,32.0,33.0,26.0,0
40356018,1,7,6/5/2013,5/16/2016,6/5/2015,11224,BROOKLYN,American,2780 STILLWELL AVENUE,4,...,CT,,0.0,9.0,24.0,4.0,13.0,8.0,3.0,0
40356151,13,5,4/11/2014,5/14/2016,5/29/2015,11369,QUEENS,American,8825 ASTORIA BOULEVARD,7,...,,,0.0,3.0,6.0,0.0,8.0,12.0,20.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50060427,10,4,3/7/2017,4/6/2017,3/20/2017,11217,BROOKLYN,Bagels/Pretzels,73 LAFAYETTE AVE,3,...,OP,Enclosed,3.0,14.0,14.0,2.0,83.0,31.0,13.0,0
50060496,7,4,2/27/2017,3/31/2017,2/27/2017,11232,BROOKLYN,Delicatessen,4102 3RD AVE,2,...,,,0.0,1.0,9.0,4.0,4.0,32.0,3.0,0
50060695,7,4,4/4/2017,4/10/2017,4/4/2017,11432,QUEENS,Pizza,16417 JAMAICA AVE,2,...,,,0.0,0.0,0.0,0.0,1.0,4.0,3.0,0
50060807,2,2,3/9/2017,3/20/2017,3/9/2017,10467,BRONX,Chinese,2861 WHITE PLAINS RD,2,...,AX,,3.0,5.0,55.0,14.0,62.0,24.0,9.0,0


## Drop columns that we know we don't want during training

In [3]:
violations_df2 = violations_df.drop(['address', 'crit_violations', 'earliest_inspection',
                                     'latest_inspection', 'second_latest_inspection',
                                     'non_crit_violations', 'num_inspections',
                                     'crit_violations_recent_inspect', 'non_crit_violations_recent_inspect',
                                     'complaints_null', 'names', 'search_info', 'address2'], axis=1)
display(violations_df2)

Unnamed: 0,zipcode,boro,cuisine,crit_violations_train,non_crit_violations_train,num_inspections_train,average_crit_v_train,average_non_crit_v_train,time_since_last_inspection,time_since_first_inspection,...,rating_null_zip,liquor_license,sidewalk_license,food_poisoning_complaints,food_establishment_complaints,electric_complaints,safety_complaints,rodent_complaints,dirty_conditions_complaints,missed_collection_complaints
30075445,10462,BRONX,Bakery,7,3,4,1.750,0.750000,374,918,...,0,L,,2.0,5.0,3.0,1.0,24.0,24.0,20.0
30112340,11225,BROOKLYN,Hamburgers,13,9,8,1.625,1.125000,24,875,...,0,OP,,5.0,7.0,23.0,3.0,27.0,18.0,4.0
30191841,10019,MANHATTAN,Irish,3,6,4,0.750,1.500000,253,1044,...,0,AX,Unenclosed,16.0,13.0,172.0,72.0,32.0,33.0,26.0
40356018,11224,BROOKLYN,American,0,6,3,0.000,2.000000,346,1076,...,0,CT,,0.0,9.0,24.0,4.0,13.0,8.0,3.0
40356151,11369,QUEENS,American,12,4,6,2.000,0.666667,351,764,...,0,,,0.0,3.0,6.0,0.0,8.0,12.0,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50060427,11217,BROOKLYN,Bagels/Pretzels,8,3,2,4.000,1.500000,17,30,...,0,OP,Enclosed,3.0,14.0,14.0,2.0,83.0,31.0,13.0
50060496,11232,BROOKLYN,Delicatessen,4,3,1,4.000,3.000000,32,32,...,0,,,0.0,1.0,9.0,4.0,4.0,32.0,3.0
50060695,11432,QUEENS,Pizza,6,3,1,6.000,3.000000,6,6,...,0,,,0.0,0.0,0.0,0.0,1.0,4.0,3.0
50060807,10467,BRONX,Chinese,1,0,1,1.000,0.000000,11,11,...,0,AX,,3.0,5.0,55.0,14.0,62.0,24.0,9.0


## What features do we have currently?

In [4]:
print(list(violations_df2.columns.values))

['zipcode', 'boro', 'cuisine', 'crit_violations_train', 'non_crit_violations_train', 'num_inspections_train', 'average_crit_v_train', 'average_non_crit_v_train', 'time_since_last_inspection', 'time_since_first_inspection', 'crit_v_2plus', 'latitude', 'longitude', '3day_temp', '3day_humidity', 'rating', 'rating_null', 'rating_null_zip', 'liquor_license', 'sidewalk_license', 'food_poisoning_complaints', 'food_establishment_complaints', 'electric_complaints', 'safety_complaints', 'rodent_complaints', 'dirty_conditions_complaints', 'missed_collection_complaints']


## Encode categorical features

In [5]:
categorical = ['zipcode', 'boro', 'cuisine', 'rating_null', 'rating_null_zip', 'liquor_license', 'sidewalk_license']
violations_df3 = pd.get_dummies(violations_df2, columns=categorical, dummy_na=True, drop_first=True)
display(violations_df3)

Unnamed: 0,crit_violations_train,non_crit_violations_train,num_inspections_train,average_crit_v_train,average_non_crit_v_train,time_since_last_inspection,time_since_first_inspection,crit_v_2plus,latitude,longitude,...,liquor_license_W,liquor_license_WC,liquor_license_WW,liquor_license_ZL,liquor_license_nan,sidewalk_license_Regular Unenclosed/Small Unenclosed,sidewalk_license_Small Unenclosed,sidewalk_license_Unenclosed,sidewalk_license_Unenclosed/Small Unenclosed,sidewalk_license_nan
30075445,7,3,4,1.750,0.750000,374,918,0,40.848537,-73.856123,...,0,0,0,0,0,0,0,0,0,1
30112340,13,9,8,1.625,1.125000,24,875,0,40.662930,-73.961726,...,0,0,0,0,0,0,0,0,0,1
30191841,3,6,4,0.750,1.500000,253,1044,0,40.767821,-73.984981,...,0,0,0,0,0,0,0,1,0,0
40356018,0,6,3,0.000,2.000000,346,1076,0,40.579526,-73.982426,...,0,0,0,0,0,0,0,0,0,1
40356151,12,4,6,2.000,0.666667,351,764,0,40.772354,-73.931502,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50060427,8,3,2,4.000,1.500000,17,30,1,40.687281,-73.975386,...,0,0,0,0,0,0,0,0,0,0
50060496,4,3,1,4.000,3.000000,32,32,1,40.652797,-74.009697,...,0,0,0,0,1,0,0,0,0,1
50060695,6,3,1,6.000,3.000000,6,6,0,40.723242,-73.728041,...,0,0,0,0,1,0,0,0,0,1
50060807,1,0,1,1.000,0.000000,11,11,0,40.867355,-73.867400,...,0,0,0,0,0,0,0,0,0,1


## Split into train/test and separate target variable

In [6]:
X = violations_df3.drop(['crit_v_2plus'],axis=1)
y = violations_df3['crit_v_2plus']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
display(X_train.head())
display(X_test.head())

Unnamed: 0,crit_violations_train,non_crit_violations_train,num_inspections_train,average_crit_v_train,average_non_crit_v_train,time_since_last_inspection,time_since_first_inspection,latitude,longitude,3day_temp,...,liquor_license_W,liquor_license_WC,liquor_license_WW,liquor_license_ZL,liquor_license_nan,sidewalk_license_Regular Unenclosed/Small Unenclosed,sidewalk_license_Small Unenclosed,sidewalk_license_Unenclosed,sidewalk_license_Unenclosed/Small Unenclosed,sidewalk_license_nan
41370373,16,7,8,2.0,0.875,40,1070,40.58031,-74.104356,38.666667,...,0,0,0,0,1,0,0,0,0,1
40762853,30,22,13,2.307692,1.692308,32,1064,40.714972,-73.997093,46.0,...,0,0,0,0,0,0,0,0,0,1
41678386,19,12,8,2.375,1.5,33,942,40.863343,-73.896855,44.666667,...,0,0,0,0,1,0,0,0,0,1
40394054,26,15,11,2.363636,1.363636,61,965,40.625015,-73.961835,37.0,...,0,0,0,0,0,0,0,0,0,1
50046451,3,1,1,3.0,1.0,25,25,40.842481,-73.867819,83.0,...,0,0,0,0,0,0,0,0,0,1


Unnamed: 0,crit_violations_train,non_crit_violations_train,num_inspections_train,average_crit_v_train,average_non_crit_v_train,time_since_last_inspection,time_since_first_inspection,latitude,longitude,3day_temp,...,liquor_license_W,liquor_license_WC,liquor_license_WW,liquor_license_ZL,liquor_license_nan,sidewalk_license_Regular Unenclosed/Small Unenclosed,sidewalk_license_Small Unenclosed,sidewalk_license_Unenclosed,sidewalk_license_Unenclosed/Small Unenclosed,sidewalk_license_nan
41367409,15,10,9,1.666667,1.111111,200,1070,40.868333,-73.919511,83.0,...,0,0,0,0,1,0,0,0,0,1
50004838,15,5,9,1.666667,0.555556,219,1057,40.833207,-73.945171,49.333333,...,0,0,0,0,0,0,0,0,0,1
41305419,7,9,7,1.0,1.285714,199,890,40.705583,-73.739711,38.333333,...,0,0,0,0,0,0,0,0,0,1
41540508,0,3,2,0.0,1.5,378,721,40.895065,-73.881452,29.666667,...,0,0,0,0,1,0,0,0,0,1
50005098,2,5,3,0.666667,1.666667,405,1089,40.781185,-73.979755,43.666667,...,0,0,0,0,0,0,0,1,0,0


## Normalize numerical features to [0,1]

In [7]:
def feature_normalization(train, test): # From first Machine Learning homework assignment
    """Rescale the data so that each feature in the training set is in
    the interval [0,1], and apply the same transformations to the test
    set, using the statistics computed on the training set.

    Args:
        train - training set, a 2D numpy array of size (num_instances, num_features)
        test  - test set, a 2D numpy array of size (num_instances, num_features)
    Returns:
        train_normalized - training set after normalization
        test_normalized  - test set after normalization

    """
    #m = np.min(train,axis=0)
    m = train.min(axis=0)
    #M = np.max(train,axis=0)
    M = train.max(axis=0)
    train_normalized = (train - m)/(M-m)
    test_normalized = (test - m)/(M-m)
    return train_normalized, test_normalized

numeric = ['crit_violations_train', 'non_crit_violations_train', 'num_inspections_train',
           'average_crit_v_train', 'average_non_crit_v_train', 'time_since_last_inspection', 'time_since_first_inspection',
          '3day_temp', '3day_humidity', 'rating', 'food_poisoning_complaints', 'food_establishment_complaints',
           'electric_complaints', 'safety_complaints', 'rodent_complaints', 'dirty_conditions_complaints',
           'missed_collection_complaints', 'latitude', 'longitude']

train_normalized, test_normalized = feature_normalization(X_train.loc[:,numeric], X_test.loc[:,numeric])

X_train_norm = X_train.copy(deep=True)
X_test_norm = X_test.copy(deep=True)

X_train_norm.loc[:,numeric] = train_normalized
X_test_norm.loc[:,numeric] = test_normalized

display(X_train_norm.head())
display(X_test_norm.head())

Unnamed: 0,crit_violations_train,non_crit_violations_train,num_inspections_train,average_crit_v_train,average_non_crit_v_train,time_since_last_inspection,time_since_first_inspection,latitude,longitude,3day_temp,...,liquor_license_W,liquor_license_WC,liquor_license_WW,liquor_license_ZL,liquor_license_nan,sidewalk_license_Regular Unenclosed/Small Unenclosed,sidewalk_license_Small Unenclosed,sidewalk_license_Unenclosed,sidewalk_license_Unenclosed/Small Unenclosed,sidewalk_license_nan
41370373,0.271186,0.233333,0.388889,0.285714,0.145833,0.036449,0.977127,0.736494,0.021334,0.26699,...,0,0,0,0,1,0,0,0,0,1
40762853,0.508475,0.733333,0.666667,0.32967,0.282051,0.028972,0.971638,0.737824,0.021774,0.373786,...,0,0,0,0,0,0,0,0,0,1
41678386,0.322034,0.4,0.388889,0.339286,0.25,0.029907,0.860018,0.73929,0.022184,0.354369,...,0,0,0,0,1,0,0,0,0,1
40394054,0.440678,0.5,0.555556,0.337662,0.227273,0.056075,0.881061,0.736936,0.021918,0.242718,...,0,0,0,0,0,0,0,0,0,1
50046451,0.050847,0.033333,0.0,0.428571,0.166667,0.02243,0.021043,0.739083,0.022303,0.912621,...,0,0,0,0,0,0,0,0,0,1


Unnamed: 0,crit_violations_train,non_crit_violations_train,num_inspections_train,average_crit_v_train,average_non_crit_v_train,time_since_last_inspection,time_since_first_inspection,latitude,longitude,3day_temp,...,liquor_license_W,liquor_license_WC,liquor_license_WW,liquor_license_ZL,liquor_license_nan,sidewalk_license_Regular Unenclosed/Small Unenclosed,sidewalk_license_Small Unenclosed,sidewalk_license_Unenclosed,sidewalk_license_Unenclosed/Small Unenclosed,sidewalk_license_nan
41367409,0.254237,0.333333,0.444444,0.238095,0.185185,0.185981,0.977127,0.739339,0.022091,0.912621,...,0,0,0,0,1,0,0,0,0,1
50004838,0.254237,0.166667,0.444444,0.238095,0.092593,0.203738,0.965233,0.738992,0.021986,0.42233,...,0,0,0,0,0,0,0,0,0,1
41305419,0.118644,0.3,0.333333,0.142857,0.214286,0.185047,0.812443,0.737731,0.022828,0.262136,...,0,0,0,0,0,0,0,0,0,1
41540508,0.0,0.1,0.055556,0.0,0.25,0.352336,0.657823,0.739603,0.022247,0.135922,...,0,0,0,0,1,0,0,0,0,1
50005098,0.033898,0.166667,0.111111,0.095238,0.277778,0.37757,0.994511,0.738478,0.021845,0.339806,...,0,0,0,0,0,0,0,1,0,0


## Calculate metrics of predicting 0 class for all

In [8]:
pred_zero = np.zeros(len(y_test))
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_zero)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_zero)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_zero)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_zero)))

accuracy = 0.661138502913
recall = 0.0
precision = 0.0
f1 = 0.0


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## Logistic Regression
### w/ L2-regularization

In [9]:
lr = linear_model.LogisticRegression(penalty = 'l2',C=1e20)
lr.fit(X_train, y_train)
pred_lr = lr.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_lr)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_lr)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_lr)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_lr)))

accuracy = 0.656656207978
recall = 0.175264550265
precision = 0.481818181818
f1 = 0.257032007759


### w/ L1-regularization

In [10]:
lr = linear_model.LogisticRegression(penalty = 'l1',C=1e20)
lr.fit(X_train, y_train)
pred_lr = lr.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_lr)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_lr)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_lr)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_lr)))

accuracy = 0.655983863738
recall = 0.183201058201
precision = 0.48006932409
f1 = 0.265198659646


## Naive Bayes
### w/ fit prior

In [11]:
nb = BernoulliNB(alpha=1,fit_prior=True)
nb.fit(X_train, y_train)
pred_nb = nb.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_nb)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_nb)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_nb)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_nb)))

accuracy = 0.635813536531
recall = 0.167989417989
precision = 0.409017713366
f1 = 0.238162212846


### w/ uniform prior

In [12]:
nb = BernoulliNB(alpha=1,fit_prior=False)
nb.fit(X_train, y_train)
pred_nb = nb.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_nb)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_nb)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_nb)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_nb)))

accuracy = 0.534289556253
recall = 0.582671957672
precision = 0.378436426117
f1 = 0.458854166667


## Random Forest
### 10 trees

In [13]:
rf = RandomForestClassifier(n_estimators=10)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_rf)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_rf)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_rf)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_rf)))

accuracy = 0.643657552667
recall = 0.185185185185
precision = 0.438871473354
f1 = 0.260465116279


### 20 trees

In [14]:
rf = RandomForestClassifier(n_estimators=20)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_rf)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_rf)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_rf)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_rf)))

accuracy = 0.659345584939
recall = 0.171296296296
precision = 0.492395437262
f1 = 0.254170755643
