In [1]:
from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier

from IPython.display import display
%matplotlib inline
pd.options.display.max_rows = 10

## Load the data
### Has had some preprocessing already, plus new features added from other datasets

In [21]:
#violations_df = pd.read_csv('health_inspect_allFeatures.csv', index_col=0) 
violations_df = pd.read_csv('violations_all.csv', index_col=0)

violations_df[~np.isnan(violations_df['general_complaints'])] # Ignore the 23 restaurants we couldn't match with 311 data
len(violations_df)

22330

## Drop columns that we know we don't want during training

In [22]:
violations_df2 = violations_df.drop(['address', 'crit_violations', 'earliest_inspection',
                                     'latest_inspection', 'second_latest_inspection',
                                     'non_crit_violations', 'num_inspections',
                                     'crit_violations_recent_inspect', 'non_crit_violations_recent_inspect',
                                     'names', 'search_info'], axis=1)
display(violations_df2)

Unnamed: 0,zipcode,boro,cuisine,crit_violations_train,non_crit_violations_train,num_inspections_train,average_crit_v_train,average_non_crit_v_train,time_since_last_inspection,time_since_first_inspection,...,general_complaints,missed_collection_complaints,police_matter_complaints,latitude,longitude,liquor_license,sidewalk_license,rating,rating_null,rating_null_zip
30075445,10462,BRONX,Bakery,7.0,3.0,4,1.750,0.750000,374,918,...,6.0,20.0,3.0,40.848537,-73.856123,L,,4.800000,0,0
30112340,11225,BROOKLYN,Hamburgers,13.0,9.0,8,1.625,1.125000,24,875,...,15.0,4.0,0.0,40.662930,-73.961726,OP,,3.400000,0,0
30191841,10019,MANHATTAN,Irish,3.0,6.0,4,0.750,1.500000,253,1044,...,104.0,26.0,11.0,40.767821,-73.984981,AX,Unenclosed,3.500000,0,0
40356018,11224,BROOKLYN,American,0.0,6.0,3,0.000,2.000000,346,1076,...,9.0,3.0,3.0,40.579526,-73.982426,CT,,4.800000,0,0
40356151,11369,QUEENS,American,12.0,4.0,6,2.000,0.666667,351,764,...,4.0,20.0,2.0,40.772354,-73.931502,,,3.800000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50060427,11217,BROOKLYN,Bagels/Pretzels,8.0,3.0,2,4.000,1.500000,17,30,...,13.0,13.0,4.0,40.687281,-73.975386,OP,Enclosed,4.100000,0,0
50060496,11232,BROOKLYN,Delicatessen,4.0,3.0,1,4.000,3.000000,32,32,...,8.0,3.0,2.0,40.652797,-74.009697,,,4.800000,0,0
50060695,11432,QUEENS,Pizza,6.0,3.0,1,6.000,3.000000,6,6,...,0.0,3.0,0.0,40.723242,-73.728041,,,3.792806,1,0
50060807,10467,BRONX,Chinese,1.0,0.0,1,1.000,0.000000,11,11,...,28.0,9.0,14.0,40.867355,-73.867400,AX,,3.700000,0,0


## What features do we have currently?

In [23]:
print(list(violations_df2.columns.values))

['zipcode', 'boro', 'cuisine', 'crit_violations_train', 'non_crit_violations_train', 'num_inspections_train', 'average_crit_v_train', 'average_non_crit_v_train', 'time_since_last_inspection', 'time_since_first_inspection', 'crit_v_2plus', '3-day temp', '3-day humidity', 'food_poisoning_complaints', 'food_establishment_complaints', 'electric_complaints', 'safety_complaints', 'rodent_complaints', 'dirty_conditions_complaints', 'general_complaints', 'missed_collection_complaints', 'police_matter_complaints', 'latitude', 'longitude', 'liquor_license', 'sidewalk_license', 'rating', 'rating_null', 'rating_null_zip']


## Encode categorical features

In [24]:
categorical = ['zipcode', 'boro', 'cuisine', 'rating_null', 'rating_null_zip', 'liquor_license', 'sidewalk_license']
violations_df3 = pd.get_dummies(violations_df2, columns=categorical, dummy_na=True, drop_first=True)
display(violations_df3)

Unnamed: 0,crit_violations_train,non_crit_violations_train,num_inspections_train,average_crit_v_train,average_non_crit_v_train,time_since_last_inspection,time_since_first_inspection,crit_v_2plus,3-day temp,3-day humidity,...,liquor_license_W,liquor_license_WC,liquor_license_WW,liquor_license_ZL,liquor_license_nan,sidewalk_license_Regular Unenclosed/Small Unenclosed,sidewalk_license_Small Unenclosed,sidewalk_license_Unenclosed,sidewalk_license_Unenclosed/Small Unenclosed,sidewalk_license_nan
30075445,7.0,3.0,4,1.750,0.750000,374,918,0,39.333333,60.666667,...,0,0,0,0,0,0,0,0,0,1
30112340,13.0,9.0,8,1.625,1.125000,24,875,0,47.000000,58.000000,...,0,0,0,0,0,0,0,0,0,1
30191841,3.0,6.0,4,0.750,1.500000,253,1044,0,77.333333,68.666667,...,0,0,0,0,0,0,0,1,0,0
40356018,0.0,6.0,3,0.000,2.000000,346,1076,0,58.333333,56.666667,...,0,0,0,0,0,0,0,0,0,1
40356151,12.0,4.0,6,2.000,0.666667,351,764,0,63.666667,67.666667,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50060427,8.0,3.0,2,4.000,1.500000,17,30,1,49.333333,88.666667,...,0,0,0,0,0,0,0,0,0,0
50060496,4.0,3.0,1,4.000,3.000000,32,32,1,46.000000,67.333333,...,0,0,0,0,1,0,0,0,0,1
50060695,6.0,3.0,1,6.000,3.000000,6,6,0,51.333333,55.000000,...,0,0,0,0,1,0,0,0,0,1
50060807,1.0,0.0,1,1.000,0.000000,11,11,0,42.000000,47.333333,...,0,0,0,0,0,0,0,0,0,1


## Split into train/test and separate target variable

In [25]:
X = violations_df3.drop(['crit_v_2plus'],axis=1)
y = violations_df3['crit_v_2plus']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
display(X_train.head())
display(X_test.head())

Unnamed: 0,crit_violations_train,non_crit_violations_train,num_inspections_train,average_crit_v_train,average_non_crit_v_train,time_since_last_inspection,time_since_first_inspection,3-day temp,3-day humidity,food_poisoning_complaints,...,liquor_license_W,liquor_license_WC,liquor_license_WW,liquor_license_ZL,liquor_license_nan,sidewalk_license_Regular Unenclosed/Small Unenclosed,sidewalk_license_Small Unenclosed,sidewalk_license_Unenclosed,sidewalk_license_Unenclosed/Small Unenclosed,sidewalk_license_nan
41351765,9.0,2.0,5,1.8,0.4,48,972,47.333333,63.0,13.0,...,0,0,0,0,0,0,0,1,0,0
40544107,10.0,3.0,7,1.428571,0.428571,69,766,49.0,40.0,15.0,...,0,0,0,0,0,0,0,0,0,1
41320199,5.0,5.0,4,1.25,1.25,57,735,40.666667,64.666667,4.0,...,0,0,0,0,1,0,0,0,0,1
41026182,0.0,4.0,2,0.0,2.0,391,764,42.333333,63.0,22.0,...,0,0,0,0,0,0,0,0,0,1
41519679,22.0,16.0,11,2.0,1.454545,26,1034,38.666667,54.0,0.0,...,0,0,0,0,1,0,0,0,0,1


Unnamed: 0,crit_violations_train,non_crit_violations_train,num_inspections_train,average_crit_v_train,average_non_crit_v_train,time_since_last_inspection,time_since_first_inspection,3-day temp,3-day humidity,food_poisoning_complaints,...,liquor_license_W,liquor_license_WC,liquor_license_WW,liquor_license_ZL,liquor_license_nan,sidewalk_license_Regular Unenclosed/Small Unenclosed,sidewalk_license_Small Unenclosed,sidewalk_license_Unenclosed,sidewalk_license_Unenclosed/Small Unenclosed,sidewalk_license_nan
41182288,10.0,5.0,6,1.666667,0.833333,365,1023,63.0,38.666667,3.0,...,0,0,0,0,1,0,0,0,0,1
41722468,16.0,6.0,6,2.666667,1.0,12,1089,55.666667,49.0,4.0,...,0,0,0,0,0,0,0,0,0,1
50034179,1.0,1.0,1,1.0,1.0,420,420,36.333333,52.666667,5.0,...,0,0,0,0,0,0,0,0,0,1
41218543,20.0,9.0,10,2.0,0.9,169,1049,47.666667,52.0,0.0,...,0,0,0,0,0,0,0,0,0,1
50011932,6.0,1.0,3,2.0,0.333333,390,602,49.333333,55.0,19.0,...,0,0,0,0,0,0,0,0,0,1


## Normalize numerical features to [0,1]

In [26]:
def feature_normalization(train, test): # From first Machine Learning homework assignment
    """Rescale the data so that each feature in the training set is in
    the interval [0,1], and apply the same transformations to the test
    set, using the statistics computed on the training set.

    Args:
        train - training set, a 2D numpy array of size (num_instances, num_features)
        test  - test set, a 2D numpy array of size (num_instances, num_features)
    Returns:
        train_normalized - training set after normalization
        test_normalized  - test set after normalization

    """
    #m = np.min(train,axis=0)
    m = train.min(axis=0)
    #M = np.max(train,axis=0)
    M = train.max(axis=0)
    train_normalized = (train - m)/(M-m)
    test_normalized = (test - m)/(M-m)
    return train_normalized, test_normalized

numeric = ['crit_violations_train', 'non_crit_violations_train', 'num_inspections_train',
           'average_crit_v_train', 'average_non_crit_v_train', 'time_since_last_inspection', 'time_since_first_inspection',
          '3-day temp', '3-day humidity', 'rating', 'food_poisoning_complaints', 'food_establishment_complaints',
           'electric_complaints', 'safety_complaints', 'rodent_complaints', 'dirty_conditions_complaints',
           'missed_collection_complaints', 'latitude', 'longitude']

train_normalized, test_normalized = feature_normalization(X_train.loc[:,numeric], X_test.loc[:,numeric])

X_train_norm = X_train.copy(deep=True)
X_test_norm = X_test.copy(deep=True)

X_train_norm.loc[:,numeric] = train_normalized
X_test_norm.loc[:,numeric] = test_normalized

display(X_train_norm.head())
display(X_test_norm.head())

Unnamed: 0,crit_violations_train,non_crit_violations_train,num_inspections_train,average_crit_v_train,average_non_crit_v_train,time_since_last_inspection,time_since_first_inspection,3-day temp,3-day humidity,food_poisoning_complaints,...,liquor_license_W,liquor_license_WC,liquor_license_WW,liquor_license_ZL,liquor_license_nan,sidewalk_license_Regular Unenclosed/Small Unenclosed,sidewalk_license_Small Unenclosed,sidewalk_license_Unenclosed,sidewalk_license_Unenclosed/Small Unenclosed,sidewalk_license_nan
41351765,0.152542,0.064516,0.222222,0.3,0.066667,0.043925,0.887363,0.393204,0.549223,0.342105,...,0,0,0,0,0,0,0,1,0,0
40544107,0.169492,0.096774,0.333333,0.238095,0.071429,0.063551,0.698718,0.417476,0.19171,0.394737,...,0,0,0,0,0,0,0,0,0,1
41320199,0.084746,0.16129,0.166667,0.208333,0.208333,0.052336,0.67033,0.296117,0.57513,0.105263,...,0,0,0,0,1,0,0,0,0,1
41026182,0.0,0.129032,0.055556,0.0,0.333333,0.364486,0.696886,0.320388,0.549223,0.578947,...,0,0,0,0,0,0,0,0,0,1
41519679,0.372881,0.516129,0.555556,0.333333,0.242424,0.023364,0.944139,0.26699,0.409326,0.0,...,0,0,0,0,1,0,0,0,0,1


Unnamed: 0,crit_violations_train,non_crit_violations_train,num_inspections_train,average_crit_v_train,average_non_crit_v_train,time_since_last_inspection,time_since_first_inspection,3-day temp,3-day humidity,food_poisoning_complaints,...,liquor_license_W,liquor_license_WC,liquor_license_WW,liquor_license_ZL,liquor_license_nan,sidewalk_license_Regular Unenclosed/Small Unenclosed,sidewalk_license_Small Unenclosed,sidewalk_license_Unenclosed,sidewalk_license_Unenclosed/Small Unenclosed,sidewalk_license_nan
41182288,0.169492,0.16129,0.277778,0.277778,0.138889,0.340187,0.934066,0.621359,0.170984,0.078947,...,0,0,0,0,1,0,0,0,0,1
41722468,0.271186,0.193548,0.277778,0.444444,0.166667,0.01028,0.994505,0.514563,0.331606,0.105263,...,0,0,0,0,0,0,0,0,0,1
50034179,0.016949,0.032258,0.0,0.166667,0.166667,0.391589,0.381868,0.23301,0.388601,0.131579,...,0,0,0,0,0,0,0,0,0,1
41218543,0.338983,0.290323,0.5,0.333333,0.15,0.157009,0.957875,0.398058,0.378238,0.0,...,0,0,0,0,0,0,0,0,0,1
50011932,0.101695,0.032258,0.111111,0.333333,0.055556,0.363551,0.548535,0.42233,0.42487,0.5,...,0,0,0,0,0,0,0,0,0,1


## Calculate metrics of predicting 0 class for all

In [36]:
pred_zero = np.zeros(len(y_test))
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_zero)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_zero)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_zero)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_zero)))

accuracy = 0.649798477385
recall = 0.0
precision = 0.0
f1 = 0.0


## Logistic Regression
### w/ L2-regularization

In [37]:
lr = linear_model.LogisticRegression(penalty = 'l2',C=1e20)
lr.fit(X_train, y_train)
pred_lr = lr.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_lr)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_lr)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_lr)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_lr)))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

### w/ L1-regularization

In [10]:
lr = linear_model.LogisticRegression(penalty = 'l1',C=1e20)
lr.fit(X_train, y_train)
pred_lr = lr.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_lr)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_lr)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_lr)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_lr)))

accuracy = 0.655983863738
recall = 0.183201058201
precision = 0.48006932409
f1 = 0.265198659646


## Naive Bayes
### w/ fit prior

In [11]:
nb = BernoulliNB(alpha=1,fit_prior=True)
nb.fit(X_train, y_train)
pred_nb = nb.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_nb)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_nb)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_nb)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_nb)))

accuracy = 0.635813536531
recall = 0.167989417989
precision = 0.409017713366
f1 = 0.238162212846


### w/ uniform prior

In [12]:
nb = BernoulliNB(alpha=1,fit_prior=False)
nb.fit(X_train, y_train)
pred_nb = nb.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_nb)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_nb)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_nb)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_nb)))

accuracy = 0.534289556253
recall = 0.582671957672
precision = 0.378436426117
f1 = 0.458854166667


## Random Forest
### 10 trees

In [13]:
rf = RandomForestClassifier(n_estimators=10)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_rf)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_rf)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_rf)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_rf)))

accuracy = 0.643657552667
recall = 0.185185185185
precision = 0.438871473354
f1 = 0.260465116279


### 20 trees

In [14]:
rf = RandomForestClassifier(n_estimators=20)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_rf)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_rf)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_rf)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_rf)))

accuracy = 0.659345584939
recall = 0.171296296296
precision = 0.492395437262
f1 = 0.254170755643


### SVM 

In [29]:
sv = svm.SVC()
sv.fit(X_train, y_train)
pred_sv = sv.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_sv)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_sv)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_sv)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_svb)))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').