In [1]:
from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier

from IPython.display import display
%matplotlib inline
pd.options.display.max_rows = 10

## Load the data
### Has had some preprocessing already, plus new features added from other datasets

In [6]:
violations_df = pd.read_csv('health_inspect_allFeatures.csv', index_col=0)

violations_df = violations_df[violations_df['complaints_null']==0] # Ignore the 23 restaurants we couldn't match with 311 data

display(violations_df)

Unnamed: 0,crit_violations,non_crit_violations,earliest_inspection,latest_inspection,second_latest_inspection,zipcode,boro,cuisine,address,num_inspections,...,liquor_license,sidewalk_license,food_poisoning_complaints,food_establishment_complaints,electric_complaints,safety_complaints,rodent_complaints,dirty_conditions_complaints,missed_collection_complaints,complaints_null
30075445,8,4,8/14/2013,2/18/2016,2/9/2015,10462,BRONX,Bakery,1007 MORRIS PARK AVE,5,...,L,,2.0,5.0,3.0,1.0,24.0,24.0,20.0,0
30112340,14,10,6/5/2014,10/27/2016,10/3/2016,11225,BROOKLYN,Hamburgers,469 FLATBUSH AVENUE,9,...,OP,,5.0,7.0,23.0,3.0,27.0,18.0,4.0,0
30191841,4,7,7/22/2013,5/31/2016,9/21/2015,10019,MANHATTAN,Irish,351 WEST 57 STREET,5,...,AX,Unenclosed,16.0,13.0,172.0,72.0,32.0,33.0,26.0,0
40356018,1,7,6/5/2013,5/16/2016,6/5/2015,11224,BROOKLYN,American,2780 STILLWELL AVENUE,4,...,CT,,0.0,9.0,24.0,4.0,13.0,8.0,3.0,0
40356151,13,5,4/11/2014,5/14/2016,5/29/2015,11369,QUEENS,American,8825 ASTORIA BOULEVARD,7,...,,,0.0,3.0,6.0,0.0,8.0,12.0,20.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50060427,10,4,3/7/2017,4/6/2017,3/20/2017,11217,BROOKLYN,Bagels/Pretzels,73 LAFAYETTE AVE,3,...,OP,Enclosed,3.0,14.0,14.0,2.0,83.0,31.0,13.0,0
50060496,7,4,2/27/2017,3/31/2017,2/27/2017,11232,BROOKLYN,Delicatessen,4102 3RD AVE,2,...,,,0.0,1.0,9.0,4.0,4.0,32.0,3.0,0
50060695,7,4,4/4/2017,4/10/2017,4/4/2017,11432,QUEENS,Pizza,16417 JAMAICA AVE,2,...,,,0.0,0.0,0.0,0.0,1.0,4.0,3.0,0
50060807,2,2,3/9/2017,3/20/2017,3/9/2017,10467,BRONX,Chinese,2861 WHITE PLAINS RD,2,...,AX,,3.0,5.0,55.0,14.0,62.0,24.0,9.0,0


## Drop columns that we know we don't want during training

In [10]:
violations_df2 = violations_df.drop(['address', 'crit_violations', 'earliest_inspection',
                                     'latest_inspection', 'second_latest_inspection',
                                     'non_crit_violations', 'num_inspections',
                                     'crit_violations_recent_inspect', 'non_crit_violations_recent_inspect',
                                     'complaints_null'], axis=1)
display(violations_df2)

Unnamed: 0,zipcode,boro,cuisine,crit_violations_train,non_crit_violations_train,num_inspections_train,average_crit_v_train,average_non_crit_v_train,time_since_last_inspection,time_since_first_inspection,...,rating_null_zip,liquor_license,sidewalk_license,food_poisoning_complaints,food_establishment_complaints,electric_complaints,safety_complaints,rodent_complaints,dirty_conditions_complaints,missed_collection_complaints
30075445,10462,BRONX,Bakery,7,3,4,1.750,0.750000,374,918,...,0,L,,2.0,5.0,3.0,1.0,24.0,24.0,20.0
30112340,11225,BROOKLYN,Hamburgers,13,9,8,1.625,1.125000,24,875,...,0,OP,,5.0,7.0,23.0,3.0,27.0,18.0,4.0
30191841,10019,MANHATTAN,Irish,3,6,4,0.750,1.500000,253,1044,...,0,AX,Unenclosed,16.0,13.0,172.0,72.0,32.0,33.0,26.0
40356018,11224,BROOKLYN,American,0,6,3,0.000,2.000000,346,1076,...,0,CT,,0.0,9.0,24.0,4.0,13.0,8.0,3.0
40356151,11369,QUEENS,American,12,4,6,2.000,0.666667,351,764,...,0,,,0.0,3.0,6.0,0.0,8.0,12.0,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50060427,11217,BROOKLYN,Bagels/Pretzels,8,3,2,4.000,1.500000,17,30,...,0,OP,Enclosed,3.0,14.0,14.0,2.0,83.0,31.0,13.0
50060496,11232,BROOKLYN,Delicatessen,4,3,1,4.000,3.000000,32,32,...,0,,,0.0,1.0,9.0,4.0,4.0,32.0,3.0
50060695,11432,QUEENS,Pizza,6,3,1,6.000,3.000000,6,6,...,0,,,0.0,0.0,0.0,0.0,1.0,4.0,3.0
50060807,10467,BRONX,Chinese,1,0,1,1.000,0.000000,11,11,...,0,AX,,3.0,5.0,55.0,14.0,62.0,24.0,9.0


## What features do we have currently?

In [11]:
print(violations_df2.columns.values)

['zipcode' 'boro' 'cuisine' 'crit_violations_train'
 'non_crit_violations_train' 'num_inspections_train' 'average_crit_v_train'
 'average_non_crit_v_train' 'time_since_last_inspection'
 'time_since_first_inspection' 'crit_v_2plus' 'address2' 'latitude'
 'longitude' '3day_temp' '3day_humidity' 'names' 'search_info' 'rating'
 'rating_null' 'rating_null_zip' 'liquor_license' 'sidewalk_license'
 'food_poisoning_complaints' 'food_establishment_complaints'
 'electric_complaints' 'safety_complaints' 'rodent_complaints'
 'dirty_conditions_complaints' 'missed_collection_complaints']


In [None]:
## Normalize numerical features # Work in progress

def feature_normalization(train, test): # From first homework assignment
    """Rescale the data so that each feature in the training set is in
    the interval [0,1], and apply the same transformations to the test
    set, using the statistics computed on the training set.

    Args:
        train - training set, a 2D numpy array of size (num_instances, num_features)
        test  - test set, a 2D numpy array of size (num_instances, num_features)
    Returns:
        train_normalized - training set after normalization
        test_normalized  - test set after normalization

    """
    m = np.min(train,axis=0)
    M = np.max(train,axis=0)
    train_normalized = (train - m)/(M-m)
    test_normalized = (test - m)/(M-m)
    return train_normalized, test_normalized
