# Builds off of ML_Project_Jon but incorporates 311 data


In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

In [11]:
pd.set_option('max_rows', 10)

## Load health inspection with 311 complaints

In [14]:
violations_df = pd.read_csv('vdf_with_complaints.csv', index_col=0) # file is in GitHub

### Remove columns we don't want in our training set

In [29]:
violations_df2 = violations_df.drop(['address', 'crit_violations', 'earliest_inspection',
                                     'latest_inspection', 'second_latest_inspection',
                                     'non_crit_violations', 'num_inspections',
                                     'crit_violations_recent_inspect', 'non_crit_violations_recent_inspect',
                                     'address2','latitude','longitude'],axis=1)
violations_df2.head()

Unnamed: 0,zipcode,boro,cuisine,crit_violations_train,non_crit_violations_train,num_inspections_train,average_crit_v_train,average_non_crit_v_train,time_since_last_inspection,time_since_first_inspection,crit_v_2plus,food_poisoning_complaints,food_establishment_complaints,electric_complaints,safety_complaints,rodent_complaints,dirty_conditions_complaints,missed_collection_complaints
40849391,10003,MANHATTAN,Japanese,10.0,7.0,5,2.0,1.4,500,953,0,17,47,2,1,24.0,33.0,31.0
41098999,10469,BRONX,Hamburgers,5.0,6.0,4,1.25,1.5,385,946,0,3,3,10,1,8.0,16.0,7.0
41140992,11369,QUEENS,Sandwiches,2.0,5.0,4,0.5,1.25,398,916,0,0,1,0,0,0.0,0.0,0.0
41317857,10017,MANHATTAN,Café/Coffee/Tea,5.0,6.0,5,1.0,1.2,385,740,0,26,60,3,2,12.0,13.0,9.0
41382811,10469,BRONX,Caribbean,9.0,9.0,5,1.8,1.8,19,1008,0,0,3,10,0,6.0,31.0,13.0


In [30]:
#violations_df2 = (violations_df2 - violations_df2.mean()) / (violations_df2.max() - violations_df2['food_poisoning_complaints'].min()) 


ValueError: could not convert string to float: American

### Convert categorical variables to dummy variables

In [31]:
violations_df3 = pd.get_dummies(violations_df2)

In [40]:

def feature_normalization(train, test):
    """Rescale the data so that each feature in the training set is in
    the interval [0,1], and apply the same transformations to the test
    set, using the statistics computed on the training set.

    Args:
        train - training set, a 2D numpy array of size (num_instances, num_features)
        test  - test set, a 2D numpy array of size (num_instances, num_features)
    Returns:
        train_normalized - training set after normalization
        test_normalized  - test set after normalization

    """
    m = np.min(train,axis=0)
    M = np.max(train,axis=0)
    train_normalized = (train - m)/(M-m)
    test_normalized = (test - m)/(M-m)
    return train_normalized, test_normalized

### Separate target variable

In [41]:
X = violations_df3.drop(['crit_v_2plus'],axis=1)
y = violations_df3['crit_v_2plus']

### Split into training and test sets

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
 
#X_train, X_test=feature_normalization(X_train, X_test)     
X_train


41538262    0
40959910    1
50000280    1
40402095    0
50001610    1
           ..
41701184    0
50036254    0
41383581    1
40513031    0
50042048    1
Name: crit_v_2plus, dtype: int64

#### Calculate metrics of predicting 0 class for all

In [19]:
pred_zero = np.zeros(len(y_test))
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_zero)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_zero)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_zero)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_zero)))

accuracy = 0.664948453608
recall = 0.0
precision = 0.0
f1 = 0.0


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


### Logistic Regression

#### w/ L2-regularization

In [44]:
lr = linear_model.LogisticRegression(penalty = 'l2',C=1e20)
lr.fit(X_train, y_train)
pred_lr = lr.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_lr)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_lr)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_lr)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_lr)))

accuracy = 0.670103092784
recall = 0.164548494983
precision = 0.524520255864
f1 = 0.250509164969


#### w/ L1-regularization

In [45]:
lr = linear_model.LogisticRegression(penalty = 'l1',C=1e20)
lr.fit(X_train, y_train)
pred_lr = lr.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_lr)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_lr)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_lr)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_lr)))

accuracy = 0.67032720753
recall = 0.164548494983
precision = 0.525641025641
f1 = 0.250636780438


### Naive Bayes

#### w/ fit prior

In [46]:
nb = BernoulliNB(alpha=1,fit_prior=True)
nb.fit(X_train, y_train)
pred_nb = nb.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_nb)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_nb)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_nb)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_nb)))

accuracy = 0.656432093232
recall = 0.0615384615385
precision = 0.414414414414
f1 = 0.107163657542


#### w/ uniform prior

In [47]:
nb = BernoulliNB(alpha=1,fit_prior=False)
nb.fit(X_train, y_train)
pred_nb = nb.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_nb)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_nb)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_nb)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_nb)))

accuracy = 0.533393097266
recall = 0.640133779264
precision = 0.382646941224
f1 = 0.478978978979


### Random Forest

#### 10 trees

In [48]:
rf = RandomForestClassifier(n_estimators=10)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_rf)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_rf)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_rf)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_rf)))

accuracy = 0.650156880323
recall = 0.204682274247
precision = 0.451327433628
f1 = 0.281638288081


#### 20 trees

In [49]:
rf = RandomForestClassifier(n_estimators=20)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_rf)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_rf)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_rf)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_rf)))

accuracy = 0.65307037203
recall = 0.197993311037
precision = 0.458914728682
f1 = 0.276635514019
