# Builds off of ML_Project_Jon but incorporates weather data
## See also scrape_weather.ipynb

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

In [2]:
pd.set_option('max_rows', 10)

## Load health inspection and weather data

In [3]:
violations_df = pd.read_csv('health_inspect_cleaned.csv', index_col=0) # file is in GitHub
weather_df = pd.read_csv('weather_data.csv') # file is in GitHub

### Add temperature and humidity columns to health inspection data

In [4]:
violations_df['3-day temp'] = list(weather_df['3-day temp'])
violations_df['3-day humidity'] = list(weather_df['3-day humidity'])

violations_df

Unnamed: 0,crit_violations,non_crit_violations,earliest_inspection,latest_inspection,second_latest_inspection,zipcode,boro,cuisine,address,num_inspections,...,crit_violations_train,non_crit_violations_train,num_inspections_train,average_crit_v_train,average_non_crit_v_train,time_since_last_inspection,time_since_first_inspection,crit_v_2plus,3-day temp,3-day humidity
30075445,8.0,4.0,2013-08-14,2016-02-18,2015-02-09,10462,BRONX,Bakery,1007 MORRIS PARK AVE,5,...,7.0,3.0,4,1.750,0.750000,374,918,0,39.333333,60.666667
30112340,14.0,10.0,2014-06-05,2016-10-27,2016-10-03,11225,BROOKLYN,Hamburgers,469 FLATBUSH AVENUE,9,...,13.0,9.0,8,1.625,1.125000,24,875,0,61.666667,
30191841,4.0,7.0,2013-07-22,2016-05-31,2015-09-21,10019,MANHATTAN,Irish,351 WEST 57 STREET,5,...,3.0,6.0,4,0.750,1.500000,253,1044,0,77.333333,68.666667
40356018,1.0,7.0,2013-06-05,2016-05-16,2015-06-05,11224,BROOKLYN,American,2780 STILLWELL AVENUE,4,...,0.0,6.0,3,0.000,2.000000,346,1076,0,58.333333,56.666667
40356151,13.0,5.0,2014-04-11,2016-05-14,2015-05-29,11369,QUEENS,American,8825 ASTORIA BOULEVARD,7,...,12.0,4.0,6,2.000,0.666667,351,764,0,63.666667,67.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50060427,10.0,4.0,2017-03-07,2017-04-06,2017-03-20,11217,BROOKLYN,Bagels/Pretzels,73 LAFAYETTE AVE,3,...,8.0,3.0,2,4.000,1.500000,17,30,1,49.333333,88.666667
50060496,7.0,4.0,2017-02-27,2017-03-31,2017-02-27,11232,BROOKLYN,Delicatessen,4102 3RD AVE,2,...,4.0,3.0,1,4.000,3.000000,32,32,1,46.000000,67.333333
50060695,7.0,4.0,2017-04-04,2017-04-10,2017-04-04,11432,QUEENS,Pizza,16417 JAMAICA AVE,2,...,6.0,3.0,1,6.000,3.000000,6,6,0,51.333333,55.000000
50060807,2.0,2.0,2017-03-09,2017-03-20,2017-03-09,10467,BRONX,Chinese,2861 WHITE PLAINS RD,2,...,1.0,0.0,1,1.000,0.000000,11,11,0,42.000000,47.333333


### Check for any rows we failed to scrape properly, i.e. rows with NaN for temperature or humidity

In [5]:
temp_is_nan = weather_df['3-day temp'].index[weather_df['3-day temp'].apply(np.isnan)].values
len(temp_is_nan)

1517

In [6]:
humid_is_nan = weather_df['3-day humidity'].index[weather_df['3-day humidity'].apply(np.isnan)].values
len(humid_is_nan)

1806

### FOR NOW we'll only include rows with valid temperature and humidity information
#### Later, we'll deal with those missing values more intelligently

In [7]:
keep = [i for i in range(len(violations_df)) if i not in temp_is_nan and i not in humid_is_nan]
len(violations_df) - len(keep)

1854

In [8]:
violations_df1 = violations_df.iloc[keep]

### Remove columns we don't want in our training set

In [9]:
violations_df2 = violations_df1.drop(['address', 'crit_violations', 'earliest_inspection',
                                     'latest_inspection', 'second_latest_inspection',
                                     'non_crit_violations', 'num_inspections',
                                     'crit_violations_recent_inspect', 'non_crit_violations_recent_inspect'],axis=1)
violations_df2.head()

Unnamed: 0,zipcode,boro,cuisine,crit_violations_train,non_crit_violations_train,num_inspections_train,average_crit_v_train,average_non_crit_v_train,time_since_last_inspection,time_since_first_inspection,crit_v_2plus,3-day temp,3-day humidity
30075445,10462,BRONX,Bakery,7.0,3.0,4,1.75,0.75,374,918,0,39.333333,60.666667
30191841,10019,MANHATTAN,Irish,3.0,6.0,4,0.75,1.5,253,1044,0,77.333333,68.666667
40356018,11224,BROOKLYN,American,0.0,6.0,3,0.0,2.0,346,1076,0,58.333333,56.666667
40356151,11369,QUEENS,American,12.0,4.0,6,2.0,0.666667,351,764,0,63.666667,67.666667
40356483,11234,BROOKLYN,Delicatessen,8.0,13.0,6,1.333333,2.166667,348,1016,0,63.333333,34.0


### Convert categorical variables to dummy variables

In [10]:
violations_df3 = pd.get_dummies(violations_df2)

### Separate target variable

In [11]:
X = violations_df3.drop(['crit_v_2plus'],axis=1)
y = violations_df3['crit_v_2plus']

### Split into training and test sets

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Calculate metrics of predicting 0 class for all

In [13]:
pred_zero = np.zeros(len(y_test))
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_zero)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_zero)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_zero)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_zero)))

accuracy = 0.6572265625
recall = 0.0
precision = 0.0
f1 = 0.0


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


### Logistic Regression

#### w/ L2-regularization

In [14]:
lr = linear_model.LogisticRegression(penalty = 'l2',C=1e20)
lr.fit(X_train, y_train)
pred_lr = lr.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_lr)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_lr)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_lr)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_lr)))

accuracy = 0.657470703125
recall = 0.106837606838
precision = 0.501672240803
f1 = 0.176159718144


#### w/ L1-regularization

In [15]:
lr = linear_model.LogisticRegression(penalty = 'l1',C=1e20)
lr.fit(X_train, y_train)
pred_lr = lr.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_lr)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_lr)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_lr)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_lr)))

accuracy = 0.653564453125
recall = 0.160256410256
precision = 0.483870967742
f1 = 0.24077046549


### Naive Bayes

#### w/ fit prior

In [16]:
nb = BernoulliNB(alpha=1,fit_prior=True)
nb.fit(X_train, y_train)
pred_nb = nb.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_nb)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_nb)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_nb)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_nb)))

accuracy = 0.656982421875
recall = 0.037037037037
precision = 0.495238095238
f1 = 0.0689198144467


#### w/ uniform prior

In [17]:
nb = BernoulliNB(alpha=1,fit_prior=False)
nb.fit(X_train, y_train)
pred_nb = nb.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_nb)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_nb)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_nb)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_nb)))

accuracy = 0.54833984375
recall = 0.526353276353
precision = 0.384095634096
f1 = 0.444110576923


### Random Forest

#### 10 trees

In [18]:
rf = RandomForestClassifier(n_estimators=10)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_rf)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_rf)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_rf)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_rf)))

accuracy = 0.65869140625
recall = 0.237891737892
precision = 0.504531722054
f1 = 0.323330106486


#### 20 trees

In [19]:
rf = RandomForestClassifier(n_estimators=20)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)
print('accuracy = ' + str(sklearn.metrics.accuracy_score(y_test, pred_rf)))
print('recall = ' + str(sklearn.metrics.recall_score(y_test, pred_rf)))
print('precision = ' + str(sklearn.metrics.precision_score(y_test, pred_rf)))
print('f1 = ' + str(sklearn.metrics.f1_score(y_test, pred_rf)))

accuracy = 0.654541015625
recall = 0.226495726496
precision = 0.491499227202
f1 = 0.310092637738
