## Please see the README in this directory for running instructions.

In [2]:
import numpy as np
import pandas as pd
import io
import matplotlib.pyplot as plt
import sklearn
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from IPython.display import display
%matplotlib inline

In [3]:
pd.set_option('max_rows', 10)

## Load health inspection and weather data

In [4]:
violations_df = pd.read_csv('../health_inspect_cleaned.csv', index_col=0)
weather_df = pd.read_csv('weather_data.csv', index_col=None)

display(weather_df)

Unnamed: 0,3-day humidity,3-day temp
0,60.666667,39.333333
1,,61.666667
2,68.666667,77.333333
3,56.666667,58.333333
4,67.666667,63.666667
...,...,...
22325,88.666667,49.333333
22326,67.333333,46.000000
22327,55.000000,51.333333
22328,47.333333,42.000000


### Add temperature and humidity columns to health inspection data

In [4]:
violations_df['3-day temp'] = list(weather_df['3-day temp'])
violations_df['3-day humidity'] = list(weather_df['3-day humidity'])

display(violations_df)

Unnamed: 0,crit_violations,non_crit_violations,earliest_inspection,latest_inspection,second_latest_inspection,zipcode,boro,cuisine,address,num_inspections,...,crit_violations_train,non_crit_violations_train,num_inspections_train,average_crit_v_train,average_non_crit_v_train,time_since_last_inspection,time_since_first_inspection,crit_v_2plus,3-day temp,3-day humidity
30075445,8.0,4.0,2013-08-14,2016-02-18,2015-02-09,10462,BRONX,Bakery,1007 MORRIS PARK AVE,5,...,7.0,3.0,4,1.750,0.750000,374,918,0,39.333333,60.666667
30112340,14.0,10.0,2014-06-05,2016-10-27,2016-10-03,11225,BROOKLYN,Hamburgers,469 FLATBUSH AVENUE,9,...,13.0,9.0,8,1.625,1.125000,24,875,0,61.666667,
30191841,4.0,7.0,2013-07-22,2016-05-31,2015-09-21,10019,MANHATTAN,Irish,351 WEST 57 STREET,5,...,3.0,6.0,4,0.750,1.500000,253,1044,0,77.333333,68.666667
40356018,1.0,7.0,2013-06-05,2016-05-16,2015-06-05,11224,BROOKLYN,American,2780 STILLWELL AVENUE,4,...,0.0,6.0,3,0.000,2.000000,346,1076,0,58.333333,56.666667
40356151,13.0,5.0,2014-04-11,2016-05-14,2015-05-29,11369,QUEENS,American,8825 ASTORIA BOULEVARD,7,...,12.0,4.0,6,2.000,0.666667,351,764,0,63.666667,67.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50060427,10.0,4.0,2017-03-07,2017-04-06,2017-03-20,11217,BROOKLYN,Bagels/Pretzels,73 LAFAYETTE AVE,3,...,8.0,3.0,2,4.000,1.500000,17,30,1,49.333333,88.666667
50060496,7.0,4.0,2017-02-27,2017-03-31,2017-02-27,11232,BROOKLYN,Delicatessen,4102 3RD AVE,2,...,4.0,3.0,1,4.000,3.000000,32,32,1,46.000000,67.333333
50060695,7.0,4.0,2017-04-04,2017-04-10,2017-04-04,11432,QUEENS,Pizza,16417 JAMAICA AVE,2,...,6.0,3.0,1,6.000,3.000000,6,6,0,51.333333,55.000000
50060807,2.0,2.0,2017-03-09,2017-03-20,2017-03-09,10467,BRONX,Chinese,2861 WHITE PLAINS RD,2,...,1.0,0.0,1,1.000,0.000000,11,11,0,42.000000,47.333333


### Check for any rows we failed to scrape, i.e. rows with NaN for temperature or humidity

In [5]:
temp_is_nan = weather_df['3-day temp'].index[weather_df['3-day temp'].apply(np.isnan)].values
print('There are {} rows missing average temperature'.format(len(temp_is_nan)))

There are 1517 rows missing average temperature


In [6]:
humid_is_nan = weather_df['3-day humidity'].index[weather_df['3-day humidity'].apply(np.isnan)].values
print('There are {} rows missing average humidity'.format(len(humid_is_nan)))

There are 1806 rows missing average humidity


In [7]:
keep = [i for i in range(len(violations_df)) if i not in temp_is_nan and i not in humid_is_nan]
missing = [i for i in range(len(violations_df)) if i in temp_is_nan or i in humid_is_nan]
# # Uncomment the next two lines if you wish to overwrite the missing_weather_data.csv file
# with io.open('missing_weather_data.csv', 'wb') as f:
#     f.write(str(missing)) # Save indices to file so that we can rescrape/troubleshoot if needed
print('There are {} rows missing average temperature and/or humidity'.format(len(missing)))

There are 1854 rows missing average temperature and/or humidity


## Stop here if you have not yet scraped Central Park weather data!

In [5]:
CentralPark_df = pd.read_csv('weather_CentralPark.csv', index_col=0)

### Use Central Park temp/humidity averages in place of missing values

In [9]:
violations_df.iloc[missing,20] = list(CentralPark_df['3-day temp'])
violations_df.iloc[missing,21] = list(CentralPark_df['3-day humidity'])

display(violations_df)

Unnamed: 0,crit_violations,non_crit_violations,earliest_inspection,latest_inspection,second_latest_inspection,zipcode,boro,cuisine,address,num_inspections,...,crit_violations_train,non_crit_violations_train,num_inspections_train,average_crit_v_train,average_non_crit_v_train,time_since_last_inspection,time_since_first_inspection,crit_v_2plus,3-day temp,3-day humidity
30075445,8.0,4.0,2013-08-14,2016-02-18,2015-02-09,10462,BRONX,Bakery,1007 MORRIS PARK AVE,5,...,7.0,3.0,4,1.750,0.750000,374,918,0,39.333333,60.666667
30112340,14.0,10.0,2014-06-05,2016-10-27,2016-10-03,11225,BROOKLYN,Hamburgers,469 FLATBUSH AVENUE,9,...,13.0,9.0,8,1.625,1.125000,24,875,0,47.000000,58.000000
30191841,4.0,7.0,2013-07-22,2016-05-31,2015-09-21,10019,MANHATTAN,Irish,351 WEST 57 STREET,5,...,3.0,6.0,4,0.750,1.500000,253,1044,0,77.333333,68.666667
40356018,1.0,7.0,2013-06-05,2016-05-16,2015-06-05,11224,BROOKLYN,American,2780 STILLWELL AVENUE,4,...,0.0,6.0,3,0.000,2.000000,346,1076,0,58.333333,56.666667
40356151,13.0,5.0,2014-04-11,2016-05-14,2015-05-29,11369,QUEENS,American,8825 ASTORIA BOULEVARD,7,...,12.0,4.0,6,2.000,0.666667,351,764,0,63.666667,67.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50060427,10.0,4.0,2017-03-07,2017-04-06,2017-03-20,11217,BROOKLYN,Bagels/Pretzels,73 LAFAYETTE AVE,3,...,8.0,3.0,2,4.000,1.500000,17,30,1,49.333333,88.666667
50060496,7.0,4.0,2017-02-27,2017-03-31,2017-02-27,11232,BROOKLYN,Delicatessen,4102 3RD AVE,2,...,4.0,3.0,1,4.000,3.000000,32,32,1,46.000000,67.333333
50060695,7.0,4.0,2017-04-04,2017-04-10,2017-04-04,11432,QUEENS,Pizza,16417 JAMAICA AVE,2,...,6.0,3.0,1,6.000,3.000000,6,6,0,51.333333,55.000000
50060807,2.0,2.0,2017-03-09,2017-03-20,2017-03-09,10467,BRONX,Chinese,2861 WHITE PLAINS RD,2,...,1.0,0.0,1,1.000,0.000000,11,11,0,42.000000,47.333333


In [10]:
violations_df.to_csv('health_inspect_cleaned_weather_added.csv')