## Caution: Scraping takes a long time

In [1]:
import numpy as np
import pandas as pd
import io
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import sklearn
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier

## Load preprocessed data

In [2]:
df = pd.read_csv('health_inspect_cleaned.csv', index_col=0)
df.head()

Unnamed: 0,crit_violations,non_crit_violations,earliest_inspection,latest_inspection,second_latest_inspection,zipcode,boro,cuisine,address,num_inspections,crit_violations_recent_inspect,non_crit_violations_recent_inspect,crit_violations_train,non_crit_violations_train,num_inspections_train,average_crit_v_train,average_non_crit_v_train,time_since_last_inspection,time_since_first_inspection,crit_v_2plus
30075445,8.0,4.0,2013-08-14,2016-02-18,2015-02-09,10462,BRONX,Bakery,1007 MORRIS PARK AVE,5,1,1,7.0,3.0,4,1.75,0.75,374,918,0
30112340,14.0,10.0,2014-06-05,2016-10-27,2016-10-03,11225,BROOKLYN,Hamburgers,469 FLATBUSH AVENUE,9,1,1,13.0,9.0,8,1.625,1.125,24,875,0
30191841,4.0,7.0,2013-07-22,2016-05-31,2015-09-21,10019,MANHATTAN,Irish,351 WEST 57 STREET,5,1,1,3.0,6.0,4,0.75,1.5,253,1044,0
40356018,1.0,7.0,2013-06-05,2016-05-16,2015-06-05,11224,BROOKLYN,American,2780 STILLWELL AVENUE,4,1,1,0.0,6.0,3,0.0,2.0,346,1076,0
40356151,13.0,5.0,2014-04-11,2016-05-14,2015-05-29,11369,QUEENS,American,8825 ASTORIA BOULEVARD,7,1,1,12.0,4.0,6,2.0,0.666667,351,764,0


## Define custom function to scrape Weather Underground for temperature/humidity data

In [None]:
def scrape_CentralPark(inspection_date):
    # Initialize dictionary to store dates we've already scraped, so that we don't repeat any
    already_scraped = {}
    
    # Define the base URL for the Weather Underground historical archives
    baseurl = 'https://www.wunderground.com/history/airport/KNYC/{}/{}/{}/DailyHistory.html' # year, month, day

    # Define regular expression we'll use later
    tag_regex = re.compile(r'<.+>([0-9.,]+)<\/.+>')
    
    # Initialize lists to store 3-day temperature and humidity
    temperature = [np.nan]*3
    humidity = [np.nan]*3

    # Scrape temperature and humidity for inspection date and up to 2 days prior
    for i in range(0,3):
        # Subtract appropriate number of days
        date = pd.to_datetime(inspection_date) - np.timedelta64(i,'D')
        
        # Don't scrape if we've already scraped this date
        if date in already_scraped:
            temperature[i] = already_scraped[date][0]
            humidity[i] = already_scraped[date][1]
            continue
        else:
            # Extract year, month, and day from datetime object
            year = pd.to_datetime(str(date)).year
            month = pd.to_datetime(str(date)).month
            day = pd.to_datetime(str(date)).day

            # Open URL and turn into BeautifulSoup
            r = requests.get(baseurl.format(year, month, day)).text
            soup = BeautifulSoup(r, 'lxml')

            # Find tags corresponding to average temperature and humidity
            temperature_tag = soup.find('span', string='Mean Temperature')
            if temperature_tag:
                temperature_tag = temperature_tag.find_next(class_='wx-value')

            humidity_tag = soup.find('span', string='Average Humidity')
            if humidity_tag:
                humidity_tag = humidity_tag.find_next('td')

            # Use regex to extract numerical value from tags
            # Also convert to float
            match = re.search(tag_regex, str(temperature_tag))
            if match:
                temperature[i] = float(match.group(1))
            match = re.search(tag_regex, str(humidity_tag))
            if match:
                humidity[i] = float(match.group(1))

            # Add date and weather data to dictionary of dates we've already scraped
            already_scraped[date] = (temperature[i], humidity[i])

    # Next two lines will cause RuntimeWarning if temperature and humidity are all NaN
    avg_temperature = np.nanmean(temperature)
    avg_humidity = np.nanmean(humidity)
    
    return avg_temperature, avg_humidity

## Iterate over rows in dataframe to add 3-day average temperature and humidity

In [None]:
# Load indices of rows that are missing data from file
with io.open('missing_weather_data.csv', 'rb') as f:
    rows_to_scrape = f.read()
    rows_to_scrape = rows_to_scrape.strip('[]').split(',')
    rows_to_scrape = [int(r) for r in rows_to_scrape]

to_scrape = df.iloc[rows_to_scrape]

nrows = len(to_scrape)
temp_3day = [np.nan]*nrows
humidity_3day = [np.nan]*nrows
inds = [np.nan]*nrows

count = 0
for index, row in to_scrape.iterrows():
    # Keep track of progress
    count += 1
    print('Scraping for row {} of {} ({}% complete)'.format(count, nrows, int(((100*count)/nrows))))
    
    # Scrape in a try-except block and move onto next row if can't scrape successfully
    try:
        temp_3day[count-1], humidity_3day[count-1] = scrape_CentralPark(row['latest_inspection'])
    except:
        continue

    inds[count-1] = int(index)

    if count % 100 == 0: # Save intermediate results every 100 rows
        weather_df = pd.DataFrame({'3-day temp': temp_3day, '3-day humidity': humidity_3day}, index=inds)
        weather_df.to_csv('weather_CentralPark_partial.csv')

weather_df = pd.DataFrame({'3-day temp': temp_3day, '3-day humidity': humidity_3day})
weather_df.head()
weather_df.to_csv('weather_CentralPark.csv')

Scraping for row 1 of 1854 (0% complete)
Scraping for row 2 of 1854 (0% complete)
Scraping for row 3 of 1854 (0% complete)
Scraping for row 4 of 1854 (0% complete)
Scraping for row 5 of 1854 (0% complete)
Scraping for row 6 of 1854 (0% complete)
Scraping for row 7 of 1854 (0% complete)
Scraping for row 8 of 1854 (0% complete)
Scraping for row 9 of 1854 (0% complete)
Scraping for row 10 of 1854 (0% complete)
Scraping for row 11 of 1854 (0% complete)
Scraping for row 12 of 1854 (0% complete)
Scraping for row 13 of 1854 (0% complete)
Scraping for row 14 of 1854 (0% complete)
Scraping for row 15 of 1854 (0% complete)
Scraping for row 16 of 1854 (0% complete)
Scraping for row 17 of 1854 (0% complete)
Scraping for row 18 of 1854 (0% complete)
Scraping for row 19 of 1854 (1% complete)
Scraping for row 20 of 1854 (1% complete)
Scraping for row 21 of 1854 (1% complete)
Scraping for row 22 of 1854 (1% complete)
Scraping for row 23 of 1854 (1% complete)
Scraping for row 24 of 1854 (1% complete)
S