## Caution: Scraping takes a long time

In [None]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import sklearn
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from scrape_weather import scrape_weather

## Load preprocessed data

In [None]:
df = pd.read_csv('health_inspect_cleaned.csv', index_col=0)

df.head()

## Define custom function to scrape Weather Underground for temperature/humidity data

In [None]:
def scrape_weather(zipcode, inspection_date):
    # Define the base URL for the Weather Underground historical archives
    baseurl = 'https://www.wunderground.com/history/zipcode/{}/{}/{}/{}/DailyHistory.html' # zipcode, year, month, day

    # Define regular expression we'll use later
    tag_regex = re.compile(r'<.+>([0-9.,]+)<\/.+>')
    
    # Initialize lists to store 3-day temperature and humidity
    temperature = [np.nan]*3
    humidity = [np.nan]*3

    # Scrape temperature and humidity for inspection date and up to 2 days prior
    for i in range(0,3):
        # Subtract appropriate number of days
        date = pd.to_datetime(inspection_date) - np.timedelta64(i,'D')

        # Extract year, month, and day from datetime object
        year = pd.to_datetime(str(date)).year
        month = pd.to_datetime(str(date)).month
        day = pd.to_datetime(str(date)).day

        # Open URL and turn into BeautifulSoup
        r = requests.get(baseurl.format(zipcode, year, month, day)).text
        soup = BeautifulSoup(r, 'lxml')

        # Find tags corresponding to average temperature and humidity
        temperature_tag = soup.find('span', string='Mean Temperature')
        if temperature_tag:
            temperature_tag = temperature_tag.find_next(class_='wx-value')
        
        humidity_tag = soup.find('span', string='Average Humidity')
        if humidity_tag:
            humidity_tag = humidity_tag.find_next('td')

        # Use regex to extract numerical value from tags
        # Also convert to float
        match = re.search(tag_regex, str(temperature_tag))
        if match:
            temperature[i] = float(match.group(1))
        match = re.search(tag_regex, str(humidity_tag))
        if match:
            humidity[i] = float(match.group(1))

    # Next two lines will cause RuntimeWarning if temperature and humidity are all NaN
    avg_temperature = np.nanmean(temperature)
    avg_humidity = np.nanmean(humidity)
    
    return avg_temperature, avg_humidity

## Iterate over rows in dataframe to add 3-day average temperature and humidity

In [None]:
# Change count to pick up where we left off if scraping didn't finish
start = 2600
rows_to_scrape = range(start,len(df))
to_scrape = df.loc[df.index[rows_to_scrape], :]

nrows = len(to_scrape)
temp_3day = [np.nan]*nrows
humidity_3day = [np.nan]*nrows
inds = [np.nan]*nrows

count = 0
for index, row in to_scrape.iterrows():
    # Keep track of progress
    count += 1
    print('Scraping for row {} of {} ({}% complete)'.format(count, nrows, int(100*(count/nrows)))
    
    # Scrape in a try-except block and move onto next row if can't scrape successfully
    try:
        temp_3day[count-1], humidity_3day[count-1] = scrape_weather(row['zipcode'], row['latest_inspection'])
    except:
        continue

    inds[count-1] = int(index)

    if count % 100 == 0: # Save intermediate results every 100 rows
        weather_df = pd.DataFrame({'3-day temp': temp_3day, '3-day humidity': humidity_3day}, index=inds)
        weather_df.to_csv('../../weather_data.csv')

weather_df = pd.DataFrame({'3-day temp': temp_3day, '3-day humidity': humidity_3day})
weather_df.head()
weather_df.to_csv('weather_data.csv')