In [13]:
# removes annoying deprecation warnings 
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

import pandas as pd
from google.cloud import bigquery
from bq_helper import BigQueryHelper #third party library to translate google query data to dataframe
import matplotlib.pyplot as plt
import os, sys

sys.path.insert(0, './../utils/')

# custom files 
import random_forest_regressor as rfr
import utilities as util
import validation as cv
from sklearn.model_selection import RepeatedKFold

from sklearn.model_selection import train_test_split
from sklearn import cross_validation

%matplotlib inline

with open('../../key.txt') as f:
    content = f.readlines()

#add your own key here 
os.environ['GOOGLE_APPLICATION_CREDENTIALS']= content[0]

In [14]:
CO_QUERY = """
        SELECT
            avg(arithmetic_mean) as amCO,
            CONCAT(cast(EXTRACT(YEAR FROM CO_daily.date_local) as string),
            cast(EXTRACT(MONTH FROM CO_daily.date_local) as string), 
            cast(EXTRACT(DAY FROM CO_daily.date_local) as string)) as date
        FROM
          `bigquery-public-data.epa_historical_air_quality.co_daily_summary` as CO_daily
        WHERE state_name ="California" AND city_name="San Francisco"
        GROUP BY date
        ORDER BY date DESC
        """

bq_assistant = BigQueryHelper("bigquery-public-data", "epa_historical_air_quality")
df_CO = bq_assistant.query_to_pandas(CO_QUERY)

In [15]:
PPM_QUERY = """
        SELECT
            avg(arithmetic_mean) as amPPM25,
            CONCAT(cast(EXTRACT(YEAR FROM PPM25_daily.date_local) as string),
            cast(EXTRACT(MONTH FROM PPM25_daily.date_local) as string), 
            cast(EXTRACT(DAY FROM PPM25_daily.date_local) as string)) as date
        FROM
          `bigquery-public-data.epa_historical_air_quality.pm25_frm_daily_summary` as PPM25_daily
        WHERE state_name ="California" AND city_name="San Francisco"
        GROUP BY date
        ORDER BY date DESC
        """

df_PPM = bq_assistant.query_to_pandas(PPM_QUERY)

In [16]:
SO2_QUERY = """
        SELECT
            avg(arithmetic_mean) as amSO2,
            CONCAT(cast(EXTRACT(YEAR FROM SO2_daily.date_local) as string),
            cast(EXTRACT(MONTH FROM SO2_daily.date_local) as string), 
            cast(EXTRACT(DAY FROM SO2_daily.date_local) as string)) as date
        FROM
          `bigquery-public-data.epa_historical_air_quality.so2_daily_summary` as SO2_daily
        WHERE state_name ="California" AND city_name="San Francisco"
        GROUP BY date
        ORDER BY date DESC
        """
df_SO2 = bq_assistant.query_to_pandas(SO2_QUERY)

In [17]:
df_total = pd.merge(pd.merge(df_CO,df_PPM,on='date'),df_SO2,on='date')

In [18]:
df_total.head()

Unnamed: 0,amCO,date,amPPM25,amSO2
0,0.302174,200899,5.8,0.111039
1,0.487318,200893,15.9,2.183766
2,0.440489,2008927,8.2,1.705195
3,0.308786,2008921,6.7,0.044156
4,0.352989,2008915,11.5,0.392308


In [21]:
SF_CRIME_QUERY = """
        SELECT
          COUNT( DISTINCT unique_key) as count,
          CONCAT(cast(EXTRACT(YEAR FROM SFCrimeData.timestamp) AS string), 
          cast(EXTRACT(MONTH FROM SFCrimeData.timestamp) AS string), 
          cast(EXTRACT(DAY FROM SFCrimeData.timestamp) AS string)) AS date
        FROM
          `bigquery-public-data.san_francisco_sfpd_incidents.sfpd_incidents` AS SFCrimeData
        WHERE category != "NON-CRIMINAL" AND category != "RECOVERED VEHICLE"
        GROUP BY date
        ORDER BY date DESC
        """
bq_assistant_SF_crime = BigQueryHelper("bigquery-public-data", "san_francisco_sfpd_incidents.sfpd_incidents")
df_SF_crime = bq_assistant_SF_crime.query_to_pandas(SF_CRIME_QUERY)

In [22]:
df_SF_crime.head()

Unnamed: 0,count,date
0,190,201859
1,192,201858
2,167,201857
3,191,201856
4,223,201855


In [23]:
df_SF_census = pd.read_csv('../../data/censuspopulationsf.tsv', sep='\t', header=None)
df_SF_census.columns = ['year', 'pop']
df_SF_census.head(n=20)

Unnamed: 0,year,pop
0,2003,757638
1,2004,750133
2,2005,748846
3,2006,751431
4,2007,758348
5,2008,767067
6,2009,774347
7,2010,805770
8,2011,816294
9,2012,830406


In [24]:
# make column for counts per capita
util.per_capita(df_SF_crime, df_SF_census)
# merge CO and Crime data
df_merged = util.merge_data(df_total, df_SF_crime)
df_merged.head()

Unnamed: 0,amCO,date,amPPM25,amSO2,count,per_capita
3259,0.302174,200899,5.8,0.111039,275,0.000359
3266,0.487318,200893,15.9,2.183766,286,0.000373
3269,0.440489,2008927,8.2,1.705195,338,0.000441
3275,0.308786,2008921,6.7,0.044156,261,0.00034
3282,0.352989,2008915,11.5,0.392308,267,0.000348


In [25]:
# find optimum regressor
regr = rfr.find_regressor(df_merged[['date','amCO', 'amSO2', 'amPPM25']].as_matrix(), df_merged['per_capita'].values)
regr

Fitting 3 folds for each of 10 candidates, totalling 30 fits


  


[CV] n_estimators=250, min_samples_split=8, max_depth=50, bootstrap=True 
[CV] n_estimators=250, min_samples_split=8, max_depth=50, bootstrap=True 
[CV] n_estimators=250, min_samples_split=8, max_depth=50, bootstrap=True 
[CV] n_estimators=275, min_samples_split=4, max_depth=80, bootstrap=True 
[CV]  n_estimators=250, min_samples_split=8, max_depth=50, bootstrap=True, total=   0.3s
[CV] n_estimators=275, min_samples_split=4, max_depth=80, bootstrap=True 
[CV]  n_estimators=250, min_samples_split=8, max_depth=50, bootstrap=True, total=   0.4s
[CV] n_estimators=275, min_samples_split=4, max_depth=80, bootstrap=True 
[CV]  n_estimators=275, min_samples_split=4, max_depth=80, bootstrap=True, total=   0.4s
[CV]  n_estimators=250, min_samples_split=8, max_depth=50, bootstrap=True, total=   0.4s
[CV] n_estimators=250, min_samples_split=2, max_depth=10, bootstrap=False 
[CV] n_estimators=250, min_samples_split=2, max_depth=10, bootstrap=False 
[CV]  n_estimators=250, min_samples_split=2, max_d

[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    2.8s finished


RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [70]:
# split dataset 
X_train, X_test, y_train, y_test = train_test_split(df_merged[['date','amCO', 'amSO2', 'amPPM25']], df_merged['per_capita'].values, test_size=0.33, random_state=42)
# make predictions based on optimum regressor
y_pred = rfr.fit_and_predict(regr, X_train, X_test, y_train, y_test)

  regr.fit(X_train.as_matrix(), y_train)
  return regr.predict(X_test.as_matrix())


In [71]:
cv.MSE(y_test, y_pred)
# leave one out cross validation 
# loo = cross_validation.LeaveOneOut(len(df_merged['per_capita'].values))
# loo_score = cv.Cross_Validation(loo, regr, df_merged[['date','am']].as_matrix(), df_merged['per_capita'].values)

1.4768410446891544e-08

In [72]:
# 10 fold tss cross validation
tss_score = cv.Cross_Validation(df_merged[['date','amCO', 'amSO2', 'amPPM25']], df_merged['per_capita'], regr, 10)
print('10-fold cross validation using time series split (additive): {} '.format(tss_score))

10-fold cross validation using time series split (additive): 1.5187209598765883e-08 


In [69]:
df_merged.corr()

Unnamed: 0,amCO,amPPM25,amSO2,count,per_capita
amCO,1.0,0.519903,0.638521,0.158086,0.160318
amPPM25,0.519903,1.0,0.612923,0.203015,0.203981
amSO2,0.638521,0.612923,1.0,0.169749,0.172524
count,0.158086,0.203015,0.169749,1.0,0.999673
per_capita,0.160318,0.203981,0.172524,0.999673,1.0
