In [179]:
import pandas as pd
from google.cloud import bigquery
from bq_helper import BigQueryHelper #third party library to translate google query data to dataframe
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

import plotly
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.tools as tls
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as fig_fact
import os
plotly.tools.set_config_file(world_readable=True, sharing='public')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, TheilSenRegressor

%matplotlib inline

with open('key.txt') as f:
    content = f.readlines()

#add your own key here 
os.environ['GOOGLE_APPLICATION_CREDENTIALS']= content[0]

In [180]:
EPA_QUERY = """
        SELECT
            avg(arithmetic_mean) as am,
            CONCAT(cast(EXTRACT(YEAR FROM CO_daily.date_local) as string),
            cast(EXTRACT(MONTH FROM CO_daily.date_local) as string)) as year_month
        FROM
          `bigquery-public-data.epa_historical_air_quality.co_daily_summary` as CO_daily
        WHERE state_name ="California" AND city_name="San Francisco"
        GROUP BY year_month
        ORDER BY year_month DESC
        """

bq_assistant_CO = BigQueryHelper("bigquery-public-data", "epa_historical_air_quality")
df_CO = bq_assistant.query_to_pandas(EPA_QUERY)

In [181]:
df_CO.head()

Unnamed: 0,am,year_month
0,0.479866,20179
1,0.51823,20178
2,0.428956,20177
3,0.313639,20176
4,0.284872,20175


In [147]:
SF_CRIME_QUERY = """
        SELECT
          COUNT(*) as count,
          CONCAT(CAST(EXTRACT(YEAR
              FROM
                SFCrimeData.timestamp) AS string), CAST(EXTRACT(MONTH
              FROM
                SFCrimeData.timestamp) AS string)) AS year_month
        FROM
          `bigquery-public-data.san_francisco_sfpd_incidents.sfpd_incidents` AS SFCrimeData
        GROUP BY
          year_month
        ORDER BY
          year_month DESC
        """
bq_assistant_SF_crime = BigQueryHelper("bigquery-public-data", "san_francisco_sfpd_incidents.sfpd_incidents")
df_SF_crime = bq_assistant_SF_crime.query_to_pandas(SF_CRIME_QUERY)

In [148]:
df_SF_crime.head(n=10)

Unnamed: 0,count,year_month
0,3644,20185
1,10306,20184
2,10740,20183
3,9947,20182
4,12031,20181
5,12684,20179
6,12872,20178
7,13171,20177
8,12605,20176
9,13267,20175


In [149]:
df_SF_census = pd.read_csv('censuspopulationsf.tsv', sep='\t', header=None)
df_SF_census.columns = ['year', 'pop']
df_SF_census.head(n=20)

Unnamed: 0,year,pop
0,2003,757638
1,2004,750133
2,2005,748846
3,2006,751431
4,2007,758348
5,2008,767067
6,2009,774347
7,2010,805770
8,2011,816294
9,2012,830406


In [167]:
for i in range(len(df_SF_census)):
    df_SF_crime.loc[list((df_SF_crime[df_SF_crime['year_month'].str.contains(str(df_SF_census.loc[i,'year']))]).index.values), 'per_capita'] = df_SF_crime[df_SF_crime['year_month'].str.contains(str(df_SF_census.loc[i,'year']))]['count']/df_SF_census.loc[i,'pop']

In [169]:
df_SF_crime.head()

Unnamed: 0,count,year_month,per_capita
0,3644,20185,0.00412
1,10306,20184,0.011654
2,10740,20183,0.012144
3,9947,20182,0.011248
4,12031,20181,0.013604


In [170]:
df_merged = pd.merge(df_CO, df_SF_crime, on=['year_month'], left_index=True)
df_merged.head()

Unnamed: 0,am,year_month,count,per_capita
5,0.479866,20179,12684,0.014343
6,0.51823,20178,12872,0.014555
7,0.428956,20177,13171,0.014893
8,0.313639,20176,12605,0.014253
9,0.284872,20175,13267,0.015002


In [171]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

param_dist = {"n_estimators": [100, 125, 150, 175, 200, 225, 250, 275, 300],
              "max_depth": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100,None],
              "min_samples_split":  [2, 4, 6, 8, 10],
              "bootstrap": [True, False]}
search_regr=RandomForestRegressor(n_estimators= 100, random_state=42)
rf_random = RandomizedSearchCV(estimator = search_regr, param_distributions = param_dist, n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(df_merged['am'].as_matrix().reshape(-1,1), df_merged['per_capita'].values)
rf_random.best_estimator_

Fitting 3 folds for each of 10 candidates, totalling 30 fits



Method .as_matrix will be removed in a future version. Use .values instead.



[CV] n_estimators=250, min_samples_split=8, max_depth=50, bootstrap=True 
[CV] n_estimators=275, min_samples_split=4, max_depth=80, bootstrap=True 
[CV] n_estimators=250, min_samples_split=8, max_depth=50, bootstrap=True 
[CV] n_estimators=250, min_samples_split=8, max_depth=50, bootstrap=True 
[CV]  n_estimators=250, min_samples_split=8, max_depth=50, bootstrap=True, total=   0.4s
[CV] n_estimators=275, min_samples_split=4, max_depth=80, bootstrap=True 
[CV]  n_estimators=275, min_samples_split=4, max_depth=80, bootstrap=True, total=   0.4s
[CV] n_estimators=275, min_samples_split=4, max_depth=80, bootstrap=True 
[CV]  n_estimators=250, min_samples_split=8, max_depth=50, bootstrap=True, total=   0.4s
[CV]  n_estimators=250, min_samples_split=8, max_depth=50, bootstrap=True, total=   0.4s
[CV] n_estimators=250, min_samples_split=2, max_depth=10, bootstrap=False 
[CV] n_estimators=250, min_samples_split=2, max_depth=10, bootstrap=False 
[CV]  n_estimators=250, min_samples_split=2, max_d

[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    3.1s finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=70,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=10,
           min_weight_fraction_leaf=0.0, n_estimators=175, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [172]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_merged['am'], df_merged['per_capita'].values, test_size=0.33, random_state=42)

rf_random.best_estimator_.fit(X_train.as_matrix().reshape(-1,1), y_train)


Method .as_matrix will be removed in a future version. Use .values instead.



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=70,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=10,
           min_weight_fraction_leaf=0.0, n_estimators=175, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [175]:
y_pred = rf_random.best_estimator_.predict(X_test.as_matrix().reshape(-1,1))


Method .as_matrix will be removed in a future version. Use .values instead.



In [176]:
from sklearn import cross_validation
loo = cross_validation.LeaveOneOut(len(df_merged['per_capita'].values))
scores = cross_validation.cross_val_score(rf_random.best_estimator_, df_merged['am'].as_matrix().reshape(-1,1), df_merged['per_capita'].values, scoring='mean_squared_error', cv=loo,)


Method .as_matrix will be removed in a future version. Use .values instead.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed t


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed t


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed t


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed t

In [177]:
scores.mean()

-1.1431462573817735e-06