In [1]:
import numpy as np
import pandas as pd
# matplotlib for plotting
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
from google.cloud import bigquery
from bq_helper import BigQueryHelper
%load_ext google.cloud.bigquery
import os
# For visualization
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.palettes import Spectral6, brewer
from bokeh.transform import factor_cmap


os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="My Project-bbdce7b1712b.json"


In [2]:
bq_assistant = BigQueryHelper("bigquery-public-data", "epa_historical_air_quality")

## We are only looking at SF Data
query="""
SELECT * FROM `bigquery-public-data.epa_historical_air_quality.co_daily_summary`
where state_name ="California" AND city_name="San Francisco"
"""

df = bq_assistant.query_to_pandas(query)
##df=pd.read_csv('carbon-monoxide-results-20181007-145932.csv')

In [3]:
df.sample_duration.unique()

array(['1 HOUR', '8-HR RUN AVG END HOUR'], dtype=object)

Check the no of missing values in each column

In [4]:
df.isna().sum()

state_code                 0
county_code                0
site_num                   0
parameter_code             0
poc                        0
latitude                   0
longitude                  0
datum                      0
parameter_name             0
sample_duration            0
pollutant_standard         0
date_local                 0
units_of_measure           0
event_type                 0
observation_count          0
observation_percent        0
arithmetic_mean            0
first_max_value            0
first_max_hour             0
aqi                    16396
method_code            16398
method_name                0
local_site_name            0
address                    0
state_name                 0
county_name                0
city_name                  0
cbsa_name                  0
date_of_last_change        0
dtype: int64

# Data Preprocessing

Remove Missing values in aqi ( Dont Uncomment this)

In [5]:
# from sklearn.preprocessing import Imputer

# imp=Imputer(missing_values='NaN',strategy='mean')

# ## replace missing values in aqi and method code with mean
# df["aqi"]=imp.fit_transform(df[["aqi"]]).ravel()
# df["method_code"]=imp.fit_transform(df[["method_code"]]).ravel()


## Data Visualization ( for different years 1990-2017)

We will measure average aqi for different gases (CO,O3,NO2,SO2)

# Bar Graph for the average AQI over the years for the 4 gases

In [6]:
# For visualization
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.palettes import Spectral6, brewer
from bokeh.transform import factor_cmap


Avg Air Quality Index for CO over the years in San Francisco

In [7]:
QUERY = """
    SELECT
        EXTRACT(YEAR FROM date_local) as year,
        round(avg(aqi),2) as avg_aqi
    FROM
      `bigquery-public-data.epa_historical_air_quality.co_daily_summary`
    WHERE
       state_name ="California" AND city_name="San Francisco"
    GROUP BY year
    ORDER BY year ASC
        """
df_co = bq_assistant.query_to_pandas(QUERY)

In [8]:
df_co.year = df_co.year.astype(str)

In [9]:
## Reference https://bokeh.pydata.org/en/latest/docs/user_guide/categorical.html
## Bar graph
output_file('average_aqi_CO_over_the_years.html')

source = ColumnDataSource(df_co)
years = source.data['year'].tolist()
p = figure(x_range=years, plot_width=1200, plot_height=800)

color_map = factor_cmap(field_name='year', palette=Spectral6, factors=years)

p.vbar(x='year', top='avg_aqi', source=source, width=0.90)

p.title.text ='Average AQI of Carbon monoxide in different years'
p.xaxis.axis_label = 'Years'
p.yaxis.axis_label = "Average AQI of Carbon monoxide"

show(p)




Avg Air Quality Index for O3 over the years in San Francisco

In [10]:
QUERY = """
    SELECT
        EXTRACT(YEAR FROM date_local) as year,
        round(avg(aqi),2) as avg_aqi
    FROM
      `bigquery-public-data.epa_historical_air_quality.o3_daily_summary`
    WHERE
       state_name ="California" AND city_name="San Francisco"
    GROUP BY year
    ORDER BY year ASC
        """
df_o3 = bq_assistant.query_to_pandas(QUERY)



In [11]:
df_o3.year = df_o3.year.astype(str)

In [12]:
## Reference https://bokeh.pydata.org/en/latest/docs/user_guide/categorical.html
## Bar graph
output_file('average_aqi_O3_over_the_years.html')

source = ColumnDataSource(df_o3)
years = source.data['year'].tolist()
p = figure(x_range=years, plot_width=1200, plot_height=800)

color_map = factor_cmap(field_name='year', palette=Spectral6, factors=years)

p.vbar(x='year', top='avg_aqi', source=source, width=0.90)

p.title.text ='Average AQI of Ozone in different years'
p.xaxis.axis_label = 'Years'
p.yaxis.axis_label = "Average AQI of Ozone"

show(p)




Avg Air Quality Index for NO2 over the years in San Francisco

In [13]:
QUERY = """
    SELECT
        EXTRACT(YEAR FROM date_local) as year,
        round(avg(aqi),2) as avg_aqi
    FROM
      `bigquery-public-data.epa_historical_air_quality.no2_daily_summary`
    WHERE
       state_name ="California" AND city_name="San Francisco"
    GROUP BY year
    ORDER BY year ASC
        """
df_no2 = bq_assistant.query_to_pandas(QUERY)



In [14]:
df_no2.year = df_no2.year.astype(str)

In [15]:
## Reference https://bokeh.pydata.org/en/latest/docs/user_guide/categorical.html
## Bar graph
output_file('average_aqi_no2_over_the_years.html')

source = ColumnDataSource(df_no2)
years = source.data['year'].tolist()
p = figure(x_range=years, plot_width=1200, plot_height=800)

color_map = factor_cmap(field_name='year', palette=Spectral6, factors=years)

p.vbar(x='year', top='avg_aqi', source=source, width=0.90)

p.title.text ='Average AQI of Nitrogen dioxide  in different years'
p.xaxis.axis_label = 'Years'
p.yaxis.axis_label = "Average AQI of Nitrogen dioxide "

show(p)




In [16]:
##Avg Air Quality Index for SO2 over the years in San Francisco

QUERY = """
    SELECT
        EXTRACT(YEAR FROM date_local) as year,
        round(avg(aqi),2) as avg_aqi
    FROM
      `bigquery-public-data.epa_historical_air_quality.so2_daily_summary`
    WHERE
       state_name ="California" AND city_name="San Francisco"
    GROUP BY year
    ORDER BY year ASC
        """
df_so2 = bq_assistant.query_to_pandas(QUERY)

df_so2.year = df_so2.year.astype(str)

## Reference https://bokeh.pydata.org/en/latest/docs/user_guide/categorical.html
## Bar graph
output_file('average_aqi_so2_over_the_years.html')

source = ColumnDataSource(df_so2)
years = source.data['year'].tolist()
p = figure(x_range=years, plot_width=1200, plot_height=800)

color_map = factor_cmap(field_name='year', palette=Spectral6, factors=years)

p.vbar(x='year', top='avg_aqi', source=source, width=0.90)

p.title.text ='Average AQI of Sulphur dioxide  in different years'
p.xaxis.axis_label = 'Years'
p.yaxis.axis_label = "Average AQI of Sulphur dioxide "

show(p)




Avg Air Quality Index for Particulate Matter over the years in San Francisco

In [17]:
##Avg Air Quality Index for Particulate Matter over the years in San Francisco

QUERY = """
    SELECT
        EXTRACT(YEAR FROM date_local) as year,
        round(avg(aqi),2) as avg_aqi
    FROM
      `bigquery-public-data.epa_historical_air_quality.pm25_frm_daily_summary`
    WHERE
       state_name ="California" AND city_name="San Francisco"
    GROUP BY year
    ORDER BY year ASC
        """
df_pm25 = bq_assistant.query_to_pandas(QUERY)

df_pm25.year = df_pm25.year.astype(str)

## Reference https://bokeh.pydata.org/en/latest/docs/user_guide/categorical.html
## Bar graph
output_file('average_aqi_pm25_over_the_years.html')

source = ColumnDataSource(df_pm25)
years = source.data['year'].tolist()
p = figure(x_range=years, plot_width=1200, plot_height=800)

color_map = factor_cmap(field_name='year', palette=Spectral6, factors=years)

p.vbar(x='year', top='avg_aqi', source=source, width=0.90)

p.title.text ='Average AQI of Particulate Matter in different years'
p.xaxis.axis_label = 'Years'
p.yaxis.axis_label = "Average AQI of Particulate Matter"

show(p)




In [18]:
## RENAME THE COLUMNS OF THE 5 DFS 
## https://stackoverflow.com/questions/11346283/renaming-columns-in-pandas
df_co.rename(columns={'avg_aqi': 'avg_aqi_CO'},inplace=True)
df_no2.rename(columns={'avg_aqi': 'avg_aqi_NO2'},inplace=True)
df_o3.rename(columns={'avg_aqi': 'avg_aqi_O3'},inplace=True)
df_so2.rename(columns={'avg_aqi': 'avg_aqi_SO2'},inplace=True)
df_pm25.rename(columns={'avg_aqi': 'avg_aqi_PM25'},inplace=True)

# Compare the Time Series Graphs of all the 5  

In [19]:
## Combine the dataframes ie all 3
from functools import reduce

## Reference: https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns
frames=[df_co,df_no2,df_o3]
df_final = reduce(lambda left,right: pd.merge(left,right,on='year'), frames)


In [20]:
df_final

Unnamed: 0,year,avg_aqi_CO,avg_aqi_NO2,avg_aqi_O3
0,1990,26.14,33.2,16.03
1,1991,25.52,36.59,16.88
2,1992,23.96,32.33,17.71
3,1993,21.4,34.21,17.42
4,1994,17.24,32.91,17.68
5,1995,16.33,32.3,22.51
6,1996,15.55,32.71,22.31
7,1997,13.96,30.38,20.47
8,1998,13.38,29.95,21.25
9,1999,13.87,32.11,22.16


In [21]:
colors = ['red', 'blue', 'green']
labels=['CARBON MONOXIDE','NITROGEN DIOXIDE','OZONE']

cols=df_final.columns.tolist()
cols.remove('year')

In [22]:
output_file('COMPARISON_AQI.html')

##https://www.geeksforgeeks.org/python-iterate-multiple-lists-simultaneously/
l = figure(title="COMPARISON OF AQI OF DIFFERENT GASES FROM 1990 TO 2017", logo=None,width=1000, height=500)

for color,label,col in zip(colors, labels, cols):
    source = ColumnDataSource(data=dict(x=df_final['year'].tolist(), y=df_final[col].tolist())) 
    l.line(x='x',y='y',source=source, legend=label, color=color,line_width=3)

source = ColumnDataSource(data=dict(x=df_so2['year'].tolist(), y=df_so2['avg_aqi_SO2'].tolist()))
l.line(x='x',y='y',source=source, legend='SULPHUR DIOXIDE', color="magenta",line_width=5)

source = ColumnDataSource(data=dict(x=df_pm25['year'].tolist(), y=df_pm25['avg_aqi_PM25'].tolist()))
l.line(x='x',y='y',source=source, legend='PARTICULATE MATTER', color="indigo",line_width=5)

l.xaxis.axis_label = 'YEAR'
l.yaxis.axis_label = "AVERAGE AQI"
l.title.text ='Comparison of AQI of CO,O3,NO2,SO2 AND PM'

l.legend.location = "top_right"
l.legend.click_policy="hide"

show(l)

DONT GO BEYOND THIS FOR NOW

In [23]:
result = pd.merge(df_final, df_so2, how='left', on='year')
df_merged = pd.merge(result, df_pm25, how='left', on='year')
df_merged

Unnamed: 0,year,avg_aqi_CO,avg_aqi_NO2,avg_aqi_O3,avg_aqi_SO2,avg_aqi_PM25
0,1990,26.14,33.2,16.03,6.43,
1,1991,25.52,36.59,16.88,7.33,
2,1992,23.96,32.33,17.71,7.45,
3,1993,21.4,34.21,17.42,6.32,
4,1994,17.24,32.91,17.68,3.52,
5,1995,16.33,32.3,22.51,5.59,
6,1996,15.55,32.71,22.31,4.88,
7,1997,13.96,30.38,20.47,4.81,
8,1998,13.38,29.95,21.25,4.08,
9,1999,13.87,32.11,22.16,6.04,55.62


Impute missing values in so2 and pm25

In [24]:
from sklearn.preprocessing import Imputer

imp=Imputer(missing_values='NaN',strategy='mean')

## replace missing values in aqi and method code with mean
df_merged["avg_aqi_SO2"]=imp.fit_transform(df_merged[["avg_aqi_SO2"]]).ravel()
df_merged["avg_aqi_PM25"]=imp.fit_transform(df_merged[["avg_aqi_PM25"]]).ravel()


SF INCIDENTS DATABASE INCIDENTS PER YEAR

In [25]:
query_sf=""" SELECT EXTRACT(YEAR FROM timestamp) as year,count(distinct unique_key) as no_of_incidents
FROM `bigquery-public-data.san_francisco.sfpd_incidents`
group by year
order by year asc 
"""

df_sf_incidents = bq_assistant.query_to_pandas(query_sf)


In [26]:
df_sf_incidents.year = df_sf_incidents.year.astype(str)
df_combined = pd.merge(df_merged, df_sf_incidents, how='left', on='year')

In [27]:
df_combined

Unnamed: 0,year,avg_aqi_CO,avg_aqi_NO2,avg_aqi_O3,avg_aqi_SO2,avg_aqi_PM25,no_of_incidents
0,1990,26.14,33.2,16.03,6.43,40.171579,
1,1991,25.52,36.59,16.88,7.33,40.171579,
2,1992,23.96,32.33,17.71,7.45,40.171579,
3,1993,21.4,34.21,17.42,6.32,40.171579,
4,1994,17.24,32.91,17.68,3.52,40.171579,
5,1995,16.33,32.3,22.51,5.59,40.171579,
6,1996,15.55,32.71,22.31,4.88,40.171579,
7,1997,13.96,30.38,20.47,4.81,40.171579,
8,1998,13.38,29.95,21.25,4.08,40.171579,
9,1999,13.87,32.11,22.16,6.04,55.62,


In [28]:
df_combined["no_of_incidents"]=imp.fit_transform(df_combined[["no_of_incidents"]]).ravel()

In [29]:
df_combined.dtypes

year                object
avg_aqi_CO         float64
avg_aqi_NO2        float64
avg_aqi_O3         float64
avg_aqi_SO2        float64
avg_aqi_PM25       float64
no_of_incidents    float64
dtype: object

Applying Regression

In [None]:

## Serialize df to a pickle object
## Skip every line above
import pickle

picle_out=open('df_merged_frame.pickle','wb')
pickle.dump(df_combined,picle_out)
picle_out.close()

In [None]:
## Deserialize the pickle object to get back the data frame
import pickle

picle_in=open('df_merged_frame.pickle','rb')
df=pickle.load(picle_in)

In [None]:
## Deserialize the pickle object to get back the data frame
picle_in=open('population/dataframe.pickle','rb')
df_pop_per_yr=pickle.load(picle_in)

In [None]:
## Remove trailing zeroes
df_pop_per_yr['year'] = df_pop_per_yr['year'].astype(str).replace('\.0', '', regex=True)

In [None]:
df_comb = pd.merge(df, df_pop_per_yr, how='inner', on='year')

In [None]:
df_comb['Incidents_per_population']=df_comb['no_of_incidents']/df_comb['Population']

In [None]:
df_pop_per_yr.columns

In [None]:
df_comb.drop(['no_of_incidents','Population'], axis=1,inplace=True)

In [None]:
df=df_comb

In [None]:
df.corr()

In [None]:
# ## Optional Standard Scaling
# from sklearn.preprocessing import StandardScaler
# df[['avg_aqi_CO', 'avg_aqi_NO2','avg_aqi_O3','avg_aqi_SO2','avg_aqi_PM25']] = StandardScaler().fit_transform(df[['avg_aqi_CO', 'avg_aqi_NO2','avg_aqi_O3','avg_aqi_SO2','avg_aqi_PM25']])

In [None]:
## Optional MinMax Scaling
from sklearn.preprocessing import MinMaxScaler
df[['avg_aqi_CO', 'avg_aqi_NO2','avg_aqi_O3','avg_aqi_SO2','avg_aqi_PM25']] = MinMaxScaler().fit_transform(df[['avg_aqi_CO', 'avg_aqi_NO2','avg_aqi_O3','avg_aqi_SO2','avg_aqi_PM25']])

In [None]:
df

In [None]:
df_extra=pd.read_excel('population/SFCRIMERATE.xlsx')

Apply Imputation

In [None]:
df_extra[['Median age', 'Average household size', 'Average family size',
       'Median value owner occupied unit', 'Median household income',
       'Median family income', 'Per capita income', 'Crime Index']]=imp.fit_transform(df_extra[['Median age', 'Average household size', 'Average family size',
       'Median value owner occupied unit', 'Median household income',
       'Median family income', 'Per capita income', 'Crime Index']])

In [None]:
df_extra.Year=df_extra.Year.astype(str)

Apply Scaling

In [None]:
df_extra[['Median age', 'Average household size', 'Average family size',
       'Median value owner occupied unit', 'Median household income',
       'Median family income', 'Per capita income', 'Crime Index']] = MinMaxScaler().fit_transform(df_extra[['Median age', 'Average household size', 'Average family size',
       'Median value owner occupied unit', 'Median household income',
       'Median family income', 'Per capita income', 'Crime Index']])

In [None]:
df_extra[['Median age', 'Average household size', 'Average family size',
       'Median value owner occupied unit', 'Median household income',
       'Median family income', 'Per capita income', 'Crime Index']].round(2)

In [None]:
df[['avg_aqi_CO', 'avg_aqi_NO2', 'avg_aqi_O3', 'avg_aqi_SO2',
       'avg_aqi_PM25','Incidents_per_population']].round(2)

In [None]:
dataframe_final=pd.merge(df, df_extra, how='inner', left_on='year',right_on='Year')

In [None]:
dataframe_final.drop(['year'],axis=1,inplace=True)

In [None]:
dataframe_final.columns

In [None]:
dataframe_final

In [None]:
dataframe_final.corr()

In [None]:
cols=['avg_aqi_CO', 'avg_aqi_NO2', 'avg_aqi_O3',
       'avg_aqi_PM25', 'Median age',
       'Average household size', 'Average family size',
       'Median value owner occupied unit', 'Median household income',
       'Median family income', 'Per capita income']

In [None]:
X=dataframe_final[cols]

In [None]:
Y=dataframe_final['Crime Index']

In [None]:
X.shape

In [None]:
Y.shape

In [None]:
# Find most important features relative to target
print("Find most important features relative to target")
corr = dataframe_final.corr()
corr.sort_values(['Crime Index'], ascending = False, inplace = True)
print(corr['Crime Index'])

In [None]:
import statsmodels.api as sm


# Note the difference in argument order
model = sm.OLS(Y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
model.summary()

In [None]:
from sklearn import linear_model

lm = linear_model.LinearRegression()
model = lm.fit(X,Y)

predictions = lm.predict(X)
print('Accuracy of model=',lm.score(X,Y))


In [None]:
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt


X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)


# Linear Models

## 1. Linear Regression

In [None]:
# fit a model
lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
predictions = lm.predict(X_test)

from sklearn import metrics
                               
print(np.sqrt(metrics.mean_squared_error(y_test, predictions)))


In [None]:
mods=['Linear_Regression']
vals=[]
## Vals contains rmse for test set
vals.append(np.sqrt(metrics.mean_squared_error(y_test, predictions))) 

In [None]:
## The line / model
plt.scatter(y_test, predictions)
plt.xlabel('True Values')
plt.ylabel('Predictions')

In [None]:
## R2 score Evaluation
y_train_pred=model.predict(X_train)
y_test_pred=model.predict(X_test)

from sklearn.metrics import r2_score
from sklearn import metrics

print("r2 train: %.3f, test : %.3f" %(r2_score(y_train,y_train_pred),r2_score(y_test,y_test_pred) ))

print('Rmse VAlue is:')
print(np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

In [None]:
X_train.columns

# Feature Importance in Linear Regression

In [None]:
coefs = pd.Series(model.coef_, index = X_train.columns)
imp_coefs = pd.concat([coefs.sort_values().head(10),
                     coefs.sort_values().tail(10)])
imp_coefs.plot(kind = "barh")
plt.title("Coefficients in the Linear Regression Model")
plt.show()

## 2. Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=324)

regressor = DecisionTreeRegressor(max_depth=20)
regressor.fit(X_train, y_train)

# make predictions on the testing set
y_pred = regressor.predict(X_test)

# compute the RMSE of our predictions
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

## R2 score Evaluation
y_train_pred=regressor.predict(X_train)
y_test_pred=regressor.predict(X_test)

from sklearn.metrics import r2_score

print("r2 train: %.3f, test : %.3f" %(r2_score(y_train,y_train_pred),r2_score(y_test,y_test_pred) ))


mods.append('Decision_Tree_Regressor')
vals.append(np.sqrt(metrics.mean_squared_error(y_test, y_pred))) 

In [None]:
coefs = pd.Series(regressor.feature_importances_, index = X_train.columns)
imp_coefs = pd.concat([coefs.sort_values().head(10),
                     coefs.sort_values().tail(10)])
imp_coefs.plot(kind = "barh")
plt.title("Coefficients in the Decision Tree Regression Model")
plt.show()

## 3. Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)

forest=RandomForestRegressor(n_estimators=1000,criterion='mse',random_state=1,n_jobs=-1)
forest.fit(X_train,y_train)

# make predictions on the testing set
y_pred = forest.predict(X_test)

# compute the RMSE of our predictions
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))



## R2 score Evaluation
y_train_pred=forest.predict(X_train)
y_test_pred=forest.predict(X_test)

from sklearn.metrics import r2_score

print("r2 train: %.3f, test : %.3f" %(r2_score(y_train,y_train_pred),r2_score(y_test,y_test_pred) ))

mods.append('Random Forest Regressor')
vals.append(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))


In [None]:
coefs = pd.Series(forest.feature_importances_, index = X_train.columns)
imp_coefs = pd.concat([coefs.sort_values().head(10),
                     coefs.sort_values().tail(10)])
imp_coefs.plot(kind = "barh")
plt.title("Coefficients in the Random Forest Regression Model")
plt.show()

# Regularized Linear Models

In [None]:
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV,Lasso
from sklearn.model_selection import cross_val_score

def rmse_cv_train(model):
    rmse= np.sqrt(-cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

def rmse_cv_test(model):
    rmse= np.sqrt(-cross_val_score(model, X_test, y_test, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

## 1. Ridge Regression

In [None]:
model_ridge = Ridge()
alphas = [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]
cv_ridge = [rmse_cv_test(Ridge(alpha = alpha)).mean() 
            for alpha in alphas]

cv_ridge = pd.Series(cv_ridge, index = alphas)
cv_ridge.plot(title = "Validation - Just Do It")
plt.xlabel("alpha")
plt.ylabel("rmse")

In [None]:
cv_ridge.min()

In [None]:
mods.append('Ridge Regressor')
vals.append(cv_ridge.min())

In [None]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

model_ridge=RidgeCV(alphas=alphas)
model_ridge.fit(X_train, y_train)
rmse_cv_test(model_ridge).mean()

coef = pd.Series(model_ridge.coef_, index = X_train.columns)
imp_coef = pd.concat([coef.sort_values().head(10),
                     coef.sort_values().tail(10)])

matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = "barh")
plt.title("Coefficients in the Ridge Model")


# 2. Lasso Regression

In [None]:
model_lasso = LassoCV()
alphas = [1, 0.1, 0.001, 0.0005]
cv_lasso = [rmse_cv_test(Lasso(alpha = alpha)).mean() 
            for alpha in alphas]

cv_lasso = pd.Series(cv_lasso, index = alphas)
cv_lasso.plot(title = "Validation - Just Do It")
plt.xlabel("alpha")
plt.ylabel("rmse")

In [None]:
# cv_lasso

In [None]:
# model_lasso = Lasso(alpha=cv_lasso.idxmin)
# model_lasso.fit(X_train, y_train)
# # rmse_cv_test(model_lasso).mean()

In [None]:
mods.append('Lasso Regressor')
vals.append(rmse_cv_train(model_lasso).min())

In [None]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

model_lasso=RidgeCV(alphas=alphas)
model_lasso.fit(X_train, y_train)
rmse_cv_test(model_lasso).mean()

coef = pd.Series(model_lasso.coef_, index = X_train.columns)
imp_coef = pd.concat([coef.sort_values().head(10),
                     coef.sort_values().tail(10)])

matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = "barh")
plt.title("Coefficients in the Lasso Model")


In [None]:

ser_df=pd.DataFrame({'Model': mods, 'Accuracy_Value': vals})

source = ColumnDataSource(ser_df)

p = figure(x_range=mods, plot_width=900, plot_height=500)
color_map = factor_cmap(field_name='Model', palette=Spectral6, factors=mods)
p.vbar(x='Model', top='Accuracy_Value', source=source, width=0.70, color=color_map)

p.title.text ='Comparison of Models'
p.xaxis.axis_label = 'Model Types'
p.yaxis.axis_label = "RMSE value of different models On Test Set"

show(p)

Ignore Everything below

In [None]:
# # Adding an xgboost model:¶










# # import xgboost as xgb

# # dtrain = xgb.DMatrix(X, label = Y)
# # dtest = xgb.DMatrix(X_test)

# # params = {"max_depth":2, "eta":0.1}
# # model = xgb.cv(params, dtrain,  num_boost_round=500, early_stopping_rounds=100)
# # model.loc[30:,["test-rmse-mean", "train-rmse-mean"]].plot()


# # model_xgb = xgb.XGBRegressor(n_estimators=360, max_depth=2, learning_rate=0.1) #the params were tuned using xgb.cv
# # model_xgb.fit(X, Y)





# ## Reference https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard

# from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
# from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
# from sklearn.kernel_ridge import KernelRidge
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import RobustScaler
# from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
# from sklearn.model_selection import KFold, cross_val_score, train_test_split
# from sklearn.metrics import mean_squared_error
# import xgboost as xgb

# #Validation function
# n_folds = 5

# def rmsle_cv(model):
#     kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X.values)
#     rmse= np.sqrt(-cross_val_score(model, X.values, Y, scoring="neg_mean_squared_error", cv = kf))
#     return(rmse)

# # LASSO Regression

# lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))

# # Elastic Net Regression

# ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))


# # Kernel Ridge Regression

# KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)


# # Gradient Boosting Regression

# GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
#                                    max_depth=4, max_features='sqrt',
#                                    min_samples_leaf=15, min_samples_split=10, 
#                                    loss='huber', random_state =5)

# # XGBoost

# model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
#                              learning_rate=0.05, max_depth=3, 
#                              min_child_weight=1.7817, n_estimators=2200,
#                              reg_alpha=0.4640, reg_lambda=0.8571,
#                              subsample=0.5213, silent=1,
#                              random_state =7, nthread = -1)

# # Model Evaluation Scores

# score = rmsle_cv(lasso)
# print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

# score = rmsle_cv(ENet)
# print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

# score = rmsle_cv(KRR)
# print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

# score = rmsle_cv(GBoost)
# print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

# score = rmsle_cv(model_xgb)
# print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

# Useless Stuff after this

# columns = df.columns
# percent_missing = df.isnull().sum() * 100 / len(df)
# missing_value_df = pd.DataFrame({'column_name': columns,
#                                  'percent_missing': percent_missing})

# missing_value_df.sort_values('percent_missing', inplace=True)


# # Prints R2 and RMSE scores
# def get_score(prediction, lables):    
#     print('R2: {}'.format(r2_score(prediction, lables)))
#     print('RMSE: {}'.format(np.sqrt(mean_squared_error(prediction, lables))))

# # Shows scores for train and validation sets    
# def train_test(estimator, x_trn, x_tst, y_trn, y_tst):
#     prediction_train = estimator.predict(x_trn)
#     # Printing estimator
#     print(estimator)
#     # Printing train scores
#     get_score(prediction_train, y_trn)
#     prediction_test = estimator.predict(x_tst)
#     # Printing test scores
#     print("Test")
#     get_score(prediction_test, y_tst)

# # ### Splitting
# # x_train, x_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.1, random_state=200)
# # x_train_st, x_test_st, y_train_st, y_test_st = train_test_split(train_features_st, train_labels, test_size=0.1, random_state=200)

# ENSTest = linear_model.ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], l1_ratio=[.01, .1, .5, .9, .99], max_iter=5000).fit(X_train, y_train)
# train_test(ENSTest,X_train, X_test, y_train, y_test)

# # Average R2 score and standart deviation of 5-fold cross-validation
# scores = cross_val_score(ENSTest, X, Y, cv=5)
# print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# from sklearn import ensemble
# GBest = ensemble.GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=3, max_features='sqrt',
#                                                min_samples_leaf=15, min_samples_split=10, loss='huber').fit(X_train, y_train)
# train_test(GBest, X_train, X_test, y_train, y_test)

# # Average R2 score and standart deviation of 5-fold cross-validation
# scores = cross_val_score(GBest,  X, Y, cv=5)
# print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))