# Exploratory Data Analysis: Correlation Matrix

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

import boto3
import awswrangler
# set name of S3 bucket
s3_bucket = 'traffic-data-bucket'

## 1. Create Boto3 session
Start by creating a boto3 session so that we can connect to the S3 bucket.

In [2]:
from aws_secrets import aws_access_key_id, aws_secret_access_key, aws_session_token

my_session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    aws_session_token = aws_session_token

)

In [3]:
df = awswrangler.s3.read_parquet(path = f's3://{s3_bucket}/model_data/model_data_post_transformation.parquet', boto3_session=my_session, use_threads=True)

In [4]:
df.shape

(1950100, 40)

In [5]:
df.head()

Unnamed: 0,hex_id,collision_date,collision_year,collision_month,collision_dayofweek,collision_hour,accident_count,ttv_split,node_street_count,node_stop,...,prev2_yr_coll_neighbor2,noaa_wind_speed,noaa_precipitation,noaa_temperature_average,noaa_temperature_max,noaa_temperature_min,drv_collision_hour_sin,drv_collision_hour_cos,drv_holiday_flag,drv_edge_lanes_max_imputed_flag
0,8829124825fffff,2014-11-14,2014,11,4,12,1,Test,0.0,0.0,...,0.0,4.92,0.0,64,69.0,61.0,-0.136167,-0.990686,0,1
1,8829124825fffff,2018-09-19,2018,9,2,10,1,Test,0.0,0.0,...,1.0,5.59,0.0,67,74.0,61.0,0.398401,-0.917211,0,1
2,882912482dfffff,2016-08-21,2016,8,6,0,1,Test,3.0,0.0,...,1.0,8.5,0.0,67,74.0,63.0,0.0,1.0,0,1
3,882912482dfffff,2017-09-24,2017,9,6,14,1,Train,3.0,0.0,...,1.0,7.16,0.0,67,80.0,58.0,-0.631088,-0.775711,0,1
4,882912482dfffff,2018-04-23,2018,4,0,13,1,Train,3.0,0.0,...,1.0,5.82,0.0,61,68.0,54.0,-0.398401,-0.917211,0,1


In [6]:
df.columns

Index(['hex_id', 'collision_date', 'collision_year', 'collision_month',
       'collision_dayofweek', 'collision_hour', 'accident_count', 'ttv_split',
       'node_street_count', 'node_stop', 'node_traffic_signals',
       'la_data_city_name', 'edge_speed_kph_max', 'edge_speek_kph_min',
       'edge_lanes_max', 'edge_motorway_flag', 'edge_motorway_link_flag',
       'edge_living_street_flag', 'edge_bridge_flag', 'edge_oneway_flag',
       'edge_tunnel_flag', 'amenities_bar_cnt', 'amenities_school_cnt',
       'amenities_restaurant_cnt', 'amenities_college_cnt',
       'prev1_yr_coll_cnt', 'prev2_yr_coll_cnt', 'prev1_yr_coll_neighbor1',
       'prev1_yr_coll_neighbor2', 'prev2_yr_coll_neighbor1',
       'prev2_yr_coll_neighbor2', 'noaa_wind_speed', 'noaa_precipitation',
       'noaa_temperature_average', 'noaa_temperature_max',
       'noaa_temperature_min', 'drv_collision_hour_sin',
       'drv_collision_hour_cos', 'drv_holiday_flag',
       'drv_edge_lanes_max_imputed_flag'],
      dt

In [7]:
df['accident_count'].unique()

<IntegerArray>
[1, 0]
Length: 2, dtype: Int64

In [8]:
df[df['accident_count'] == 0].head()

Unnamed: 0,hex_id,collision_date,collision_year,collision_month,collision_dayofweek,collision_hour,accident_count,ttv_split,node_street_count,node_stop,...,prev2_yr_coll_neighbor2,noaa_wind_speed,noaa_precipitation,noaa_temperature_average,noaa_temperature_max,noaa_temperature_min,drv_collision_hour_sin,drv_collision_hour_cos,drv_holiday_flag,drv_edge_lanes_max_imputed_flag
388712,8829a1d623fffff,2014-07-12,2014,7,5,2,0,Train,4.0,0.0,...,0.0,7.61,0.0,70,75.0,66.0,0.519584,0.854419,0,0
388713,8829a56d29fffff,2014-05-29,2014,5,3,22,0,Validate,4.0,2.0,...,0.0,8.5,0.0,68,74.0,64.0,-0.2697968,0.962917,0,0
388714,8829a1d307fffff,2014-03-28,2014,3,4,4,0,Test,4.0,0.0,...,0.0,7.61,0.0,60,67.0,53.0,0.8878852,0.460065,0,0
388715,8829a1d11bfffff,2014-06-05,2014,6,3,23,0,Train,4.0,0.0,...,0.0,8.28,0.0,65,71.0,61.0,-2.449294e-16,1.0,0,0
388716,8829a1d6c5fffff,2014-09-24,2014,9,2,4,0,Train,4.0,6.0,...,0.0,6.71,0.0,69,78.0,64.0,0.8878852,0.460065,0,0


### Correlation Heatmap Using $R^{2}$
Here we attempt to uncover which features are correlated with the target variable `accident_count` by looking at details of a property and its corresponding reviews. `Pandas.corr()` returns the Pearson correlation coefficient but this value is being squared in order to return the coefficient of determination or in other words, $R^{2}$.  This will allow us to better evaluate predictions.

In [None]:
# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(20,14))

# Compute the correlation matrix
# square the pearson correlation coefficients returned by .corr()
corr = df.corr()**2

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(20, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, ax = ax)
plt.show()

In [None]:
# from sklearn.metrics import matthews_corrcoef

In [None]:
# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(20,14))

# Compute the correlation matrix
# square the pearson correlation coefficients returned by .corr()
corr = df.corr().abs()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(20, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, ax = ax)
plt.show()

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

In [None]:
X = df.drop('accident_count', axis=1).select_dtypes(include=np.number)
y = df['accident_count']

In [None]:
X.dtypes

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# perform data normalization
X_scaled = scaler.fit_transform(X)

In [None]:
X_scaled.shape

In [None]:
X_scaled.dtype

In [None]:
y = y.astype(int)

In [None]:
feature_selector = SelectKBest(chi2, k = "all")
fit = feature_selector.fit(X_scaled,y)

p_values = pd.DataFrame(fit.pvalues_)
scores = pd.DataFrame(fit.scores_)
input_variable_names = pd.DataFrame(X.columns)
summary_stats = pd.concat([input_variable_names, p_values, scores], axis = 1)
summary_stats.columns = ["input_variable", "p_value", "chi2_score"]
summary_stats.sort_values(by = "p_value", inplace = True)

p_value_threshold = 0.05
score_threshold = 5

selected_variables = summary_stats.loc[(summary_stats["chi2_score"] >= score_threshold) &
                                       (summary_stats["p_value"] <= p_value_threshold)]
selected_variables = selected_variables["input_variable"].tolist()
X_new = X[selected_variables]

In [None]:
X_new

In [9]:
from sklearn.feature_selection import RFECV 
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler

In [22]:
X = df.drop('accident_count', axis=1).select_dtypes(include=np.number)
y = df['accident_count'].astype(int)

X = X.iloc[:200000,:]
y = y.iloc[:200000]

In [23]:
scaler = MinMaxScaler()

# perform data normalization
X_scaled = scaler.fit_transform(X)

In [24]:
classifier = SVC()
feature_selector = RFECV(classifier)

fit = feature_selector.fit(X_scaled,y)

optimal_feature_count = feature_selector.n_features_
print(f"Optimal numer of features: {optimal_feature_count}")

X_new = X.loc[:, feature_selector.get_support()]

plt.plot(range(1, len(fit.grid_scores_) + 1), fit.grid_scores_, marker = "o")
plt.ylabel("Model Score")
plt.xlabel("Number of Features")
plt.title(f"Feature Selection using RFE \n Optimal number of features is {optimal_feature_count} (at score of {round(max(fit.grid_scores_), 4)})")
plt.tight_layout() 
plt.show()

ValueError: The number of classes has to be greater than one; got 1 class

In [None]:
matthews_corrcoef(Y, X)

### Ordinary Least Squares Regression
Let's try to predict the price of a listing using ordinary least squares regression. We will select only a handful features and we will use robust standard errors.  Robust standard errors is a technique to obtain unbiased standard errors of OLS coefficients under heteroscedasticity.

In [None]:
reg = smf.ols(formula = 'price ~ cleaning_fee + host_is_superhost + bedrooms + bathrooms + number_of_reviews + review_scores_value ', data = listings).fit()
ols_robust = reg.get_robustcov_results()
ols_robust.summary()