# Exploratory Data Analysis: Correlation Matrix

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import altair as alt
import seaborn as sns
import pandas as pd
import numpy as np

import boto3
import awswrangler
# set name of S3 bucket
s3_bucket = 'traffic-data-bucket'

## 1. Create Boto3 session
Start by creating a boto3 session so that we can connect to the S3 bucket.

In [None]:
from aws_secrets import aws_access_key_id, aws_secret_access_key, aws_session_token

my_session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    aws_session_token = aws_session_token

)

In [None]:
df = awswrangler.s3.read_parquet(path = f's3://{s3_bucket}/model_data/model_data_post_transformation.parquet', boto3_session=my_session, use_threads=True)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.columns

## 2. Correlation Matrix
A correlation matrix will be created to visualize the correlation coefficients between variables and discover if there is any multicollinearity among the variables.

In [None]:
# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(20,14))

# Compute the correlation matrix
# square the pearson correlation coefficients returned by .corr()
corr = df.corr().abs()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(20, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, ax = ax)

ax.set_title('Correlation Matrix', fontdict= { 'fontsize': 24, 'fontweight':'bold', 'horizontalalignment': 'center'})
plt.show()

## 3. Time Series Analysis

In [None]:
alt.Chart(corr).mark_rect()

In [None]:
df_corr = corr.stack().reset_index()
df_corr.rename(columns={'level_0':'x','level_1':'y',0:'correlation'}, inplace=True)

In [None]:
df_corr

In [None]:
alt.Chart(df_corr).mark_rect().encode(
    x=alt.X('x', title=None),
    y=alt.Y('y', title=None),
    color=alt.Color('correlation', title=None)
).properties(title={'text':['Correlation Matrix'], 'subtitle':['']}).configure_title(
    fontSize=20,
    anchor='middle'
)