In [1]:
import numpy as np
import pandas as pd 
from scipy import stats
pd.set_option("display.max_rows", None)

import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
# load dataset and visualize columns
df = pd.read_csv("../data/raw/train.csv")
df.shape

(1460, 81)

In [4]:
# preview all 81 variables for first 5 instances
df.head().T

Unnamed: 0,0,1,2,3,4
Id,1,2,3,4,5
MSSubClass,60,20,60,70,60
MSZoning,RL,RL,RL,RL,RL
LotFrontage,65,80,68,60,84
LotArea,8450,9600,11250,9550,14260
Street,Pave,Pave,Pave,Pave,Pave
Alley,,,,,
LotShape,Reg,Reg,IR1,IR1,IR1
LandContour,Lvl,Lvl,Lvl,Lvl,Lvl
Utilities,AllPub,AllPub,AllPub,AllPub,AllPub


In [5]:
fig = make_subplots(
    rows=2, cols=2,
    specs=[[{}, {"rowspan":2}],
           [{}, None]],
    horizontal_spacing=0.125,
    vertical_spacing=0.125,
    subplot_titles=("Histogram", "Violin Plot", "QQ Plot")
        )
# calculate the quantiles for QQ plot
qq = stats.probplot(df["SalePrice"], dist='norm', sparams=(1))
# lowest and highest quantiles
x = np.array([qq[0][0][0], qq[0][0][-1]]) 

# plotting charts
fig.add_traces([
    go.Histogram(x=df["SalePrice"],
                 hoverinfo="x", 
                 showlegend=False),
    go.Violin(y=df["SalePrice"],
              box_visible=True, 
              line_color="black",
              meanline_visible=True,
              fillcolor='lightseagreen',
              opacity=0.8,
              showlegend=False),
    go.Scatter(x=qq[0][0], y=qq[0][1], 
               mode='markers',
               showlegend=False),
    go.Scatter(x=x, y=qq[1][1] + qq[1][0]*x,
               showlegend=False,
               mode='lines')
        ],
    rows=[1, 1, 2, 2],
    cols=[1, 2, 1, 1]
)

# Update xaxis properties
fig.update_xaxes(title_text="Quantiles", row=2, col=1)

# Update yaxis properties
fig.update_yaxes(title_text="Counts", row=1, col=1)
fig.update_yaxes(title_text="SalePrice", row=1, col=2)
fig.update_yaxes(title_text="SalePrice", row=2, col=1)

fig.update_layout(
    title=dict(
        text="Sale Price Distribution",
        font=dict(
            family="Arial",
            size=20)),
	showlegend=False,
	width=800,
	height=500
    )
fig.show()

In [6]:
# most correlated features to sale price
corr= df.corr()["SalePrice"].sort_values(ascending=False)[1:]
corr_feat= corr.index
corr_vals= corr.values

# traffic light setting for fill colour
fill = ['lime' if x >= .50 else "red" for x in corr_vals]

# visual table of features correlation to SalePrice 
fig = go.Figure(data=[go.Table(
    header=dict(values=["Features", "Correlation"],
                fill_color='lightgrey',
                font=dict(family="Arial", size=14, color="black"),
                align='center'),
    cells=dict(values=[corr_feat, corr_vals],
               fill_color=[fill],
               height=22,
               font=dict(family="Arial", size=12),
               align='left'))]
)

fig.update_layout(title=dict(
                        text="Pearson's Correlation of features to SalePrice",
                        font=dict(
                            family="Arial",
                            size=20
                            ),
                        ),
		          showlegend=False,
		          width=900,
		          height=650
                  )
fig.show()

In [7]:
# plotting 4 most correlated charts
fig = make_subplots(
    rows=2, cols=2,
    horizontal_spacing=0.165,
    vertical_spacing=0.165,
    column_widths=[10, 10] ,
    row_heights=[10, 10]
        )
fig.add_traces([
    go.Box(x= df["OverallQual"], y= df["SalePrice"],
           showlegend=False),
    go.Box(x= df["GarageCars"], y= df["SalePrice"],
           showlegend=False),
    go.Scatter(x= df["GrLivArea"], y= df["SalePrice"], 
               mode='markers',
               showlegend=False),
    go.Scatter(x= df["GarageArea"], y= df["SalePrice"], 
               mode='markers',
               showlegend=False),
        ],
    rows=[1, 2, 1, 2],
    cols=[1, 1, 2, 2]
)

# Update xaxis properties
fig.update_xaxes(title_text="OverallQual", row=1, col=1)
fig.update_xaxes(title_text="GarageCars", row=1, col=2)
fig.update_xaxes(title_text="GrLivArea", row=2, col=1)
fig.update_xaxes(title_text="GarageArea", range=[0, 1500], row=2, col=2)

# Update yaxis properties
fig.update_yaxes(title_text="SalePrice", row=1, col=1)
fig.update_yaxes(title_text="SalePrice", row=1, col=2)
fig.update_yaxes(title_text="SalePrice", row=2, col=1)
fig.update_yaxes(title_text="SalePrice", range=[0, 800000], row=2, col=2)

fig.update_layout(
    title=dict(
        text="4 Most correlated features with SalePrice",
        font=dict(
            family="Arial",
            size=20)),
	showlegend=False,
	width=900,
	height=600
    )

fig.show()

In [8]:
# correlation of features to each other 
fig = go.Figure(data=[
    go.Heatmap(x=df.corr().columns.tolist(),
               y=df.corr().columns.tolist(),
               z=df.corr().values,
               xgap=1.25, ygap=1.25,
               colorbar_thickness=20,
               colorbar_ticklen=3,
    )]
)

fig.update_layout(title=dict(
                        text="Feature Correlation Heatmap",
                        font=dict(
                            family="Arial",
                            size=20
                            ),
                        ),
		          showlegend=False,
		          width=800,
		          height=550
                  )

fig.show()

In [9]:
# total number and percent of missing variables in train df
total_missing = df.isnull().sum().sort_index(ascending=False)[df.isnull().sum().sort_values(ascending=False) != 0]
percent_missing = round(df.isnull().sum().sort_values(ascending = False)/len(df)*100, 2)[round(df.isnull().sum().sort_values(ascending = False)/len(df)*100,2) != 0]

# df of missing variables
missing_df = pd.concat([total_missing, percent_missing], axis=1, keys=["total", "percent"])

features = missing_df.index
total = missing_df.total
percent = missing_df.percent

# traffic light setting for fill colour
fill = ['red' if x >= 75.0 else "lime" if x <= 25.0 else "yellow" for x in percent]

# visual table of missing variables 
fig = go.Figure(data=[go.Table(
    header=dict(values=["Features", "Total Missing Values", "Percentage"],
                fill_color='lightgrey',
                font=dict(family="Arial", size=14, color="black"),
                align='center'),
    cells=dict(values=[features, total, percent],
               fill_color=[fill],
               height=22,
               font=dict(family="Arial", size=12),
               align='left'))]
)

fig.update_layout(title=dict(
                        text="Features with missing values",
                        font=dict(
                            family="Arial",
                            size=20
                            ),
                        ),
		          showlegend=False,
		          width=900,
		          height=650
                  )

fig.show()