In [34]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from dython.nominal import associations
import numpy as np
import plotly.express as px

import warnings
# Suppresses FutureWarning 
warnings.filterwarnings("ignore", category=FutureWarning)

df = pd.read_csv('train.csv')

Filtering numeric and categorical features

In [35]:
df.columns

target_col = "SalePrice"

In [36]:
df["OverallQual"]

0       7
1       6
2       7
3       7
4       8
       ..
1455    6
1456    6
1457    7
1458    5
1459    5
Name: OverallQual, Length: 1460, dtype: int64

In [37]:
num_cols = df.drop(["Id"], axis=1).select_dtypes(exclude="object").columns.tolist()
cat_cols = df.drop(["Id", "SalePrice"], axis=1).select_dtypes(include="object").columns.tolist()

Get correlation of entire dataframe (this was a nice idea but is replaced by code below since making a correlation matrix with categorical values produce BIAS when trying to remove highly correlated columns, so only use numerical cols when making a correlation matrix)

In [38]:

ass = associations(df, plot=False)
correlations = ass['corr']

# Replace diagonal and upper triangular elements with Nan to exlude self correlation
df_corr = correlations.where(~np.triu(np.ones(correlations.shape), k=1).astype(bool))

View highest correlations

In [39]:
# Stack the correlation matrix to convert it into a Series
stacked_corr = df_corr.stack()

# Sort the correlation values in descending order and drop NaN values
highest_corr = stacked_corr.sort_values(ascending=False).dropna()

# Drop out all values equal to 1 so that not same item correlation happens
highest_corr_filtered = highest_corr[(highest_corr != 1)]

highest_corr_filtered.head(20)



GarageFinish  GarageYrBlt     0.999025
GarageYrBlt   GarageType      0.998872
GarageQual    GarageYrBlt     0.998786
GarageCond    GarageYrBlt     0.998732
PoolQC        PoolArea        0.990775
MiscVal       MiscFeature     0.935741
2ndFlrSF      HouseStyle      0.916506
FireplaceQu   Fireplaces      0.901204
YearBuilt     Neighborhood    0.883812
GarageArea    GarageCars      0.882475
BldgType      MSSubClass      0.863889
BsmtFinSF2    BsmtFinType2    0.838641
TotRmsAbvGrd  GrLivArea       0.825489
1stFlrSF      TotalBsmtSF     0.819530
SalePrice     OverallQual     0.790982
KitchenAbvGr  BldgType        0.778076
Foundation    YearBuilt       0.761337
Exterior2nd   Exterior1st     0.758884
BsmtQual      YearBuilt       0.750716
SalePrice     Neighborhood    0.738630
dtype: float64

In [40]:
correlations

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,1.000000,0.011156,0.053673,-0.019761,-0.033226,0.008916,0.002885,0.062937,0.024296,0.013324,...,0.057044,0.069280,0.024927,0.063398,-0.006242,0.021172,0.000712,0.081004,0.042422,-0.021917
MSSubClass,0.011156,1.000000,0.349270,-0.215023,-0.139781,0.024969,0.188233,0.123571,0.037980,0.022844,...,0.008283,0.024812,0.119103,0.074159,-0.007683,-0.013585,-0.021407,0.111426,0.060590,-0.084284
MSZoning,0.053673,0.349270,1.000000,0.134045,0.205819,0.249384,0.388841,0.152191,0.102102,0.000000,...,0.035591,0.000000,0.025450,0.000000,0.022909,0.064259,0.042875,0.150903,0.136065,0.327963
LotFrontage,-0.019761,-0.215023,0.134045,1.000000,0.100739,0.025107,0.064913,0.190091,0.124251,0.043535,...,0.114106,0.152119,0.019669,0.070336,-0.059606,0.018942,-0.012094,0.190363,0.187632,0.209624
LotArea,-0.033226,-0.139781,0.205819,0.100739,1.000000,0.197131,0.084230,0.371568,0.375869,0.010123,...,0.077672,0.101577,0.051850,0.112026,0.038068,0.001205,-0.014261,0.031340,0.040402,0.263843
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MoSold,0.021172,-0.013585,0.064259,0.018942,0.001205,0.003690,0.022350,0.033669,0.084457,0.051552,...,-0.033737,0.040255,0.050725,0.026421,-0.006495,1.000000,-0.145721,0.123407,0.096080,0.046432
YrSold,0.000712,-0.021407,0.042875,-0.012094,-0.014261,0.025043,0.022192,0.045683,0.026413,0.023353,...,-0.059689,0.064768,0.069390,0.087355,0.004906,-0.145721,1.000000,0.156007,0.156572,-0.028923
SaleType,0.081004,0.111426,0.150903,0.190363,0.031340,0.110886,0.035549,0.000000,0.030262,0.130824,...,0.017377,0.000000,0.056467,0.000000,0.032522,0.123407,0.156007,1.000000,0.470819,0.370523
SaleCondition,0.042422,0.060590,0.136065,0.187632,0.040402,0.099458,0.064614,0.001923,0.107176,0.076153,...,0.145597,0.111456,0.071734,0.000000,0.037372,0.096080,0.156572,0.470819,1.000000,0.368100


Correlation in relation to saleprice

In [41]:
saleprice_corr = correlations["SalePrice"]

Here the correct correlations with only the relation with numerical values

In [42]:
corr_matrix = df[num_cols].corr()

# Replace diagonal and upper triangular elements with Nan to exlude self correlation
df_corr = corr_matrix.where(~np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

Get highly correlated feature pairs

In [43]:
# Stack the correlation matrix to convert it into a Series
stacked_corr = df_corr.stack()

# Sort the correlation values in descending order and drop NaN values
highest_corr = stacked_corr.sort_values(ascending=False).dropna()

# Drop out all values equal to 1 so that not same item correlation happens
highest_corr_filtered = highest_corr[(highest_corr != 1)]

highest_corr_filtered.head(5)


GarageArea    GarageCars     0.882475
GarageYrBlt   YearBuilt      0.825667
TotRmsAbvGrd  GrLivArea      0.825489
1stFlrSF      TotalBsmtSF    0.819530
SalePrice     OverallQual    0.790982
dtype: float64

In [44]:
# Plotting a one-sided correlation matrix:
fig = px.imshow(df_corr)
fig.show()

In [45]:
def lowestTargetCorr(val: tuple, corr_matrix: pd.DataFrame, target_column_name: str) -> str:
    """Finds which value in tuple has highest correlation with target"""
    corr_column1 = abs(corr_matrix.loc[val[0], target_column_name])
    corr_column2 = abs(corr_matrix.loc[val[1], target_column_name])

    if corr_column1 >= corr_column2:
        return val[1]
    return val[0]

Here we check which col have a high correlation with each other and drops 1 of the columns which has the least correlation with the target of the 2

In [46]:

high_corr_names = highest_corr_filtered[highest_corr_filtered > 0.8].index.tolist()

print(high_corr_names)

# Here we check which value of the 2 most correlated has the lowest correlation with the target
removeable_cols = [lowestTargetCorr(pair, df_corr, target_col) for pair in high_corr_names]

removeable_cols.append(target_col)

print(removeable_cols)

[('GarageArea', 'GarageCars'), ('GarageYrBlt', 'YearBuilt'), ('TotRmsAbvGrd', 'GrLivArea'), ('1stFlrSF', 'TotalBsmtSF')]
['GarageArea', 'GarageYrBlt', 'TotRmsAbvGrd', '1stFlrSF', 'SalePrice']


In [47]:
# remove the columns with high correlation
num_cols = [col for col in num_cols if col not in removeable_cols]

all_features = num_cols + cat_cols

Building modelling pipeline


75