In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

# the below extension properly formats a cell after it is run
%load_ext nb_black

# Set the maximum number of rows to 200
pd.set_option("display.max_rows", 200)


# Set the maximum number of columns to 200

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [4]:
housing = pd.read_csv("data/housing_cleaned.csv")

<IPython.core.display.Javascript object>

In [8]:
# Select only numeric columns from the DataFrame
numeric_cols = housing.select_dtypes(include=np.number).columns

# Compute the correlation matrix
corr_matrix = housing[numeric_cols].corr()

# Extract only highly correlated columns
threshold = 0.7
highly_correlated_cols = corr_matrix[abs(corr_matrix) > threshold].stack().index.tolist()
highly_correlated_cols = list(set([col[0] for col in highly_correlated_cols] + [col[1] for col in highly_correlated_cols]))

# Print highly correlated columns
print(highly_correlated_cols)


['LotFrontage', 'SalePrice', 'OverallQual', 'OpenPorchSF', 'BsmtUnfSF', 'TotalBath', 'PID', 'TotalPorchSF', 'YearBuilt', 'GarageArea', 'WoodDeckSF', 'TotRmsAbvGrd', 'BsmtFinSF2', 'Age', 'BsmtHalfBath', 'OverallCond', 'GarageCars', 'BsmtFullBath', 'GrLivArea', 'BsmtFinSF1', 'EnclosedPorch', 'YrSold', '1stFlrSF', 'Fireplaces', 'MoSold', 'FullBath', 'TotalBsmtSF', 'ScreenPorch', 'RemodAge', 'MasVnrArea', 'HalfBath', 'GarageYrBlt', 'MSSubClass', 'LotArea', 'BedroomAbvGr', '2ndFlrSF', 'TotalSF', 'YearRemodAdd', 'Remodeled', 'KitchenAbvGr']


<IPython.core.display.Javascript object>

In [9]:
corr = housing.corr()
corr = corr[(corr > 0.7) | (corr < -0.7)]
high_corr_vars = corr[corr.sum() > 1].index.tolist()
print(high_corr_vars)


['GrLivArea', 'SalePrice', 'OverallQual', 'TotalBsmtSF', '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea', 'TotalSF', 'TotalBath']


<IPython.core.display.Javascript object>

In [10]:
# Select the features of interest
selected_features = ['LotFrontage', 'SalePrice', 'OverallQual', 'OpenPorchSF', 'BsmtUnfSF', 'TotalBath', 'PID', 'TotalPorchSF', 'YearBuilt', 'GarageArea', 'WoodDeckSF', 'TotRmsAbvGrd', 'BsmtFinSF2', 'Age', 'BsmtHalfBath', 'OverallCond', 'GarageCars', 'BsmtFullBath', 'GrLivArea', 'BsmtFinSF1', 'EnclosedPorch', 'YrSold', '1stFlrSF', 'Fireplaces', 'MoSold', 'FullBath', 'TotalBsmtSF', 'ScreenPorch', 'RemodAge', 'MasVnrArea', 'HalfBath', 'GarageYrBlt', 'MSSubClass', 'LotArea', 'BedroomAbvGr', '2ndFlrSF', 'TotalSF', 'YearRemodAdd', 'Remodeled', 'KitchenAbvGr']

# Generate a correlation matrix for the selected features
corr_matrix = housing[selected_features].corr()

# Filter the correlation matrix to show only the correlations with the selected features
corr_with_selected = corr_matrix.loc[:, selected_features]

# Display the highly correlated features for each selected feature
for feature in selected_features:
    print("Highly correlated with", feature)
    print(corr_with_selected[feature][(corr_with_selected[feature] > 0.7) | (corr_with_selected[feature] < -0.7)].sort_values(ascending=False))


Highly correlated with LotFrontage
LotFrontage    1.0
Name: LotFrontage, dtype: float64
Highly correlated with SalePrice
SalePrice      1.000000
TotalSF        0.819377
OverallQual    0.790467
GrLivArea      0.719735
Name: SalePrice, dtype: float64
Highly correlated with OverallQual
OverallQual    1.000000
SalePrice      0.790467
Name: OverallQual, dtype: float64
Highly correlated with OpenPorchSF
OpenPorchSF    1.0
Name: OpenPorchSF, dtype: float64
Highly correlated with BsmtUnfSF
BsmtUnfSF    1.0
Name: BsmtUnfSF, dtype: float64
Highly correlated with TotalBath
TotalBath    1.000000
FullBath     0.715885
Name: TotalBath, dtype: float64
Highly correlated with PID
PID    1.0
Name: PID, dtype: float64
Highly correlated with TotalPorchSF
TotalPorchSF    1.0
Name: TotalPorchSF, dtype: float64
Highly correlated with YearBuilt
YearBuilt    1.000000
Age         -0.999018
Name: YearBuilt, dtype: float64
Highly correlated with GarageArea
GarageArea    1.00000
GarageCars    0.89121
Name: GarageA

<IPython.core.display.Javascript object>

In [14]:
for col in ['LotFrontage', 'OverallQual', 'OpenPorchSF', 'BsmtUnfSF', 'TotalBath', 'PID', 'TotalPorchSF', 'YearBuilt', 'GarageArea', 'WoodDeckSF', 'TotRmsAbvGrd', 'BsmtFinSF2', 'Age', 'BsmtHalfBath', 'OverallCond', 'GarageCars', 'BsmtFullBath', 'GrLivArea', 'BsmtFinSF1', 'EnclosedPorch', 'YrSold', '1stFlrSF', 'Fireplaces', 'MoSold', 'FullBath', 'TotalBsmtSF', 'ScreenPorch', 'RemodAge', 'MasVnrArea', 'HalfBath', 'GarageYrBlt', 'MSSubClass', 'LotArea', 'BedroomAbvGr', '2ndFlrSF', 'TotalSF', 'YearRemodAdd', 'Remodeled', 'KitchenAbvGr']:
    if col not in ['SalePrice']:
        corr = housing.corr()[col]
        corr = corr[(corr > 0.7) | (corr < -0.7)]
        corr = corr.drop([col, 'SalePrice'], errors='ignore')
        if not corr.empty:
            print(f"Highly correlated with {col}")
            print(corr)
            print("\n")


Highly correlated with TotalBath
FullBath    0.715885
Name: TotalBath, dtype: float64


Highly correlated with YearBuilt
Age   -0.999018
Name: YearBuilt, dtype: float64


Highly correlated with GarageArea
GarageCars    0.89121
Name: GarageArea, dtype: float64


Highly correlated with TotRmsAbvGrd
GrLivArea    0.806588
Name: TotRmsAbvGrd, dtype: float64


Highly correlated with Age
YearBuilt   -0.999018
Name: Age, dtype: float64


Highly correlated with GarageCars
GarageArea    0.89121
Name: GarageCars, dtype: float64


Highly correlated with GrLivArea
TotRmsAbvGrd    0.806588
TotalSF         0.866333
Name: GrLivArea, dtype: float64


Highly correlated with 1stFlrSF
TotalBsmtSF    0.788911
TotalSF        0.777873
Name: 1stFlrSF, dtype: float64


Highly correlated with FullBath
TotalBath    0.715885
Name: FullBath, dtype: float64


Highly correlated with TotalBsmtSF
1stFlrSF    0.788911
TotalSF     0.810867
Name: TotalBsmtSF, dtype: float64


Highly correlated with RemodAge
YearRemodAdd 

<IPython.core.display.Javascript object>