In [12]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

# the below extension properly formats a cell after it is run
%load_ext nb_black

# Set the maximum number of rows to 200
pd.set_option("display.max_rows", 200)


# Set the maximum number of columns to 200
pd.set_option("display.max_columns", 200)


The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [13]:
housing = pd.read_csv("../data/housing_cleaned.csv")
locations = pd.read_csv("../data/locations.csv")

<IPython.core.display.Javascript object>

In [14]:
# Dropping this observtion that is the only one in the Landmrk neighborhood
housing.drop(housing[housing["Neighborhood"] == "Landmrk"].index, inplace=True)

<IPython.core.display.Javascript object>

In [15]:
# merging Neigborhood_st column
housing = housing.merge(locations[["PID", "Neighborhood_st"]], on="PID", how="left")

<IPython.core.display.Javascript object>

In [16]:
# Creating Dictionaries
GQual = {"No_Garage": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
GQualCond = {"No_Garage": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
GFin = {"No_Garage": 0, "Unf": 1, "RFn": 2, "Fin": 3}
BExposure = {"No_Bsmt": 0, "No": 1, "Mn": 2, "Gd": 3, "Av": 4}
BFinType = {"No_Bsmt": 0, "Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6}
BQualCond = {"No_Bsmt": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
FenceQual = {"No_Fence": 0, "MnWw": 1, "GdWo": 2, "MnPrv": 3, "GdPrv": 4}
FireQual = {"No_Fireplace": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
ExQualCond = {"Po": 0, "Fa": 1, "TA": 2, "Gd": 3, "Ex": 4}

<IPython.core.display.Javascript object>

In [17]:
# Turning nominal columns to numeric with previous dictionary
housing["BsmtExposure"] = housing["BsmtExposure"].apply(lambda row: BExposure[row])
housing["BsmtFinType1"] = housing["BsmtFinType1"].apply(lambda row: BFinType[row])
housing["BsmtFinType2"] = housing["BsmtFinType2"].apply(lambda row: BFinType[row])
housing["BsmtQual"] = housing["BsmtQual"].apply(lambda row: BQualCond[row])
housing["BsmtCond"] = housing["BsmtCond"].apply(lambda row: BQualCond[row])
housing["GarageQual"] = housing["GarageQual"].apply(lambda row: GQualCond[row])
housing["GarageCond"] = housing["GarageCond"].apply(lambda row: GQualCond[row])
housing["GarageFinish"] = housing["GarageFinish"].apply(lambda row: GFin[row])
housing["FireplaceQu"] = housing["FireplaceQu"].apply(lambda row: FireQual[row])
housing["Fence"] = housing["Fence"].apply(lambda row: FenceQual[row])
housing["ExterQual"] = housing["ExterQual"].apply(lambda row: ExQualCond[row])
housing["ExterCond"] = housing["ExterCond"].apply(lambda row: ExQualCond[row])
housing["HeatingQC"] = housing["HeatingQC"].apply(lambda row: ExQualCond[row])
housing["KitchenQual"] = housing["KitchenQual"].apply(lambda row: ExQualCond[row])

<IPython.core.display.Javascript object>

## Feature engineering should be moved to other notebook

In [19]:
# Creating a new feature of all outdoor SF
housing["TotalOutdoorSF"] = housing["WoodDeckSF"] + housing["TotalPorchSF"]

<IPython.core.display.Javascript object>

In [21]:
# Dropping features used to create other features
housing.drop(
    ["WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "ScreenPorch", "TotalPorchSF"],
    axis=1,
    inplace=True,
)

<IPython.core.display.Javascript object>

# Looking for Higly Correlated Columns #

In [23]:
# Selecting only numeric columns from the DataFrame
numeric_cols = housing.select_dtypes(include=np.number).columns

# Compute the correlation matrix
corr_matrix = housing[numeric_cols].corr()

# Extract only highly correlated columns
threshold = 0.7
highly_correlated_cols = corr_matrix[abs(corr_matrix) > threshold].stack().reset_index()
highly_correlated_cols = highly_correlated_cols[
    highly_correlated_cols["level_0"] != highly_correlated_cols["level_1"]
]
highly_correlated_cols = highly_correlated_cols.rename(columns={0: "correlation"})
highly_correlated_cols["abs_correlation"] = highly_correlated_cols["correlation"].abs()
highly_correlated_cols = highly_correlated_cols.sort_values(
    "abs_correlation", ascending=False
)
highly_correlated_cols = highly_correlated_cols.drop("abs_correlation", axis=1)

# Print highly correlated columns with correlation coefficients
for idx, row in highly_correlated_cols.iterrows():
    col1 = row["level_0"]
    col2 = row["level_1"]
    corr = row["correlation"]
    print(f"- {col1} & {col2}: {corr}")

- Age & YearBuilt: -0.9990187953011697
- YearBuilt & Age: -0.9990187953011697
- RemodAge & YearRemodAdd: -0.9979373392786269
- YearRemodAdd & RemodAge: -0.9979373392786269
- GarageCond & GarageYrBlt: 0.9430052130771328
- GarageYrBlt & GarageCond: 0.9430052130771328
- GarageQual & GarageCond: 0.9416339103102874
- GarageCond & GarageQual: 0.9416339103102874
- GarageQual & GarageYrBlt: 0.9367589577001315
- GarageYrBlt & GarageQual: 0.9367589577001315
- GarageArea & GarageCars: 0.8912204450237325
- GarageCars & GarageArea: 0.8912204450237325
- TotalSF & GrLivArea: 0.8663484828491583
- GrLivArea & TotalSF: 0.8663484828491583
- Fireplaces & FireplaceQu: 0.858974953102109
- FireplaceQu & Fireplaces: 0.858974953102109
- SalePrice & TotalSF: 0.8193539295384036
- TotalSF & SalePrice: 0.8193539295384036
- TotalBsmtSF & TotalSF: 0.8108202085042245
- TotalSF & TotalBsmtSF: 0.8108202085042245
- GrLivArea & TotRmsAbvGrd: 0.8066150649334259
- TotRmsAbvGrd & GrLivArea: 0.8066150649334259
- BsmtFinSF2 &

<IPython.core.display.Javascript object>

In [None]:


# - Age & YearBuilt: -0.9990187953011697             Removing YearBuilt
# - YearBuilt & Age: -0.9990187953011697
# - RemodAge & YearRemodAdd: -0.9979373392786269     Removing YearRemodAdd
# - YearRemodAdd & RemodAge: -0.9979373392786269 
# - GarageCond & GarageYrBlt: 0.9430052130771328     Removing GarageYrBlt
# - GarageYrBlt & GarageCond: 0.9430052130771328
# - GarageQual & GarageCond: 0.9416339103102874      Removing GarageCond
# - GarageCond & GarageQual: 0.9416339103102874
# - GarageQual & GarageYrBlt: 0.9367589577001315
# - GarageYrBlt & GarageQual: 0.9367589577001315
# - GarageArea & GarageCars: 0.8912204450237325      Removing GarageArea
# - GarageCars & GarageArea: 0.8912204450237325
# - TotalSF & GrLivArea: 0.8663484828491583          Removing GrLivArea
# - GrLivArea & TotalSF: 0.8663484828491583
# - Fireplaces & FireplaceQu: 0.858974953102109      Removing FireplaceQu
# - FireplaceQu & Fireplaces: 0.858974953102109
# - SalePrice & TotalSF: 0.8193539295384036
# - TotalSF & SalePrice: 0.8193539295384036
# - TotalBsmtSF & TotalSF: 0.8108202085042245        Removing TotalBsmtSF
# - TotalSF & TotalBsmtSF: 0.8108202085042245
# - GrLivArea & TotRmsAbvGrd: 0.8066150649334259   
# - TotRmsAbvGrd & GrLivArea: 0.8066150649334259
# - BsmtFinSF2 & BsmtFinType2: 0.8044119822833958    Removing BsmtFinType2
# - BsmtFinType2 & BsmtFinSF2: 0.8044119822833958
# - OverallQual & SalePrice: 0.7905058765051461
# - SalePrice & OverallQual: 0.7905058765051461
# - 1stFlrSF & TotalBsmtSF: 0.7888234584139528     
# - TotalBsmtSF & 1stFlrSF: 0.7888234584139528
# - 1stFlrSF & TotalSF: 0.7778403581906675           Removing 1stFlrSF
# - TotalSF & 1stFlrSF: 0.7778403581906675
# - SalePrice & GrLivArea: 0.7197206293691185
# - GrLivArea & SalePrice: 0.7197206293691185
# - FullBath & TotalBath: 0.715880332033678          Removing FullBath  
# - TotalBath & FullBath: 0.715880332033678
# - BsmtFinSF1 & BsmtFinType1: 0.7145783833237291    Removing BsmtFinType1
# - BsmtFinType1 & BsmtFinSF1: 0.7145783833237291
# - ExterQual & OverallQual: 0.7136966591877554      Removing ExterQual
# - OverallQual & ExterQual: 0.7136966591877554

In [24]:
# Dropping variables from above and also dropping variables that were used to create other features
# such as 2ndFlrSF which is included in TotalSF also WoodDeckSF, OpenPorchSF, EnclosedPorch, ScreenPorch used to
# create
housing.drop(
    [
        "YearBuilt",
        "2ndFlrSF",
        "YearRemodAdd",
        "GarageYrBlt",
        "GarageCond",
        "GarageArea",
        "GrLivArea",
        "FireplaceQu",
        "TotalBsmtSF",
        "BsmtFinType2",
        "1stFlrSF",
        "FullBath",
        "BsmtFinType1",
        "ExterQual",
    ],
    axis=1,
    inplace=True,
)

<IPython.core.display.Javascript object>

### Making Sure They Were All Removed ###

In [26]:
# Select only numeric columns from the DataFrame
numeric_cols = housing.select_dtypes(include=np.number).columns

# Compute the correlation matrix
corr_matrix = housing[numeric_cols].corr()

# Extract only highly correlated columns
threshold = 0.7
highly_correlated_cols = corr_matrix[abs(corr_matrix) > threshold].stack().reset_index()
highly_correlated_cols = highly_correlated_cols[
    highly_correlated_cols["level_0"] != highly_correlated_cols["level_1"]
]
highly_correlated_cols = highly_correlated_cols.rename(columns={0: "correlation"})
highly_correlated_cols["abs_correlation"] = highly_correlated_cols["correlation"].abs()
highly_correlated_cols = highly_correlated_cols.sort_values(
    "abs_correlation", ascending=False
)
highly_correlated_cols = highly_correlated_cols.drop("abs_correlation", axis=1)

# Print highly correlated columns with correlation coefficients
for idx, row in highly_correlated_cols.iterrows():
    col1 = row["level_0"]
    col2 = row["level_1"]
    corr = row["correlation"]
    print(f"- {col1} & {col2}: {corr}")

- SalePrice & TotalSF: 0.8193539295384036
- TotalSF & SalePrice: 0.8193539295384036
- SalePrice & OverallQual: 0.7905058765051461
- OverallQual & SalePrice: 0.7905058765051461


<IPython.core.display.Javascript object>

In [27]:
# Saving CSV
housing.to_csv("../data/housing_corr.csv", index=False)

<IPython.core.display.Javascript object>