In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

# the below extension properly formats a cell after it is run
%load_ext nb_black

# Set the maximum number of rows to 200
pd.set_option("display.max_rows", 200)


# Set the maximum number of columns to 200
pd.set_option("display.max_columns", 200)


The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [8]:
housing = pd.read_csv("../data/housing_cleaned.csv")
locations = pd.read_csv("../data/locations.csv")

<IPython.core.display.Javascript object>

In [9]:
# Dropping this observtion that is the only one in the Landmrk neighborhood
housing.drop(housing[housing["Neighborhood"] == "Landmrk"].index, inplace=True)

<IPython.core.display.Javascript object>

In [10]:
# merging Neigborhood_st column
housing = housing.merge(locations[["PID", "Neighborhood_st"]], on="PID", how="left")

<IPython.core.display.Javascript object>

In [11]:
# Creating Dictionaries
GQual = {"No_Garage": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
GQualCond = {"No_Garage": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
GFin = {"No_Garage": 0, "Unf": 1, "RFn": 2, "Fin": 3}
BExposure = {"No_Bsmt": 0, "No": 1, "Mn": 2, "Gd": 3, "Av": 4}
BFinType = {"No_Bsmt": 0, "Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6}
BQualCond = {"No_Bsmt": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
FenceQual = {"No_Fence": 0, "MnWw": 1, "GdWo": 2, "MnPrv": 3, "GdPrv": 4}
FireQual = {"No_Fireplace": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
ExQualCond = {"Po": 0, "Fa": 1, "TA": 2, "Gd": 3, "Ex": 4}

<IPython.core.display.Javascript object>

In [12]:
# Turning nominal columns to numeric with previous dictionary
housing["BsmtExposure"] = housing["BsmtExposure"].apply(lambda row: BExposure[row])
housing["BsmtFinType1"] = housing["BsmtFinType1"].apply(lambda row: BFinType[row])
housing["BsmtFinType2"] = housing["BsmtFinType2"].apply(lambda row: BFinType[row])
housing["BsmtQual"] = housing["BsmtQual"].apply(lambda row: BQualCond[row])
housing["BsmtCond"] = housing["BsmtCond"].apply(lambda row: BQualCond[row])
housing["GarageQual"] = housing["GarageQual"].apply(lambda row: GQualCond[row])
housing["GarageCond"] = housing["GarageCond"].apply(lambda row: GQualCond[row])
housing["GarageFinish"] = housing["GarageFinish"].apply(lambda row: GFin[row])
housing["FireplaceQu"] = housing["FireplaceQu"].apply(lambda row: FireQual[row])
housing["Fence"] = housing["Fence"].apply(lambda row: FenceQual[row])
housing["ExterQual"] = housing["ExterQual"].apply(lambda row: ExQualCond[row])
housing["ExterCond"] = housing["ExterCond"].apply(lambda row: ExQualCond[row])
housing["HeatingQC"] = housing["HeatingQC"].apply(lambda row: ExQualCond[row])
housing["KitchenQual"] = housing["KitchenQual"].apply(lambda row: ExQualCond[row])

<IPython.core.display.Javascript object>

# Looking for Higly Correlated Columns #

In [13]:
# Select only numeric columns from the DataFrame
numeric_cols = housing.select_dtypes(include=np.number).columns

# Compute the correlation matrix
corr_matrix = housing[numeric_cols].corr()

# Extract only highly correlated columns
threshold = 0.7
highly_correlated_cols = corr_matrix[abs(corr_matrix) > threshold].stack().reset_index()
highly_correlated_cols = highly_correlated_cols[
    highly_correlated_cols["level_0"] != highly_correlated_cols["level_1"]
]
highly_correlated_cols = highly_correlated_cols.rename(columns={0: "correlation"})
highly_correlated_cols["abs_correlation"] = highly_correlated_cols["correlation"].abs()
highly_correlated_cols = highly_correlated_cols.sort_values(
    "abs_correlation", ascending=False
)
highly_correlated_cols = highly_correlated_cols.drop("abs_correlation", axis=1)

# Print highly correlated columns with correlation coefficients
for idx, row in highly_correlated_cols.iterrows():
    col1 = row["level_0"]
    col2 = row["level_1"]
    corr = row["correlation"]
    print(f"- {col1} & {col2}: {corr}")

- Age & YearBuilt: -0.9990187953011697
- YearBuilt & Age: -0.9990187953011697
- RemodAge & YearRemodAdd: -0.9979373392786269
- YearRemodAdd & RemodAge: -0.9979373392786269
- GarageCond & GarageYrBlt: 0.9430052130771328
- GarageYrBlt & GarageCond: 0.9430052130771328
- GarageQual & GarageCond: 0.9416339103102874
- GarageCond & GarageQual: 0.9416339103102874
- GarageQual & GarageYrBlt: 0.9367589577001315
- GarageYrBlt & GarageQual: 0.9367589577001315
- GarageArea & GarageCars: 0.8912204450237325
- GarageCars & GarageArea: 0.8912204450237325
- TotalSF & GrLivArea: 0.8663484828491583
- GrLivArea & TotalSF: 0.8663484828491583
- Fireplaces & FireplaceQu: 0.858974953102109
- FireplaceQu & Fireplaces: 0.858974953102109
- SalePrice & TotalSF: 0.8193539295384036
- TotalSF & SalePrice: 0.8193539295384036
- TotalBsmtSF & TotalSF: 0.8108202085042245
- TotalSF & TotalBsmtSF: 0.8108202085042245
- GrLivArea & TotRmsAbvGrd: 0.8066150649334259
- TotRmsAbvGrd & GrLivArea: 0.8066150649334259
- BsmtFinSF2 &

<IPython.core.display.Javascript object>

In [None]:


# - Age & YearBuilt: -0.9990187953011697             Removing YearBuilt
# - YearBuilt & Age: -0.9990187953011697
# - RemodAge & YearRemodAdd: -0.9979373392786269     Removing YearRemodAdd
# - YearRemodAdd & RemodAge: -0.9979373392786269 
# - GarageCond & GarageYrBlt: 0.9430052130771328     Removing GarageYrBlt
# - GarageYrBlt & GarageCond: 0.9430052130771328
# - GarageQual & GarageCond: 0.9416339103102874      Removing GarageCond
# - GarageCond & GarageQual: 0.9416339103102874
# - GarageQual & GarageYrBlt: 0.9367589577001315
# - GarageYrBlt & GarageQual: 0.9367589577001315
# - GarageArea & GarageCars: 0.8912204450237325      Removing GarageCars
# - GarageCars & GarageArea: 0.8912204450237325
# - TotalSF & GrLivArea: 0.8663484828491583          Removing GrLivArea
# - GrLivArea & TotalSF: 0.8663484828491583
# - Fireplaces & FireplaceQu: 0.858974953102109      Removing FireplaceQu
# - FireplaceQu & Fireplaces: 0.858974953102109
# - SalePrice & TotalSF: 0.8193539295384036
# - TotalSF & SalePrice: 0.8193539295384036
# - TotalBsmtSF & TotalSF: 0.8108202085042245        Removing TotalBsmtSF
# - TotalSF & TotalBsmtSF: 0.8108202085042245
# - GrLivArea & TotRmsAbvGrd: 0.8066150649334259   
# - TotRmsAbvGrd & GrLivArea: 0.8066150649334259
# - BsmtFinSF2 & BsmtFinType2: 0.8044119822833958    Removing BsmtFinType2
# - BsmtFinType2 & BsmtFinSF2: 0.8044119822833958
# - OverallQual & SalePrice: 0.7905058765051461
# - SalePrice & OverallQual: 0.7905058765051461
# - 1stFlrSF & TotalBsmtSF: 0.7888234584139528     
# - TotalBsmtSF & 1stFlrSF: 0.7888234584139528
# - 1stFlrSF & TotalSF: 0.7778403581906675           Removing 1stFlrSF
# - TotalSF & 1stFlrSF: 0.7778403581906675
# - SalePrice & GrLivArea: 0.7197206293691185
# - GrLivArea & SalePrice: 0.7197206293691185
# - FullBath & TotalBath: 0.715880332033678          Removing FullBath  
# - TotalBath & FullBath: 0.715880332033678
# - BsmtFinSF1 & BsmtFinType1: 0.7145783833237291    Removing BsmtFinType1
# - BsmtFinType1 & BsmtFinSF1: 0.7145783833237291
# - ExterQual & OverallQual: 0.7136966591877554      Removing ExterQual
# - OverallQual & ExterQual: 0.7136966591877554

In [15]:
housing.drop(
    [
        "YearBuilt",
        "YearRemodAdd",
        "GarageYrBlt",
        "GarageCond",
        "GarageCars",
        "GrLivArea",
        "FireplaceQu",
        "TotalBsmtSF",
        "BsmtFinType2",
        "1stFlrSF",
        "FullBath",
        "BsmtFinType1",
        "ExterQual",
    ],
    axis=1,
    inplace=True,
)

<IPython.core.display.Javascript object>

### Making Sure They Were All Removed ###

In [16]:
# Select only numeric columns from the DataFrame
numeric_cols = housing.select_dtypes(include=np.number).columns

# Compute the correlation matrix
corr_matrix = housing[numeric_cols].corr()

# Extract only highly correlated columns
threshold = 0.7
highly_correlated_cols = corr_matrix[abs(corr_matrix) > threshold].stack().reset_index()
highly_correlated_cols = highly_correlated_cols[
    highly_correlated_cols["level_0"] != highly_correlated_cols["level_1"]
]
highly_correlated_cols = highly_correlated_cols.rename(columns={0: "correlation"})
highly_correlated_cols["abs_correlation"] = highly_correlated_cols["correlation"].abs()
highly_correlated_cols = highly_correlated_cols.sort_values(
    "abs_correlation", ascending=False
)
highly_correlated_cols = highly_correlated_cols.drop("abs_correlation", axis=1)

# Print highly correlated columns with correlation coefficients
for idx, row in highly_correlated_cols.iterrows():
    col1 = row["level_0"]
    col2 = row["level_1"]
    corr = row["correlation"]
    print(f"- {col1} & {col2}: {corr}")

- SalePrice & TotalSF: 0.8193539295384036
- TotalSF & SalePrice: 0.8193539295384036
- SalePrice & OverallQual: 0.7905058765051461
- OverallQual & SalePrice: 0.7905058765051461


<IPython.core.display.Javascript object>

In [24]:
# Saving CSV
housing.to_csv("../data/housing_corr.csv", index=False)

<IPython.core.display.Javascript object>

In [22]:
housing

Unnamed: 0,PID,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,Heating,HeatingQC,CentralAir,Electrical,2ndFlrSF,BsmtFullBath,BsmtHalfBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,GarageType,GarageFinish,GarageArea,GarageQual,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,ScreenPorch,Fence,MoSold,YrSold,SaleType,SaleCondition,Age,RemodAge,TotalSF,Remodeled,TotalPorchSF,TotalBath,MSSubClass_cat,Street_type,Neighborhood_st
0,909176150,126000,30,RL,0.0,7890,Pave,No_Alley,Reg,Lvl,AllPub,Corner,Gtl,SWISU,Norm,Norm,1Fam,1Story,6,6,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,2,CBlock,3,3,1,238.0,0.0,618.0,GasA,2,Y,SBrkr,0,1.0,0.0,0,2,1,2,4,Typ,1,Detchd,1,399.0,3,Y,0,0,0,166,0,3,2010,WD,Normal,71,60,1712.0,1,166,2.0,subclass30,AVE,SWISU_HAYWARD_AVE
1,905476230,139500,120,RL,42.0,4235,Pave,No_Alley,Reg,Lvl,AllPub,Inside,Gtl,Edwards,Norm,Norm,TwnhsE,1Story,5,5,Gable,CompShg,HdBoard,HdBoard,BrkFace,149.0,2,CBlock,4,3,2,552.0,393.0,104.0,GasA,2,Y,SBrkr,0,1.0,0.0,0,2,1,3,5,Typ,0,Attchd,3,266.0,3,Y,0,105,0,0,0,2,2009,WD,Normal,25,25,2098.0,0,105,3.0,subclass120,ST,Edwards_WEST_ST
2,911128020,124900,30,C (all),60.0,6060,Pave,No_Alley,Reg,Lvl,AllPub,Inside,Gtl,IDOTRR,Norm,Norm,1Fam,1Story,5,9,Hip,CompShg,MetalSd,MetalSd,,0.0,2,BrkTil,3,3,1,737.0,0.0,100.0,GasA,4,Y,SBrkr,0,0.0,0.0,0,2,1,3,5,Typ,0,Detchd,1,216.0,3,N,154,0,42,0,0,11,2007,WD,Normal,77,0,1838.0,1,128,1.0,subclass30,ST,IDOTRR_S_2ND_ST
3,535377150,114000,70,RL,80.0,8146,Pave,No_Alley,Reg,Lvl,AllPub,Corner,Gtl,OldTown,Norm,Norm,1Fam,2Story,4,8,Gable,CompShg,MetalSd,MetalSd,,0.0,3,BrkTil,2,3,1,0.0,0.0,405.0,GasA,3,Y,SBrkr,322,0.0,0.0,0,2,1,2,6,Typ,0,Detchd,1,281.0,3,N,0,0,168,111,0,5,2009,WD,Normal,109,6,1444.0,1,279,1.0,subclass70,AVE,OldTown_DOUGLAS_AVE
4,534177230,227000,60,RL,70.0,8400,Pave,No_Alley,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,Norm,1Fam,2Story,8,6,Gable,CompShg,VinylSd,VinylSd,,0.0,2,PConc,4,3,1,643.0,0.0,167.0,GasA,4,Y,SBrkr,855,1.0,0.0,1,3,1,3,6,Typ,0,Attchd,3,528.0,3,Y,0,45,0,0,0,11,2009,WD,Normal,8,8,2475.0,0,45,3.5,subclass60,AVE,NWAmes_FILLMORE_AVE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2553,903205040,121000,30,RL,0.0,8854,Pave,No_Alley,Reg,Lvl,AllPub,Inside,Gtl,BrkSide,Norm,Norm,1Fam,1.5Unf,6,6,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,2,BrkTil,3,3,1,0.0,0.0,952.0,Grav,1,N,FuseF,0,0.0,0.0,0,2,1,1,4,Typ,1,Detchd,1,192.0,2,P,0,98,0,40,0,5,2009,WD,Normal,93,59,1904.0,1,138,1.0,subclass30,AVE,BrkSide_RIDGEWOOD_AVE
2554,905402060,139600,20,RL,0.0,13680,Pave,No_Alley,IR1,Lvl,AllPub,CulDSac,Gtl,Edwards,Norm,Norm,1Fam,1Story,3,5,Hip,CompShg,BrkFace,Wd Sdng,,0.0,2,Slab,0,0,0,0.0,0.0,0.0,GasA,4,Y,FuseA,0,0.0,0.0,0,4,1,2,8,Min2,1,Attchd,1,452.0,3,Y,0,0,0,0,0,6,2009,WD,Normal,54,54,1733.0,0,0,2.0,subclass20,CIR,Edwards_MARY_CIR
2555,909275030,145000,90,RH,82.0,6270,Pave,No_Alley,Reg,HLS,AllPub,Inside,Gtl,Crawfor,Norm,Norm,Duplex,2Story,5,6,Gable,CompShg,MetalSd,MetalSd,,0.0,2,CBlock,3,3,1,284.0,0.0,717.0,GasA,2,N,FuseA,1001,0.0,0.0,0,4,2,2,8,Typ,0,2Types,1,871.0,3,Y,0,0,0,0,0,8,2007,WD,Normal,58,57,3003.0,1,0,2.0,subclass90,DR,Crawfor_SUNSET_DR
2556,907192040,217500,60,RL,0.0,8826,Pave,No_Alley,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,Gable,CompShg,VinylSd,VinylSd,BrkFace,144.0,2,PConc,4,3,1,841.0,0.0,144.0,GasA,4,Y,SBrkr,857,1.0,0.0,1,3,1,3,7,Typ,1,Attchd,3,486.0,3,Y,193,96,0,0,0,7,2007,WD,Normal,7,7,2827.0,0,96,3.5,subclass60,BLVD,CollgCr_CLEMENS_BLVD


<IPython.core.display.Javascript object>