In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import re
from sklearn.model_selection import train_test_split


# the below extension properly formats a cell after it is run
%load_ext nb_black

# Set the maximum number of rows to 200
pd.set_option("display.max_rows", 200)


# Set the maximum number of columns to 200
pd.set_option("display.max_columns", 200)

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [19]:
housing = pd.read_csv("data/housing_cleaned.csv")
locations = pd.read_csv("data/locations.csv")

<IPython.core.display.Javascript object>

In [20]:
# Moving sale price to the first column after PID
housing.insert(1, "SalePrice", housing.pop("SalePrice"))
# dropping Quantiles
housing = housing.drop(
    columns=[
        "MSSubClass",
        "Street_name",
        "PoolQC",
        "Prop_Addr",
        "Street_10_Quantiles",
        "Street_20_Quantiles",
        "Street_30_Quantiles",
        "Street_50_Quantiles",
    ]
)
# Removing observations with no address
housing = housing[housing["Street_type"] != "None"]

<IPython.core.display.Javascript object>

In [21]:
# Should these rows also be removed where the sale condition is not normal?

# Normal: indicates a normal sale where no special conditions apply.
# Partial: indicates a sale where the buyer purchases a partial interest in the property, such as a partial ownership of the land or building.
# Abnorml: indicates an abnormal sale where conditions like foreclosure, short sale or sheriff sale apply.
# Family: indicates a sale between family members.
# Alloca: indicates a sale where the property was sold to a relocation company.
# AdjLand: indicates a sale where the property is adjacent to land zoned for commercial or industrial use.

<IPython.core.display.Javascript object>

In [22]:
# Dropping this observtion that is the only one in the Landmrk neighborhood
housing.drop(housing[housing["Neighborhood"] == "Landmrk"].index, inplace=True)


<IPython.core.display.Javascript object>

In [23]:
housing = housing.merge(locations[['PID', 'Neighborhood_st']], on='PID', how='left')


<IPython.core.display.Javascript object>

In [24]:
# Creating Dictionaries
GQual = {"No_Garage": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
GQualCond = {"No_Garage": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
GFin = {"No_Garage": 0, "Unf": 1, "RFn": 2, "Fin": 3}
BExposure = {"No_Bsmt": 0, "No": 1, "Mn": 2, "Gd": 3, "Av": 4}
BFinType = {"No_Bsmt": 0, "Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6}
BQualCond = {"No_Bsmt": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
FenceQual = {"No_Fence": 0, "MnWw": 1, "GdWo": 2, "MnPrv": 3, "GdPrv": 4}
FireQual = {"No_Fireplace": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
ExQualCond = {"Po": 0, "Fa": 1, "TA": 2, "Gd": 3, "Ex": 4}

<IPython.core.display.Javascript object>

In [25]:
# Turning nominal columns to numeric with previous dictionary
housing["BsmtExposure"] = housing["BsmtExposure"].apply(lambda row: BExposure[row])
housing["BsmtFinType1"] = housing["BsmtFinType1"].apply(lambda row: BFinType[row])
housing["BsmtFinType2"] = housing["BsmtFinType2"].apply(lambda row: BFinType[row])
housing["BsmtQual"] = housing["BsmtQual"].apply(lambda row: BQualCond[row])
housing["BsmtCond"] = housing["BsmtCond"].apply(lambda row: BQualCond[row])
housing["GarageQual"] = housing["GarageQual"].apply(lambda row: GQualCond[row])
housing["GarageCond"] = housing["GarageCond"].apply(lambda row: GQualCond[row])
housing["GarageFinish"] = housing["GarageFinish"].apply(lambda row: GFin[row])
housing["FireplaceQu"] = housing["FireplaceQu"].apply(lambda row: FireQual[row])
housing["Fence"] = housing["Fence"].apply(lambda row: FenceQual[row])
housing["ExterQual"] = housing["ExterQual"].apply(lambda row: ExQualCond[row])
housing["ExterCond"] = housing["ExterCond"].apply(lambda row: ExQualCond[row])
housing["HeatingQC"] = housing["HeatingQC"].apply(lambda row: ExQualCond[row])
housing["KitchenQual"] = housing["KitchenQual"].apply(lambda row: ExQualCond[row])

<IPython.core.display.Javascript object>

In [26]:
# Splitting the data into train test and stratifying on neighborhood since that is what we are intested in
strat_train_set, strat_test_set = train_test_split(
    housing, test_size=0.2, stratify=housing["Neighborhood"], random_state=42
)

<IPython.core.display.Javascript object>

In [27]:
# Making sure they were distributed properly
print("Test set neighborhood distribution:")
print(strat_test_set["Neighborhood"].value_counts() / len(strat_test_set))

print("\nTrain set neighborhood distribution:")
print(strat_train_set["Neighborhood"].value_counts() / len(strat_train_set))


Test set neighborhood distribution:
NAmes      0.160156
CollgCr    0.091797
OldTown    0.082031
Edwards    0.064453
Somerst    0.056641
Gilbert    0.056641
Sawyer     0.054688
NWAmes     0.048828
NridgHt    0.046875
SawyerW    0.042969
BrkSide    0.041016
Mitchel    0.041016
Crawfor    0.035156
IDOTRR     0.029297
NoRidge    0.025391
Timber     0.019531
StoneBr    0.017578
SWISU      0.015625
ClearCr    0.015625
MeadowV    0.013672
BrDale     0.011719
Blmngtn    0.007812
Veenker    0.007812
NPkVill    0.007812
Blueste    0.003906
Greens     0.001953
Name: Neighborhood, dtype: float64

Train set neighborhood distribution:
NAmes      0.159335
CollgCr    0.092375
OldTown    0.081134
Edwards    0.063539
Gilbert    0.055718
Somerst    0.055718
Sawyer     0.054252
NWAmes     0.047898
NridgHt    0.047410
SawyerW    0.043988
Mitchel    0.040078
BrkSide    0.040078
Crawfor    0.036168
IDOTRR     0.028348
NoRidge    0.026393
Timber     0.020528
StoneBr    0.016618
SWISU      0.016129
ClearCr    

<IPython.core.display.Javascript object>

In [28]:
# saving the dataframes as a CSV file
strat_train_set.to_csv('data/strat_train_set.csv', index=False)
strat_test_set.to_csv('data/strat_test_set.csv', index=False)



<IPython.core.display.Javascript object>

In [140]:
# creating a function to create groups based on mean sale price of streets in each
# neighborhood of the training df and adds a new column with the groups created
# It then uses the groups created using the training dataframe and creates a new column
# in the testing dataframe by mapping these groups.  This will result in missing values
# If there is a street neighborhood combination that was in the training dataset but 
# not the testing dataset


def group_neighbor_streets_by_saleprice(
    traindf=train_df, testdf=test_df, num_quantiles=10
):
    # Calculate the mean sale price for each street in the training df
    street_prices = traindf.groupby("Neighborhood_st")["SalePrice"].mean()
    # Group the streets into the specified number of quantiles based on sale price
    labels = [f"group_{i+1}" for i in range(num_quantiles)]
    groups = pd.qcut(street_prices, q=num_quantiles, labels=range(1, num_quantiles + 1))
    # Create a dictionary that maps each street name to its corresponding sale price group label
    street_group_dict = dict(zip(street_prices.index, groups))
    # Add a new column to the training dataframe with the street price groups
    traindf["StreetPriceGroup"] = traindf["Neighborhood_st"].map(street_group_dict)
    # Add a new column to the testing dataframe with the street price groups
    testdf["StreetPriceGroup"] = testdf["Neighborhood_st"].map(street_group_dict)
    return street_group_dict


<IPython.core.display.Javascript object>

In [142]:
# the num of quantiles can be changed and it is assigned to d which is the dictionary that
# will be used to fill in the missing values

d = group_neighbor_streets_by_saleprice(
    traindf=strat_train_set, testdf=strat_test_set, num_quantiles=5
)

<IPython.core.display.Javascript object>

In [145]:
# this will use the dictionary created to fill in the missing values in the test df with
# another group in the same neighborhood


def fill_na(testdf=test_df, d={}):
    # Extract the first part of the string in the "Neighborhood_st" column
    testdf["Neighborhood_prefix"] = testdf["Neighborhood_st"].map(
        lambda x: x.split("_")[0]
    )
    # Create a new dict that only contains the neighborhood
    new_dict = {k.split("_")[0]: v for k, v in d.items()}
    # Create a list of PIDs with missing StreetPriceGroup values
    na_pid_list = testdf[testdf["StreetPriceGroup"].isna()]["PID"].tolist()
    # Create a Boolean mask to filter the DataFrame
    mask = testdf["PID"].isin(na_pid_list)
    # Apply the dictionary mapping only to the filtered rows
    testdf.loc[mask, "StreetPriceGroup"] = testdf[mask]["Neighborhood_prefix"].map(
        new_dict
    )
    # Drop the column since there is no more use for it
    testdf.drop("Neighborhood_prefix", axis=1, inplace=True)


<IPython.core.display.Javascript object>

In [146]:
fill_na(strat_test_set, d)

<IPython.core.display.Javascript object>

In [147]:
strat_test_set.isna().sum()

PID                    0
SalePrice              0
GrLivArea              0
MSZoning               0
LotFrontage            0
LotArea                0
Street                 0
Alley                  0
LotShape               0
LandContour            0
Utilities              0
LotConfig              0
LandSlope              0
Neighborhood           0
Condition1             0
Condition2             0
BldgType               0
HouseStyle             0
OverallQual            0
OverallCond            0
YearBuilt              0
YearRemodAdd           0
RoofStyle              0
RoofMatl               0
Exterior1st            0
Exterior2nd            0
MasVnrType             0
MasVnrArea             0
ExterQual              0
ExterCond              0
Foundation             0
BsmtQual               0
BsmtCond               0
BsmtExposure           0
BsmtFinType1           0
BsmtFinSF1             0
BsmtFinType2           0
BsmtFinSF2             0
BsmtUnfSF              0
TotalBsmtSF            0


<IPython.core.display.Javascript object>