# <center> House Prices -- Advance Regression Technique </center>
----

data source: https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data?select=test.csv

**Aim** : To build a predictive model to predict housing price



In [146]:
import inflection as infl
import math
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go


from plotly.subplots import make_subplots
from scipy.stats import linregress
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.feature_selection import mutual_info_regression, r_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, average_precision_score, f1_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MaxAbsScaler, StandardScaler
from sklearn.tree import DecisionTreeRegressor


In [14]:
##### helper functions

def dataframe_summary(dataframe: pd.DataFrame,
                      max_length: int):
    
    pd.options.display.float_format = '{:,.4f}'.format

    ### Get list of column names
    col_names = dataframe.columns.to_list()

    ### Get unique count of each column
    col_length = [len(dataframe[val].unique()) for val in col_names]

    ### Append info for dataframe
    col_names.append("this_df")
    col_length.append(len(dataframe))

    ### Create new dataframe for summary
    summary_df = pd.DataFrame(list(zip(col_names, col_length)), columns=["column_name", "unique_count"])
    summary_df["uniqueness"] = summary_df["unique_count"]/len(dataframe) * 100

    ### Count nan values in each column
    nan_df = dataframe.isnull().sum(axis = 0).to_frame().reset_index().rename(columns={"index":"column_name", 0:"nan_count"})
    nan_df["nan_percent"] = nan_df["nan_count"]/len(dataframe)*100
    nan_df["percent_complete"] = 100 - nan_df["nan_percent"]

    ### Create new frame for data type in each column
    type_df = dataframe.dtypes.to_frame().reset_index().rename(columns={"index":"column_name", 0:"data_type"})
    type_df["data_type"] = type_df["data_type"].astype(str)

    ### Create new dataframe with unique entries for each column

    col_name = []
    unique_val = []
    
    for val  in dataframe.columns.to_list():

        col_name.append(val)

        if len(dataframe[val].unique())<=max_length:
            unique_val.append(list(dataframe[val].unique()))

        else:
            unique_val.append(f">{max_length} entries")

    unique_df = pd.DataFrame(list(zip(col_name, unique_val)),
                             columns=["column_name", "unique_entries"])
    
    

    ### Merge summary_df with nan_df, type_df
    summary_df = summary_df.merge(nan_df, on="column_name", how="outer")
    summary_df = summary_df.merge(type_df, on="column_name", how="outer")
    summary_df = summary_df.merge(unique_df, on="column_name", how="outer")


    return summary_df.sort_values(["data_type", "nan_count"])


def describe_dataframe(dataframe: pd.DataFrame):
    
    pd.options.display.float_format = '{:,.3f}'.format
    
    return dataframe.select_dtypes([int, float]).describe()


In [2]:
##### load house price data and rename column to snake case
house_df = pd.read_csv("house_price_train.csv")
house_df.columns = [infl.underscore(val) for val in house_df.columns]

house_df


Unnamed: 0,id,ms_sub_class,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,utilities,...,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,sale_condition,sale_price
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [120]:
##### introduce new columns
house_df["age_sold"] = house_df["yr_sold"] - house_df["year_built"]

# house_df["age_sold"]

0        5
1       31
2        7
3       91
4        8
        ..
1455     8
1456    32
1457    69
1458    60
1459    43
Name: age_sold, Length: 1460, dtype: int64

----
# <center> Data Clean Up </center>
---

Determine the amount of rows with NaN values. The following features are removed since the amount missing values are >10%. 

ID was also removed since it does not contribute to the prediction.

| Feature | Description |
| --- | --- | 
| pool_qc | pool quality | 
| misc_feature | Miscellaneous features not covered in other categories | 
| alley |  Type of alley access to property | 
| fence |  Fence quality  | 
| mas_vnr_type |  Masonry veneer type -- Brick, stone, etc. | 
| fireplace_qu |  Fireplace quality | 
| lot_frontage |  Linear feet of street connected to property| 

In [12]:
col_names = [val for val in house_df.columns]
nan_count = [len(house_df.loc[house_df[val].isna(), val]) for val in house_df.columns]

nan_count_df = pd.DataFrame(list(zip(col_names, nan_count)), columns=["column", "nan_count"])
nan_count_df["nan_percent"] = nan_count_df["nan_count"]/len(house_df)*100

##### Display the column with NaN; sort according to the amount of NaN in columns
nan_count_df[nan_count_df["nan_percent"]>10].sort_values(["nan_count"], ascending=False)

Unnamed: 0,column,nan_count,nan_percent
72,pool_qc,1453,99.521
74,misc_feature,1406,96.301
6,alley,1369,93.767
73,fence,1179,80.753
25,mas_vnr_type,872,59.726
57,fireplace_qu,690,47.26
3,lot_frontage,259,17.74


In [13]:
##### remove features with more than 10% NaN row

nan_removal = nan_count_df.loc[nan_count_df["nan_percent"]>10, "column"].to_list()
nan_removal = nan_removal + ["id"]

cleaned_house_df = house_df.drop(columns=nan_removal)

cleaned_house_df.head(5)

Unnamed: 0,ms_sub_class,ms_zoning,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,...,enclosed_porch,3_ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type,sale_condition,sale_price
0,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,...,0,0,0,0,0,12,2008,WD,Normal,250000


---
# <center> Exploratory Data Analysis </center>
---



In [19]:
house_df_summary = dataframe_summary(cleaned_house_df, 10)
house_df_summary.sort_values(["percent_complete", "data_type"], ascending=[False, True])

Unnamed: 0,column_name,unique_count,uniqueness,nan_count,nan_percent,percent_complete,data_type,unique_entries
0,ms_sub_class,15,1.0274,0.0000,0.0000,100.0000,int64,>10 entries
2,lot_area,1073,73.4932,0.0000,0.0000,100.0000,int64,>10 entries
14,overall_qual,10,0.6849,0.0000,0.0000,100.0000,int64,"[7, 6, 8, 5, 9, 4, 10, 3, 1, 2]"
15,overall_cond,9,0.6164,0.0000,0.0000,100.0000,int64,"[5, 8, 6, 7, 4, 2, 3, 9, 1]"
16,year_built,112,7.6712,0.0000,0.0000,100.0000,int64,>10 entries
...,...,...,...,...,...,...,...,...
53,garage_type,7,0.4795,81.0000,5.5479,94.4521,object,"[Attchd, Detchd, BuiltIn, CarPort, nan, Basmen..."
55,garage_finish,4,0.2740,81.0000,5.5479,94.4521,object,"[RFn, Unf, Fin, nan]"
58,garage_qual,6,0.4110,81.0000,5.5479,94.4521,object,"[TA, Fa, Gd, nan, Ex, Po]"
59,garage_cond,6,0.4110,81.0000,5.5479,94.4521,object,"[TA, Fa, nan, Gd, Po, Ex]"


## Sale Price with Respect to Neighborhood

In [158]:
house_neighborhood = house_df[["neighborhood", "sale_price", "gr_liv_area", "age_sold"]].groupby(["neighborhood"]).agg(["mean", "median", "std"]).reset_index()
house_neighborhood = house_neighborhood.sort_values(("sale_price", "mean"), ascending=False)
house_neighborhood["mean_scaled"] = house_neighborhood["sale_price"]["mean"]/house_neighborhood["gr_liv_area"]["mean"]
house_neighborhood["median_scaled"] = house_neighborhood["sale_price"]["median"]/house_neighborhood["gr_liv_area"]["median"]



In [157]:
linregress_result = linregress(x=house_neighborhood[("age_sold", "mean")], y=house_neighborhood["mean_scaled"])
linregress_result

dummy_x = [val for val in range(87)]
dummy_y = [linregress_result[0]*val + linregress_result[1] for val in dummy_x]

fig_eda_1 = make_subplots(rows=4, cols=1, 
                          subplot_titles=["Sale Price by Neighborhood", "Sale Price per Square Foot by Neighborhood",
                                          "House Age during Sale", "Sale Price per Square Foot versus House Age"],
                            vertical_spacing=0.08)

fig_eda_1.add_trace( go.Bar(y=house_neighborhood["sale_price"]["mean"], x=house_neighborhood["neighborhood"], orientation="v",
                        text=house_neighborhood["sale_price"]["mean"], texttemplate='$ %{text:,.0f}', marker_color="#811331",
                        showlegend=True, legendgroup="mean", name="mean"), 
                 row=1, col=1)

fig_eda_1.add_trace( go.Bar(y=-1*house_neighborhood["sale_price"]["median"], x=house_neighborhood["neighborhood"], orientation="v", 
                        text=house_neighborhood["sale_price"]["median"], texttemplate='$ %{text:,.0f}', 
                        marker_color="#0F52BA",
                        showlegend=True, legendgroup="median", name="median"), 
                 row=1, col=1)

fig_eda_1.add_trace( go.Bar(y=house_neighborhood["mean_scaled"], x=house_neighborhood["neighborhood"], orientation="v",
                        text=house_neighborhood["mean_scaled"], texttemplate='$ %{text:,.0f}', marker_color="#811331",
                        showlegend=False, legendgroup="mean", name="mean"), 
                 row=2, col=1)

fig_eda_1.add_trace( go.Bar(y=-1*house_neighborhood["median_scaled"], x=house_neighborhood["neighborhood"], orientation="v", 
                        text=house_neighborhood["median_scaled"], texttemplate='$ %{text:,.0f}', 
                        marker_color="#0F52BA",
                        showlegend=False, legendgroup="median", name="median"), 
                 row=2, col=1)

fig_eda_1.add_trace( go.Bar(y=house_neighborhood["age_sold"]["mean"], x=house_neighborhood["neighborhood"], orientation="v",
                        text=house_neighborhood["age_sold"]["mean"], texttemplate='%{text:,.0f} years', marker_color="#811331",
                        showlegend=False, legendgroup="mean", name="mean"), 
                 row=3, col=1)

fig_eda_1.add_trace( go.Bar(y=-1*house_neighborhood["age_sold"]["median"], x=house_neighborhood["neighborhood"], orientation="v", 
                        text=house_neighborhood["age_sold"]["median"], texttemplate='%{text:,.0f} years', 
                        marker_color="#0F52BA",
                        showlegend=False, legendgroup="median", name="median"), 
                 row=3, col=1)

fig_eda_1.add_trace( go.Scatter(x=house_neighborhood[("age_sold", "mean")], y=house_neighborhood["mean_scaled"], 
                                mode="markers", showlegend=False, marker=dict(color="#7D3C98")),
                    row=4, col=1)

fig_eda_1.add_trace( go.Scatter(x=dummy_x, y=dummy_y, mode="lines", showlegend=False, marker=dict(color="#AF7AC5")),
                    row=4, col=1)


fig_eda_1.update_yaxes(showticklabels=False)

fig_eda_1.update_xaxes(title_text="house age during sale (years)", row=4)
fig_eda_1.update_yaxes(title_text="sale price per square foot", showticklabels=True, row=4)

fig_eda_1.update_layout(height=1200, barmode="overlay", yaxis1=dict(range=[-380000, 380000]),
                        legend=dict(yanchor="top", y=1.1, xanchor="left", x=0.45, orientation="h"))

fig_eda_1.show()

## Sale Price Fluctuation

In [182]:
price_temporal = house_df[["yr_sold", "mo_sold", "gr_liv_area", "sale_price"]]
price_temporal["sale_date_str"] = ["{0}-{1}-01".format(price_temporal.loc[val, "yr_sold"], price_temporal.loc[val, "mo_sold"]) for val in range(len(price_temporal))]
price_temporal["date_sold"] = pd.to_datetime(price_temporal["sale_date_str"])
price_temporal["price_per_sf"] = price_temporal["sale_price"]/price_temporal["gr_liv_area"]


price_temporal_ave = price_temporal.groupby("date_sold").mean(numeric_only=True).reset_index()
price_temporal_med = price_temporal.groupby("date_sold").median(numeric_only=True).reset_index()


# price_temporal_ave

In [190]:
fig_eda_2 = make_subplots(rows=2, cols=1)

fig_eda_2.add_trace( go.Scatter(x=price_temporal["date_sold"], y=price_temporal["sale_price"],
                          mode="markers"),
                row=1, col=1)
fig_eda_2.add_trace( go.Scatter(x=price_temporal_ave["date_sold"], y=price_temporal_ave["sale_price"],
                          mode="lines"),
                row=1, col=1)
fig_eda_2.add_trace( go.Scatter(x=price_temporal_med["date_sold"], y=price_temporal_med["sale_price"],
                          mode="lines"),
                row=1, col=1)


fig_eda_2.add_trace( go.Scatter(x=price_temporal["date_sold"], y=price_temporal["price_per_sf"],
                          mode="markers"),
                row=2, col=1)
fig_eda_2.add_trace( go.Scatter(x=price_temporal_ave["date_sold"], y=price_temporal_ave["price_per_sf"],
                          mode="lines"),
                row=2, col=1)
fig_eda_2.add_trace( go.Scatter(x=price_temporal_med["date_sold"], y=price_temporal_med["price_per_sf"],
                          mode="lines"),
                row=2, col=1)

fig_eda_2.update_layout(height=600)
fig_eda_2.show()


## House Characteristics

In [259]:
##### Building type
house_bldg_type = house_df[["sale_price", "neighborhood", "house_style", "bldg_type"]]

### group by building type only
house_bldg_type_1 =  house_bldg_type.groupby(["bldg_type"]).count().rename(columns={"sale_price":"count"}).reset_index()
house_bldg_type_1 = house_bldg_type_1[["bldg_type", "count"]]

### group by neighborhood and building type
house_bldg_type_2 = house_bldg_type.groupby(["neighborhood", "bldg_type"]).count()
house_bldg_type_2 = house_bldg_type_2["sale_price"].rename({"sale_price":"count"}).unstack().fillna(0)

house_bldg_type_3 = house_bldg_type_2.div(house_bldg_type_2.sum(axis=1), axis=0)

house_bldg_type_4 = house_bldg_type.groupby(["neighborhood", "bldg_type"]).agg(["mean", "median"]).reset_index()


##### House style
house_style = house_df[["sale_price", "neighborhood", "house_style", "bldg_type"]]

house_style_1 = house_style.groupby(["house_style"]).count().rename(columns={"sale_price":"count"}).reset_index()
house_style_1 = house_style_1[["house_style", "count"]]

house_style_2 = house_style.groupby(["neighborhood", "house_style"]).count()
house_style_2 = house_style_2["sale_price"].rename({"sale_price":"count"}).unstack().fillna(0)

house_style_3 = house_style_2.div(house_style_2.sum(axis=1), axis=0)


In [322]:
fig_eda_3 = make_subplots(rows=2, cols=2, vertical_spacing=0.09,
                    specs=[[{"type":"domain"}, {"type":"domain"}], [{"type":"xy"}, {"type":"xy"}]],
                    subplot_titles=["Building type", "House style", 
                                    "Building type breakdown by neighborhood", "House style breakdown by neighborhood"])

fig_eda_3.add_trace( go.Pie(values=house_bldg_type_1["count"], labels=house_bldg_type_1["bldg_type"],
                      textinfo="value+percent", 
                      marker=dict(colors=px.colors.qualitative.Pastel[0:len(house_bldg_type_1["bldg_type"])]),
                      legend="legend1", legendgroup="1", legendgrouptitle=dict(text="Building type")),
              row=1, col=1)

fig_eda_3.add_trace( go.Pie(values=house_style_1["count"], labels=house_style_1["house_style"], 
                      textinfo="value+percent",
                      marker=dict(colors=px.colors.qualitative.Bold[0:len(house_bldg_type_1["bldg_type"])]),
                      legend="legend2", legendgroup="2", legendgrouptitle=dict(text="House style")),
              row=1, col=2)

fig_eda_3.add_trace(go.Heatmap(x=house_bldg_type_3.columns, y=house_bldg_type_3.index, 
                         z=house_bldg_type_3, text=house_bldg_type_2, texttemplate="%{text:,.0f}",
                         colorscale="Oranges", 
                         colorbar=dict(tickvals=[0,1], ticktext=["min", "max"],
                                       len=0.5, y=0.2)
                         ),
            row=2, col=1)

fig_eda_3.add_trace(go.Heatmap(x=house_style_3.columns, y=house_style_3.index, 
                         z=house_style_3, text=house_style_2, texttemplate="%{text:,.0f}",
                         colorscale="Oranges",
                         colorbar=dict(tickvals=[0,1], ticktext=["min", "max"], len=0.5),
                         showscale=False
                         ),
            row=2, col=2)

fig_eda_3.update_layout(height=1200, hovermode=False,
                  )
fig_eda_3.show()

----
# <center> Feature Selection </center>
----

Split data into categorical and numerical data type.

In [5]:
##### split numerical and categorical data
num_cleaned_house_df = cleaned_house_df.select_dtypes(include=np.number)
cat_cleaned_house_df = cleaned_house_df.select_dtypes(exclude=np.number)

num_cleaned_house_df.head(5)

Unnamed: 0,ms_sub_class,lot_area,overall_qual,overall_cond,year_built,year_remod_add,mas_vnr_area,bsmt_fin_sf1,bsmt_fin_sf2,bsmt_unf_sf,...,wood_deck_sf,open_porch_sf,enclosed_porch,3_ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_price
0,60,8450,7,5,2003,2003,196.0,706,0,150,...,0,61,0,0,0,0,0,2,2008,208500
1,20,9600,6,8,1976,1976,0.0,978,0,284,...,298,0,0,0,0,0,0,5,2007,181500
2,60,11250,7,5,2001,2002,162.0,486,0,434,...,0,42,0,0,0,0,0,9,2008,223500
3,70,9550,7,5,1915,1970,0.0,216,0,540,...,0,35,272,0,0,0,0,2,2006,140000
4,60,14260,8,5,2000,2000,350.0,655,0,490,...,192,84,0,0,0,0,0,12,2008,250000


In [271]:
corr_matrix = num_cleaned_house_df.corr().sort_values("sale_price", ascending=False)
corr_matrix = corr_matrix.drop(columns=["overall_qual"]).drop(index=["overall_qual"])

fig_1a = px.imshow(corr_matrix)

fig_1a.update_xaxes(title_text="Feature Names")
fig_1a.update_yaxes(title_text="Feature Names")
fig_1a.update_layout(title_text="Fig_1a: Pearson Correlation Heatmap")
fig_1a.show()

Fig_1a shows the heatmap of Pearson correlation matrix for features with numerical data type. The correlation compares two features at a time, and the correlation values can range from -1 to 1. When two features are negatively correlated, an increase in one feature decreases the other feature. On the other hand, when two features are positively correlated, an increase in one feature increases the other feature. A 0 correlation value means that the two features are not correlated, hence any increase or decrease in one feature does not affect the other feature.

The right most column of the heatmap shows how sale price is correlated to other feature. These values are extracted and plotted in Fig_1b.

In [272]:
fig_1b = px.scatter(x=corr_matrix.index, y=corr_matrix["sale_price"])

fig_1b.update_xaxes(title_text="Feature Name")
fig_1b.update_yaxes(title_text="Pearson correlation value")
fig_1b.update_layout(title_text="Fig_1b: Pearson correlation value between Sale Price and other features")
fig_1b.show()

Based on Fig_1b, some of the features that can be used to create a model are `gr_liv_area`, `garage_cars`, `garage_area`, `total_bsmt_sf`, and `1st_flr_sf`. The remainder features will not be included to avoid creating a complex model and potentially overfitting the model.

Additionally, `year_built`, `yr_sold`, `year_remod_add`, and `mo_sold` features were also included for feature engineering.



---
# <center> Model Development </center>
---

In [6]:
y = house_df["sale_price"]

num_cols = ["gr_liv_area", "garage_area", "total_bsmt_sf", "1st_flr_sf", "year_built", 
            "yr_sold", "year_remod_add", "mo_sold"]
cat_cols = ["neighborhood"]

num_X = num_cleaned_house_df[num_cols]
cat_X = cleaned_house_df[cat_cols]


##### Feature engineering
num_X["house_age"] = num_X["yr_sold"] - num_X["year_built"]
num_X.loc[num_X["house_age"]==0, "house_age"] = num_X.loc[num_X["house_age"]==0, "mo_sold"]/12
num_X["scaled_remod"] = (num_X["year_remod_add"] - num_X["year_built"])/num_X["house_age"]


##### Drop unnecessary columns
num_X = num_X.drop(columns=["year_built", "yr_sold", "mo_sold"])

X = num_X

##### create standard scaler
std_scaler = StandardScaler()
std_scaler.fit(num_X)

##### create one-hot encoder
ohe_encoder = OneHotEncoder(sparse_output=False)
ohe_encoder.fit(cat_X)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=43)


##### transform values using standard scaler
scaled_num_X_train = std_scaler.transform(X_train.select_dtypes(include=np.number))
scaled_num_X_test = std_scaler.transform(X_test.select_dtypes(include=np.number))


##### recreate dataframe after scaling and encoding
scaled_X_train_df = pd.DataFrame(data=scaled_num_X_train, columns=std_scaler.feature_names_in_)
scaled_X_test_df = pd.DataFrame(data=scaled_num_X_test, columns=std_scaler.feature_names_in_)

X_train = scaled_X_train_df
X_test = scaled_X_test_df




New features were generated using feature engineering. The new features include `house_age` and `scaled_remod`. `house_age` is the age of the house in years since it was built. This value was computed as `yr_sold` - `year_built`. If the house was sold in the same year it was built, then the month the house was sold will be divided by 12 to convert the house age to year. `scaled_remod` is the time when the house was remodeled scaled to its age. This value was computed as (`year_remod_add` - `year_built`)/`house_age`.

After some experimentation, `garage_cars` feature were dropped since it did not improve the model accuracy. The categorical data were also found to not significantly improve the model accuracy.

In [7]:
##### Linear Regression model

lr_mod = LinearRegression()

lr_mod.fit(X_train, y_train)

fig_2a = px.scatter(x=y_test, y=lr_mod.predict(X_test))

fig_2a.update_xaxes(title_text="Actual Sale Price")
fig_2a.update_yaxes(title_text="Predicted Sale Price")
fig_2a.update_layout(title_text="Predicted Sale Price vs. Actual Sale Price")
fig_2a.show()


print(f"Test set score: {lr_mod.score(X_test, y_test)}")


Test set score: 0.7550668341859367


In [277]:
##### Decision Tree Regressor

min_splits = []
scores = []

for min_split in range(2,51):

    dtr_mdl = DecisionTreeRegressor(min_samples_split=min_split, ccp_alpha=1000)

    score = cross_val_score(dtr_mdl, X=X_test, y=y_test, cv=10)

    min_splits.append(min_split)
    scores.append(score.mean())

dtr_regr_df = pd.DataFrame(list(zip(min_splits, scores)), columns=["min_split", "score"]).sort_values("score", ascending=False).reset_index()


dtr_mdl = DecisionTreeRegressor(min_samples_split=dtr_regr_df.loc[0, "min_split"], ccp_alpha=1000)
dtr_mdl.fit(X_train, y_train)

fig_3a = px.scatter(x=y_test, y=dtr_mdl.predict(X_test))
fig_3a.update_xaxes(title_text="Actual Sale Price")
fig_3a.update_yaxes(title_text="Predicted Sale Price")
fig_3a.update_layout(title_text="Predicted Sale Price vs. Actual Sale Price")
fig_3a.show()


print(f"Test set score: {dtr_mdl.score(X_test, y_test)}")

Test set score: 0.7399763757956677


In [300]:
##### Random Forest Regressor

rfr_min_splits = []
rfr_max_samples = []
rfr_alphas = []
rfr_scores = []

for rfr_min_split in range(2,51):

    # for rfr_max_sample in [val/100 for val in range(50,96,5)]:
    for rfr_alpha in [10**val for val in range(6)]:

        # rfr_mdl = RandomForestRegressor(min_samples_split=min_split, max_samples=rfr_max_sample)
        rfr_mdl = RandomForestRegressor(min_samples_split=min_split, 
                                        max_samples=0.75,
                                        n_estimators=200,
                                        ccp_alpha=rfr_alpha)

        rfr_score = cross_val_score(rfr_mdl, X=scaled_num_X_train, y=y_train, cv=10)

        rfr_min_splits.append(rfr_min_split)
        # rfr_max_samples.append(rfr_max_sample)
        rfr_alphas.append(rfr_alpha)
        rfr_scores.append(rfr_score.mean())


# rfr_result_df = pd.DataFrame(list(zip(rfr_min_splits, rfr_max_samples, rfr_scores)), 
#                              columns=["min_split", "max_sample", "score"])
rfr_result_df = pd.DataFrame(list(zip(rfr_min_splits, rfr_alphas, rfr_scores)), 
                             columns=["min_split", "ccp_alpha", "score"])
rfr_result_df = rfr_result_df.sort_values("score", ascending=False).reset_index(drop=True)

rfr_result_df


Unnamed: 0,min_split,max_sample,score
0,25,10000,0.709603
1,50,100000,0.708366
2,18,100,0.708257
3,35,10000,0.708037
4,37,1,0.707800
...,...,...,...
289,12,10,0.698179
290,32,100,0.698024
291,38,10,0.697939
292,47,100000,0.697910


In [301]:
max_sample_df = rfr_result_df.groupby("max_sample").mean().reset_index()

final_rfr_mdl = RandomForestRegressor(min_samples_split=25, 
                                      max_samples=0.75,
                                      n_estimators=200,
                                      ccp_alpha=10000)
final_rfr_mdl.fit(X_train, y_train)

fig_5a = px.scatter(x=y_test, y=final_rfr_mdl.predict(X_test))

fig_5a.update_xaxes(title_text="Actual Sale Price")
fig_5a.update_yaxes(title_text="Predicted Sale Price")
fig_5a.update_layout(title_text="Predicted Sale Price vs. Actual Sale Price")
fig_5a.show()

print(f"Test set score: {final_rfr_mdl.score(X_test, y_test)}")

Test set score: 0.8068200342578721


In [282]:
##### Gradient Boost Regressor
gbr_max_depth = []
gbr_min_split = []
gbr_scores = []

for val in range(1,11):

    for val2 in range(5,38,3):

        gbr_params = {
            "n_estimators": 500,
            "max_depth": val,
            "min_samples_split": val2,
            "learning_rate": 0.2,
            "loss": "squared_error",
            "ccp_alpha": 100
        }

        gbr_reg_mdl = GradientBoostingRegressor(**gbr_params)

        gbr_score = cross_val_score(gbr_reg_mdl, X=X_train, y=y_train, cv=10)

        gbr_max_depth.append(val)
        gbr_min_split.append(val2)
        gbr_scores.append(gbr_score.mean())

    print(val)


gbr_reg_df = pd.DataFrame(list(zip(gbr_max_depth, gbr_min_split, gbr_scores)), 
                          columns=["max_depth", "min_split", "score"])
gbr_reg_df = gbr_reg_df.sort_values("score", ascending=False).reset_index(drop=True)

gbr_reg_df

1
2
3
4
5
6
7
8
9
10


Unnamed: 0,max_depth,min_split,score
0,5,11,0.737433
1,5,17,0.734476
2,4,5,0.734187
3,5,20,0.731060
4,4,35,0.729722
...,...,...,...
105,8,11,0.697120
106,8,29,0.692223
107,9,8,0.688742
108,9,5,0.683167


In [283]:
gbr_summary = gbr_reg_df.groupby("max_depth").mean().reset_index()

final_gbr_params = {
        "n_estimators": 500,
        "max_depth": 5,
        "min_samples_split": 11,
        "learning_rate": 0.2,
        "loss": "squared_error",
}

final_gbr_mdl = GradientBoostingRegressor(**final_gbr_params)
final_gbr_mdl.fit(X_train, y_train)

fig_6a = px.scatter(x=y_test, y=final_gbr_mdl.predict(X_test))

fig_6a.update_xaxes(title_text="Actual Sale Price")
fig_6a.update_yaxes(title_text="Predicted Sale Price")
fig_6a.update_layout(title_text="Predicted Sale Price vs. Actual Sale Price")
fig_6a.show()


print(f"Testing score: {final_gbr_mdl.score(X_test, y_test)}")

Testing score: 0.7369047147075481


In [261]:
n_estimators = []
ada_regr_scores = []

for n_est in range(100,2100,100):

    ada_regr_mdl = AdaBoostRegressor(estimator=DecisionTreeRegressor(
                                                    max_depth=5,
                                                    min_samples_split=7,
                                                    ccp_alpha=100,
                                                    random_state=43),
                                    loss="square",
                                    n_estimators=n_est,
                                    learning_rate=0.1,
                                    random_state=43)
    
    ada_score = cross_val_score(ada_regr_mdl, X=X_train, y=y_train, cv=10)

    n_estimators.append(n_est)
    ada_regr_scores.append(ada_score.mean())


ada_regr_df = pd.DataFrame(list(zip(n_estimators, ada_regr_scores)), 
                           columns=["estimator_count", "score"])

ada_regr_df


Unnamed: 0,estimator_count,score
0,100,0.720667
1,200,0.72216
2,300,0.720407
3,400,0.721109
4,500,0.720856
5,600,0.719175
6,700,0.718227
7,800,0.716888
8,900,0.715606
9,1000,0.714231


In [285]:
ada_regr_df = ada_regr_df.sort_values(by=["score"], ascending=False).reset_index(drop=True)

final_ada_regr_mdl = AdaBoostRegressor(estimator=DecisionTreeRegressor(max_depth=5),
                                        loss="square",
                                        n_estimators=ada_regr_df.loc[0,"estimator_count"],
                                        learning_rate=0.1,
                                        random_state=43)

final_ada_regr_mdl.fit(X_train, y_train)
final_ada_regr_mdl.score(X_test, y_test)

fig_7a = px.scatter(x=y_test, y=final_ada_regr_mdl.predict(X_test))

fig_7a.update_xaxes(title_text="Actual Sale Price")
fig_7a.update_yaxes(title_text="Predicted Sale Price")
fig_7a.update_layout(title_text="Predicted Sale Price vs. Actual Sale Price")
fig_7a.show()

print(f"Testing score: {final_ada_regr_mdl.score(X_test, y_test)}")

Testing score: 0.7145447032176635


----
# <center>Testing Set </center>
----

The following testing set was obtained as part of the House Prices -- Advance Regression Techniques competition on Kaggle. Random Forest Regression was selected to be used to predict the house prices based on the selected features. The predicted house price was then submitted to Kaggle for evaluation.

In [148]:
house_testing_set = pd.read_csv("house_price_test.csv")
house_testing_set.columns = [infl.underscore(val) for val in house_testing_set.columns]

id_col = house_testing_set["id"]

coi_testing = ["gr_liv_area", "garage_area", "total_bsmt_sf", "1st_flr_sf", "year_built", 
                "yr_sold", "year_remod_add", "mo_sold"]

house_testing_set = house_testing_set[coi_testing]

##### Feature engineering
house_testing_set["house_age"] = house_testing_set["yr_sold"] - house_testing_set["year_built"]
house_testing_set.loc[house_testing_set["house_age"]==0, "house_age"] = house_testing_set.loc[house_testing_set["house_age"]==0, "mo_sold"]/12
house_testing_set["scaled_remod"] = (house_testing_set["year_remod_add"] - house_testing_set["year_built"])/house_testing_set["house_age"]

##### Drop unnecessary columns
house_testing_set = house_testing_set.drop(columns=["year_built", "yr_sold", "mo_sold"])

##### fillna
house_testing_set = house_testing_set.fillna(0)

##### transform values
scaled_testing_set = std_scaler.transform(house_testing_set)

##### recreate dataframe after scaling and encoding
house_testing_set = pd.DataFrame(data=scaled_testing_set, columns=std_scaler.feature_names_in_)

house_testing_set

Unnamed: 0,gr_liv_area,garage_area,total_bsmt_sf,1st_flr_sf,year_remod_add,house_age,scaled_remod
0,-1.179256,1.202536,-0.400017,-0.689929,-1.156380,0.411259,-0.712248
1,-0.354966,-0.753188,0.619239,0.430511,-1.301740,0.510587,-0.712248
2,0.216136,0.042202,-0.295127,-0.607125,0.636400,-0.780683,-0.498028
3,0.168544,-0.013943,-0.299687,-0.612300,0.636400,-0.813792,-0.712248
4,-0.448246,0.154492,0.507509,0.303718,0.345679,-0.615135,-0.712248
...,...,...,...,...,...,...,...
1454,-0.806136,-2.212963,-1.166169,-1.595596,-0.720298,-0.019165,-0.712248
1455,-0.806136,-0.874836,-1.166169,-1.595596,-0.720298,-0.019165,-0.712248
1456,-0.554851,0.482006,0.379817,0.158811,0.539493,0.311930,1.467209
1457,-1.038384,-2.212963,-0.331610,-0.498445,0.345679,-0.747573,-0.712248


In [151]:
prediction = final_rfr_mdl.predict(house_testing_set)

submission = pd.DataFrame(list(zip(id_col, prediction)), columns=["Id", "SalePrice"])
submission.to_csv("submission_house.csv", index=False)

In [141]:
# house_testing_set[house_testing_set["garage_area"].isna()]
house_testing_set[house_testing_set["total_bsmt_sf"].isna()]

Unnamed: 0,gr_liv_area,garage_area,total_bsmt_sf,1st_flr_sf,year_built,yr_sold,year_remod_add,mo_sold,house_age,scaled_remod
660,896,280.0,,896,1946,2008,1950,4,62.0,0.064516


----
# <center>Regression Analysis </center>
----

In [8]:
fig_2a = px.scatter(x=y_test, y=lr_mod.predict(X_test))

fig_2a.update_xaxes(title_text="Actual Sale Price")
fig_2a.update_yaxes(title_text="Predicted Sale Price")
fig_2a.update_layout(title_text="Predicted Sale Price vs. Actual Sale Price")
fig_2a.show()


print(f"Test set score: {lr_mod.score(X_test, y_test)}")

Test set score: 0.7550668341859367


In [13]:
lr_predicted_y = lr_mod.predict(X_test)
lr_residual = y_test - lr_predicted_y
lr_z_score = (lr_residual - np.mean(lr_residual))/np.std(lr_residual)
qq_plot_y = 

fig_2b = px.scatter(x=lr_predicted_y, y=lr_residual)

fig_2b.update_xaxes(title_text="Predicted Sale Price")
fig_2b.update_yaxes(title_text="Residual")
fig_2b.update_layout(title_text="Residual vs. Predicted Sale Price")
fig_2b.show()

In [18]:
fig_2c = px.scatter(x=sorted(lr_z_score), y=sorted(lr_predicted_y))

fig_2c.update_xaxes(title_text="Theoretical Quantiles")
fig_2c.update_yaxes(title_text="Sample Quantile")
fig_2c.update_layout(title_text="Sample Quantile vs. Theoretical Quantile")
fig_2c.show()

In [22]:
fig_2c = make_subplots(specs=[[{"secondary_y": True}]])

fig_2c.add_trace(
    go.Scatter(x=sorted(lr_z_score), y=sorted(lr_predicted_y)),
    secondary_y=False,
)

fig_2c.add_trace(
    go.Scatter(x=sorted(lr_z_score), y=lr_z_score*math.tan(45)),
    secondary_y=True,
)

fig_2c.show()


In [17]:
x = [0, ]
y = [200000, ]

[-46.534937926218845,
 34870.927540014585,
 35930.30257638023,
 35959.22509011949,
 36645.58299588016,
 37603.005521948624,
 41915.91358729632,
 42408.3618906386,
 43956.85474474836,
 44829.10995277332,
 46707.73152195415,
 47131.33291451898,
 48051.98127545224,
 48878.89382565196,
 48962.85620921446,
 49526.88002524979,
 49959.27359948176,
 55376.926299166225,
 59283.067410441334,
 59378.32585497627,
 59403.302210212976,
 59740.20586822914,
 60798.68240026233,
 60889.07022779103,
 61074.79528848623,
 64285.545408395046,
 69223.06278724445,
 70102.82129583924,
 71435.93622962883,
 71882.09963738722,
 73075.4847097267,
 73137.50702072168,
 75137.72738698601,
 78509.39549914678,
 78850.4419302574,
 81159.24157102346,
 83053.18214477386,
 83295.77894833688,
 83348.21657017258,
 83580.4902242111,
 84602.271614036,
 84978.73432776314,
 87501.1966958832,
 87662.13139412753,
 88351.71921371465,
 91831.23581265775,
 93364.90440377353,
 93683.36683041278,
 94249.2743271266,
 94327.40745583308,


In [19]:
math.tan(45)

1.6197751905438615