# 0. Import Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import make_scorer, mean_squared_error

In [2]:
df = pd.read_csv('../data/preprocessed.csv')

# 3.Data Modeling

In [3]:
nominal_col = [
    "MSSubClass","MSZoning", "LotConfig", "Neighborhood", "HouseStyle", "RoofStyle",
    "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "GarageType",
    "SaleType", "SaleCondition", "Street", "Condition1", "Condition2", 
    "BldgType", "RoofMatl", "Heating", "CentralAir", "Electrical", 
    "Functional"
]

ordinal_col = [
    "LotShape", "ExterQual", "ExterCond", "BsmtQual", "BsmtExposure",
    "BsmtFinType1", "HeatingQC", "KitchenQual", "FireplaceQu", "GarageFinish",
    "KitchenAbvGr", "LandContour", "Utilities", "LandSlope", "BsmtCond",
    "BsmtFinType2", "GarageQual", "GarageCond", "PavedDrive"
]

categorical_col = nominal_col + ordinal_col

numerical_col = list(df.columns[(~df.columns.isin(categorical_col))])[:-1]

In [4]:
test_df = pd.read_csv('../data/test_clean.csv')

test_df[ordinal_col] = test_df[ordinal_col].astype("category").apply(lambda x: x.cat.codes)
test_df = pd.get_dummies(test_df, columns=nominal_col, drop_first=True)

test_df[numerical_col] = StandardScaler().fit_transform(test_df[numerical_col])

In [5]:
df[ordinal_col] = df[ordinal_col].astype("category").apply(lambda x: x.cat.codes)
df = pd.get_dummies(df, columns=nominal_col, drop_first=True)

In [6]:
features = list(df.columns)

features.remove('SalePrice')
features.remove("Exterior1st_ImStucc")
features.remove("Exterior1st_Stone")
features.remove("Exterior2nd_Other")
features.remove('Exterior1st_CBlock')
features.remove('Exterior2nd_AsphShn')
features.remove('Exterior2nd_CBlock')
features.remove('Exterior2nd_Stone')
features.remove("HouseStyle_2.5Fin")
for col in ['Condition2_RRAe', 'Condition2_RRAn', 'Condition2_RRNn', 'RoofMatl_CompShg', 'RoofMatl_Membran', 'RoofMatl_Metal', 'RoofMatl_Roll', 'Heating_GasA', 'Heating_OthW', 'Electrical_Mix']:
    if col in features:
        features.remove(col)

target = 'SalePrice'

X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

X_train[numerical_col] = StandardScaler().fit_transform(X_train[numerical_col])
X_test[numerical_col] = StandardScaler().fit_transform(X_test[numerical_col])

In [7]:
def predict(model):
    pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": model.predict(test_df[features])
    }).to_csv('../data/submission.csv', index=False)

    print("Submission file created!")

In [8]:
scorer = make_scorer(mean_squared_error, squared=False)

In [9]:
temp = test_df.isna().sum()
temp[temp > 0]

Series([], dtype: int64)

In [10]:
X_train

Unnamed: 0,LotFrontage,LotArea,LotShape,LandContour,Utilities,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,...,CentralAir_Y,Electrical_FuseF,Electrical_FuseP,Electrical_SBrkr,Functional_Maj2,Functional_Min1,Functional_Min2,Functional_Mod,Functional_Sev,Functional_Typ
254,0.424462,-0.212896,3,3,0,0,-0.820445,0.372217,-0.568922,-1.296113,...,1,0,0,1,0,0,0,0,0,1
1066,0.153752,-0.265245,0,3,0,0,-0.088934,1.268609,0.714019,0.300405,...,1,0,0,1,0,0,0,0,0,1
638,0.351788,-0.177841,3,3,0,0,-0.820445,1.268609,-1.793320,-1.485933,...,1,0,0,0,0,0,0,0,0,1
799,0.178866,-0.324474,3,3,0,0,-0.820445,1.268609,-1.158528,-1.485933,...,1,0,0,1,0,0,0,0,0,1
380,-0.077481,-0.529035,3,3,0,0,-0.820445,0.372217,-1.488988,-1.485933,...,1,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,0.614486,-0.127631,0,3,0,0,-0.088934,-0.524174,1.241280,1.127047,...,1,0,0,1,0,0,0,0,0,1
1130,0.302875,-0.268313,3,3,0,0,-1.551955,-2.316957,-1.392002,-1.485933,...,1,0,0,1,0,0,1,0,0,0
1294,0.178866,-0.234096,3,3,0,0,-0.820445,1.268609,-0.632065,0.065745,...,1,0,0,1,0,0,0,0,0,1
860,0.052182,-0.283376,3,3,0,0,0.642577,2.165000,-1.626287,0.554450,...,1,0,0,1,0,0,0,0,0,1


## 3.1. Regression Tree

In [11]:
model1 = DecisionTreeRegressor()
model1.fit(X_train, y_train)

print(model1.score(X_test, y_test))

predict(model1)

0.7417627395529192
Submission file created!


In [16]:
param = {
    "max_depth": ["None", 5, 10, 15],
    "min_samples_split": [2, 5, 10],
    "criterion": ["absolute_error", "absolute_error"]
}

grid_search = GridSearchCV(estimator = model1, param_grid = param, cv = 10, scoring = scorer)

grid_search.fit(X_train, y_train)

# predict(grid_search.best_estimator_)

pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": model1.predict(test_df[features])
    }).to_csv('../data/submission1.csv', index=False)

grid_search.best_estimator_.score(X_test, y_test)

60 fits failed out of a total of 240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "/home/xaxiu/miniconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/xaxiu/miniconda3/lib/python3.10/site-packages/sklearn/tree/_classes.py", line 1247, in fit
    super().fit(
  File "/home/xaxiu/miniconda3/lib/python3.10/site-packages/sklearn/tree/_classes.py", line 177, in fit
    self._validate_params()
  File "/home/xaxiu/miniconda3/lib/python3.10/site-packages/sklearn/base.py", line 581, in _validate_params
    validate_parameter_constraints(
  File "

0.6582487677473994

## 3.2. Random Forest

In [12]:
model2 = RandomForestRegressor()
model2.fit(X_train, y_train)

predict(model2)

model2.score(X_test, y_test)

Submission file created!


0.8947063299133987

In [17]:
param = {
    'n_estimators': [10, 50,100, 200, 300]
}

grid_search = GridSearchCV(estimator = model2, param_grid = param, cv = 10, scoring = scorer)

grid_search.fit(X_train, y_train)

pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": model2.predict(test_df[features])
    }).to_csv('../data/submission2.csv', index=False)

grid_search.best_estimator_.score(X_test, y_test)

0.8744844053885023

## 3.3.Gradient Boosting

In [15]:
model3 = GradientBoostingRegressor()
model3.fit(X_train, y_train)

predict(model3)

print(model3.score(X_test, y_test))

Submission file created!
0.894805831142898


In [18]:
param = {
    'n_estimators': [50, 100, 150]
}

grid_search = GridSearchCV(estimator = model3, param_grid = param, cv = 10, scoring = scorer)

grid_search.fit(X_train, y_train)

pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": model3.predict(test_df[features])
    }).to_csv('../data/submission3.csv', index=False)

grid_search.best_estimator_.score(X_test, y_test)

0.8903704156629797

## 3.4. Linear Regression

In [14]:
model4 = LinearRegression()
model4.fit(X_train, y_train)

predict(model4)

model4.score(X_test, y_test)

Submission file created!


-1.2077803967171474e+24