# 0. Import Library

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import make_scorer, mean_squared_error

In [27]:
df = pd.read_csv('../data/preprocessed.csv')

# 3.Data Modeling

In [28]:
nominal_col = [
    "MSSubClass","MSZoning", "LotConfig", "Neighborhood", "HouseStyle", "RoofStyle",
    "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "GarageType",
    "SaleType", "SaleCondition", "Street", "Condition1", "Condition2", 
    "BldgType", "RoofMatl", "Heating", "CentralAir", "Electrical", 
    "Functional"
]

ordinal_col = [
    "LotShape", "ExterQual", "ExterCond", "BsmtQual", "BsmtExposure",
    "BsmtFinType1", "HeatingQC", "KitchenQual", "FireplaceQu", "GarageFinish",
    "KitchenAbvGr", "LandContour", "Utilities", "LandSlope", "BsmtCond",
    "BsmtFinType2", "GarageQual", "GarageCond", "PavedDrive"
]

categorical_col = nominal_col + ordinal_col

numerical_col = list(df.columns[(~df.columns.isin(categorical_col))])[:-1]

In [29]:
test_df = pd.read_csv('../data/test_clean.csv')

test_df[ordinal_col] = test_df[ordinal_col].astype("category").apply(lambda x: x.cat.codes)
test_df = pd.get_dummies(test_df, columns=nominal_col, drop_first=True)

test_df[numerical_col] = StandardScaler().fit_transform(test_df[numerical_col])

In [30]:
df[ordinal_col] = df[ordinal_col].astype("category").apply(lambda x: x.cat.codes)
df = pd.get_dummies(df, columns=nominal_col, drop_first=True)

In [31]:
features = list(df.columns)

features.remove('SalePrice')
features.remove("Exterior1st_ImStucc")
features.remove("Exterior1st_Stone")
features.remove("Exterior2nd_Other")
features.remove('Exterior1st_CBlock')
features.remove('Exterior2nd_AsphShn')
features.remove('Exterior2nd_CBlock')
features.remove('Exterior2nd_Stone')
features.remove("HouseStyle_2.5Fin")
for col in ['Condition2_RRAe', 'Condition2_RRAn', 'Condition2_RRNn', 'RoofMatl_CompShg', 'RoofMatl_Membran', 'RoofMatl_Metal', 'RoofMatl_Roll', 'Heating_GasA', 'Heating_OthW', 'Electrical_Mix']:
    if col in features:
        features.remove(col)

target = 'SalePrice'

X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

X_train[numerical_col] = StandardScaler().fit_transform(X_train[numerical_col])
X_test[numerical_col] = StandardScaler().fit_transform(X_test[numerical_col])

In [32]:
def predict(model):
    pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": model.predict(test_df[features])
    }).to_csv('../data/submission.csv', index=False)

    print("Submission file created!")

In [41]:
scorer = make_scorer(mean_squared_error, squared=False)

## 3.1. Regression Tree

In [35]:
model1 = DecisionTreeRegressor()
model1.fit(X_train, y_train)

print(model1.score(X_test, y_test))

predict(model1)

0.8044792243683874
Submission file created!


In [None]:
param = {
    "max_depth": ["None", 5, 10, 15],
    "min_samples_split": [2, 5, 10],
    "criterion": ["absolute_error", "absolute_error"]
}

grid_search = GridSearchCV(estimator = model1, param_grid = param, cv = 10, scoring = scorer)

grid_search.fit(X_train, y_train)

# predict(grid_search.best_estimator_)

pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": model1.predict(test_df[features])
    }).to_csv('../data/submission1.csv', index=False)

grid_search.best_estimator_.score(X_test, y_test)

## 3.2. Random Forest

In [94]:
model2s = [RandomForestRegressor(n_estimators = i, random_state = 25) for i in range(50, 150, 20)]

best_model = None
best_score = 0

for (i, model) in enumerate(model2s):
    model.fit(X_train, y_train)

    if best_score < model.score(X_test, y_test):
        best_score = model.score(X_test, y_test)
        best_model = i

print(best_score, best_model)

0.9039935996477334 2


In [None]:
param = {
    'n_estimators': [100, 200, 300]
}

grid_search = GridSearchCV(estimator = model2, param_grid = param, cv = 10, scoring = scorer)

grid_search.fit(X_train, y_train)

pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": model2.predict(test_df[features])
    }).to_csv('../data/submission2.csv', index=False)

grid_search.best_estimator_.score(X_test, y_test)

0.8744844053885023

## 3.3.Gradient Boosting

In [84]:
model3s = [GradientBoostingRegressor(n_estimators = i, random_state = 1) for i in range(300, 400, 20)]

for model in model3s:
    model.fit(X_train, y_train)
    
    print(model.score(X_test, y_test))

0.9071397366945458
0.9072095797939624
0.9069416711683994
0.9067537972785258
0.9068663893520195


In [86]:
model3s[1].score(X_test, y_test)

predict(model3s[1])

Submission file created!


In [74]:
model3 = GradientBoostingRegressor(n_estimators = 500, random_state = 1)
model3.fit(X_train, y_train)

pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": model3.predict(test_df[features])
    }).to_csv('../data/submission3.csv', index=False)

print(model3.score(X_test, y_test))

0.9063225643640488


In [None]:
param = {
    'n_estimators': [50, 100, 150]
}

grid_search = GridSearchCV(estimator = model3, param_grid = param, cv = 10, scoring = scorer)

grid_search.fit(X_train, y_train)

pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": model3.predict(test_df[features])
    }).to_csv('../data/submission3.csv', index=False)

grid_search.best_estimator_.score(X_test, y_test)

## 3.4. Voting Regressor

In [88]:
model4 = VotingRegressor([('rf', model2), ('gb', model3s[1])])

model4.fit(X_train, y_train)

pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": model4.predict(test_df[features])
    }).to_csv('../data/submission4.csv', index=False)

model4.score(X_test, y_test)

0.9089927999108867