# 0. Import Library

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import make_scorer, mean_squared_error

In [7]:
train_df = pd.read_csv('../data/train_clean.csv')
test_df = pd.read_csv('../data/test_clean.csv')

# 3.Data Modeling

In [11]:
nominal_col = [
    'MSSubClass', 'MSZoning', 'Street', 'Alley', 'Utilities', 
    'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
    'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
    'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating',
    'CentralAir', 'Electrical', 'Functional', 'GarageType',
    'MiscFeature', 'SaleType', 'SaleCondition', 'MoSold', 'YrSold',
]

ordinal_col = [
    'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'ExterQual',
    'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
    'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageFinish',
    'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 
]

categorical_col = nominal_col + ordinal_col

numerical_col = list(train_df.columns[(~train_df.columns.isin(categorical_col))])[:-1]

In [13]:
train_df[ordinal_col] = train_df[ordinal_col].astype("category").apply(lambda x: x.cat.codes)
train_df = pd.get_dummies(train_df, columns=nominal_col, drop_first=True)

test_df[ordinal_col] = test_df[ordinal_col].astype("category").apply(lambda x: x.cat.codes)
test_df = pd.get_dummies(test_df, columns=nominal_col, drop_first=True)

test_df[numerical_col] = StandardScaler().fit_transform(test_df[numerical_col])

In [14]:
features = list(train_df.columns)

diff_col = list(set(train_df.columns) - set(test_df.columns))

for col in diff_col:
    if col in features:
        features.remove(col)

target = 'SalePrice'

X_train, X_test, y_train, y_test = train_test_split(train_df[features], train_df[target], test_size=0.2, random_state=42)

X_train[numerical_col] = StandardScaler().fit_transform(X_train[numerical_col])
X_test[numerical_col] = StandardScaler().fit_transform(X_test[numerical_col])

In [15]:
def predict(model):
    pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": model.predict(test_df[features])
    }).to_csv('../data/submission.csv', index=False)

    print("Submission file created!")

In [16]:
scorer = make_scorer(mean_squared_error, squared=False)

## 3.1. Regression Tree

In [17]:
model1 = DecisionTreeRegressor()
model1.fit(X_train, y_train)

print(model1.score(X_test, y_test))

predict(model1)

0.8362794797336331
Submission file created!


## 3.2. Random Forest

In [18]:
model2 = RandomForestRegressor()
model2.fit(X_train, y_train)

print(model2.score(X_test, y_test))

predict(model2)

0.8921923253641667
Submission file created!


## 3.3.Gradient Boosting

In [19]:
model3 = GradientBoostingRegressor()
model3.fit(X_train, y_train)

print(model3.score(X_test, y_test))

predict(model3)

0.8938009987941419
Submission file created!


## 3.4. Voting Regressor

In [20]:
model4 = VotingRegressor([('rf', model2), ('gb', model3)])

model4.fit(X_train, y_train)

model4.score(X_test, y_test)

0.903821828995462