In [1]:
import pandas as pd
import os
import sys
import numpy as np

#Import sklearn
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler


from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from catboost import CatBoostRegressor
# import lightgbm as lgb

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import math
from sklearn.base import clone

import timeit

In [2]:
ROOT = "./datasets/house-price/"
SAVED_PROCESSED_DF = "./datasets/house-price/saved_df/"
TRAIN_PATH = os.path.join(SAVED_PROCESSED_DF, "train_df.csv")

if os.path.exists(os.path.join(ROOT, "base_model_performances")):
    os.mkdir(os.path.join(ROOT, "base_model_performances"))

In [3]:
train_df = pd.read_csv(TRAIN_PATH)

x = train_df.loc[:, train_df.columns != "SalePrice"]
y = train_df["SalePrice"]

print(x.shape)
print(y.shape)

(1460, 80)
(1460,)


In [4]:
#Metric
def rmse(y_true, y_pred):
    return math.sqrt(mean_squared_error(y_true, y_pred))


In [5]:
#Building Base Models
linear_reg = LinearRegression()

elastic_net = make_pipeline(RobustScaler(),
    ElasticNet(alpha = 0.0005, random_state=1))

lasso = make_pipeline(RobustScaler(),
    Lasso(alpha = 0.0005, random_state=3))


svr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))

polynomial_reg = make_pipeline(PolynomialFeatures(degree = 2), LinearRegression())

catboost = CatBoostRegressor(
    iterations=500,
    max_ctr_complexity=4,
    random_seed = 0,
    od_type = 'Iter',
    od_wait=25,
    logging_level = "Silent",
    depth=4
)

xgboost = XGBRegressor(
    max_depth = 8, 
    n_estimators=500, 
    min_child_weight = 10000, 
    colsample_bytree=0.7,
    subsample=0.7,eta=0.3, 
    seed = 0)

rf_model = RandomForestRegressor(
    n_estimators=50, 
    max_depth=7, 
    random_state = 0, 
    n_jobs = -1)

knn_model = make_pipeline(
    MinMaxScaler(), 
    KNeighborsRegressor(n_neighbors=9, leaf_size=13, n_jobs=-1))


stack_models = [
    ("Linear Regression", linear_reg),
    ("Catboost", catboost),
    ("RandomForest", rf_model),
]
stacking = StackingRegressor(stack_models, 
                             final_estimator = clone(linear_reg))


In [None]:
#Setting up KFold

models = [
    ("Linear Regression", LinearRegression()),
    ("Elastic Net", elastic_net),
    ("Lasso", lasso),
    ("SVR", svr),
    ("Catboost", catboost),
    ("XGBoost", xgboost),
    ("RandomForest", rf_model),
    ("KNN Regressor", knn_model),
    ("Stack Regressor", stacking)
]

def measure_performance(model, x, y, k_fold = 10, shuffle = True):
    kfold = KFold(n_splits = k_fold, shuffle = shuffle)
    rmse_scores = []
    r2 = []
    for train_index, valid_index in kfold.split(x):
        x_train, x_vaild = x.loc[train_index], x.loc[valid_index]
        y_train, y_valid = y.loc[train_index], y.loc[valid_index]

        model.fit(x_train, y_train)
        y_pred = model.predict(x_vaild)

        rmse_scores.append(rmse(y_valid, y_pred))
    return model, rmse_scores

model_scores = []

for name, model in models:
    start = timeit.default_timer()
    fitted_model, rmse_scores = measure_performance(model, x,y)
    end = timeit.default_timer()
    
    mean_rmse = np.mean(rmse_scores)
    std_rmse = np.std(rmse_scores)
    
    elapsed = end - start
    print("Model ", name)
    print("mean ", mean_rmse)
    print("std ", std_rmse)
    print("elapsed: ", elapsed, " seconds ")
    print("------------------------\n")
    model_scores.append({"name": name, "rmse": rmse_scores, "elapsed_time": elapsed})


Model  Linear Regression
mean  0.13873565268224744
std  0.03195972660515161
elapsed:  0.15324869300820865  seconds 
------------------------

Model  Elastic Net
mean  0.1348940223450052
std  0.029674934186491874
elapsed:  0.5003467430069577  seconds 
------------------------

Model  Lasso
mean  0.1330275310921709
std  0.03012838776375157
elapsed:  0.4508253039966803  seconds 
------------------------

Model  SVR
mean  0.17464087575885698
std  0.043730848281039805
elapsed:  0.4318600060069002  seconds 
------------------------

Model  Catboost
mean  0.1192194863446893
std  0.014669824244029375
elapsed:  8.025672100993688  seconds 
------------------------

Model  XGBoost
mean  0.398246050678286
std  0.030270119817323127
elapsed:  5.660264181991806  seconds 
------------------------

Model  RandomForest
mean  0.14483360468824255
std  0.01281024475451813
elapsed:  3.4280171599966707  seconds 
------------------------

Model  KNN Regressor
mean  0.18211623107658534
std  0.01226853213351834

In [None]:
#Put everything together and measure the based line

#Store the scores
def convert_scores_to_pd(scores):
    merged = {}
    for score in scores:
        print(score["name"])
        for key, value in score.items():
            if key == "rmse":
                if "mean" not in merged:
                    merged.setdefault("mean", [np.mean(value)])
                else:
                    merged["mean"].append(np.mean(value))
                if "std" not in merged:
                    merged.setdefault("std", [np.std(value)])
                else:
                    merged["std"].append(np.std(value))
                continue
                
            if(key not in merged):
                merged.setdefault(key, [value])
            else:
                merged[key].append(value)
    return pd.DataFrame(merged)


In [None]:
#Tunning SVR with grid search

params = {
    "C": [.0001 , .001, .01, 1, 10, 100, 1000, 10000],
    "epsilon": [.0001 , .001, .01, 1, 10, 100, 100],
}

svr_grid_search = GridSearchCV(SVR(), params, n_jobs = -1, verbose = 50)
svr_grid_search.fit(x, y)

In [None]:
#Measure best SVR
print(svr_grid_search.best_estimator_)

best_svr = svr_grid_search.best_estimator_

best_svr.get_params()

start = timeit.default_timer()
fitted_model, rmse_scores = measure_performance(best_svr, x,y)
end = timeit.default_timer()

mean_rmse = np.mean(rmse_scores)
std_rmse = np.std(rmse_scores)
name = "SVR Optimized"
elapsed = end - start
print("Model ", name)
print("mean ", mean_rmse)
print("std ", std_rmse)
print("elapsed: ", elapsed, " seconds ")
model_scores.append({"name": name, "rmse": rmse_scores, "elapsed_time": elapsed})


In [None]:
linear_reg = LinearRegression()

svr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))

catboost = CatBoostRegressor(
    iterations=500,
    max_ctr_complexity=4,
    random_seed = 0,
    od_type = 'Iter',
    od_wait=25,
    logging_level = "Silent",
    depth=4
)

xgboost = XGBRegressor(
    max_depth = 8, 
    n_estimators=500, 
    min_child_weight = 10000, 
    colsample_bytree=0.7,
    subsample=0.7,eta=0.3, 
    seed = 0)

rf_model = RandomForestRegressor(
    n_estimators=50, 
    max_depth=7, 
    random_state = 0, 
    n_jobs = -1)

#Tune stack models
stack_models = [
    ("Linear Regression", LinearRegression()),
    ("Catboost", catboost.copy()),
    ("RandomForest", clone(rf_model))
]

params = {
    "final_estimator": [LinearRegression(), catboost.copy(), clone(rf_model)]
}
cv_search = GridSearchCV(StackingRegressor(stack_models), params, n_jobs = -1, verbose = 50)

cv_search.fit(x,y)

In [None]:
best_stacking= cv_search.best_estimator_

print("Best stacking params: ", best_stacking.get_params())

start = timeit.default_timer()
fitted_model, rmse_scores = measure_performance(best_stacking, x,y)
end = timeit.default_timer()

mean_rmse = np.mean(rmse_scores)
std_rmse = np.std(rmse_scores)
name = "Stack Regressor Optimized"
elapsed = end - start
print("Model ", name)
print("mean ", mean_rmse)
print("std ", std_rmse)
print("elapsed: ", elapsed, " seconds ")
print("------------------------\n")
model_scores.append({"name": name, "rmse": rmse_scores, "elapsed_time": elapsed})

In [None]:
#Store the score
scores_df = convert_scores_to_pd(model_scores)

print(scores_df.sort_values(by="mean"))
    
scores_df.to_csv(os.path.join(ROOT, "base_models", "09-16-2020-base.csv"))