<a href="https://colab.research.google.com/github/jhwang1992/KaggleHousePricesPrediction/blob/master/kagglepriceprediction_part2_ScikitLearnPipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# pickle load data

In [1]:
from google.colab import files
uploaded = files.upload()

Saving df_train.pkl to df_train.pkl
Saving df_test.pkl to df_test.pkl


In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import pickle

import warnings as wrn
wrn.filterwarnings('ignore', category = DeprecationWarning) 
wrn.filterwarnings('ignore', category = FutureWarning) 
wrn.filterwarnings('ignore', category = UserWarning)

from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
import xgboost as xgb
from xgboost import XGBRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import make_scorer, mean_squared_error

In [0]:
with open('df_train.pkl', 'rb') as f:
    df_train = pickle.load(f)

with open('df_test.pkl', 'rb') as f:
    df_test = pickle.load(f)

In [4]:
print('df_train shape: ', df_train.shape)
print('df_test shape: ', df_test.shape)

df_train shape:  (1450, 76)
df_test shape:  (1459, 75)


# identify numericColumns and categoricalColumns

In [5]:
numericColumns = []
categoricalColumns = []

for column in df_train.columns:
  if df_train[column].dtypes==int or df_train[column].dtypes==float:
    numericColumns.append(column)
  else:
    categoricalColumns.append(column)

numericColumns.remove('Id')
numericColumns.remove('SalePrice')

print( len(numericColumns), 'numeric columns: ', numericColumns)
print( len(categoricalColumns), 'categorical columns: ', categoricalColumns)
print( 'ID and SalePrice are seperated')

54 numeric columns:  ['MSSubClass', 'LotFrontage', 'LotArea', 'LotShape', 'LandContour', 'Utilities', 'LandSlope', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'HeatingQC', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
20 categorical columns:  ['MSZoning', 'Street', 'LotConfig', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'BsmtCond',

# Ridge pipeline and GridSearchCV

In [6]:
# build pipeline
# first implement different columntransformer for numeric and categorical columns
# second pass the columntransformer results to Ridge regressor

numeric_features = numericColumns
numeric_transformer = Pipeline(steps=[
                                      ('imputer', SimpleImputer(strategy='mean')),
                                      ('scaler', StandardScaler())])


categorical_features = categoricalColumns
categorical_transformer = Pipeline(steps=[
                                      ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                                      ('onehot', OneHotEncoder(handle_unknown='ignore'))])


preprocessor = ColumnTransformer(
    transformers=[
                  ('num', numeric_transformer, numeric_features),
                  ('cat', categorical_transformer, categorical_features)])


ridge_pipeline = Pipeline(steps=[
                                 ('preprocessor', preprocessor),
                                 ('regressor', Ridge())])


ridge_param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'regressor__alpha': [1,5,10,20]
}


# define the scoring parameter to be passed into GridSearchCV
# 'neg_mean_squared_error', metrics.mean_squared_error is used here
# make_scorer and set greater_is_better to False to make sure GridSearchCV optimizes the hyperparameters correctly
scoring_metrics = make_scorer(mean_squared_error, greater_is_better=False)


ridge_grid_search = GridSearchCV(ridge_pipeline, ridge_param_grid, scoring = scoring_metrics, cv = 5)
ridge_grid_search.fit(df_train[numericColumns+categoricalColumns], df_train['SalePrice'])


print(f"best score is {ridge_grid_search.best_score_}, best parameter from grid search is {ridge_grid_search.best_params_}")

best score is -0.012890171962691654, best parameter from grid search is {'preprocessor__num__imputer__strategy': 'median', 'regressor__alpha': 10}


In [7]:
np.sqrt(ridge_grid_search.best_score_ * -1)

0.11353489315048328

# Lasso pipeline and GridSearchCV

In [8]:
numeric_features = numericColumns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])


categorical_features = categoricalColumns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


lasso_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('regressor', Lasso())])


lasso_param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'regressor__alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 100]
}


# 'neg_mean_squared_error', metrics.mean_squared_error is used here
scoring_metrics = make_scorer(mean_squared_error, greater_is_better=False)


lasso_grid_search = GridSearchCV(lasso_pipeline, lasso_param_grid, scoring = scoring_metrics, cv = 5)
lasso_grid_search.fit(df_train[numericColumns+categoricalColumns], df_train['SalePrice'])


print(f"best score is {lasso_grid_search.best_score_}, best parameter from grid search is {lasso_grid_search.best_params_}")

best score is -0.012361374369539848, best parameter from grid search is {'preprocessor__num__imputer__strategy': 'median', 'regressor__alpha': 0.0005}


In [9]:
np.sqrt(lasso_grid_search.best_score_ * -1)

0.11118171778462432

# ElasticNet pipeline and GridSearchCV

In [10]:
numeric_features = numericColumns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])


categorical_features = categoricalColumns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


ElasticNet_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('regressor', ElasticNet())])


ElasticNet_param_grid = {
    'regressor__alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10,100],
    'regressor__l1_ratio':[0.5,0.6,0.7,0.8,0.9]
}


# 'neg_mean_squared_error', metrics.mean_squared_error is used here
scoring_metrics = make_scorer(mean_squared_error, greater_is_better=False)


ElasticNet_grid_search = GridSearchCV(ElasticNet_pipeline, ElasticNet_param_grid, scoring = scoring_metrics, cv = 5)
ElasticNet_grid_search.fit(df_train[numericColumns+categoricalColumns], df_train['SalePrice'])


print(f"best score is {ElasticNet_grid_search.best_score_}, best parameter from grid search is {ElasticNet_grid_search.best_params_}")

best score is -0.01237041440688029, best parameter from grid search is {'regressor__alpha': 0.0005, 'regressor__l1_ratio': 0.9}


In [11]:
np.sqrt(ElasticNet_grid_search.best_score_ * -1)

0.11122236468840378

# KRR pipeline and GridSearchCV

In [12]:
numeric_features = numericColumns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])


categorical_features = categoricalColumns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


KRR_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('regressor', KernelRidge())])


KRR_param_grid = {
    'regressor__alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10,100],
    'regressor__kernel':['polynomial', 'linear'],
    'regressor__degree':[1,2,3]
}


# 'neg_mean_squared_error', metrics.mean_squared_error is used here
scoring_metrics = make_scorer(mean_squared_error, greater_is_better=False)


KRR_grid_search = GridSearchCV(KRR_pipeline, KRR_param_grid, scoring = scoring_metrics, cv = 5)
KRR_grid_search.fit(df_train[numericColumns+categoricalColumns], df_train['SalePrice'])


print(f"best score is {KRR_grid_search.best_score_}, best parameter from grid search is {KRR_grid_search.best_params_}")

best score is -0.012820118676965156, best parameter from grid search is {'regressor__alpha': 0.1, 'regressor__degree': 2, 'regressor__kernel': 'polynomial'}


In [13]:
np.sqrt(KRR_grid_search.best_score_ * -1)

0.11322596291030232

# gradientBoost pipeline and GridSearchCV

In [14]:
numeric_features = numericColumns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])


categorical_features = categoricalColumns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


gradientBoost_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                         ('regressor', GradientBoostingRegressor(n_estimators=3000,max_features='sqrt',
                                                                                 min_samples_leaf=15, min_samples_split=10, 
                                                                                 loss='huber', random_state =5))])


gradientBoost_param_grid = {
    'regressor__learning_rate':[0.01, 0.05, 0.1, 0.5],
    'regressor__max_depth':[3,4]
}


# 'neg_mean_squared_error', metrics.mean_squared_error is used here
scoring_metrics = make_scorer(mean_squared_error, greater_is_better=False)


gradientBoost_grid_search = GridSearchCV(gradientBoost_pipeline, gradientBoost_param_grid, scoring = scoring_metrics, cv = 5)
gradientBoost_grid_search.fit(df_train[numericColumns+categoricalColumns], df_train['SalePrice'])


print(f"best score is {gradientBoost_grid_search.best_score_}, best parameter from grid search is {gradientBoost_grid_search.best_params_}")

best score is -0.012345541626874207, best parameter from grid search is {'regressor__learning_rate': 0.01, 'regressor__max_depth': 4}


In [15]:
np.sqrt(gradientBoost_grid_search.best_score_ * -1)

0.11111049287476951

# XGBRegressor pipeline and GridSearchCV

In [16]:
numeric_features = numericColumns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])


categorical_features = categoricalColumns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


XGB_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                                                          min_child_weight=1.7817, n_estimators=2200,
                                                          reg_alpha=0.4640, reg_lambda=0.8571,
                                                          subsample=0.5213, silent=1,
                                                          random_state =7, nthread = -1))])


XGB_param_grid = {
    'regressor__learning_rate':[0.01, 0.05, 0.1, 0.5],
    'regressor__max_depth':[3,4]
}


# 'neg_mean_squared_error', metrics.mean_squared_error is used here
scoring_metrics = make_scorer(mean_squared_error, greater_is_better=False)


XGB_grid_search = GridSearchCV(XGB_pipeline, XGB_param_grid, scoring = scoring_metrics, cv = 5)
XGB_grid_search.fit(df_train[numericColumns+categoricalColumns], df_train['SalePrice'])


print(f"best score is {XGB_grid_search.best_score_}, best parameter from grid search is {XGB_grid_search.best_params_}")

best score is -0.013037841644771614, best parameter from grid search is {'regressor__learning_rate': 0.05, 'regressor__max_depth': 3}


In [17]:
np.sqrt(XGB_grid_search.best_score_ * -1)

0.11418336851210693

# LGBRegressor pipeline and GridSearchCV

In [18]:
numeric_features = numericColumns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])


categorical_features = categoricalColumns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


LGB_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', LGBMRegressor(objective='regression',n_estimators=720,
                                                           max_bin = 55, bagging_fraction = 0.8,
                                                           bagging_freq = 5, feature_fraction = 0.2319,
                                                           feature_fraction_seed=9, bagging_seed=9,
                                                           min_data_in_leaf =6, min_sum_hessian_in_leaf = 11))])


LGB_param_grid = {
    'regressor__learning_rate':[0.01, 0.05, 0.1, 0.5],
    'regressor__max_depth':[3,4,5,10]
}


# 'neg_mean_squared_error', metrics.mean_squared_error is used here
scoring_metrics = make_scorer(mean_squared_error, greater_is_better=False)


LGB_grid_search = GridSearchCV(LGB_pipeline, LGB_param_grid, scoring = scoring_metrics, cv = 5)
LGB_grid_search.fit(df_train[numericColumns+categoricalColumns], df_train['SalePrice'])


print(f"best score is {LGB_grid_search.best_score_}, best parameter from grid search is {LGB_grid_search.best_params_}")

best score is -0.012822351683941694, best parameter from grid search is {'regressor__learning_rate': 0.05, 'regressor__max_depth': 3}


In [19]:
np.sqrt(LGB_grid_search.best_score_ * -1)

0.11323582332434244

# overview

In [20]:
print(f'Ridge score is {np.sqrt(ridge_grid_search.best_score_ * -1)}')
print(f'Lasso score is {np.sqrt(lasso_grid_search.best_score_ * -1)}')
print(f'ElasticNet score is {np.sqrt(ElasticNet_grid_search.best_score_ * -1)}')
print(f'KRR score is {np.sqrt(KRR_grid_search.best_score_ * -1)}')
print(f'GradientBoosting score is {np.sqrt(gradientBoost_grid_search.best_score_ * -1)}')
print(f'XGBoosting score is {np.sqrt(XGB_grid_search.best_score_ * -1)}')
print(f'LGBoosting score is {np.sqrt(LGB_grid_search.best_score_ * -1)}')

Ridge score is 0.11353489315048328
Lasso score is 0.11118171778462432
ElasticNet score is 0.11122236468840378
KRR score is 0.11322596291030232
GradientBoosting score is 0.11111049287476951
XGBoosting score is 0.11418336851210693
LGBoosting score is 0.11323582332434244


# generate prediction result

In [0]:
# generate result

ridge_grid_search.fit(df_train[numericColumns+categoricalColumns], df_train['SalePrice'])

# inverse transform the result by applying exponential function
result = np.expm1(ridge_grid_search.predict(df_test[numericColumns+categoricalColumns]))

sub = pd.DataFrame(columns = ['Id', 'SalePrice'])
sub['Id'] = df_test.loc[:,'Id']
sub['SalePrice'] = result
sub
sub.to_csv(r"submission.csv")