Models Compare and Ensemble the prediction

In [1]:
import numpy as np
import pandas as pd

import math
import re

from matplotlib import pyplot as plt
plt.style.use('ggplot')

import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score


from sklearn.linear_model import Lasso, LinearRegression as lr
from sklearn.ensemble import GradientBoostingRegressor as gbr, RandomForestRegressor as rfr
import xgboost as xgb
from preprocess import impute

from stacking import stacking_regression
from sklearn.metrics import mean_squared_error


import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

Import dataframe and models

In [2]:
df_train = pd.read_csv('data/train.csv.gz', compression='gzip', header=0, sep=',', quotechar='"')
df_train_label = df_train[["Id", "SalePrice"]]
df_train = df_train.drop('SalePrice', axis=1)

df_train = df_train.set_index("Id")
df_train_label = df_train_label.set_index("Id")

df_test = pd.read_csv('data/test.csv.gz', compression='gzip', header=0, sep=',', quotechar='"')

test_null_columns=df_test.columns[df_test.isnull().any()] 
train_null_columns=df_train.columns[df_train.isnull().any()] 
test_null_only_ColIdx = test_null_columns.difference(train_null_columns)

test_null_only_RowIdx = [ df_test[df_test[idx].isnull()].index.tolist() for idx in test_null_only_ColIdx ]
test_null_only_RowIdx = list ( set(x for l in test_null_only_RowIdx for x in l) )

problematicTestSet = df_test.loc[ df_test.index.isin( test_null_only_RowIdx ) ]

fineTestSet = df_test.loc[ ~df_test.index.isin( test_null_only_RowIdx ) ]  #1447 records

problematicTestSet= problematicTestSet.set_index("Id")
fineTestSet = fineTestSet.set_index("Id")
df_test = df_test.set_index("Id")

df = pd.concat([df_train,df_test], axis=0, sort=True)

problematicTestSet.index


#After write everything as a function. This is how you call the impute function
df_label_count, encodedDic = impute( df, False)  # label encode categorical data
df_onehot, _ = impute( df, True)  # onehot encode categorical data

df_test = df_onehot[df_onehot.index >= min(df_test.index)]
df_train = df_onehot[df_onehot.index < min(df_test.index)]
print( df_train.shape )
print( df_test.shape )

(1460, 255)
(1459, 255)


In [3]:
# split data into train and test sets
#X_train, X_test, y_train, y_test = train_test_split(df_train, df_train_label, test_size=0.2, random_state=0)

In [4]:
# Useful if you are debugging the function inside another .py script
%load_ext autoreload
%autoreload 2

In [5]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(np.log(y), np.log(y_pred)))

def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def evaluate(model, test_features, test_labels): 
    
    predictions = model.predict(test_features).ravel() 
    residuals = test_labels - predictions
    
    rmse = np.sqrt(np.mean((test_labels - predictions) ** 2))
    
    rmsle = np.sqrt(np.mean((np.log1p(predictions) - np.log1p(test_labels))**2)) 
    
    pred = pd.DataFrame( {'residuals': residuals.values, 'predictions': predictions} )
    pred.set_index(residuals.index)
    
    return [rmse, rmsle, pred]

In [8]:
models = [
    
    # conservative random forst model
    rfr(random_state=0,
        n_estimators=3000, max_depth=6,  max_features='sqrt'),
    
    # conservative gbm model
    gbr(random_state=0, learning_rate = 0.005, max_features='sqrt',
        min_samples_leaf=15, min_samples_split=10, 
        n_estimators=3000, max_depth=3),

    # linear model, Lasso
    Lasso(random_state=0),
    
    xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
    
]

#meta_model = lr(normalize=True)
meta_model = Lasso(normalize=True)

In [9]:
%%time
final_prediction = stacking_regression(models, meta_model, df_train.values, df_train_label.values, df_test.values,
                               transform_target=np.log1p, transform_pred = np.expm1, 
                               metric=rmsle, verbose=2)

metric: [rmsle]

model 0: [RandomForestRegressor]
    fold 0: [0.15645687]
    fold 1: [0.17662361]
    fold 2: [0.15304140]
    ----
    MEAN:   [0.16238059]

model 1: [GradientBoostingRegressor]
    fold 0: [0.11914925]
    fold 1: [0.14023782]
    fold 2: [0.12111389]
    ----
    MEAN:   [0.12719390]

model 2: [Lasso]
    fold 0: [0.25735691]
    fold 1: [0.28118993]
    fold 2: [0.26747606]
    ----
    MEAN:   [0.26885270]

model 3: [XGBRegressor]
    fold 0: [0.11725741]
    fold 1: [0.13908155]
    fold 2: [0.12660985]
    ----
    MEAN:   [0.12796320]

CPU times: user 1min 21s, sys: 388 ms, total: 1min 22s
Wall time: 41.6 s


In [11]:
d = {'SalePrice': final_prediction.reshape(-1)}
sub = pd.DataFrame(data = d, dtype=np.int64, index = df_test.index )
sub.to_csv('Stacking.csv')