In [26]:
%matplotlib inline
from IPython.display import Image

import os, sys, re, datetime, gc
from pathlib import Path
from itertools import product

import numpy as np
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

import lightgbm as lgb
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor

pd.set_option('display.max_rows', 600)
pd.set_option('display.max_columns', 50)

for p in [np, pd, sklearn, lgb]:
    print (p.__name__, p.__version__)

numpy 1.14.2
pandas 0.23.0
sklearn 0.19.1
lightgbm 2.2.1


# Load dataset

In [2]:
data_dir = Path.home()/'.kaggle/competitions/house-prices-advanced-regression-techniques'

In [71]:
data = pd.read_csv(data_dir/'train.csv.gz', compression='gzip')
test = pd.read_csv(data_dir/'test.csv.gz', compression='gzip')

# Preprocess

In [48]:
dummies = pd.get_dummies(data.select_dtypes(include=['object']))
data = pd.concat([data, dummies], axis=1)
X = data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object']).fillna(0)
y = np.log(data['SalePrice'].fillna(0))

In [72]:
dummies = pd.get_dummies(data.select_dtypes(include=['object']))
test = pd.concat([test, dummies], axis=1)
test = test.select_dtypes(exclude=['object']).fillna(0)
test = test[list(X.columns)]

# Split train/valid/test

In [11]:
random_state = 1021

In [53]:
X_train_valid, X_meta_valid, y_train_valid, y_meta_valid = train_test_split(X, y, test_size=0.2, random_state=random_state)

In [54]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=0.5, random_state=random_state)

# Stacking

In [55]:
# train base model
base_model_1 = LinearRegression()
base_model_2 = LGBMRegressor()
base_model_3 = KNeighborsRegressor()

base_model_1.fit(X_train, y_train)
base_model_2.fit(X_train, y_train)
base_model_3.fit(X_train, y_train)

# base predicts
base_pred_1 = base_model_1.predict(X_valid)
base_pred_2 = base_model_2.predict(X_valid)
base_pred_3 = base_model_3.predict(X_valid)

# test predicts for final result 
valid_pred_1 = base_model_1.predict(X_meta_valid)
valid_pred_2 = base_model_2.predict(X_meta_valid)
valid_pred_3 = base_model_3.predict(X_meta_valid)

print ("mean squared error of model 1: {:.4f}".format(mean_squared_error(y_meta_valid, valid_pred_1)) )
print ("mean squared error of model 2: {:.4f}".format(mean_squared_error(y_meta_valid, valid_pred_2)) )
print ("mean squared error of model 3: {:.4f}".format(mean_squared_error(y_meta_valid, valid_pred_3)) )

# stack base predicts for training meta model
stacked_predictions = np.column_stack((base_pred_1, base_pred_2, base_pred_3))

# stack test predicts for final result 
stacked_valid_predictions = np.column_stack((valid_pred_1, valid_pred_2, valid_pred_3))

# train meta model 
meta_model = LinearRegression()
meta_model.fit(stacked_predictions, y_valid)

# final result 
meta_valid_pred = meta_model.predict(stacked_valid_predictions)
print ("mean squared error of meta model: {:.4f}".format(mean_squared_error(y_meta_valid, meta_valid_pred)) )

mean squared error of model 1: 0.0239
mean squared error of model 2: 0.0181
mean squared error of model 3: 0.0634
mean squared error of meta model: 0.0175
