In [1]:
import pandas as pd
import numpy as np
import math as mt

from sklearn.metrics import accuracy_score, r2_score, mean_squared_error, mean_absolute_error, precision_score, recall_score
from sklearn.metrics import roc_curve, roc_auc_score

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.cluster import KMeans
from sklearn.linear_model import Lasso, Ridge, LogisticRegression, LinearRegression, SGDRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR

from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

from sklearn.neural_network import MLPRegressor

import xgboost
import lightgbm as lgb

from scipy.stats import pearsonr

from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder, StandardScaler, PolynomialFeatures

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
#
# Helpers
#
def write_to_submission_file(predicted_labels, sale_ids, out_file="submission.csv", target='SalePrice', index_label="Id"):
    
    # turn predictions into data frame and save as csv file
    predicted_df = pd.DataFrame(predicted_labels,
                                index = sale_ids,
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [3]:
# Load House train & test data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Put the labels aside
labels_orig = train.SalePrice.as_matrix().reshape(-1, 1)

# This will be needed after pre-processing
size_of_train = train.shape[0]
print(size_of_train)

# Merge datasets
del train['SalePrice']

train = pd.concat([train, test])

# Define variables
cols = []
cols_count = 0

# Handling years as categorial 
use_fe_2 = False

# Using kind of a total sum of square feet
use_fe_3 = True

# Using new feature of Clustering
use_fe_4 = False

1460


### Data Preprocessing

In [4]:
# MSSubClass
enc = OneHotEncoder(sparse=False)
features = enc.fit_transform(train.MSSubClass.values.reshape(-1, 1))
cols_count = cols_count + 1

# MSZoning
#train.MSZoning.fillna("RM", inplace=True)
train.MSZoning.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.MSZoning).as_matrix()], axis=1 )
cols_count = cols_count + 1

# LotFrontage
train.LotFrontage.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.LotFrontage.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# LotArea
features = np.concatenate( [features, train.LotArea.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# Street
features = np.concatenate( [features, pd.get_dummies(train.Street).as_matrix()], axis=1 )
cols_count = cols_count + 1

# Alley
train.Alley.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Alley).as_matrix()], axis=1 )
cols_count = cols_count + 1

# LotShape
features = np.concatenate( [features, pd.get_dummies(train.LotShape).as_matrix()], axis=1 )
cols_count = cols_count + 1

# LandContour
features = np.concatenate( [features, pd.get_dummies(train.LandContour).as_matrix()], axis=1 )
cols_count = cols_count + 1

In [5]:
# Utilities
#train.Utilities.fillna("AllPub", inplace=True)
train.Utilities.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Utilities).as_matrix()], axis=1 )
cols_count = cols_count + 1

# LotConfig
features = np.concatenate( [features, pd.get_dummies(train.LotConfig).as_matrix()], axis=1 )
cols_count = cols_count + 1

# LandSlope
features = np.concatenate( [features, pd.get_dummies(train.LandSlope).as_matrix()], axis=1 )
cols_count = cols_count + 1

# Neighborhood 
features = np.concatenate( [features, pd.get_dummies(train.Neighborhood).as_matrix()], axis=1 )
cols_count = cols_count + 1

# Condition1
features = np.concatenate( [features, pd.get_dummies(train.Condition1).as_matrix()], axis=1 )
cols_count = cols_count + 1

# Condition2
features = np.concatenate( [features, pd.get_dummies(train.Condition2).as_matrix()], axis=1 )
cols_count = cols_count + 1

# BldgType
features = np.concatenate( [features, pd.get_dummies(train.BldgType).as_matrix()], axis=1 )
cols_count = cols_count + 1

# HouseStyle
features = np.concatenate( [features, pd.get_dummies(train.HouseStyle).as_matrix()], axis=1 )
cols_count = cols_count + 1

# OverallQual
#enc = OneHotEncoder(sparse=False)
#features = np.concatenate( [features, enc.fit_transform(train.OverallQual.values.reshape(-1, 1))], axis=1 )
features = np.concatenate( [features, train.OverallQual.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# OverallCond
#enc = OneHotEncoder(sparse=False)
#features = np.concatenate( [features, enc.fit_transform(train.OverallCond.values.reshape(-1, 1))], axis=1 )
features = np.concatenate( [features, train.OverallCond.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

In [6]:
# YearBuilt
train['HouseAge'] = train.YrSold - train.YearBuilt
features = np.concatenate( [features, train.HouseAge.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# YearRemodAdd
train['AgeSinceRemod'] = train.YrSold - train.YearRemodAdd
features = np.concatenate( [features, train.AgeSinceRemod.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# RoofStyle
features = np.concatenate( [features, pd.get_dummies(train.RoofStyle).as_matrix()], axis=1 )
cols_count = cols_count + 1

# RoofMatl
features = np.concatenate( [features, pd.get_dummies(train.RoofMatl).as_matrix()], axis=1 )
cols_count = cols_count + 1

# Exterior1st
#train.Exterior1st.fillna("Wd Sdng", inplace=True)
train.Exterior1st.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Exterior1st).as_matrix()], axis=1 )
cols_count = cols_count + 1

# Exterior2nd
#train.Exterior2nd.fillna("Wd Sdng", inplace=True)
train.Exterior2nd.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Exterior2nd).as_matrix()], axis=1 )
cols_count = cols_count + 1

# MasVnrType
#train.MasVnrType.fillna("None", inplace=True)
train.MasVnrType.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.MasVnrType).as_matrix()], axis=1 )
cols_count = cols_count + 1

# MasVnrArea
train.MasVnrArea.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.MasVnrArea.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# ExterQual
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0}
train["ExterQual"].replace(di, inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.ExterQual).as_matrix()], axis=1 )
features = np.concatenate( [features, train.ExterQual.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# ExterCond
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0}
train["ExterCond"].replace(di, inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.ExterCond).as_matrix()], axis=1 )
features = np.concatenate( [features, train.ExterCond.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

In [7]:
# Foundation
features = np.concatenate( [features, pd.get_dummies(train.Foundation).as_matrix()], axis=1 )
cols_count = cols_count + 1

# BsmtQual
train.BsmtQual.fillna("NA", inplace=True)
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0, "NA": 0.0}
train["BsmtQual"].replace(di, inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.BsmtQual).as_matrix()], axis=1 )
features = np.concatenate( [features, train.BsmtQual.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# BsmtCond
train.BsmtCond.fillna("NA", inplace=True)
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0, "NA": 0.0}
train["BsmtCond"].replace(di, inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.BsmtCond).as_matrix()], axis=1 )
features = np.concatenate( [features, train.BsmtCond.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# BsmtExposure
train.BsmtExposure.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.BsmtExposure).as_matrix()], axis=1 )
cols_count = cols_count + 1

# BsmtFinType1
train.BsmtFinType1.fillna("NA", inplace=True)
di = {"GLQ": 6.0, "ALQ": 5.0, "BLQ": 4.0, "Rec": 3.0, "LwQ": 2.0, "Unf": 1.0, "NA": 0.0}
train["BsmtFinType1"].replace(di, inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.BsmtFinType1).as_matrix()], axis=1 )
features = np.concatenate( [features, train.BsmtFinType1.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# BsmtFinSF1
train.BsmtFinSF1.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.BsmtFinSF1.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# BsmtFinType2
train.BsmtFinType2.fillna("NA", inplace=True)
di = {"GLQ": 6.0, "ALQ": 5.0, "BLQ": 4.0, "Rec": 3.0, "LwQ": 2.0, "Unf": 1.0, "NA": 0.0}
train["BsmtFinType2"].replace(di, inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.BsmtFinType2).as_matrix()], axis=1 )
features = np.concatenate( [features, train.BsmtFinType2.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# BsmtFinSF2
train.BsmtFinSF2.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.BsmtFinSF2.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# BsmtUnfSF
train.BsmtUnfSF.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.BsmtUnfSF.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# TotalBsmtSF
train.TotalBsmtSF.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.TotalBsmtSF.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

In [8]:
# Heating
features = np.concatenate( [features, pd.get_dummies(train.Heating).as_matrix()], axis=1 )
cols_count = cols_count + 1

# HeatingQC
#features = np.concatenate( [features, pd.get_dummies(train.HeatingQC).as_matrix()], axis=1 )
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0}
train["HeatingQC"].replace(di, inplace=True)
features = np.concatenate( [features, train.HeatingQC.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# CentralAir
features = np.concatenate( [features, pd.get_dummies(train.CentralAir).as_matrix()], axis=1 )
cols_count = cols_count + 1

# Electrical
train.Electrical.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Electrical).as_matrix()], axis=1 )
cols_count = cols_count + 1

# 1stFlrSF
features = np.concatenate( [features, train['1stFlrSF'].as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# 2ndFlrSF
features = np.concatenate( [features, train['2ndFlrSF'].as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# LowQualFinSF
features = np.concatenate( [features, train.LowQualFinSF.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# GrLivArea
features = np.concatenate( [features, train.GrLivArea.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# BsmtFullBath
train.BsmtFullBath.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.BsmtFullBath.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# BsmtHalfBath
train.BsmtHalfBath.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.BsmtHalfBath.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

In [9]:
# FullBath
features = np.concatenate( [features, train.FullBath.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# HalfBath
features = np.concatenate( [features, train.HalfBath.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# BedroomAbvGr
features = np.concatenate( [features, train.BedroomAbvGr.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# KitchenAbvGr
features = np.concatenate( [features, train.KitchenAbvGr.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# KitchenQual
#train.KitchenQual.fillna("TA", inplace=True)
train.KitchenQual.fillna("Unknown", inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.KitchenQual).as_matrix()], axis=1 )
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0, "Unknown": 0.0}
train["KitchenQual"].replace(di, inplace=True)
features = np.concatenate( [features, train.KitchenQual.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# TotRmsAbvGrd
features = np.concatenate( [features, train.TotRmsAbvGrd.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# Functional
#train.Functional.fillna("Typ", inplace=True)
train.Functional.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Functional).as_matrix()], axis=1 )
cols_count = cols_count + 1

# Fireplaces
features = np.concatenate( [features, train.Fireplaces.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# FireplaceQu
train.FireplaceQu.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.FireplaceQu).as_matrix()], axis=1 )
cols_count = cols_count + 1

# GarageType
train.GarageType.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.GarageType).as_matrix()], axis=1 )
cols_count = cols_count + 1

In [10]:
# GarageYrBlt
train.GarageYrBlt.fillna(train.YearBuilt, inplace=True)
train['GarageAge'] = train.YrSold - train.GarageYrBlt
features = np.concatenate( [features, train.GarageAge.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# GarageFinish
train.GarageFinish.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.GarageFinish).as_matrix()], axis=1 )
cols_count = cols_count + 1

# GarageCars
train.GarageCars.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.GarageCars.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# GarageArea
train.GarageArea.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.GarageArea.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# GarageQual
train.GarageQual.fillna("NA", inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.GarageQual).as_matrix()], axis=1 )
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0, "NA": 0.0}
train["GarageQual"].replace(di, inplace=True)
features = np.concatenate( [features, train.GarageQual.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# GarageCond
train.GarageCond.fillna("NA", inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.GarageCond).as_matrix()], axis=1 )
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0, "NA": 0.0}
train["GarageCond"].replace(di, inplace=True)
features = np.concatenate( [features, train.GarageCond.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# PavedDrive
features = np.concatenate( [features, pd.get_dummies(train.PavedDrive).as_matrix()], axis=1 )
cols_count = cols_count + 1

# WoodDeckSF
features = np.concatenate( [features, train.WoodDeckSF.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# OpenPorchSF
features = np.concatenate( [features, train.OpenPorchSF.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# EnclosedPorch
features = np.concatenate( [features, train.EnclosedPorch.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

In [11]:
# 3SsnPorch
features = np.concatenate( [features, train['3SsnPorch'].as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# ScreenPorch
features = np.concatenate( [features, train.ScreenPorch.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# PoolArea
features = np.concatenate( [features, train.PoolArea.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# PoolQC
train.PoolQC.fillna("NA", inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.PoolQC).as_matrix()], axis=1 )
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0, "NA": 0.0}
train["PoolQC"].replace(di, inplace=True)
features = np.concatenate( [features, train.PoolQC.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# Fence
train.Fence.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Fence).as_matrix()], axis=1 )
cols_count = cols_count + 1

# MiscFeature
train.MiscFeature.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.MiscFeature).as_matrix()], axis=1 )
cols_count = cols_count + 1

# MiscVal
features = np.concatenate( [features, train.MiscVal.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# MoSold
if (use_fe_2):
    features = np.concatenate( [features, pd.get_dummies(train.MoSold).as_matrix()], axis=1 )
else:
    features = np.concatenate( [features, train.MoSold.as_matrix().reshape(-1, 1)], axis=1 )
    
cols_count = cols_count + 1

# YrSold
if (use_fe_2):
    features = np.concatenate( [features, pd.get_dummies(train.YrSold).as_matrix()], axis=1 )
else:
    features = np.concatenate( [features, train.YrSold.as_matrix().reshape(-1, 1)], axis=1 )
    
cols_count = cols_count + 1

# SaleType
#train.SaleType.fillna("WD", inplace=True)
train.SaleType.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.SaleType).as_matrix()], axis=1 )
cols_count = cols_count + 1

# SaleCondition
features = np.concatenate( [features, pd.get_dummies(train.SaleCondition).as_matrix()], axis=1 )
cols_count = cols_count + 1        

In [12]:
# LivingAreaSF
if (use_fe_3):
    train['LivingAreaSF'] = train['1stFlrSF'] + train['2ndFlrSF'] + train['TotalBsmtSF'] + \
                            train['GarageArea'] + train['MasVnrArea'] + train['WoodDeckSF'] + \
                            train['OpenPorchSF'] + train['3SsnPorch'] + train['ScreenPorch']

    features = np.concatenate( [features, train.LivingAreaSF.as_matrix().reshape(-1, 1)], axis=1 )
    cols_count = cols_count + 1
    
    train['LandRatio'] = train['LivingAreaSF'] / train['LotArea']
    features = np.concatenate( [features, train.LandRatio.as_matrix().reshape(-1, 1)], axis=1 )
    cols_count = cols_count + 1

### Scaling

In [13]:
# Prepare labels
#labels = (train.SalePrice - train.MiscVal).values.reshape(-1, 1)
labels = labels_orig

# First scale labels
labels = labels.astype(float)
labels_max = labels.max()

labels = labels / labels_max

count = 0
    
for jj in range(features.shape[1]):
    if((features[:, jj] > 25.).sum() > 0):
        mx = float(features[:, jj].max())

        features[:, jj] = features[:, jj] / mx
        count = count + 1
        
print("Total", count, "features scaled")


Total 24 features scaled


### Подготовка наборов для обучения и тестирования

In [14]:
# Split for train and test sets
features_train = features[:size_of_train, :]
labels_train = labels
features_test = features[size_of_train:, :]

print("Total features processed:", cols_count)
print("")
print(features_train.shape)
print(labels_train.shape)
print(features_test.shape)

Total features processed: 81

(1460, 280)
(1460, 1)
(1459, 280)


## Algorithm Selection  

In [15]:
#
# For Averaging
#
results = pd.DataFrame()

### Light GBM

In [17]:
%%time
def choose_LGB(X, y):
    LGB = lgb.LGBMRegressor(random_state=23, n_jobs=-1)

    parameters_grid = {
        #"boosting_type": ["gbdt", "dart", "goss", "rf"],
        "n_estimators": [50, 100, 150, 200, 250, 300, 350],
        "max_depth": [2, 3, 4, 5, 6],
        "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1],
    }

    gcv = GridSearchCV(LGB, parameters_grid, scoring='neg_mean_absolute_error')

    gcv.fit(X, y.reshape(-1))

    print(gcv.best_score_)
    print(gcv.best_params_)
    
    return gcv.best_estimator_    


LGB = choose_LGB(features_train, labels_train)

-0.0205232781156
{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Wall time: 1min 10s


In [33]:
%%time
res = []

tmp_f = np.array(features_train)
tmp_l = np.array(labels_train)

for i in range(73):
    LGB.fit(tmp_f, tmp_l.reshape(-1))
    prediction = LGB.predict(tmp_f).reshape(-1, 1)

    res.append(mean_absolute_error(tmp_l, prediction))
    
    if (i == 72):
        break

    tmp1 = np.abs(tmp_l - prediction)
    d = tmp1.argmax()

    tmp_f = np.delete(tmp_f, d, 0)
    tmp_l = np.delete(tmp_l, d, 0)
    
print(min(res), argmin(res))

prediction = LGB.predict(features_test)

results['LGB'] = prediction.reshape(-1)

0.00880652030721 72
Wall time: 14.7 s


### Gradient Boosting

In [25]:
def choose_GBM(X, y):
    GBR = GradientBoostingRegressor(random_state=23)

    parameters_grid = {
        #"n_estimators": [250, 300, 350, 400, 450, 500, 550, 600],
        "n_estimators": [350, 400, 450, 500, 550, 600],
        "min_samples_split": [2, 3, 4, 5, 6],
        "min_samples_leaf": [1, 2, 3, 4]
    }

    gcv = GridSearchCV(GBR, parameters_grid, scoring='neg_mean_absolute_error')

    gcv.fit(X, y.reshape(-1))

    print(gcv.best_score_)
    print(gcv.best_params_)
    
    return gcv.best_estimator_


GBM = choose_GBM(features_train, labels_train)

-0.0202452826565
{'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 600}


In [34]:
%%time
res = []

tmp_f = np.array(features_train)
tmp_l = np.array(labels_train)

for i in range(75):
    GBM.fit(tmp_f, tmp_l.reshape(-1))
    prediction = GBM.predict(tmp_f).reshape(-1, 1)

    res.append(mean_absolute_error(tmp_l, prediction))
    
    if (i == 70):
        break

    tmp1 = np.abs(tmp_l - prediction)
    d = tmp1.argmax()

    tmp_f = np.delete(tmp_f, d, 0)
    tmp_l = np.delete(tmp_l, d, 0)
    
print(min(res), argmin(res))

prediction = GBM.predict(features_test)

results['GBM'] = prediction.reshape(-1)

0.00490467338095 70
Wall time: 5min 51s


### Random Forest

In [27]:
%%time
RFR = RandomForestRegressor(n_jobs=-1, random_state=23)

parameters_grid = {
    "n_estimators": [200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700],
    "min_samples_split": [2, 3, 4, 5, 6],
    "min_samples_leaf": [1, 2, 3, 4]
}

gcv = GridSearchCV(RFR, parameters_grid, scoring='neg_mean_absolute_error')

gcv.fit(features_train, labels_train.reshape(-1))

print(gcv.best_score_)
print(gcv.best_params_)

RFR = gcv.best_estimator_

-0.0231520228587
{'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 700}
Wall time: 25min


In [36]:
%%time
res = []

tmp_f = np.array(features_train)
tmp_l = np.array(labels_train)

for i in range(75):
    RFR.fit(tmp_f, tmp_l.reshape(-1))
    prediction = RFR.predict(tmp_f).reshape(-1, 1)

    res.append(mean_absolute_error(tmp_l, prediction))
    
    if (i == 73):
        break

    tmp1 = np.abs(tmp_l - prediction)
    d = tmp1.argmax()

    tmp_f = np.delete(tmp_f, d, 0)
    tmp_l = np.delete(tmp_l, d, 0)
    
print(min(res), argmin(res))

prediction = RFR.predict(features_test)

results['RF'] = prediction.reshape(-1)

0.00654207637137 73
Wall time: 6min 54s


### Kernel Ridge

In [29]:
def choose_KernelRidge(X, y):
    clf = KernelRidge()

    parameters_grid = {
        "kernel": ['polynomial', 'rbf'], 
        "alpha": [1e0, 0.1, 1e-2, 1e-3],
        "gamma": np.logspace(-3, 3, 10)    
    }

    gcv = GridSearchCV(clf, parameters_grid, scoring='neg_mean_absolute_error')

    gcv.fit(X, y.reshape(-1))

    print(gcv.best_score_)
    print(gcv.best_params_)
    
    return gcv.best_estimator_


KRG = choose_KernelRidge(features_train, labels_train)

-0.0226646722288
{'alpha': 0.1, 'gamma': 0.0046415888336127772, 'kernel': 'polynomial'}


In [35]:
%%time
res = []

tmp_f = np.array(features_train)
tmp_l = np.array(labels_train)

for i in range(75):
    KRG.fit(tmp_f, tmp_l.reshape(-1))
    prediction = KRG.predict(tmp_f).reshape(-1, 1)

    res.append(mean_absolute_error(tmp_l, prediction))
    
    if (i == 74):
        break

    tmp1 = np.abs(tmp_l - prediction)
    d = tmp1.argmax()

    tmp_f = np.delete(tmp_f, d, 0)
    tmp_l = np.delete(tmp_l, d, 0)
    
print(min(res), argmin(res))

prediction = KRG.predict(features_test)

results['KRG'] = prediction.reshape(-1)

0.00823247822089 74
Wall time: 24.9 s


### XGBoost

In [31]:
def choose_XGB(X, y):
    XGB = xgboost.XGBRegressor()

    parameters_grid = {
        "n_estimators": [300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000],
        #"n_estimators": [250, 300, 350],
        #"max_depth": [2, 3, 4, 5, 6, 7, 8],
        "max_depth": [2, 3, 4, 5],
        #"learning_rate": [0.1, 0.05, 0.01]
        #"learning_rate": [0.1],
    }

    gcv = GridSearchCV(XGB, parameters_grid, scoring='neg_mean_absolute_error')

    gcv.fit(X, y.reshape(-1))

    print(gcv.best_score_)
    print(gcv.best_params_)
    
    return gcv.best_estimator_


XGB = choose_XGB(features_train, labels_train)

-0.0202368960687
{'max_depth': 2, 'n_estimators': 700}


In [37]:
%%time
res = []

tmp_f = np.array(features_train)
tmp_l = np.array(labels_train)

for i in range(75):
    XGB.fit(tmp_f, tmp_l.reshape(-1))
    prediction = XGB.predict(tmp_f).reshape(-1, 1)

    res.append(mean_absolute_error(tmp_l, prediction))
    
    if (i == 73):
        break

    tmp1 = np.abs(tmp_l - prediction)
    d = tmp1.argmax()

    tmp_f = np.delete(tmp_f, d, 0)
    tmp_l = np.delete(tmp_l, d, 0)
    
print(min(res), argmin(res))

prediction = XGB.predict(features_test)

results['XGB'] = prediction.reshape(-1)

0.00866033397388 73
Wall time: 6min 44s


### Averaging  

In [38]:
results.head()

Unnamed: 0,LGB,KRG,XGB,GBM,RF
0,0.172551,0.158953,0.167144,0.163088,0.179363
1,0.211509,0.212709,0.218139,0.227571,0.209968
2,0.250716,0.236492,0.242999,0.249454,0.241507
3,0.248756,0.257584,0.254926,0.264987,0.252983
4,0.249427,0.244794,0.239664,0.235327,0.252389


In [40]:
tmp = 0.3*results['KRG'] + 0.1*results['RF'] + 0.1*results['LGB'] + 0.2*results['GBM'] + 0.3*results['XGB']
tmp = np.array(tmp.tolist())
write_to_submission_file(tmp * labels_max, test.Id, out_file="submission.Short+WA.csv", target='SalePrice', index_label="Id")