In [190]:
import pandas as pd
import numpy as np
import math as mt

from sklearn.metrics import accuracy_score, r2_score, mean_squared_error, mean_absolute_error, precision_score, recall_score
from sklearn.metrics import roc_curve, roc_auc_score

from sklearn.ensemble import GradientBoostingClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.cluster import KMeans
from sklearn.linear_model import Lasso, Ridge, LogisticRegression, LinearRegression, SGDRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR

from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

from sklearn.neural_network import MLPClassifier

import xgboost
import lightgbm as lgb

from scipy.stats import pearsonr

from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder, StandardScaler, PolynomialFeatures

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [191]:
# Load House train data
train = pd.read_csv('train.csv')

# Put the labels aside
labels_orig = train.SalePrice.as_matrix().reshape(-1, 1)

# Which columns have NaN values?
count = 0

#for col in train.columns:
#    if(train[col].isnull().sum() > 0):
#        print(col)
#        count = count + 1
        
print("Total", count, "columns with NaNs")

# Define variables
cols = []
cols_count = 0

# Handling years as categorial 
use_fe_2 = False

# Using kind of a total sum of square feet
use_fe_3 = True

# Using new feature of Clustering
use_fe_4 = False

Total 0 columns with NaNs


In [192]:
train[train.SalePrice > 500000][['LotArea', '1stFlrSF', '2ndFlrSF', 'TotalBsmtSF',
                                'GarageArea', 'MasVnrArea', 'WoodDeckSF', 'OpenPorchSF', '3SsnPorch', 
                                'ScreenPorch', 'SalePrice']]

Unnamed: 0,LotArea,1stFlrSF,2ndFlrSF,TotalBsmtSF,GarageArea,MasVnrArea,WoodDeckSF,OpenPorchSF,3SsnPorch,ScreenPorch,SalePrice
178,17423,2234,0,2216,1166,748.0,0,60,0,0,501837
440,15431,2402,0,3094,672,200.0,0,72,0,170,555000
691,21535,2444,1872,2444,832,1170.0,382,50,0,0,755000
769,53504,1690,1589,1650,841,603.0,503,36,0,210,538000
803,13891,1734,1088,1734,1020,424.0,52,170,0,192,582933
898,12919,2364,0,2330,820,760.0,0,67,0,0,611657
1046,16056,1992,876,1992,716,208.0,214,108,0,0,556581
1169,35760,1831,1796,1930,807,1378.0,361,76,0,0,625000
1182,15623,2411,2065,2396,813,0.0,171,78,0,0,745000


In [193]:
train.SalePrice.max()

755000

### Data Preprocessing

In [194]:
train.iloc[:, :9].head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl


In [195]:
# MSSubClass
enc = OneHotEncoder(sparse=False)
features = enc.fit_transform(train.MSSubClass.values.reshape(-1, 1))
df_train = pd.get_dummies(train.MSSubClass, prefix="MSSubClass")
cols_count = cols_count + 1

# MSZoning
#train.MSZoning.fillna("RM", inplace=True)
train.MSZoning.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.MSZoning).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.MSZoning, prefix="MSZoning")], axis=1)
cols_count = cols_count + 1

# LotFrontage
train.LotFrontage.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.LotFrontage.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.LotFrontage], axis=1)
cols_count = cols_count + 1

# LotArea
features = np.concatenate( [features, train.LotArea.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.LotArea], axis=1)
cols_count = cols_count + 1

# Street
features = np.concatenate( [features, pd.get_dummies(train.Street).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.Street, prefix="Street")], axis=1)
cols_count = cols_count + 1

# Alley
train.Alley.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Alley).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.Alley, prefix="Alley")], axis=1)
cols_count = cols_count + 1

# LotShape
features = np.concatenate( [features, pd.get_dummies(train.LotShape).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.LotShape, prefix="LotShape")], axis=1)
cols_count = cols_count + 1

# LandContour
features = np.concatenate( [features, pd.get_dummies(train.LandContour).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.LandContour, prefix="LandContour")], axis=1)
cols_count = cols_count + 1

In [196]:
train.iloc[:, 9:19].head()

Unnamed: 0,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond
0,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5
1,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8
2,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5
3,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5
4,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5


In [197]:
# Utilities
#train.Utilities.fillna("AllPub", inplace=True)
train.Utilities.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Utilities).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.Utilities, prefix="Utilities")], axis=1)
cols_count = cols_count + 1

# LotConfig
features = np.concatenate( [features, pd.get_dummies(train.LotConfig).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.LotConfig, prefix="LotConfig")], axis=1)
cols_count = cols_count + 1

# LandSlope
features = np.concatenate( [features, pd.get_dummies(train.LandSlope).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.LandSlope, prefix="LandSlope")], axis=1)
cols_count = cols_count + 1

# Neighborhood 
features = np.concatenate( [features, pd.get_dummies(train.Neighborhood).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.Neighborhood, prefix="Neighborhood")], axis=1)
cols_count = cols_count + 1

# Condition1
features = np.concatenate( [features, pd.get_dummies(train.Condition1).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.Condition1, prefix="Condition1")], axis=1)
cols_count = cols_count + 1

# Condition2
features = np.concatenate( [features, pd.get_dummies(train.Condition2).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.Condition2, prefix="Condition2")], axis=1)
cols_count = cols_count + 1

# BldgType
features = np.concatenate( [features, pd.get_dummies(train.BldgType).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.BldgType, prefix="BldgType")], axis=1)
cols_count = cols_count + 1

# HouseStyle
features = np.concatenate( [features, pd.get_dummies(train.HouseStyle).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.HouseStyle, prefix="HouseStyle")], axis=1)
cols_count = cols_count + 1

# OverallQual
#enc = OneHotEncoder(sparse=False)
#features = np.concatenate( [features, enc.fit_transform(train.OverallQual.values.reshape(-1, 1))], axis=1 )
features = np.concatenate( [features, train.OverallQual.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.OverallQual], axis=1)
cols_count = cols_count + 1

# OverallCond
#enc = OneHotEncoder(sparse=False)
#features = np.concatenate( [features, enc.fit_transform(train.OverallCond.values.reshape(-1, 1))], axis=1 )
features = np.concatenate( [features, train.OverallCond.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.OverallCond], axis=1)
cols_count = cols_count + 1

In [198]:
train.iloc[:, 19:29].head()

Unnamed: 0,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond
0,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA
1,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA
2,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA
3,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA
4,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA


In [199]:
# YearBuilt
#if (use_fe_2):
#    features = np.concatenate( [features, pd.get_dummies(train.YearBuilt).as_matrix()], axis=1 )
#else:
#    features = np.concatenate( [features, train.YearBuilt.as_matrix().reshape(-1, 1)], axis=1 )
train['HouseAge'] = train.YrSold - train.YearBuilt
features = np.concatenate( [features, train.HouseAge.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.HouseAge], axis=1)
cols_count = cols_count + 1

# YearRemodAdd
#if (False):
#    features = np.concatenate( [features, pd.get_dummies(train.YearRemodAdd).as_matrix()], axis=1 )
#else:
#    features = np.concatenate( [features, train.YearRemodAdd.as_matrix().reshape(-1, 1)], axis=1 )
train['AgeSinceRemod'] = train.YrSold - train.YearRemodAdd
features = np.concatenate( [features, train.AgeSinceRemod.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.AgeSinceRemod], axis=1)
cols_count = cols_count + 1

# RoofStyle
features = np.concatenate( [features, pd.get_dummies(train.RoofStyle).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.RoofStyle, prefix="RoofStyle")], axis=1)
cols_count = cols_count + 1

# RoofMatl
features = np.concatenate( [features, pd.get_dummies(train.RoofMatl).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.RoofMatl, prefix="RoofMatl")], axis=1)
cols_count = cols_count + 1

# Exterior1st
#train.Exterior1st.fillna("Wd Sdng", inplace=True)
train.Exterior1st.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Exterior1st).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.Exterior1st, prefix="Exterior1st")], axis=1)
cols_count = cols_count + 1

# Exterior2nd
#train.Exterior2nd.fillna("Wd Sdng", inplace=True)
train.Exterior2nd.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Exterior2nd).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.Exterior2nd, prefix="Exterior2nd")], axis=1)
cols_count = cols_count + 1

# MasVnrType
#train.MasVnrType.fillna("None", inplace=True)
train.MasVnrType.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.MasVnrType).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.MasVnrType, prefix="MasVnrType")], axis=1)
cols_count = cols_count + 1

# MasVnrArea
train.MasVnrArea.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.MasVnrArea.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.MasVnrArea], axis=1)
cols_count = cols_count + 1

# ExterQual
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0}
train["ExterQual"].replace(di, inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.ExterQual).as_matrix()], axis=1 )
features = np.concatenate( [features, train.ExterQual.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.ExterQual], axis=1)
cols_count = cols_count + 1

# ExterCond
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0}
train["ExterCond"].replace(di, inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.ExterCond).as_matrix()], axis=1 )
features = np.concatenate( [features, train.ExterCond.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.ExterCond], axis=1)
cols_count = cols_count + 1

In [200]:
train.iloc[:, 29:39].head()

Unnamed: 0,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF
0,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856
1,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262
2,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920
3,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756
4,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145


In [201]:
# Foundation
features = np.concatenate( [features, pd.get_dummies(train.Foundation).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.Foundation, prefix="Foundation")], axis=1)
cols_count = cols_count + 1

# BsmtQual
train.BsmtQual.fillna("NA", inplace=True)
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0, "NA": 0.0}
train["BsmtQual"].replace(di, inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.BsmtQual).as_matrix()], axis=1 )
features = np.concatenate( [features, train.BsmtQual.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.BsmtQual], axis=1)
cols_count = cols_count + 1

# BsmtCond
train.BsmtCond.fillna("NA", inplace=True)
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0, "NA": 0.0}
train["BsmtCond"].replace(di, inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.BsmtCond).as_matrix()], axis=1 )
features = np.concatenate( [features, train.BsmtCond.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.BsmtCond], axis=1)
cols_count = cols_count + 1

# BsmtExposure
train.BsmtExposure.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.BsmtExposure).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.BsmtExposure, prefix="BsmtExposure")], axis=1)
cols_count = cols_count + 1

# BsmtFinType1
train.BsmtFinType1.fillna("NA", inplace=True)
di = {"GLQ": 6.0, "ALQ": 5.0, "BLQ": 4.0, "Rec": 3.0, "LwQ": 2.0, "Unf": 1.0, "NA": 0.0}
train["BsmtFinType1"].replace(di, inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.BsmtFinType1).as_matrix()], axis=1 )
features = np.concatenate( [features, train.BsmtFinType1.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.BsmtFinType1], axis=1)
cols_count = cols_count + 1

# BsmtFinSF1
train.BsmtFinSF1.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.BsmtFinSF1.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.BsmtFinSF1], axis=1)
cols_count = cols_count + 1

# BsmtFinType2
train.BsmtFinType2.fillna("NA", inplace=True)
di = {"GLQ": 6.0, "ALQ": 5.0, "BLQ": 4.0, "Rec": 3.0, "LwQ": 2.0, "Unf": 1.0, "NA": 0.0}
train["BsmtFinType2"].replace(di, inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.BsmtFinType2).as_matrix()], axis=1 )
features = np.concatenate( [features, train.BsmtFinType2.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.BsmtFinType2], axis=1)
cols_count = cols_count + 1

# BsmtFinSF2
train.BsmtFinSF2.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.BsmtFinSF2.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.BsmtFinSF2], axis=1)
cols_count = cols_count + 1

# BsmtUnfSF
train.BsmtUnfSF.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.BsmtUnfSF.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.BsmtUnfSF], axis=1)
cols_count = cols_count + 1

# TotalBsmtSF
train.TotalBsmtSF.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.TotalBsmtSF.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.TotalBsmtSF], axis=1)
cols_count = cols_count + 1

In [202]:
train.iloc[:, 39:49].head()

Unnamed: 0,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath
0,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0
1,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1
2,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0
3,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0
4,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0


In [203]:
# Heating
features = np.concatenate( [features, pd.get_dummies(train.Heating).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.Heating, prefix="Heating")], axis=1)
cols_count = cols_count + 1

# HeatingQC
#features = np.concatenate( [features, pd.get_dummies(train.HeatingQC).as_matrix()], axis=1 )
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0}
train["HeatingQC"].replace(di, inplace=True)
features = np.concatenate( [features, train.HeatingQC.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.HeatingQC], axis=1)
cols_count = cols_count + 1

# CentralAir
features = np.concatenate( [features, pd.get_dummies(train.CentralAir).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.CentralAir, prefix="CentralAir")], axis=1)
cols_count = cols_count + 1

# Electrical
train.Electrical.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Electrical).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.Electrical, prefix="Electrical")], axis=1)
cols_count = cols_count + 1

# 1stFlrSF
features = np.concatenate( [features, train['1stFlrSF'].as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train['1stFlrSF']], axis=1)
cols_count = cols_count + 1

# 2ndFlrSF
features = np.concatenate( [features, train['2ndFlrSF'].as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train['2ndFlrSF']], axis=1)
cols_count = cols_count + 1

# LowQualFinSF
features = np.concatenate( [features, train.LowQualFinSF.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.LowQualFinSF], axis=1)
cols_count = cols_count + 1

# GrLivArea
features = np.concatenate( [features, train.GrLivArea.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.GrLivArea], axis=1)
cols_count = cols_count + 1

# BsmtFullBath
train.BsmtFullBath.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.BsmtFullBath.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.BsmtFullBath], axis=1)
cols_count = cols_count + 1

# BsmtHalfBath
train.BsmtHalfBath.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.BsmtHalfBath.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.BsmtHalfBath], axis=1)
cols_count = cols_count + 1

In [204]:
train.iloc[:, 49:59].head()

Unnamed: 0,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType
0,2,1,3,1,Gd,8,Typ,0,,Attchd
1,2,0,3,1,TA,6,Typ,1,TA,Attchd
2,2,1,3,1,Gd,6,Typ,1,TA,Attchd
3,1,0,3,1,Gd,7,Typ,1,Gd,Detchd
4,2,1,4,1,Gd,9,Typ,1,TA,Attchd


In [205]:
# FullBath
features = np.concatenate( [features, train.FullBath.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.FullBath], axis=1)
cols_count = cols_count + 1

# HalfBath
features = np.concatenate( [features, train.HalfBath.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.HalfBath], axis=1)
cols_count = cols_count + 1

# BedroomAbvGr
features = np.concatenate( [features, train.BedroomAbvGr.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.BedroomAbvGr], axis=1)
cols_count = cols_count + 1

# KitchenAbvGr
features = np.concatenate( [features, train.KitchenAbvGr.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.KitchenAbvGr], axis=1)
cols_count = cols_count + 1

# KitchenQual
#train.KitchenQual.fillna("TA", inplace=True)
train.KitchenQual.fillna("Unknown", inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.KitchenQual).as_matrix()], axis=1 )
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0, "Unknown": 0.0}
train["KitchenQual"].replace(di, inplace=True)
features = np.concatenate( [features, train.KitchenQual.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.KitchenQual], axis=1)
cols_count = cols_count + 1

# TotRmsAbvGrd
features = np.concatenate( [features, train.TotRmsAbvGrd.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.TotRmsAbvGrd], axis=1)
cols_count = cols_count + 1

# Functional
#train.Functional.fillna("Typ", inplace=True)
train.Functional.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Functional).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.Functional, prefix="Functional")], axis=1)
cols_count = cols_count + 1

# Fireplaces
features = np.concatenate( [features, train.Fireplaces.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.Fireplaces], axis=1)
cols_count = cols_count + 1

# FireplaceQu
train.FireplaceQu.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.FireplaceQu).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.FireplaceQu, prefix="Functional")], axis=1)
cols_count = cols_count + 1

# GarageType
train.GarageType.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.GarageType).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.GarageType, prefix="Functional")], axis=1)
cols_count = cols_count + 1

In [206]:
train.iloc[:, 59:69].head()

Unnamed: 0,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch
0,2003.0,RFn,2,548,TA,TA,Y,0,61,0
1,1976.0,RFn,2,460,TA,TA,Y,298,0,0
2,2001.0,RFn,2,608,TA,TA,Y,0,42,0
3,1998.0,Unf,3,642,TA,TA,Y,0,35,272
4,2000.0,RFn,3,836,TA,TA,Y,192,84,0


In [207]:
# GarageYrBlt
train.GarageYrBlt.fillna(train.YearBuilt, inplace=True)

#if (False):
#    features = np.concatenate( [features, pd.get_dummies(train.GarageYrBlt).as_matrix()], axis=1 )
#else:
#    features = np.concatenate( [features, train.GarageYrBlt.as_matrix().reshape(-1, 1)], axis=1 )

train['GarageAge'] = train.YrSold - train.GarageYrBlt
features = np.concatenate( [features, train.GarageAge.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.GarageAge], axis=1)
cols_count = cols_count + 1

# GarageFinish
train.GarageFinish.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.GarageFinish).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.GarageFinish, prefix="GarageFinish")], axis=1)
cols_count = cols_count + 1

# GarageCars
train.GarageCars.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.GarageCars.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.GarageCars], axis=1)
cols_count = cols_count + 1

# GarageArea
train.GarageArea.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.GarageArea.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.GarageArea], axis=1)
cols_count = cols_count + 1

# GarageQual
train.GarageQual.fillna("NA", inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.GarageQual).as_matrix()], axis=1 )
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0, "NA": 0.0}
train["GarageQual"].replace(di, inplace=True)
features = np.concatenate( [features, train.GarageQual.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.GarageQual], axis=1)
cols_count = cols_count + 1

# GarageCond
train.GarageCond.fillna("NA", inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.GarageCond).as_matrix()], axis=1 )
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0, "NA": 0.0}
train["GarageCond"].replace(di, inplace=True)
features = np.concatenate( [features, train.GarageCond.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.GarageCond], axis=1)
cols_count = cols_count + 1

# PavedDrive
features = np.concatenate( [features, pd.get_dummies(train.PavedDrive).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.PavedDrive, prefix="PavedDrive")], axis=1)
cols_count = cols_count + 1

# WoodDeckSF
features = np.concatenate( [features, train.WoodDeckSF.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.WoodDeckSF], axis=1)
cols_count = cols_count + 1

# OpenPorchSF
features = np.concatenate( [features, train.OpenPorchSF.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.OpenPorchSF], axis=1)
cols_count = cols_count + 1

# EnclosedPorch
features = np.concatenate( [features, train.EnclosedPorch.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.EnclosedPorch], axis=1)
cols_count = cols_count + 1

In [208]:
train.iloc[:, 69:].head()

Unnamed: 0,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,HouseAge,AgeSinceRemod,GarageAge
0,0,0,0,,,,0,2,2008,WD,Normal,208500,5,5,5.0
1,0,0,0,,,,0,5,2007,WD,Normal,181500,31,31,31.0
2,0,0,0,,,,0,9,2008,WD,Normal,223500,7,6,7.0
3,0,0,0,,,,0,2,2006,WD,Abnorml,140000,91,36,8.0
4,0,0,0,,,,0,12,2008,WD,Normal,250000,8,8,8.0


In [209]:
# 3SsnPorch
features = np.concatenate( [features, train['3SsnPorch'].as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train['3SsnPorch']], axis=1)
cols_count = cols_count + 1

# ScreenPorch
features = np.concatenate( [features, train.ScreenPorch.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.ScreenPorch], axis=1)
cols_count = cols_count + 1

# PoolArea
features = np.concatenate( [features, train.PoolArea.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.PoolArea], axis=1)
cols_count = cols_count + 1

# PoolQC
train.PoolQC.fillna("NA", inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.PoolQC).as_matrix()], axis=1 )
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0, "NA": 0.0}
train["PoolQC"].replace(di, inplace=True)
features = np.concatenate( [features, train.PoolQC.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.PoolQC], axis=1)
cols_count = cols_count + 1

# Fence
train.Fence.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Fence).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.Fence, prefix="Fence")], axis=1)
cols_count = cols_count + 1

# MiscFeature
train.MiscFeature.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.MiscFeature).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.MiscFeature, prefix="MiscFeature")], axis=1)
cols_count = cols_count + 1

# MiscVal
features = np.concatenate( [features, train.MiscVal.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.MiscVal], axis=1)
cols_count = cols_count + 1

# MoSold
if (use_fe_2):
    features = np.concatenate( [features, pd.get_dummies(train.MoSold).as_matrix()], axis=1 )
    df_train = pd.concat([df_train, pd.get_dummies(train.MoSold, prefix="MoSold")], axis=1)
else:
    features = np.concatenate( [features, train.MoSold.as_matrix().reshape(-1, 1)], axis=1 )
    df_train = pd.concat([df_train, train.MoSold], axis=1)
    
cols_count = cols_count + 1

# YrSold
if (use_fe_2):
    features = np.concatenate( [features, pd.get_dummies(train.YrSold).as_matrix()], axis=1 )
    df_train = pd.concat([df_train, pd.get_dummies(train.YrSold, prefix="YrSold")], axis=1)
else:
    features = np.concatenate( [features, train.YrSold.as_matrix().reshape(-1, 1)], axis=1 )
    df_train = pd.concat([df_train, train.YrSold], axis=1)
    
cols_count = cols_count + 1

# SaleType
#train.SaleType.fillna("WD", inplace=True)
train.SaleType.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.SaleType).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.SaleType, prefix="SaleType")], axis=1)
cols_count = cols_count + 1

# SaleCondition
features = np.concatenate( [features, pd.get_dummies(train.SaleCondition).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.SaleCondition, prefix="SaleCondition")], axis=1)
cols_count = cols_count + 1        

In [210]:
# LivingAreaSF
if (use_fe_3):
    train['LivingAreaSF'] = train['1stFlrSF'] + train['2ndFlrSF'] + train['TotalBsmtSF'] + \
                            train['GarageArea'] + train['MasVnrArea'] + train['WoodDeckSF'] + \
                            train['OpenPorchSF'] + train['3SsnPorch'] + train['ScreenPorch']

    print(pearsonr(train['LivingAreaSF'].values.reshape(-1, 1), train.SalePrice.values.reshape(-1, 1))[0][0])
    
    features = np.concatenate( [features, train.LivingAreaSF.as_matrix().reshape(-1, 1)], axis=1 )
    df_train = pd.concat([df_train, train.LivingAreaSF], axis=1)                      
    cols_count = cols_count + 1
    
    train['LandRatio'] = train['LivingAreaSF'] / train['LotArea']
    features = np.concatenate( [features, train.LandRatio.as_matrix().reshape(-1, 1)], axis=1 )
    df_train = pd.concat([df_train, train.LandRatio], axis=1)   
    cols_count = cols_count + 1

0.823860406285


### Scaling features and transforming labels

In [211]:
labels_orig = train.SalePrice

# Transform labels
mn = train.SalePrice.min()
mx = train.SalePrice.max()

labels = (train.SalePrice - mn) // 10000

# Scale features
count = 0
    
for jj in range(features.shape[1]):
    if((features[:, jj] > 25.).sum() > 0):
        mx = float(features[:, jj].max())

        features[:, jj] = features[:, jj] / mx
        count = count + 1
        
print("Total", count, "features scaled")

Total 24 features scaled


### Подготовка наборов для обучения и тестирования

In [212]:
# Split for train and test sets
features_train, features_test, labels_train, labels_test, labels_orig_train, labels_orig_test = \
    train_test_split(features, labels, labels_orig, test_size=0.3, random_state=23)

print("Total features processed:", cols_count)
print("")
print(features_train.shape)
print(labels_train.shape)

Total features processed: 81

(1022, 273)
(1022,)


### Multi-Layer Perceptron

In [1200]:
%%time
def choose_MLP(X, y):
    MLP = MLPClassifier(random_state=23)

    parameters_grid = {
        "hidden_layer_sizes": [(150, 100, 50), (150, 30, 20), (150, 50, 50), (150, 50, 10), (150, 75, 25)],
        #"n_estimators": [50, 100, 150, 200, 250, 300, 350],
        #"max_depth": [2, 3, 4, 5, 6],
        #"learning_rate": [0.05, 0.1, 0.5, 1.0]
        #"min_samples_split": [2, 3, 4, 5, 6],
        #"min_samples_leaf": [1, 2, 3, 4]
    }

    gcv = GridSearchCV(MLP, parameters_grid, scoring='neg_mean_absolute_error')

    gcv.fit(X, y.reshape(-1))

    print(gcv.best_score_)
    print(gcv.best_params_)
    
    return gcv.best_estimator_


clf = choose_MLP(features_train, labels_train)

clf.fit(features_train, labels_train.reshape(-1))
prediction = clf.predict(features_test)

print(mean_absolute_error(labels_test, prediction))

-0.0378903574145
{'hidden_layer_sizes': (150, 50, 10)}
0.0303985022085
Wall time: 8.78 s


In [186]:
%%time
clf = MLPClassifier(hidden_layer_sizes=(150, 150, 150, 150, 100), solver="adam", activation="relu", max_iter=500, random_state=23)
clf.fit(features_train, labels_train)

prediction = clf.predict(features_train)
print(accuracy_score(labels_train, prediction))

prediction = clf.predict(features_test)
print(accuracy_score(labels_test, prediction))

0.503913894325
0.168949771689
Wall time: 2.82 s


In [185]:
print(mean_absolute_error( labels_orig_test, (prediction * 10000) + mn ))

20752.0022831


### Light GBM

In [1688]:
%%time
def choose_LGB(X, y):
    LGB = lgb.LGBMRegressor(random_state=23, n_jobs=-1)

    parameters_grid = {
        #"boosting_type": ["gbdt", "dart", "goss", "rf"],
        "n_estimators": [50, 100, 150, 200, 250, 300, 350],
        "max_depth": [2, 3, 4, 5, 6],
        "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1],
    }

    gcv = GridSearchCV(LGB, parameters_grid, scoring='neg_mean_absolute_error')

    gcv.fit(X, y.reshape(-1))

    print(gcv.best_score_)
    print(gcv.best_params_)
    
    return gcv.best_estimator_    


clf = choose_LGB(features_train, labels_train)

clf.fit(features_train, labels_train.reshape(-1))
prediction = clf.predict(features_test)

print(mean_absolute_error(labels_test, prediction))

results['LGB'] = prediction.reshape(-1)

-0.0218135205778
{'learning_rate': 0.075, 'max_depth': 4, 'n_estimators': 200}
0.0205182885608
Wall time: 1min 39s


In [1250]:
#
# Make several classifiers for Averaging
#
col = 1

for p1 in [200, 250, 300]:
    for p2 in [2, 3, 4, 5, 6]:
        for p3 in [0.01, 0.025, 0.05, 0.075, 0.1]:
            clf = lgb.LGBMRegressor(n_estimators=p1, max_depth=p2, learning_rate=p3, random_state=23, n_jobs=-1)
            clf.fit(features_train, labels_train.reshape(-1))
            prediction = clf.predict(features_test)
            results['LGB' + str(col)] = prediction.reshape(-1)
            col = col + 1

(438, 77)

### Gradient Boosting

In [1611]:
def choose_GBM(X, y):
    GBR = GradientBoostingRegressor(random_state=23)

    parameters_grid = {
        "n_estimators": [250, 300, 350],
        "min_samples_split": [2, 3, 4, 5, 6],
        "min_samples_leaf": [1, 2, 3, 4]
    }

    gcv = GridSearchCV(GBR, parameters_grid, scoring='neg_mean_absolute_error')

    gcv.fit(X, y.reshape(-1))

    print(gcv.best_score_)
    print(gcv.best_params_)
    
    return gcv.best_estimator_

In [1612]:
%%time

clf = choose_GBM(features_train, labels_train)

clf.fit(features_train, labels_train.reshape(-1))
prediction = clf.predict(features_test)

print(mean_absolute_error(labels_test, prediction))

results['GBM'] = prediction.reshape(-1)

-0.0222876966681
{'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 250}
0.0210613365987
Wall time: 3min 22s


In [213]:
clf = GradientBoostingClassifier(n_estimators=300, min_samples_split=5, min_samples_leaf=2, random_state=23)
clf.fit(features_train, labels_train)

prediction = clf.predict(features_train)
print(accuracy_score(labels_train, prediction))

prediction = clf.predict(features_test)
print(accuracy_score(labels_test, prediction))

1.0
0.180365296804


In [214]:
print(mean_absolute_error( labels_orig_test, (prediction * 10000) + mn ))

27675.9520548


In [1253]:
#
# Make several classifiers for Averaging
#
col = 1

for p1 in [250, 300, 350]:
    for p2 in [2, 3, 4, 5, 6]:
        for p3 in [1, 2, 3, 4]:
            clf = GradientBoostingRegressor(n_estimators=p1, min_samples_split=p2, min_samples_leaf=p3, random_state=23)
            clf.fit(features_train, labels_train.reshape(-1))
            prediction = clf.predict(features_test)
            results['GBM' + str(col)] = prediction.reshape(-1)
            col = col + 1

results.shape

(438, 137)

In [1606]:
for col, val in zip(df_train.columns, clf.feature_importances_):
    print(col, val)

MSSubClass_20 0.0
MSSubClass_30 0.000575188414835
MSSubClass_40 0.0
MSSubClass_45 0.0
MSSubClass_50 0.00215518175752
MSSubClass_60 0.000869593148986
MSSubClass_70 0.00540094227902
MSSubClass_75 0.000313234828535
MSSubClass_80 0.0
MSSubClass_85 0.0
MSSubClass_90 0.00075292493171
MSSubClass_120 0.0
MSSubClass_160 0.0
MSSubClass_180 0.0
MSSubClass_190 0.0
MSZoning_C (all) 0.00663362189177
MSZoning_FV 0.00401219174223
MSZoning_RH 0.0
MSZoning_RL 0.000571831522152
MSZoning_RM 0.00014579852328
LotFrontage 0.0183977059698
LotArea 0.0496223467631
Street_Grvl 0.0
Street_Pave 0.0
Alley_Grvl 0.0
Alley_NA 0.0
Alley_Pave 0.000885666040154
LotShape_IR1 0.0028146738555
LotShape_IR2 0.00288619978484
LotShape_IR3 0.0
LotShape_Reg 0.00143887170057
LandContour_Bnk 0.00286821412508
LandContour_HLS 0.00119150915355
LandContour_Low 0.000669630736189
LandContour_Lvl 0.000987984064423
Utilities_AllPub 0.0
Utilities_NoSeWa 0.0
LotConfig_Corner 0.0
LotConfig_CulDSac 0.00312707747217
LotConfig_FR2 0.003281735321

In [1287]:
%%time
res = []

tmp_f = np.array(features)
tmp_l = np.array(labels)

for i in range(75):
    clf = GradientBoostingRegressor(n_estimators=300, min_samples_split=5, min_samples_leaf=2, random_state=23)
    clf.fit(tmp_f, tmp_l.reshape(-1))
    prediction = clf.predict(tmp_f).reshape(-1, 1)

    res.append(mean_absolute_error(tmp_l, prediction))
    
    if (i == 46):
        pass

    tmp1 = np.abs(tmp_l - prediction)
    d = tmp1.argmax()

    tmp_f = np.delete(tmp_f, d, 0)
    tmp_l = np.delete(tmp_l, d, 0)
    
print(min(res), argmin(res))

0.00821708373924 72
Wall time: 6min 57s


In [902]:
f_train, f_test, l_train, l_test = train_test_split(tmp_f, tmp_l, test_size=0.3, random_state=23)

clf.fit(f_train, l_train.reshape(-1))
prediction = clf.predict(f_test)

print(mean_absolute_error(l_test, prediction))

0.0174893337057


### Random Forest

In [1614]:
%%time
RFR = RandomForestRegressor(n_jobs=-1, random_state=23)

parameters_grid = {
    "n_estimators": [200, 250, 300, 350, 400],
    "min_samples_split": [2, 3, 4, 5, 6],
    "min_samples_leaf": [1, 2, 3, 4]
}

gcv = GridSearchCV(RFR, parameters_grid, scoring='neg_mean_absolute_error')

gcv.fit(features, labels.reshape(-1))

print(gcv.best_score_)
print(gcv.best_params_)

-0.023588186645
{'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}
Wall time: 11min 21s


In [1615]:
clf = gcv.best_estimator_
#clf = RandomForestRegressor(n_estimators=350, min_samples_leaf=2, min_samples_split=2, n_jobs=-1, random_state=23)
clf.fit(features_train, labels_train.reshape(-1))
prediction = clf.predict(features_test)

print(mean_absolute_error(labels_test, prediction))

#results['RF'] = prediction.reshape(-1)

0.0234985014601


In [1254]:
#
# Make several classifiers for Averaging
#
col = 1

for p1 in [300, 350, 400]:
    for p2 in [2, 3, 4, 5, 6]:
        for p3 in [1, 2, 3, 4]:
            clf = RandomForestRegressor(n_estimators=p1, min_samples_split=p2, min_samples_leaf=p3, random_state=23, n_jobs=-1)
            clf.fit(features_train, labels_train.reshape(-1))
            prediction = clf.predict(features_test)
            results['RF' + str(col)] = prediction.reshape(-1)
            col = col + 1
            
results.shape

(438, 197)

### Kernel Ridge

In [821]:
def choose_KernelRidge(X, y):
    clf = KernelRidge()

    parameters_grid = {
        "kernel": ['polynomial', 'rbf'], 
        "alpha": [1e0, 0.1, 1e-2, 1e-3],
        "gamma": np.logspace(-3, 3, 10)    
    }

    gcv = GridSearchCV(clf, parameters_grid, scoring='neg_mean_absolute_error')

    gcv.fit(X, y.reshape(-1))

    print(gcv.best_score_)
    print(gcv.best_params_)
    
    return gcv.best_estimator_

In [1609]:
clf = choose_KernelRidge(features_train, labels_train)

clf.fit(features_train, labels_train.reshape(-1))
prediction = clf.predict(features_test)

print(mean_absolute_error(labels_test, prediction))

results['K_Ridge'] = prediction.reshape(-1)

-0.0252060026344
{'alpha': 0.1, 'gamma': 0.0046415888336127772, 'kernel': 'polynomial'}
0.0233070963741


In [1255]:
#
# Make several classifiers for Averaging
#
col = 1

for p1 in ['polynomial', 'rbf']:
    for p2 in [1e0, 0.1, 1e-2, 1e-3]:
        for p3 in np.logspace(-3, 3, 10):
            clf = KernelRidge(kernel=p1, alpha=p2, gamma=p3)
            clf.fit(features_train, labels_train.reshape(-1))
            prediction = clf.predict(features_test)
            results['KRG' + str(col)] = prediction.reshape(-1)
            col = col + 1
            
results.shape

(438, 277)

### XGBoost

In [525]:
def choose_XGB(X, y):
    XGB = xgboost.XGBRegressor()

    parameters_grid = {
        #"n_estimators": [50, 100, 150, 200, 250, 300],
        "n_estimators": [250, 300, 350],
        #"max_depth": [2, 3, 4, 5, 6, 7, 8],
        "max_depth": [3, 4, 5],
        #"learning_rate": [0.1, 0.05, 0.01]
        #"learning_rate": [0.1],
    }

    gcv = GridSearchCV(XGB, parameters_grid, scoring='neg_mean_absolute_error')

    gcv.fit(X, y.reshape(-1))

    print(gcv.best_score_)
    print(gcv.best_params_)
    
    return gcv.best_estimator_

In [1610]:
%%time

clf = choose_XGB(features_train, labels_train)

clf.fit(features_train, labels_train.reshape(-1))
prediction = clf.predict(features_test)

print(mean_absolute_error(labels_test, prediction))

#results['XGB'] = prediction.reshape(-1)

-0.0227337351622
{'max_depth': 4, 'n_estimators': 300}
0.0204025974813
Wall time: 53.1 s


In [1690]:
XGB = xgboost.XGBRegressor(max_depth=3, n_estimators=350)
XGB.fit(features_train, labels_train.reshape(-1))

prediction = XGB.predict(features_test)

print(mean_absolute_error(labels_test, prediction))

#results['XGB'] = prediction.reshape(-1)

0.0200851372074


In [1257]:
#
# Make several classifiers for Averaging
#
col = 1

for p1 in [250, 300, 350]:
    for p2 in [3, 4, 5]:
        clf = xgboost.XGBRegressor(n_estimators=p1, max_depth=p2)
        clf.fit(features_train, labels_train.reshape(-1))
        prediction = clf.predict(features_test)
        results['XGB' + str(col)] = prediction.reshape(-1)
        col = col + 1
        
results.shape

(438, 286)