In [108]:
import pandas as pd
import numpy as np
import math as mt

from sklearn.metrics import accuracy_score, r2_score, mean_squared_error, mean_absolute_error, precision_score, recall_score
from sklearn.metrics import roc_curve, roc_auc_score

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.cluster import KMeans
from sklearn.linear_model import Lasso, Ridge, LogisticRegression, LinearRegression, SGDRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR

from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

from sklearn.neural_network import MLPRegressor

import xgboost
import lightgbm as lgb

from scipy.stats import pearsonr

from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder, StandardScaler, PolynomialFeatures

%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [109]:
# Load House train data
train = pd.read_csv('train.csv')

# Put the labels aside
labels_orig = train.SalePrice.as_matrix().reshape(-1, 1)

# Which columns have NaN values?
count = 0

#for col in train.columns:
#    if(train[col].isnull().sum() > 0):
#        print(col)
#        count = count + 1
        
#print("Total", count, "columns with NaNs")

# Define variables
cols = []
cols_count = 0

# Handling years as categorial 
use_fe_2 = False

# Using kind of a total sum of square feet
use_fe_3 = True

# Using new feature of Clustering
use_fe_4 = False

### Data Preprocessing

In [110]:
train.iloc[:, :9].head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl


In [111]:
# MSSubClass
enc = OneHotEncoder(sparse=False)
features = enc.fit_transform(train.MSSubClass.values.reshape(-1, 1))
df_train = pd.get_dummies(train.MSSubClass, prefix="MSSubClass")
cols_count = cols_count + 1

# MSZoning
#train.MSZoning.fillna("RM", inplace=True)
train.MSZoning.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.MSZoning).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.MSZoning, prefix="MSZoning")], axis=1)
cols_count = cols_count + 1

# LotFrontage
train.LotFrontage.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.LotFrontage.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.LotFrontage], axis=1)
cols_count = cols_count + 1

# LotArea
features = np.concatenate( [features, train.LotArea.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.LotArea], axis=1)
cols_count = cols_count + 1

# Street
features = np.concatenate( [features, pd.get_dummies(train.Street).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.Street, prefix="Street")], axis=1)
cols_count = cols_count + 1

# Alley
train.Alley.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Alley).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.Alley, prefix="Alley")], axis=1)
cols_count = cols_count + 1

# LotShape
features = np.concatenate( [features, pd.get_dummies(train.LotShape).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.LotShape, prefix="LotShape")], axis=1)
cols_count = cols_count + 1

# LandContour
features = np.concatenate( [features, pd.get_dummies(train.LandContour).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.LandContour, prefix="LandContour")], axis=1)
cols_count = cols_count + 1

In [112]:
train.iloc[:, 9:19].head()

Unnamed: 0,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond
0,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5
1,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8
2,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5
3,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5
4,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5


In [113]:
# Utilities
#train.Utilities.fillna("AllPub", inplace=True)
train.Utilities.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Utilities).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.Utilities, prefix="Utilities")], axis=1)
cols_count = cols_count + 1

# LotConfig
features = np.concatenate( [features, pd.get_dummies(train.LotConfig).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.LotConfig, prefix="LotConfig")], axis=1)
cols_count = cols_count + 1

# LandSlope
features = np.concatenate( [features, pd.get_dummies(train.LandSlope).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.LandSlope, prefix="LandSlope")], axis=1)
cols_count = cols_count + 1

# Neighborhood 
features = np.concatenate( [features, pd.get_dummies(train.Neighborhood).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.Neighborhood, prefix="Neighborhood")], axis=1)
cols_count = cols_count + 1

# Condition1
features = np.concatenate( [features, pd.get_dummies(train.Condition1).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.Condition1, prefix="Condition1")], axis=1)
cols_count = cols_count + 1

# Condition2
features = np.concatenate( [features, pd.get_dummies(train.Condition2).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.Condition2, prefix="Condition2")], axis=1)
cols_count = cols_count + 1

# BldgType
features = np.concatenate( [features, pd.get_dummies(train.BldgType).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.BldgType, prefix="BldgType")], axis=1)
cols_count = cols_count + 1

# HouseStyle
features = np.concatenate( [features, pd.get_dummies(train.HouseStyle).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.HouseStyle, prefix="HouseStyle")], axis=1)
cols_count = cols_count + 1

# OverallQual
#enc = OneHotEncoder(sparse=False)
#features = np.concatenate( [features, enc.fit_transform(train.OverallQual.values.reshape(-1, 1))], axis=1 )
features = np.concatenate( [features, train.OverallQual.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.OverallQual], axis=1)
cols_count = cols_count + 1

# OverallCond
#enc = OneHotEncoder(sparse=False)
#features = np.concatenate( [features, enc.fit_transform(train.OverallCond.values.reshape(-1, 1))], axis=1 )
features = np.concatenate( [features, train.OverallCond.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.OverallCond], axis=1)
cols_count = cols_count + 1

In [114]:
train.iloc[:, 19:29].head()

Unnamed: 0,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond
0,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA
1,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA
2,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA
3,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA
4,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA


In [115]:
# YearBuilt
#if (use_fe_2):
#    features = np.concatenate( [features, pd.get_dummies(train.YearBuilt).as_matrix()], axis=1 )
#else:
#    features = np.concatenate( [features, train.YearBuilt.as_matrix().reshape(-1, 1)], axis=1 )
train['HouseAge'] = train.YrSold - train.YearBuilt
features = np.concatenate( [features, train.HouseAge.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.HouseAge], axis=1)
cols_count = cols_count + 1

# YearRemodAdd
#if (False):
#    features = np.concatenate( [features, pd.get_dummies(train.YearRemodAdd).as_matrix()], axis=1 )
#else:
#    features = np.concatenate( [features, train.YearRemodAdd.as_matrix().reshape(-1, 1)], axis=1 )
train['AgeSinceRemod'] = train.YrSold - train.YearRemodAdd
features = np.concatenate( [features, train.AgeSinceRemod.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.AgeSinceRemod], axis=1)
cols_count = cols_count + 1

# RoofStyle
features = np.concatenate( [features, pd.get_dummies(train.RoofStyle).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.RoofStyle, prefix="RoofStyle")], axis=1)
cols_count = cols_count + 1

# RoofMatl
features = np.concatenate( [features, pd.get_dummies(train.RoofMatl).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.RoofMatl, prefix="RoofMatl")], axis=1)
cols_count = cols_count + 1

# Exterior1st
#train.Exterior1st.fillna("Wd Sdng", inplace=True)
train.Exterior1st.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Exterior1st).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.Exterior1st, prefix="Exterior1st")], axis=1)
cols_count = cols_count + 1

# Exterior2nd
#train.Exterior2nd.fillna("Wd Sdng", inplace=True)
train.Exterior2nd.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Exterior2nd).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.Exterior2nd, prefix="Exterior2nd")], axis=1)
cols_count = cols_count + 1

# MasVnrType
#train.MasVnrType.fillna("None", inplace=True)
train.MasVnrType.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.MasVnrType).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.MasVnrType, prefix="MasVnrType")], axis=1)
cols_count = cols_count + 1

# MasVnrArea
train.MasVnrArea.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.MasVnrArea.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.MasVnrArea], axis=1)
cols_count = cols_count + 1

# ExterQual
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0}
train["ExterQual"].replace(di, inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.ExterQual).as_matrix()], axis=1 )
features = np.concatenate( [features, train.ExterQual.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.ExterQual], axis=1)
cols_count = cols_count + 1

# ExterCond
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0}
train["ExterCond"].replace(di, inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.ExterCond).as_matrix()], axis=1 )
features = np.concatenate( [features, train.ExterCond.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.ExterCond], axis=1)
cols_count = cols_count + 1

In [116]:
train.iloc[:, 29:39].head()

Unnamed: 0,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF
0,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856
1,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262
2,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920
3,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756
4,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145


In [117]:
# Foundation
features = np.concatenate( [features, pd.get_dummies(train.Foundation).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.Foundation, prefix="Foundation")], axis=1)
cols_count = cols_count + 1

# BsmtQual
train.BsmtQual.fillna("NA", inplace=True)
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0, "NA": 0.0}
train["BsmtQual"].replace(di, inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.BsmtQual).as_matrix()], axis=1 )
features = np.concatenate( [features, train.BsmtQual.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.BsmtQual], axis=1)
cols_count = cols_count + 1

# BsmtCond
train.BsmtCond.fillna("NA", inplace=True)
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0, "NA": 0.0}
train["BsmtCond"].replace(di, inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.BsmtCond).as_matrix()], axis=1 )
features = np.concatenate( [features, train.BsmtCond.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.BsmtCond], axis=1)
cols_count = cols_count + 1

# BsmtExposure
train.BsmtExposure.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.BsmtExposure).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.BsmtExposure, prefix="BsmtExposure")], axis=1)
cols_count = cols_count + 1

# BsmtFinType1
train.BsmtFinType1.fillna("NA", inplace=True)
di = {"GLQ": 6.0, "ALQ": 5.0, "BLQ": 4.0, "Rec": 3.0, "LwQ": 2.0, "Unf": 1.0, "NA": 0.0}
train["BsmtFinType1"].replace(di, inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.BsmtFinType1).as_matrix()], axis=1 )
features = np.concatenate( [features, train.BsmtFinType1.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.BsmtFinType1], axis=1)
cols_count = cols_count + 1

# BsmtFinSF1
train.BsmtFinSF1.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.BsmtFinSF1.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.BsmtFinSF1], axis=1)
cols_count = cols_count + 1

# BsmtFinType2
train.BsmtFinType2.fillna("NA", inplace=True)
di = {"GLQ": 6.0, "ALQ": 5.0, "BLQ": 4.0, "Rec": 3.0, "LwQ": 2.0, "Unf": 1.0, "NA": 0.0}
train["BsmtFinType2"].replace(di, inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.BsmtFinType2).as_matrix()], axis=1 )
features = np.concatenate( [features, train.BsmtFinType2.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.BsmtFinType2], axis=1)
cols_count = cols_count + 1

# BsmtFinSF2
train.BsmtFinSF2.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.BsmtFinSF2.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.BsmtFinSF2], axis=1)
cols_count = cols_count + 1

# BsmtUnfSF
train.BsmtUnfSF.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.BsmtUnfSF.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.BsmtUnfSF], axis=1)
cols_count = cols_count + 1

# TotalBsmtSF
train.TotalBsmtSF.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.TotalBsmtSF.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.TotalBsmtSF], axis=1)
cols_count = cols_count + 1

In [118]:
train.iloc[:, 39:49].head()

Unnamed: 0,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath
0,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0
1,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1
2,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0
3,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0
4,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0


In [119]:
# Heating
features = np.concatenate( [features, pd.get_dummies(train.Heating).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.Heating, prefix="Heating")], axis=1)
cols_count = cols_count + 1

# HeatingQC
#features = np.concatenate( [features, pd.get_dummies(train.HeatingQC).as_matrix()], axis=1 )
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0}
train["HeatingQC"].replace(di, inplace=True)
features = np.concatenate( [features, train.HeatingQC.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.HeatingQC], axis=1)
cols_count = cols_count + 1

# CentralAir
features = np.concatenate( [features, pd.get_dummies(train.CentralAir).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.CentralAir, prefix="CentralAir")], axis=1)
cols_count = cols_count + 1

# Electrical
train.Electrical.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Electrical).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.Electrical, prefix="Electrical")], axis=1)
cols_count = cols_count + 1

# 1stFlrSF
features = np.concatenate( [features, train['1stFlrSF'].as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train['1stFlrSF']], axis=1)
cols_count = cols_count + 1

# 2ndFlrSF
features = np.concatenate( [features, train['2ndFlrSF'].as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train['2ndFlrSF']], axis=1)
cols_count = cols_count + 1

# LowQualFinSF
features = np.concatenate( [features, train.LowQualFinSF.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.LowQualFinSF], axis=1)
cols_count = cols_count + 1

# GrLivArea
features = np.concatenate( [features, train.GrLivArea.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.GrLivArea], axis=1)
cols_count = cols_count + 1

# BsmtFullBath
train.BsmtFullBath.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.BsmtFullBath.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.BsmtFullBath], axis=1)
cols_count = cols_count + 1

# BsmtHalfBath
train.BsmtHalfBath.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.BsmtHalfBath.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.BsmtHalfBath], axis=1)
cols_count = cols_count + 1

In [120]:
train.iloc[:, 49:59].head()

Unnamed: 0,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType
0,2,1,3,1,Gd,8,Typ,0,,Attchd
1,2,0,3,1,TA,6,Typ,1,TA,Attchd
2,2,1,3,1,Gd,6,Typ,1,TA,Attchd
3,1,0,3,1,Gd,7,Typ,1,Gd,Detchd
4,2,1,4,1,Gd,9,Typ,1,TA,Attchd


In [121]:
# FullBath
features = np.concatenate( [features, train.FullBath.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.FullBath], axis=1)
cols_count = cols_count + 1

# HalfBath
features = np.concatenate( [features, train.HalfBath.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.HalfBath], axis=1)
cols_count = cols_count + 1

# BedroomAbvGr
features = np.concatenate( [features, train.BedroomAbvGr.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.BedroomAbvGr], axis=1)
cols_count = cols_count + 1

# KitchenAbvGr
features = np.concatenate( [features, train.KitchenAbvGr.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.KitchenAbvGr], axis=1)
cols_count = cols_count + 1

# KitchenQual
#train.KitchenQual.fillna("TA", inplace=True)
train.KitchenQual.fillna("Unknown", inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.KitchenQual).as_matrix()], axis=1 )
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0, "Unknown": 0.0}
train["KitchenQual"].replace(di, inplace=True)
features = np.concatenate( [features, train.KitchenQual.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.KitchenQual], axis=1)
cols_count = cols_count + 1

# TotRmsAbvGrd
features = np.concatenate( [features, train.TotRmsAbvGrd.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.TotRmsAbvGrd], axis=1)
cols_count = cols_count + 1

# Functional
#train.Functional.fillna("Typ", inplace=True)
train.Functional.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Functional).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.Functional, prefix="Functional")], axis=1)
cols_count = cols_count + 1

# Fireplaces
features = np.concatenate( [features, train.Fireplaces.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.Fireplaces], axis=1)
cols_count = cols_count + 1

# FireplaceQu
train.FireplaceQu.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.FireplaceQu).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.FireplaceQu, prefix="Functional")], axis=1)
cols_count = cols_count + 1

# GarageType
train.GarageType.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.GarageType).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.GarageType, prefix="Functional")], axis=1)
cols_count = cols_count + 1

In [122]:
train.iloc[:, 59:69].head()

Unnamed: 0,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch
0,2003.0,RFn,2,548,TA,TA,Y,0,61,0
1,1976.0,RFn,2,460,TA,TA,Y,298,0,0
2,2001.0,RFn,2,608,TA,TA,Y,0,42,0
3,1998.0,Unf,3,642,TA,TA,Y,0,35,272
4,2000.0,RFn,3,836,TA,TA,Y,192,84,0


In [123]:
# GarageYrBlt
train.GarageYrBlt.fillna(train.YearBuilt, inplace=True)

#if (False):
#    features = np.concatenate( [features, pd.get_dummies(train.GarageYrBlt).as_matrix()], axis=1 )
#else:
#    features = np.concatenate( [features, train.GarageYrBlt.as_matrix().reshape(-1, 1)], axis=1 )

train['GarageAge'] = train.YrSold - train.GarageYrBlt
features = np.concatenate( [features, train.GarageAge.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.GarageAge], axis=1)
cols_count = cols_count + 1

# GarageFinish
train.GarageFinish.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.GarageFinish).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.GarageFinish, prefix="GarageFinish")], axis=1)
cols_count = cols_count + 1

# GarageCars
train.GarageCars.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.GarageCars.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.GarageCars], axis=1)
cols_count = cols_count + 1

# GarageArea
train.GarageArea.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.GarageArea.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.GarageArea], axis=1)
cols_count = cols_count + 1

# GarageQual
train.GarageQual.fillna("NA", inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.GarageQual).as_matrix()], axis=1 )
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0, "NA": 0.0}
train["GarageQual"].replace(di, inplace=True)
features = np.concatenate( [features, train.GarageQual.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.GarageQual], axis=1)
cols_count = cols_count + 1

# GarageCond
train.GarageCond.fillna("NA", inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.GarageCond).as_matrix()], axis=1 )
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0, "NA": 0.0}
train["GarageCond"].replace(di, inplace=True)
features = np.concatenate( [features, train.GarageCond.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.GarageCond], axis=1)
cols_count = cols_count + 1

# PavedDrive
features = np.concatenate( [features, pd.get_dummies(train.PavedDrive).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.PavedDrive, prefix="PavedDrive")], axis=1)
cols_count = cols_count + 1

# WoodDeckSF
features = np.concatenate( [features, train.WoodDeckSF.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.WoodDeckSF], axis=1)
cols_count = cols_count + 1

# OpenPorchSF
features = np.concatenate( [features, train.OpenPorchSF.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.OpenPorchSF], axis=1)
cols_count = cols_count + 1

# EnclosedPorch
features = np.concatenate( [features, train.EnclosedPorch.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.EnclosedPorch], axis=1)
cols_count = cols_count + 1

In [124]:
train.iloc[:, 69:].head()

Unnamed: 0,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,HouseAge,AgeSinceRemod,GarageAge
0,0,0,0,,,,0,2,2008,WD,Normal,208500,5,5,5.0
1,0,0,0,,,,0,5,2007,WD,Normal,181500,31,31,31.0
2,0,0,0,,,,0,9,2008,WD,Normal,223500,7,6,7.0
3,0,0,0,,,,0,2,2006,WD,Abnorml,140000,91,36,8.0
4,0,0,0,,,,0,12,2008,WD,Normal,250000,8,8,8.0


In [125]:
# 3SsnPorch
features = np.concatenate( [features, train['3SsnPorch'].as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train['3SsnPorch']], axis=1)
cols_count = cols_count + 1

# ScreenPorch
features = np.concatenate( [features, train.ScreenPorch.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.ScreenPorch], axis=1)
cols_count = cols_count + 1

# PoolArea
features = np.concatenate( [features, train.PoolArea.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.PoolArea], axis=1)
cols_count = cols_count + 1

# PoolQC
train.PoolQC.fillna("NA", inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.PoolQC).as_matrix()], axis=1 )
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0, "NA": 0.0}
train["PoolQC"].replace(di, inplace=True)
features = np.concatenate( [features, train.PoolQC.as_matrix().reshape(-1, 1)], axis=1 )
df_train = pd.concat([df_train, train.PoolQC], axis=1)
cols_count = cols_count + 1

# Fence
train.Fence.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Fence).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.Fence, prefix="Fence")], axis=1)
cols_count = cols_count + 1

# MiscFeature
train.MiscFeature.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.MiscFeature).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.MiscFeature, prefix="MiscFeature")], axis=1)
cols_count = cols_count + 1

# MiscVal
#features = np.concatenate( [features, train.MiscVal.as_matrix().reshape(-1, 1)], axis=1 )
#df_train = pd.concat([df_train, train.MiscVal], axis=1)
#cols_count = cols_count + 1

# MoSold
if (use_fe_2):
    features = np.concatenate( [features, pd.get_dummies(train.MoSold).as_matrix()], axis=1 )
    df_train = pd.concat([df_train, pd.get_dummies(train.MoSold, prefix="MoSold")], axis=1)
else:
    features = np.concatenate( [features, train.MoSold.as_matrix().reshape(-1, 1)], axis=1 )
    df_train = pd.concat([df_train, train.MoSold], axis=1)
    
cols_count = cols_count + 1

# YrSold
if (use_fe_2):
    features = np.concatenate( [features, pd.get_dummies(train.YrSold).as_matrix()], axis=1 )
    df_train = pd.concat([df_train, pd.get_dummies(train.YrSold, prefix="YrSold")], axis=1)
else:
    features = np.concatenate( [features, train.YrSold.as_matrix().reshape(-1, 1)], axis=1 )
    df_train = pd.concat([df_train, train.YrSold], axis=1)
    
cols_count = cols_count + 1

# SaleType
#train.SaleType.fillna("WD", inplace=True)
train.SaleType.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.SaleType).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.SaleType, prefix="SaleType")], axis=1)
cols_count = cols_count + 1

# SaleCondition
features = np.concatenate( [features, pd.get_dummies(train.SaleCondition).as_matrix()], axis=1 )
df_train = pd.concat([df_train, pd.get_dummies(train.SaleCondition, prefix="SaleCondition")], axis=1)
cols_count = cols_count + 1        

In [126]:
# LivingAreaSF
if (use_fe_3):
    train['LivingAreaSF'] = train['1stFlrSF'] + train['2ndFlrSF'] + train['TotalBsmtSF'] + \
                            train['GarageArea'] + train['MasVnrArea'] + train['WoodDeckSF'] + \
                            train['OpenPorchSF'] + train['3SsnPorch'] + train['ScreenPorch']

    print(pearsonr(train['LivingAreaSF'].values.reshape(-1, 1), train.SalePrice.values.reshape(-1, 1))[0][0])
    
    features = np.concatenate( [features, train.LivingAreaSF.as_matrix().reshape(-1, 1)], axis=1 )
    df_train = pd.concat([df_train, train.LivingAreaSF], axis=1)                      
    cols_count = cols_count + 1
    
    train['LandRatio'] = train['LivingAreaSF'] / train['LotArea']
    features = np.concatenate( [features, train.LandRatio.as_matrix().reshape(-1, 1)], axis=1 )
    df_train = pd.concat([df_train, train.LandRatio], axis=1)   
    cols_count = cols_count + 1

0.823860406285


### Scaling

In [127]:
# Prepare labels
#labels = (train.SalePrice - train.MiscVal).values.reshape(-1, 1)
labels = train.SalePrice.values.reshape(-1, 1)

In [128]:
# First scale labels
labels = labels.astype(float)
labels_max = labels.max()

labels = labels / labels_max

count = 0
    
for jj in range(features.shape[1]):
    if((features[:, jj] > 25.).sum() > 0):
        mx = float(features[:, jj].max())

        features[:, jj] = features[:, jj] / mx
        count = count + 1
        
print("Total", count, "features scaled")


Total 23 features scaled


### Подготовка наборов для обучения и тестирования

In [129]:
# Split for train and test sets
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.3, random_state=23)

print("Total features processed:", cols_count)
print("")
print(features_train.shape)
print(labels_train.shape)

Total features processed: 80

(1022, 272)
(1022, 1)


## Algorithm Selection  
### Baseline

GradientBoostingRegressor(n_estimators=300, min_samples_split=5, min_samples_leaf=2, random_state=23)  
На полном наборе данных + общая площадь:  

#### 0.020075944788

In [130]:
#
# For Averaging
#

results = pd.DataFrame()
results['Target'] = labels_test.reshape(-1)

clfs = []

In [146]:
results_ada = pd.DataFrame()
results_ada['Target'] = labels_test.reshape(-1)

### Multi-Layer Perceptron

In [1200]:
%%time
def choose_MLP(X, y):
    MLP = MLPRegressor(random_state=23)

    parameters_grid = {
        "hidden_layer_sizes": [(150, 100, 50), (150, 30, 20), (150, 50, 50), (150, 50, 10), (150, 75, 25)],
        #"n_estimators": [50, 100, 150, 200, 250, 300, 350],
        #"max_depth": [2, 3, 4, 5, 6],
        #"learning_rate": [0.05, 0.1, 0.5, 1.0]
        #"min_samples_split": [2, 3, 4, 5, 6],
        #"min_samples_leaf": [1, 2, 3, 4]
    }

    gcv = GridSearchCV(MLP, parameters_grid, scoring='neg_mean_absolute_error')

    gcv.fit(X, y.reshape(-1))

    print(gcv.best_score_)
    print(gcv.best_params_)
    
    return gcv.best_estimator_


clf = choose_MLP(features_train, labels_train)

clf.fit(features_train, labels_train.reshape(-1))
prediction = clf.predict(features_test)

print(mean_absolute_error(labels_test, prediction))

-0.0378903574145
{'hidden_layer_sizes': (150, 50, 10)}
0.0303985022085
Wall time: 8.78 s


In [1249]:
clf = MLPRegressor(hidden_layer_sizes=(150, 50, 10), random_state=23)
clf.fit(features_train, labels_train.reshape(-1))
prediction = clf.predict(features_test)

print(mean_absolute_error(labels_test, prediction))

results['MLP'] = prediction.reshape(-1)

0.0303985022085


### Light GBM

In [174]:
%%time
def choose_LGB(X, y):
    LGB = lgb.LGBMRegressor(random_state=23, n_jobs=-1)

    parameters_grid = {
        #"boosting_type": ["gbdt", "dart", "goss", "rf"],
        "n_estimators": [50, 100, 150, 200, 250, 300, 350],
        "max_depth": [2, 3, 4, 5, 6],
        "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1],
    }

    gcv = GridSearchCV(LGB, parameters_grid, scoring='neg_mean_absolute_error')

    gcv.fit(X, y.reshape(-1))

    print(gcv.best_score_)
    print(gcv.best_params_)
    
    return gcv.best_estimator_    


clf = choose_LGB(features_train, labels_train)

clf.fit(features_train, labels_train.reshape(-1))
prediction = clf.predict(features_test)

print(mean_absolute_error(labels_test, prediction))

results['LGB'] = prediction.reshape(-1)

-0.0218094142142
{'learning_rate': 0.075, 'max_depth': 4, 'n_estimators': 200}
0.0204735668589
Wall time: 1min 26s


In [177]:
%%time
#
# AdaBoost
#
adb_clf = AdaBoostRegressor(base_estimator=clf, n_estimators=350, learning_rate=0.7, random_state=23)
adb_clf.fit(features_train, labels_train.reshape(-1))
prediction = adb_clf.predict(features_test)

print(mean_absolute_error(labels_test, prediction))

#results_ada['LGB'] = prediction.reshape(-1)

0.0196013179077
Wall time: 1min 19s


### Gradient Boosting

In [1611]:
def choose_GBM(X, y):
    GBR = GradientBoostingRegressor(random_state=23)

    parameters_grid = {
        "n_estimators": [250, 300, 350],
        "min_samples_split": [2, 3, 4, 5, 6],
        "min_samples_leaf": [1, 2, 3, 4]
    }

    gcv = GridSearchCV(GBR, parameters_grid, scoring='neg_mean_absolute_error')

    gcv.fit(X, y.reshape(-1))

    print(gcv.best_score_)
    print(gcv.best_params_)
    
    return gcv.best_estimator_

In [1612]:
%%time

clf = choose_GBM(features_train, labels_train)

clf.fit(features_train, labels_train.reshape(-1))
prediction = clf.predict(features_test)

print(mean_absolute_error(labels_test, prediction))

results['GBM'] = prediction.reshape(-1)

-0.0222876966681
{'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 250}
0.0210613365987
Wall time: 3min 22s


In [192]:
clf = GradientBoostingRegressor(n_estimators=300, min_samples_split=5, min_samples_leaf=2, random_state=23)
#clf = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1, max_depth=3, random_state=23)
clf.fit(features_train, labels_train.reshape(-1))
prediction = clf.predict(features_test)

print(mean_absolute_error(labels_test, prediction))

results['GBM'] = prediction.reshape(-1)

0.0203621230414


In [None]:
%%time
#
# AdaBoost
#
adb_clf = AdaBoostRegressor(base_estimator=clf, n_estimators=350, learning_rate=0.7, random_state=23)
adb_clf.fit(features_train, labels_train.reshape(-1))
prediction = adb_clf.predict(features_test)

print(mean_absolute_error(labels_test, prediction))

results_ada['GBM'] = prediction.reshape(-1)

In [187]:
%%time
res = []

tmp_f = np.array(features)
tmp_l = np.array(labels)

for i in range(75):
    clf = GradientBoostingRegressor(n_estimators=300, min_samples_split=5, min_samples_leaf=2, random_state=23)
    clf.fit(tmp_f, tmp_l.reshape(-1))
    prediction = clf.predict(tmp_f).reshape(-1, 1)

    res.append(mean_absolute_error(tmp_l, prediction))
    
    if (i == 42):
        break

    tmp1 = np.abs(tmp_l - prediction)
    d = tmp1.argmax()

    tmp_f = np.delete(tmp_f, d, 0)
    tmp_l = np.delete(tmp_l, d, 0)
    
print(min(res), argmin(res))

prediction = clf.predict(features_test)

print(mean_absolute_error(labels_test, prediction))

0.00786508173164 42
0.0105238355887
Wall time: 2min 7s


In [188]:
f_train, f_test, l_train, l_test = train_test_split(tmp_f, tmp_l, test_size=0.3, random_state=23)

clf.fit(f_train, l_train.reshape(-1))
prediction = clf.predict(f_test)

print(mean_absolute_error(l_test, prediction))

0.0174589900129


### Linear Regression + Polynomial features

In [484]:
%%time
poly = PolynomialFeatures(degree=2)
features_poly = poly.fit_transform(features)

Wall time: 1.11 s


In [485]:
# Split for train and test sets
features_train, features_test, labels_train, labels_test = train_test_split(features_poly, labels, test_size=0.3, random_state=23)

In [489]:
%%time
#clf = LinearRegression(fit_intercept=False, n_jobs=-1)
clf = Lasso(alpha = 0.1)
clf.fit(features_train, labels_train.reshape(-1))
prediction = clf.predict(features_test)

print(mean_absolute_error(labels_test, prediction))

0.0464094046792
Wall time: 1.61 s


### Decision Tree

In [588]:
%%time
clf = DecisionTreeRegressor(random_state=23)

parameters_grid = {
    "max_depth": [2, 3, 4, 5, 6, 7, 8, 9, 10],
    "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9],
    "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7],
}

gcv = GridSearchCV(clf, parameters_grid, scoring='neg_mean_absolute_error')

gcv.fit(features_train, labels_train.reshape(-1))

print(gcv.best_score_)
print(gcv.best_params_)

-0.0327128646758
{'max_depth': 6, 'min_samples_leaf': 7, 'min_samples_split': 2}
Wall time: 28.8 s


### Random Forest

In [1614]:
%%time
RFR = RandomForestRegressor(n_jobs=-1, random_state=23)

parameters_grid = {
    "n_estimators": [200, 250, 300, 350, 400],
    "min_samples_split": [2, 3, 4, 5, 6],
    "min_samples_leaf": [1, 2, 3, 4]
}

gcv = GridSearchCV(RFR, parameters_grid, scoring='neg_mean_absolute_error')

gcv.fit(features, labels.reshape(-1))

print(gcv.best_score_)
print(gcv.best_params_)

-0.023588186645
{'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}
Wall time: 11min 21s


In [179]:
#clf = gcv.best_estimator_
clf = RandomForestRegressor(n_estimators=350, min_samples_leaf=2, min_samples_split=2, n_jobs=-1, random_state=23)
clf.fit(features_train, labels_train.reshape(-1))
prediction = clf.predict(features_test)

print(mean_absolute_error(labels_test, prediction))

results['RF'] = prediction.reshape(-1)

0.0223665344854


In [180]:
%%time
#
# AdaBoost
#
adb_clf = AdaBoostRegressor(base_estimator=clf, n_estimators=300, learning_rate=0.7, random_state=23)
adb_clf.fit(features_train, labels_train.reshape(-1))
prediction = adb_clf.predict(features_test)

print(mean_absolute_error(labels_test, prediction))

results_ada['RF'] = prediction.reshape(-1)

0.0207490017856
Wall time: 14min 57s


### K Nearest Neighbors

In [3120]:
%%time
clf = KNeighborsRegressor(n_jobs=-1)

parameters_grid = {
    "n_neighbors": list(range(1, 15)),
}

gcv = GridSearchCV(clf, parameters_grid, scoring='neg_mean_absolute_error')

gcv.fit(features_train, labels_train.reshape(-1))

print(gcv.best_score_)
print(gcv.best_params_)

-0.0315106462462
{'n_neighbors': 4}
Wall time: 13.7 s


In [3121]:
clf = gcv.best_estimator_
clf.fit(features_train, labels_train.reshape(-1))
prediction = clf.predict(features_test)
results['KNN'] = prediction.reshape(-1)

### Kernel Ridge

In [135]:
def choose_KernelRidge(X, y):
    clf = KernelRidge()

    parameters_grid = {
        "kernel": ['polynomial', 'rbf'], 
        "alpha": [1e0, 0.1, 1e-2, 1e-3],
        "gamma": np.logspace(-3, 3, 10)    
    }

    gcv = GridSearchCV(clf, parameters_grid, scoring='neg_mean_absolute_error')

    gcv.fit(X, y.reshape(-1))

    print(gcv.best_score_)
    print(gcv.best_params_)
    
    return gcv.best_estimator_

In [161]:
clf = choose_KernelRidge(features_train, labels_train)

clf.fit(features_train, labels_train.reshape(-1))
prediction = clf.predict(features_test)

print(mean_absolute_error(labels_test, prediction))

results['KRG'] = prediction.reshape(-1)

-0.0251215396392
{'alpha': 0.1, 'gamma': 0.0046415888336127772, 'kernel': 'polynomial'}
0.0230851795018


In [162]:
%%time
#
# AdaBoost
#
adb_clf = AdaBoostRegressor(base_estimator=clf, n_estimators=100, learning_rate=1.0, random_state=23)
adb_clf.fit(features_train, labels_train.reshape(-1))
prediction = adb_clf.predict(features_test)

print(mean_absolute_error(labels_test, prediction))

results_ada['KRG'] = prediction.reshape(-1)

0.0262928641152
Wall time: 23 s


In [163]:
results_ada['KRG'] = results['KRG']

### SVR

In [3126]:
%%time
clf = SVR()

parameters_grid = {
    "kernel": ['rbf', 'poly'],
    "C": [1e0, 1e1, 1e2, 1e3],
    "gamma": np.logspace(-2, 2, 5)
}

gcv = GridSearchCV(clf, parameters_grid, scoring='neg_mean_absolute_error')

gcv.fit(features_train, labels_train.reshape(-1))

print(gcv.best_score_)
print(gcv.best_params_)

-0.0529154749451
{'C': 10.0, 'gamma': 0.01, 'kernel': 'rbf'}
Wall time: 23.2 s


In [3127]:
clf = gcv.best_estimator_
clf.fit(features_train, labels_train.reshape(-1))
prediction = clf.predict(features_test)
results['SVR'] = prediction.reshape(-1)

### XGBoost

In [525]:
def choose_XGB(X, y):
    XGB = xgboost.XGBRegressor()

    parameters_grid = {
        #"n_estimators": [50, 100, 150, 200, 250, 300],
        "n_estimators": [250, 300, 350],
        #"max_depth": [2, 3, 4, 5, 6, 7, 8],
        "max_depth": [3, 4, 5],
        #"learning_rate": [0.1, 0.05, 0.01]
        #"learning_rate": [0.1],
    }

    gcv = GridSearchCV(XGB, parameters_grid, scoring='neg_mean_absolute_error')

    gcv.fit(X, y.reshape(-1))

    print(gcv.best_score_)
    print(gcv.best_params_)
    
    return gcv.best_estimator_

In [1610]:
%%time

clf = choose_XGB(features_train, labels_train)

clf.fit(features_train, labels_train.reshape(-1))
prediction = clf.predict(features_test)

print(mean_absolute_error(labels_test, prediction))

#results['XGB'] = prediction.reshape(-1)

-0.0227337351622
{'max_depth': 4, 'n_estimators': 300}
0.0204025974813
Wall time: 53.1 s


In [154]:
XGB = xgboost.XGBRegressor(max_depth=3, n_estimators=350)
XGB.fit(features_train, labels_train.reshape(-1))

prediction = XGB.predict(features_test)

print(mean_absolute_error(labels_test, prediction))

results['XGB'] = prediction.reshape(-1)

0.0200851372074


In [191]:
%%time
#
# AdaBoost
#
adb_clf = AdaBoostRegressor(base_estimator=XGB, n_estimators=300, learning_rate=0.5, random_state=23)
adb_clf.fit(features_train, labels_train.reshape(-1))
prediction = adb_clf.predict(features_test)

print(mean_absolute_error(labels_test, prediction))

results_ada['XGB'] = prediction.reshape(-1)

0.0194865040989
Wall time: 12min 55s


In [190]:
%%time
XGB = xgboost.XGBRegressor(max_depth=3, n_estimators=350)
adb_clf = AdaBoostRegressor(base_estimator=XGB, random_state=23)

parameters_grid = {
    "n_estimators": [300, 350, 400],
    "learning_rate": [0.5, 0.6, 0.7],
}

gcv = GridSearchCV(adb_clf, parameters_grid, scoring='neg_mean_absolute_error')

gcv.fit(features_train, labels_train.reshape(-1))

print(gcv.best_score_)
print(gcv.best_params_)

-0.0217873663087
{'learning_rate': 0.5, 'n_estimators': 300}
Wall time: 4h 27min 34s


### Averaging  

In [182]:
%%time
res = []
wgh = []

for c1 in np.linspace(0.0, 0.9, 10):
    for c2 in np.linspace(0.1, 0.6, 6):
        if(c1 + c2 > 1.01):
            break
        else:
            for c3 in np.linspace(0.0, 0.9, 10):
                if(c1 + c2 + c3 > 1.01):
                    break
                else:
                    for c4 in np.linspace(0.0, 0.9, 10):
                        if(c1 + c2 + c3 + c4 > 1.01):
                            break
                        else:
                            for c5 in np.linspace(0.0, 0.9, 10):
                                if(c1 + c2 + c3 + c4 + c5 > 1.01):
                                    break
                                else:
                                    tmp = c1*results_ada['KRG'] + c2*results_ada['RF'] + c3*results_ada['LGB'] + c4*results_ada['GBM'] + c5*results_ada['XGB']
                                    res.append(mean_absolute_error(results.Target, tmp))
                                    wgh.append( (c1, c2, c3, c4, c5) )
                                    
print(min(res))
idx = argmin(res)
print(wgh[idx])

0.018971133894
(0.20000000000000001, 0.30000000000000004, 0.0, 0.0, 0.5)
Wall time: 2.14 s


In [171]:
np.linspace(0.0, 0.6, 7)

array([ 0. ,  0.1,  0.2,  0.3,  0.4,  0.5,  0.6])

In [167]:
average = np.mean(results_ada[['LGB', 'GBM', 'XGB', 'KRG', 'RF']], axis=1)

print("Average vs Target", mean_absolute_error(results_ada.Target, average))

Average vs Target 0.0190728184582


In [839]:
average = np.mean(results[['GBM', 'RF', 'XGB', 'K_Ridge']], axis=1)

print("Average vs Target", mean_absolute_error(results.Target, average))
print("GBM vs Target", mean_absolute_error(results.Target, results.GBM))
print("Manual_kNN vs Target", mean_absolute_error(results.Target, results.Manual_kNN))

Average vs Target 0.0193590982883
GBM vs Target 0.0201937590676
Manual_kNN vs Target 0.0287775614624


In [1208]:
tmp = np.mean(results[['GBM', 'K_Ridge']], axis=1)
print("Average vs Target", mean_absolute_error(results.Target, tmp))

Average vs Target 0.0197486344607


In [140]:
#average = np.mean(results[['MLP', 'LGB', 'GBM', 'RF', 'XGB', 'K_Ridge']], axis=1)
average = np.mean(results[['LGB', 'GBM', 'XGB', 'KRG', 'RF']], axis=1)

print("Average vs Target", mean_absolute_error(results.Target, average))

Average vs Target 0.0191947267387


In [1259]:
average = np.mean(results.loc[:, results.columns != 'Target'], axis=1)

print("Average vs Target", mean_absolute_error(results.Target, average))

Average vs Target 0.0269612525144


### Stacking

In [189]:
LR = LogisticRegression(n_jobs=-1, random_state=23)

stack_f_train = results_ada.loc[:, results_ada.columns != 'Target'].as_matrix()
stack_l_train = results_ada['Target'].as_matrix().reshape(-1, 1)

LR.fit(stack_f_train, stack_l_train)
prediction = LR.predict(stack_f_train)

print(mean_absolute_error(results_ada['Target'], prediction))

  y = column_or_1d(y, warn=True)


ValueError: Unknown label type: 'continuous'

### Еще раз поищем "близкие" варианты для "плохих" предсказаний

In [785]:
tmp1 = pd.DataFrame(features_test)
tmp1["SalePrice"] = labels_test
tmp1["Prediction"] = prediction
tmp1["Dif"] = tmp1["SalePrice"] - tmp1["Prediction"]
tmp1[tmp1["Dif"] > 0.05]
#tmp1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,618,619,620,621,622,623,624,SalePrice,Prediction,Dif
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.253404,0.284106,0.226146,0.05796
9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.336283,0.582781,0.486312,0.096469
23,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.303438,0.434437,0.356958,0.077479
36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.294078,0.249007,0.192849,0.056157
49,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.260892,0.370861,0.286522,0.084339
129,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.249404,0.309934,0.246559,0.063375
141,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.285824,0.389404,0.330432,0.058972
160,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.233833,0.319868,0.266475,0.053392
241,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.291014,0.493248,0.40909,0.084158
245,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.257573,0.339073,0.276054,0.063019


In [789]:
bad_guy = features_test[323, :]

dct = {}

for ii in range(features_train.shape[0]):
    dct[ii] = np.linalg.norm(features_train[ii, :] - bad_guy)

In [790]:
sorted(dct.items(), key=lambda x: x[1])

[(679, 4.0572058981167798),
 (460, 4.1037883500069876),
 (48, 4.2623789384980393),
 (397, 4.274175333909576),
 (190, 4.3767181932109667),
 (57, 4.4168297180683345),
 (64, 4.4428056354746825),
 (226, 4.5846531235730303),
 (157, 4.596488056804513),
 (930, 4.6632828726130615),
 (47, 4.6999548322582809),
 (283, 4.7014405236404277),
 (75, 4.7381908611740302),
 (634, 4.7580281455803854),
 (68, 4.7615236139951094),
 (757, 4.784261782273119),
 (15, 4.8071201074491876),
 (140, 4.8135197646579488),
 (974, 4.8370012529865019),
 (997, 4.8398494076218439),
 (212, 4.8497802279082824),
 (625, 4.8538997766605068),
 (994, 4.8723637666820618),
 (311, 4.8766821801573776),
 (672, 4.9114913636506818),
 (633, 4.9362610722979419),
 (767, 4.9406670382759019),
 (1004, 4.9642315957389869),
 (90, 4.9657207627467672),
 (245, 4.9857175656754915),
 (424, 5.0168802429649624),
 (861, 5.017664613541565),
 (870, 5.0507985691112127),
 (851, 5.1177925848144046),
 (155, 5.1377546699455685),
 (539, 5.1561584209492919),
 (1

In [854]:
pred_2 = np.zeros((438, 1))

cluster_size = 6

for ii in range(features_test.shape[0]):
    
    dct = {}
    
    for iii in range(features_train.shape[0]):
        dct[iii] = np.linalg.norm(features_train[iii, :] - features_test[ii, :])
        
    dd = sorted(dct.items(), key=lambda x: x[1])[:cluster_size]
    
    counter = 0.0
    
    for i in range(cluster_size):
        counter = counter + labels_train[ dd[i][0] ]
    
    pred_2[ii] = counter / cluster_size
    
print(mean_absolute_error(labels_test, pred_2))

0.0286100527181


### Снижение размерности

In [738]:
%%time
gbr = GradientBoostingRegressor(n_estimators=300, min_samples_split=5, min_samples_leaf=2, random_state=23)

pipe = Pipeline([
    ('reduce_dim', PCA()),
    ('classify', gbr)
])

parameters_grid = {
        'reduce_dim__n_components': list(range(10, 625, 5)),
        #'reduce_dim__n_components': [600],
    }
    
gcv = GridSearchCV(pipe, parameters_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)

gcv.fit(features, labels.reshape(-1))

print(gcv.best_score_)
print(gcv.best_params_)

-0.0273277097972
{'reduce_dim__n_components': 265}
Wall time: 20min 27s


In [766]:
%%time
gbr = GradientBoostingRegressor(min_samples_split=5, min_samples_leaf=2, random_state=23)

parameters_grid = {
        'n_estimators': [250, 300, 350, 400, 450, 500, 550, 600],
    }
    
gcv = GridSearchCV(gbr, parameters_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)

gcv.fit(features, labels.reshape(-1))

print(gcv.best_score_)
print(gcv.best_params_)

-0.0206620912125
{'n_estimators': 600}
Wall time: 2min 16s


### Соберем и усредним 6 лучших классификаторов на сокращенном наборе данных

In [1330]:
%%time

clf1 = GradientBoostingRegressor(n_estimators=300, min_samples_split=5, min_samples_leaf=2, random_state=23)
clf2 = MLPRegressor(hidden_layer_sizes=(150, 50, 10), random_state=23)
clf3 = lgb.LGBMRegressor(n_estimators=250, max_depth=5, learning_rate=0.075, random_state=23, n_jobs=-1)
clf4 = RandomForestRegressor(n_estimators=350, min_samples_leaf=2, min_samples_split=2, n_jobs=-1, random_state=23)
clf5 = KernelRidge(kernel='rbf', alpha=0.01, gamma=0.0046415888336127772)
clf6 = xgboost.XGBRegressor(max_depth=3, n_estimators=350)

clfs = {
    (clf1, "GBR"),
    (clf2, "MLP"),
    (clf3, "LGB"),
    (clf4, "RF"),
    (clf5, "KRG"),
    (clf6, "XGB")
}

results = pd.DataFrame()
results["Target"] = labels_test.reshape(-1)

MAX_DEALS_TO_DROP = 100

for item in clfs:
    clf = item[0]
    clf_name = item[1]

    tmp_f = np.array(features_train)
    tmp_l = np.array(labels_train)
    
    tmp_scores = []
    tmp_predictions = pd.DataFrame()

    for i in range(MAX_DEALS_TO_DROP):
        clf.fit(tmp_f, tmp_l.reshape(-1))

        prediction_train = clf.predict(tmp_f).reshape(-1, 1)
        tmp_scores.append(mean_absolute_error(tmp_l, prediction_train))
        
        tmp1 = np.abs(tmp_l - prediction_train)
        d = tmp1.argmax()

        tmp_f = np.delete(tmp_f, d, 0)
        tmp_l = np.delete(tmp_l, d, 0)
        
        prediction_test = clf.predict(features_test).reshape(-1)
        tmp_predictions[clf_name + str(i)] = prediction_test
    
    idx = argmin(tmp_scores)
    
    print("For {} best fit to train data is {} at {}".format(clf_name, min(tmp_scores), idx))
    
    results[clf_name + str(idx)] = tmp_predictions[clf_name + str(idx)]

For KRG best fit to train data is 0.004414931048544484 at 99
For LGB best fit to train data is 0.003320830661547149 at 98
For MLP best fit to train data is 0.020899522036716148 at 87
For GBR best fit to train data is 0.006609260722143669 at 99
For XGB best fit to train data is 0.007482646769804072 at 97
For RF best fit to train data is 0.006544310230829068 at 99
Wall time: 21min 49s


In [1347]:
#del results["MLP87"]
results.head(40)

Unnamed: 0,Target,KRG99,LGB98,GBR99,XGB97,RF99,GBM
0,0.246358,0.215891,0.220931,0.217291,0.207699,0.233758,0.208321
1,0.284106,0.250369,0.243277,0.241411,0.233683,0.252862,0.228792
2,0.234437,0.201216,0.216678,0.232361,0.223333,0.211115,0.230963
3,0.245033,0.231642,0.243733,0.231486,0.24891,0.210354,0.220751
4,0.07947,0.130078,0.16367,0.148244,0.154188,0.183551,0.142222
5,0.296026,0.299347,0.29072,0.272892,0.28096,0.289386,0.27571
6,0.08543,0.151984,0.138162,0.142027,0.141186,0.139402,0.130217
7,0.579012,0.548356,0.582324,0.573224,0.638096,0.523348,0.630222
8,0.178808,0.180147,0.177298,0.177105,0.174363,0.175416,0.174033
9,0.582781,0.446261,0.48661,0.473377,0.472188,0.411903,0.486946


In [1345]:
average = (results["XGB97"] + results["LGB98"] + results["GBR99"]) / 3.0

print("Average vs Target", mean_absolute_error(results.Target, average))

Average vs Target 0.0198273191639


In [1694]:
results.head(50)

Unnamed: 0,Target,KRG99,LGB98,GBR99,XGB97,RF99,GBM,LGB,K_Ridge
0,0.246358,0.215891,0.220931,0.217291,0.207699,0.233758,0.211308,0.218134,0.206438
1,0.284106,0.250369,0.243277,0.241411,0.233683,0.252862,0.23172,0.230721,0.200416
2,0.234437,0.201216,0.216678,0.232361,0.223333,0.211115,0.212378,0.204213,0.199507
3,0.245033,0.231642,0.243733,0.231486,0.24891,0.210354,0.221657,0.244341,0.21467
4,0.07947,0.130078,0.16367,0.148244,0.154188,0.183551,0.142103,0.145544,0.12391
5,0.296026,0.299347,0.29072,0.272892,0.28096,0.289386,0.272607,0.275519,0.286952
6,0.08543,0.151984,0.138162,0.142027,0.141186,0.139402,0.127285,0.144077,0.142821
7,0.579012,0.548356,0.582324,0.573224,0.638096,0.523348,0.656476,0.58351,0.567856
8,0.178808,0.180147,0.177298,0.177105,0.174363,0.175416,0.17133,0.182721,0.178252
9,0.582781,0.446261,0.48661,0.473377,0.472188,0.411903,0.490589,0.485691,0.45884


### Feature Engineering 1: handling features mostly correlated with target  
Как показала практика, эта штука не помогает, а наоборот: сильно ухудшает

In [201]:
fe_1_cols

[544, 564, 567, 606]

In [202]:
if (use_fe_1):
    for col in fe_1_cols:
        print("Pearson r for col", col, "is", pearsonr(features_train[:, col].reshape(-1, 1), labels_train.reshape(-1, 1))[0][0])
        
        # Train DT
        DTR = DecisionTreeRegressor(random_state=23)
        DTR.fit(features_train[:, col].reshape(-1, 1), labels_train)

        DTR_prediction1 = DTR.predict(features_train[:, col].reshape(-1, 1))
        DTR_prediction2 = DTR.predict(features_test[:, col].reshape(-1, 1))

        features_train = np.concatenate( [features_train, DTR_prediction1.reshape(-1, 1)], axis=1)
        features_test = np.concatenate( [features_test, DTR_prediction2.reshape(-1, 1)], axis=1)
        
print("New features shape is", features_train.shape)

Pearson r for col 544 is 0.59507792562
Pearson r for col 564 is 0.59725533954
Pearson r for col 567 is 0.69953753481
Pearson r for col 606 is 0.622064164667
New features shape is (1022, 679)


### Feature Selection

In [2707]:
def my_mutual_info(X, y):
    return mutual_info_regression(X, y, random_state=23)

In [2708]:
%%time

for num in list(range(10, 301, 5)):
    sel = SelectKBest(score_func=my_mutual_info, k=num)
    sel.fit(features_train, labels_train.reshape(-1))
    
    best_features_train = sel.transform(features_train)
    best_features_test = sel.transform(features_test)
    
    clf = GradientBoostingRegressor(n_estimators=250, min_samples_split=5, min_samples_leaf=2, random_state=23)
    #clf = xgboost.XGBRegressor(max_depth=4, n_estimators=300)
    
    clf.fit(best_features_train, labels_train.reshape(-1))

    prediction = clf.predict(best_features_test)

    print("Num of features:", num, "and the error:", mean_absolute_error(labels_test, prediction))

Num of features: 10 and the error: 0.0281051605919
Num of features: 15 and the error: 0.026373524992
Num of features: 20 and the error: 0.0252935764034
Num of features: 25 and the error: 0.0228834465052
Num of features: 30 and the error: 0.0228548795961
Num of features: 35 and the error: 0.0220050909192
Num of features: 40 and the error: 0.0214715726029
Num of features: 45 and the error: 0.0217378356126
Num of features: 50 and the error: 0.021290717931
Num of features: 55 and the error: 0.0219111232816
Num of features: 60 and the error: 0.0215221542437
Num of features: 65 and the error: 0.0215535434889
Num of features: 70 and the error: 0.0212812832387
Num of features: 75 and the error: 0.0215981896786
Num of features: 80 and the error: 0.0211061671218
Num of features: 85 and the error: 0.021094033704
Num of features: 90 and the error: 0.0210766351066
Num of features: 95 and the error: 0.0208754660513
Num of features: 100 and the error: 0.0205909384025
Num of features: 105 and the erro

#### Наилучший результат GBM = 0.01999 на 155 признаках  
XGB работает похуже

In [2709]:
sel = SelectKBest(score_func=my_mutual_info, k=155)
sel.fit(features_train, labels_train.reshape(-1))

best_features_train = sel.transform(features_train)
best_features_test = sel.transform(features_test)

#### Прогоним алгоритмы на сокращенном наборе и усредним

In [2710]:
results = pd.DataFrame()
results['Target'] = labels_test.reshape(-1)

In [2696]:
%%time

clf = choose_GBM(best_features_train, labels_train)

clf.fit(best_features_train, labels_train.reshape(-1))
prediction = clf.predict(best_features_test)

print(mean_absolute_error(labels_test, prediction))


-0.0234871402589
{'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 350}
0.0202941592297
Wall time: 2min 27s


In [2711]:
clf = GradientBoostingRegressor(n_estimators=250, min_samples_split=5, min_samples_leaf=2, random_state=23)
clf.fit(best_features_train, labels_train.reshape(-1))

prediction = clf.predict(best_features_test)

print(mean_absolute_error(labels_test, prediction))

results['GBM'] = prediction.reshape(-1)

0.0199961119705


In [2712]:
clf = choose_XGB(best_features_train, labels_train)

clf.fit(best_features_train, labels_train.reshape(-1))
prediction = clf.predict(best_features_test)

print(mean_absolute_error(labels_test, prediction))

results['XGB'] = prediction.reshape(-1)

-0.0240306834833
{'max_depth': 4, 'n_estimators': 300}
0.0210248258735


In [2713]:
clf = choose_KernelRidge(best_features_train, labels_train)

clf.fit(best_features_train, labels_train.reshape(-1))
prediction = clf.predict(best_features_test)

print(mean_absolute_error(labels_test, prediction))

results['K_Ridge'] = prediction.reshape(-1)

Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 5.2198795484543084e-21
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 5.219879548454294e-21


-0.0247435408483
{'alpha': 1.0, 'gamma': 0.021544346900318832, 'kernel': 'polynomial'}
0.0249165079801


In [2717]:
average = np.mean(results[['GBM', 'K_Ridge', 'XGB']], axis=1)

print("Average vs Target", mean_absolute_error(results.Target, average))
print("GBM vs Target", mean_absolute_error(results.Target, results.GBM))

Average vs Target 0.0200525379468
GBM vs Target 0.0199961119705


#### Усреднение на сокращенном наборе признаков не даёт заметного преимущества

### Feature Engineering 4: use Cluster # as additional feature

In [225]:
%%time

if (False):
    for num_of_clusters in range(2, 32):
        k_means = KMeans(n_clusters=num_of_clusters, random_state=23, n_jobs=-1,)
        k_means.fit(features_train)

        prediction_train = k_means.predict(features_train)
        prediction_test = k_means.predict(features_test)

        features_train_cluster = np.concatenate((features_train, prediction_train.reshape(-1, 1)), axis=1)
        features_test_cluster = np.concatenate((features_test, prediction_test.reshape(-1, 1)), axis=1)

        clf = GradientBoostingRegressor(n_estimators=300, min_samples_split=5, min_samples_leaf=2, random_state=23)
        #clf = xgboost.XGBRegressor(max_depth=3, n_estimators=350)
        clf.fit(features_train_cluster, labels_train.reshape(-1))

        prediction = clf.predict(features_test_cluster)

        print("Number of clusters:", num_of_clusters, ", and the error is", mean_absolute_error(labels_test, prediction))    

Wall time: 0 ns


In [226]:
if (False):
    for num_of_clusters in range(2, 32):
        k_means = KMeans(n_clusters=num_of_clusters, random_state=23, n_jobs=-1)
        k_means.fit(features)
        prediction = k_means.predict(features)

        features_cluster = np.concatenate((features, prediction.reshape(-1, 1)), axis=1)

        f_train, f_test, l_train, l_test = train_test_split(features_cluster, labels, test_size=0.3, random_state=23)

        clf = GradientBoostingRegressor(n_estimators=300, min_samples_split=5, min_samples_leaf=2, random_state=23)
        clf.fit(f_train, l_train.reshape(-1))
        prediction = clf.predict(f_test)

        print("Number of clusters:", num_of_clusters, ", and the error is", mean_absolute_error(l_test, prediction))

In [227]:
if (use_fe_4):
    k_means = KMeans(n_clusters=22, random_state=23, n_jobs=-1)
    k_means.fit(features)
    prediction = k_means.predict(features)
    features = np.concatenate((features, prediction.reshape(-1, 1)), axis=1)