In [58]:
import pandas as pd
import numpy as np
import math as mt

from sklearn.metrics import accuracy_score, r2_score, mean_squared_error, mean_absolute_error
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve, roc_auc_score

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.cluster import KMeans
from sklearn.linear_model import Lasso, Ridge, LogisticRegression, LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR

from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

import xgboost

from scipy.stats import pearsonr

from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder, LabelBinarizer, PolynomialFeatures

%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [59]:
#
# Helpers
#
def write_to_submission_file(predicted_labels, sale_ids, out_file="submission.csv", target='SalePrice', index_label="Id"):
    
    # turn predictions into data frame and save as csv file
    predicted_df = pd.DataFrame(predicted_labels,
                                index = sale_ids,
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [60]:
# Load House train and test data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [61]:
# Put the labels aside
labels_orig = train.SalePrice.as_matrix().reshape(-1, 1)

In [62]:
# This will be needed after pre-processing
size_of_train = train.shape[0]
print(size_of_train)

# Merge datasets
del train['SalePrice']

train = pd.concat([train, test])

1460


In [63]:
# Which columns have NaN values?
count = 0

for col in train.columns:
    if(train[col].isnull().sum() > 0):
        #print(col)
        count = count + 1
        
print("Total", count, "columns with NaNs")

Total 34 columns with NaNs


In [64]:
# Define variables
cols = []
cols_count = 0

# Handling years as categorial 
use_fe_2 = True

# Using kind of a total sum of square feet
use_fe_3 = True

# Using new feature of Clustering
use_fe_4 = False

### Data Preprocessing

In [65]:
train.iloc[:, :9].head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl


In [66]:
# MSSubClass
enc = OneHotEncoder(sparse=False)
features = enc.fit_transform(train.MSSubClass.values.reshape(-1, 1))
cols_count = cols_count + 1

# MSZoning
#train.MSZoning.fillna("RM", inplace=True)
train.MSZoning.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.MSZoning).as_matrix()], axis=1 )
cols_count = cols_count + 1

# LotFrontage
train.LotFrontage.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.LotFrontage.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# LotArea
features = np.concatenate( [features, train.LotArea.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# Street
features = np.concatenate( [features, pd.get_dummies(train.Street).as_matrix()], axis=1 )
cols_count = cols_count + 1

# Alley
train.Alley.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Alley).as_matrix()], axis=1 )
cols_count = cols_count + 1

# LotShape
features = np.concatenate( [features, pd.get_dummies(train.LotShape).as_matrix()], axis=1 )
cols_count = cols_count + 1

# LandContour
features = np.concatenate( [features, pd.get_dummies(train.LandContour).as_matrix()], axis=1 )
cols_count = cols_count + 1

In [67]:
train.iloc[:, 9:19].head()

Unnamed: 0,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond
0,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5
1,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8
2,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5
3,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5
4,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5


In [68]:
# Utilities
#train.Utilities.fillna("AllPub", inplace=True)
train.Utilities.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Utilities).as_matrix()], axis=1 )
cols_count = cols_count + 1

# LotConfig
features = np.concatenate( [features, pd.get_dummies(train.LotConfig).as_matrix()], axis=1 )
cols_count = cols_count + 1

# LandSlope
features = np.concatenate( [features, pd.get_dummies(train.LandSlope).as_matrix()], axis=1 )
cols_count = cols_count + 1

# Neighborhood 
features = np.concatenate( [features, pd.get_dummies(train.Neighborhood).as_matrix()], axis=1 )
cols_count = cols_count + 1

# Condition1
features = np.concatenate( [features, pd.get_dummies(train.Condition1).as_matrix()], axis=1 )
cols_count = cols_count + 1

# Condition2
features = np.concatenate( [features, pd.get_dummies(train.Condition2).as_matrix()], axis=1 )
cols_count = cols_count + 1

# BldgType
features = np.concatenate( [features, pd.get_dummies(train.BldgType).as_matrix()], axis=1 )
cols_count = cols_count + 1

# HouseStyle
features = np.concatenate( [features, pd.get_dummies(train.HouseStyle).as_matrix()], axis=1 )
cols_count = cols_count + 1

# OverallQual
#enc = OneHotEncoder(sparse=False)
#features = np.concatenate( [features, enc.fit_transform(train.OverallQual.values.reshape(-1, 1))], axis=1 )
features = np.concatenate( [features, train.OverallQual.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# OverallCond
#enc = OneHotEncoder(sparse=False)
#features = np.concatenate( [features, enc.fit_transform(train.OverallCond.values.reshape(-1, 1))], axis=1 )
features = np.concatenate( [features, train.OverallCond.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

In [69]:
train.iloc[:, 19:29].head()

Unnamed: 0,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond
0,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA
1,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA
2,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA
3,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA
4,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA


In [70]:
# YearBuilt
#if (use_fe_2):
#    features = np.concatenate( [features, pd.get_dummies(train.YearBuilt).as_matrix()], axis=1 )
#else:
#    features = np.concatenate( [features, train.YearBuilt.as_matrix().reshape(-1, 1)], axis=1 )
train.HouseAge = train.YrSold - train.YearBuilt
features = np.concatenate( [features, train.HouseAge.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# YearRemodAdd
#if (False):
#    features = np.concatenate( [features, pd.get_dummies(train.YearRemodAdd).as_matrix()], axis=1 )
#else:
#    features = np.concatenate( [features, train.YearRemodAdd.as_matrix().reshape(-1, 1)], axis=1 )
train.AgeSinceRemod = train.YrSold - train.YearRemodAdd
features = np.concatenate( [features, train.AgeSinceRemod.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# RoofStyle
features = np.concatenate( [features, pd.get_dummies(train.RoofStyle).as_matrix()], axis=1 )
cols_count = cols_count + 1

# RoofMatl
features = np.concatenate( [features, pd.get_dummies(train.RoofMatl).as_matrix()], axis=1 )
cols_count = cols_count + 1

# Exterior1st
#train.Exterior1st.fillna("Wd Sdng", inplace=True)
train.Exterior1st.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Exterior1st).as_matrix()], axis=1 )
cols_count = cols_count + 1

# Exterior2nd
#train.Exterior2nd.fillna("Wd Sdng", inplace=True)
train.Exterior2nd.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Exterior2nd).as_matrix()], axis=1 )
cols_count = cols_count + 1

# MasVnrType
#train.MasVnrType.fillna("None", inplace=True)
train.MasVnrType.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.MasVnrType).as_matrix()], axis=1 )
cols_count = cols_count + 1

# MasVnrArea
train.MasVnrArea.fillna(0.0, inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.MasVnrArea).as_matrix()], axis=1 )
cols_count = cols_count + 1

# ExterQual
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0}
train["ExterQual"].replace(di, inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.ExterQual).as_matrix()], axis=1 )
features = np.concatenate( [features, train.ExterQual.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# ExterCond
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0}
train["ExterCond"].replace(di, inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.ExterCond).as_matrix()], axis=1 )
features = np.concatenate( [features, train.ExterCond.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

In [71]:
train.iloc[:, 29:39].head()

Unnamed: 0,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF
0,PConc,Gd,TA,No,GLQ,706.0,Unf,0.0,150.0,856.0
1,CBlock,Gd,TA,Gd,ALQ,978.0,Unf,0.0,284.0,1262.0
2,PConc,Gd,TA,Mn,GLQ,486.0,Unf,0.0,434.0,920.0
3,BrkTil,TA,Gd,No,ALQ,216.0,Unf,0.0,540.0,756.0
4,PConc,Gd,TA,Av,GLQ,655.0,Unf,0.0,490.0,1145.0


In [72]:
# Foundation
features = np.concatenate( [features, pd.get_dummies(train.Foundation).as_matrix()], axis=1 )
cols_count = cols_count + 1

# BsmtQual
train.BsmtQual.fillna("NA", inplace=True)
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0, "NA": 0.0}
train["BsmtQual"].replace(di, inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.BsmtQual).as_matrix()], axis=1 )
features = np.concatenate( [features, train.BsmtQual.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# BsmtCond
train.BsmtCond.fillna("NA", inplace=True)
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0, "NA": 0.0}
train["BsmtCond"].replace(di, inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.BsmtCond).as_matrix()], axis=1 )
features = np.concatenate( [features, train.BsmtCond.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# BsmtExposure
train.BsmtExposure.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.BsmtExposure).as_matrix()], axis=1 )
cols_count = cols_count + 1

# BsmtFinType1
train.BsmtFinType1.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.BsmtFinType1).as_matrix()], axis=1 )
cols_count = cols_count + 1

# BsmtFinSF1
train.BsmtFinSF1.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.BsmtFinSF1.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# BsmtFinType2
train.BsmtFinType2.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.BsmtFinType2).as_matrix()], axis=1 )
cols_count = cols_count + 1

# BsmtFinSF2
train.BsmtFinSF2.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.BsmtFinSF2.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# BsmtUnfSF
train.BsmtUnfSF.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.BsmtUnfSF.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# TotalBsmtSF
train.TotalBsmtSF.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.TotalBsmtSF.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

In [73]:
train.iloc[:, 39:49].head()

Unnamed: 0,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath
0,GasA,Ex,Y,SBrkr,856,854,0,1710,1.0,0.0
1,GasA,Ex,Y,SBrkr,1262,0,0,1262,0.0,1.0
2,GasA,Ex,Y,SBrkr,920,866,0,1786,1.0,0.0
3,GasA,Gd,Y,SBrkr,961,756,0,1717,1.0,0.0
4,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1.0,0.0


In [74]:
# Heating
features = np.concatenate( [features, pd.get_dummies(train.Heating).as_matrix()], axis=1 )
cols_count = cols_count + 1

# HeatingQC
#features = np.concatenate( [features, pd.get_dummies(train.HeatingQC).as_matrix()], axis=1 )
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0}
train["HeatingQC"].replace(di, inplace=True)
features = np.concatenate( [features, train.HeatingQC.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# CentralAir
features = np.concatenate( [features, pd.get_dummies(train.CentralAir).as_matrix()], axis=1 )
cols_count = cols_count + 1

# Electrical
train.Electrical.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Electrical).as_matrix()], axis=1 )
cols_count = cols_count + 1

# 1stFlrSF
features = np.concatenate( [features, train['1stFlrSF'].as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# 2ndFlrSF
features = np.concatenate( [features, train['2ndFlrSF'].as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# LowQualFinSF
features = np.concatenate( [features, train.LowQualFinSF.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# GrLivArea
features = np.concatenate( [features, train.GrLivArea.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# BsmtFullBath
train.BsmtFullBath.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.BsmtFullBath.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# BsmtHalfBath
train.BsmtHalfBath.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.BsmtHalfBath.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

In [75]:
train.iloc[:, 49:59].head()

Unnamed: 0,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType
0,2,1,3,1,Gd,8,Typ,0,,Attchd
1,2,0,3,1,TA,6,Typ,1,TA,Attchd
2,2,1,3,1,Gd,6,Typ,1,TA,Attchd
3,1,0,3,1,Gd,7,Typ,1,Gd,Detchd
4,2,1,4,1,Gd,9,Typ,1,TA,Attchd


In [76]:
# FullBath
features = np.concatenate( [features, train.FullBath.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# HalfBath
features = np.concatenate( [features, train.HalfBath.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# BedroomAbvGr
features = np.concatenate( [features, train.BedroomAbvGr.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# KitchenAbvGr
features = np.concatenate( [features, train.KitchenAbvGr.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# KitchenQual
#train.KitchenQual.fillna("TA", inplace=True)
train.KitchenQual.fillna("Unknown", inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.KitchenQual).as_matrix()], axis=1 )
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0, "Unknown": 0.0}
train["KitchenQual"].replace(di, inplace=True)
features = np.concatenate( [features, train.KitchenQual.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# TotRmsAbvGrd
features = np.concatenate( [features, train.TotRmsAbvGrd.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# Functional
#train.Functional.fillna("Typ", inplace=True)
train.Functional.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Functional).as_matrix()], axis=1 )
cols_count = cols_count + 1

# Fireplaces
features = np.concatenate( [features, train.Fireplaces.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# FireplaceQu
train.FireplaceQu.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.FireplaceQu).as_matrix()], axis=1 )
cols_count = cols_count + 1

# GarageType
train.GarageType.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.GarageType).as_matrix()], axis=1 )
cols_count = cols_count + 1

In [77]:
train.iloc[:, 59:69].head()

Unnamed: 0,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch
0,2003.0,RFn,2.0,548.0,TA,TA,Y,0,61,0
1,1976.0,RFn,2.0,460.0,TA,TA,Y,298,0,0
2,2001.0,RFn,2.0,608.0,TA,TA,Y,0,42,0
3,1998.0,Unf,3.0,642.0,TA,TA,Y,0,35,272
4,2000.0,RFn,3.0,836.0,TA,TA,Y,192,84,0


In [78]:
# GarageYrBlt
train.GarageYrBlt.fillna(train.YearBuilt, inplace=True)

#if (False):
#    features = np.concatenate( [features, pd.get_dummies(train.GarageYrBlt).as_matrix()], axis=1 )
#else:
#    features = np.concatenate( [features, train.GarageYrBlt.as_matrix().reshape(-1, 1)], axis=1 )

train.GarageAge = train.YrSold - train.GarageYrBlt
features = np.concatenate( [features, train.GarageAge.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# GarageFinish
train.GarageFinish.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.GarageFinish).as_matrix()], axis=1 )
cols_count = cols_count + 1

# GarageCars
train.GarageCars.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.GarageCars.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# GarageArea
train.GarageArea.fillna(0.0, inplace=True)
features = np.concatenate( [features, train.GarageArea.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# GarageQual
train.GarageQual.fillna("NA", inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.GarageQual).as_matrix()], axis=1 )
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0, "NA": 0.0}
train["GarageQual"].replace(di, inplace=True)
features = np.concatenate( [features, train.GarageQual.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# GarageCond
train.GarageCond.fillna("NA", inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.GarageCond).as_matrix()], axis=1 )
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0, "NA": 0.0}
train["GarageCond"].replace(di, inplace=True)
features = np.concatenate( [features, train.GarageCond.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# PavedDrive
features = np.concatenate( [features, pd.get_dummies(train.PavedDrive).as_matrix()], axis=1 )
cols_count = cols_count + 1

# WoodDeckSF
features = np.concatenate( [features, train.WoodDeckSF.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# OpenPorchSF
features = np.concatenate( [features, train.OpenPorchSF.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# EnclosedPorch
features = np.concatenate( [features, train.EnclosedPorch.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

In [79]:
train.iloc[:, 69:].head()

Unnamed: 0,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,0,0,0,,,,0,2,2008,WD,Normal
1,0,0,0,,,,0,5,2007,WD,Normal
2,0,0,0,,,,0,9,2008,WD,Normal
3,0,0,0,,,,0,2,2006,WD,Abnorml
4,0,0,0,,,,0,12,2008,WD,Normal


In [80]:
# 3SsnPorch
features = np.concatenate( [features, train['3SsnPorch'].as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# ScreenPorch
features = np.concatenate( [features, train.ScreenPorch.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# PoolArea
features = np.concatenate( [features, train.PoolArea.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# PoolQC
train.PoolQC.fillna("NA", inplace=True)
#features = np.concatenate( [features, pd.get_dummies(train.PoolQC).as_matrix()], axis=1 )
di = {"Ex": 5.0, "Gd": 4.0, "TA": 3.0, "Fa": 2.0, "Po": 1.0, "NA": 0.0}
train["PoolQC"].replace(di, inplace=True)
features = np.concatenate( [features, train.PoolQC.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# Fence
train.Fence.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.Fence).as_matrix()], axis=1 )
cols_count = cols_count + 1

# MiscFeature
train.MiscFeature.fillna("NA", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.MiscFeature).as_matrix()], axis=1 )
cols_count = cols_count + 1

# MiscVal
features = np.concatenate( [features, train.MiscVal.as_matrix().reshape(-1, 1)], axis=1 )
cols_count = cols_count + 1

# MoSold
features = np.concatenate( [features, pd.get_dummies(train.MoSold).as_matrix()], axis=1 )
cols_count = cols_count + 1

# YrSold
if (use_fe_2):
    features = np.concatenate( [features, pd.get_dummies(train.YrSold).as_matrix()], axis=1 )
else:
    features = np.concatenate( [features, train.YrSold.as_matrix().reshape(-1, 1)], axis=1 )
    
cols_count = cols_count + 1

# SaleType
#train.SaleType.fillna("WD", inplace=True)
train.SaleType.fillna("Unknown", inplace=True)
features = np.concatenate( [features, pd.get_dummies(train.SaleType).as_matrix()], axis=1 )
cols_count = cols_count + 1

# SaleCondition
features = np.concatenate( [features, pd.get_dummies(train.SaleCondition).as_matrix()], axis=1 )
cols_count = cols_count + 1


#### Total Living Area  

In [81]:
# LivingAreaSF
if (use_fe_3):
    train['LivingAreaSF'] = train['1stFlrSF'] + train['2ndFlrSF'] + train['TotalBsmtSF']
    features = np.concatenate( [features, train.LivingAreaSF.as_matrix().reshape(-1, 1)], axis=1 )
    cols_count = cols_count + 1

### Scaling

In [82]:
# Prepare labels
labels = labels_orig

In [83]:
# First scale labels
labels = labels.astype(float)
labels_max = labels.max()

labels = labels / labels_max
     
count = 0
    
for jj in range(features.shape[1]):
    if((features[:, jj] > 25.).sum() > 0):
        mx = float(features[:, jj].max())

        features[:, jj] = features[:, jj] / mx
        count = count + 1
        
print("Total", count, "features scaled")

Total 22 features scaled


### Подготовка наборов для обучения и тестирования

In [84]:
# Split for train and test sets
features_train = features[:size_of_train, :]
labels_train = labels
features_test = features[size_of_train:, :]

print("Total features processed:", cols_count)
print("")
print(features_train.shape)
print(labels_train.shape)
print(features_test.shape)

Total features processed: 80

(1460, 749)
(1460, 1)
(1459, 749)


## Algorithm Selection  
### Baseline

GradientBoostingRegressor(n_estimators=300, min_samples_split=5, min_samples_leaf=2, random_state=23)  
На полном наборе данных + общая площадь:  

#### 0.020075944788

In [87]:
#
# For Averaging
#

results = pd.DataFrame()
#results['Target'] = labels_train.reshape(-1)

#overfit = pd.DataFrame()
#overfit['Target'] = labels_train.reshape(-1)

### Gradient Boosting

In [325]:
def choose_GBM(X, y):
    GBR = GradientBoostingRegressor(random_state=23)

    parameters_grid = {
        "n_estimators": [250, 300, 350],
        "min_samples_split": [2, 3, 4, 5, 6],
        "min_samples_leaf": [1, 2, 3, 4]
    }

    gcv = GridSearchCV(GBR, parameters_grid, scoring='neg_mean_absolute_error')

    gcv.fit(X, y.reshape(-1))

    print(gcv.best_score_)
    print(gcv.best_params_)
    
    return gcv.best_estimator_

In [326]:
%%time

clf = choose_GBM(features_train, labels_train)

clf.fit(features_train, labels_train.reshape(-1))
prediction = clf.predict(features_test)

print(mean_absolute_error(labels_test, prediction))

results['GBM'] = prediction.reshape(-1)

-0.022993850418
{'min_samples_leaf': 1, 'min_samples_split': 6, 'n_estimators': 350}
0.0202307205639
Wall time: 6min 14s


In [819]:
clf = GradientBoostingRegressor(n_estimators=300, min_samples_split=5, min_samples_leaf=2, random_state=23)
clf.fit(features_train, labels_train.reshape(-1))
prediction = clf.predict(features_test)

print(mean_absolute_error(labels_test, prediction))

results['GBM'] = prediction.reshape(-1)

0.0201937590676


In [85]:
%%time
res = []

tmp_f = np.array(features_train)
tmp_l = np.array(labels_train)

for i in range(50):
    clf = GradientBoostingRegressor(n_estimators=300, min_samples_split=5, min_samples_leaf=2, random_state=23)
    clf.fit(tmp_f, tmp_l.reshape(-1))
    prediction = clf.predict(tmp_f).reshape(-1, 1)

    res.append(mean_absolute_error(tmp_l, prediction))
    
    if (i == 46):
        break

    tmp1 = np.abs(tmp_l - prediction)
    d = tmp1.argmax()

    tmp_f = np.delete(tmp_f, d, 0)
    tmp_l = np.delete(tmp_l, d, 0)
    
print(min(res), argmin(res))

0.00855643408885 46
Wall time: 5min 39s


In [100]:
prediction = clf.predict(features_test)
results['Best_GBM-46'] = prediction.reshape(-1)

In [39]:
prediction = clf.predict(features_test)

# Baseline Submission
write_to_submission_file(prediction * labels_max, test.Id, out_file="submission.GBM.csv", target='SalePrice', index_label="Id")

In [46]:
clf = GradientBoostingRegressor(n_estimators=300, min_samples_split=5, min_samples_leaf=2, random_state=23)
clf.fit(tmp_f, tmp_l.reshape(-1))
prediction = clf.predict(tmp_f).reshape(-1, 1)

print(mean_absolute_error(tmp_l, prediction))

results['Target'] = tmp_l.reshape(-1)
results['GBM'] = prediction

0.00855643408885


### Decision Tree

In [588]:
%%time
clf = DecisionTreeRegressor(random_state=23)

parameters_grid = {
    "max_depth": [2, 3, 4, 5, 6, 7, 8, 9, 10],
    "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9],
    "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7],
}

gcv = GridSearchCV(clf, parameters_grid, scoring='neg_mean_absolute_error')

gcv.fit(features_train, labels_train.reshape(-1))

print(gcv.best_score_)
print(gcv.best_params_)

-0.0327128646758
{'max_depth': 6, 'min_samples_leaf': 7, 'min_samples_split': 2}
Wall time: 28.8 s


### Random Forest

In [772]:
%%time
RFR = RandomForestRegressor(n_jobs=-1, random_state=23)

parameters_grid = {
    "n_estimators": [200, 250, 300, 350, 400],
    "min_samples_split": [2, 3, 4, 5, 6],
    "min_samples_leaf": [1, 2, 3, 4]
}

gcv = GridSearchCV(RFR, parameters_grid, scoring='neg_mean_absolute_error')

gcv.fit(features, labels.reshape(-1))

print(gcv.best_score_)
print(gcv.best_params_)

-0.0229257649225
{'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 350}
Wall time: 14min 34s


In [47]:
#clf = gcv.best_estimator_
clf = RandomForestRegressor(n_estimators=350, min_samples_leaf=2, min_samples_split=2, n_jobs=-1, random_state=23)
clf.fit(tmp_f, tmp_l.reshape(-1))
prediction = clf.predict(tmp_f)

print(mean_absolute_error(tmp_l, prediction))

results['RF'] = prediction.reshape(-1)

0.00849999180324


### K Nearest Neighbors

In [3120]:
%%time
clf = KNeighborsRegressor(n_jobs=-1)

parameters_grid = {
    "n_neighbors": list(range(1, 15)),
}

gcv = GridSearchCV(clf, parameters_grid, scoring='neg_mean_absolute_error')

gcv.fit(features_train, labels_train.reshape(-1))

print(gcv.best_score_)
print(gcv.best_params_)

-0.0315106462462
{'n_neighbors': 4}
Wall time: 13.7 s


In [3121]:
clf = gcv.best_estimator_
clf.fit(features_train, labels_train.reshape(-1))
prediction = clf.predict(features_test)
results['KNN'] = prediction.reshape(-1)

### Ridge

In [522]:
%%time
clf = Ridge(random_state=23)

parameters_grid = {
    "alpha": [0.1, 1.0, 2.0, 2.5, 5.0, 10.0],
}

gcv = GridSearchCV(clf, parameters_grid, scoring='neg_mean_absolute_error')

gcv.fit(features_train, labels_train.reshape(-1))

print(gcv.best_score_)
print(gcv.best_params_)

-0.0294672954111
{'alpha': 10.0}
Wall time: 868 ms


In [3123]:
clf = gcv.best_estimator_
clf.fit(features_train, labels_train.reshape(-1))
prediction = clf.predict(features_test)
results['Ridge'] = prediction.reshape(-1)

### Kernel Ridge

In [48]:
def choose_KernelRidge(X, y):
    clf = KernelRidge()

    parameters_grid = {
        "kernel": ['polynomial', 'rbf'], 
        "alpha": [1e0, 0.1, 1e-2, 1e-3],
        "gamma": np.logspace(-3, 3, 10)    
    }

    gcv = GridSearchCV(clf, parameters_grid, scoring='neg_mean_absolute_error')

    gcv.fit(X, y.reshape(-1))

    print(gcv.best_score_)
    print(gcv.best_params_)
    
    return gcv.best_estimator_

In [56]:
clf = choose_KernelRidge(tmp_f, tmp_l)

clf.fit(tmp_f, tmp_l.reshape(-1))
prediction = clf.predict(tmp_f)

print(mean_absolute_error(tmp_l, prediction))

results['K_Ridge'] = prediction.reshape(-1)

0.00397581049921


In [57]:
prediction = clf.predict(features_test)

# Baseline Submission
write_to_submission_file(prediction * labels_max, test.Id, out_file="submission.KRG.csv", target='SalePrice', index_label="Id")

### SVR

In [3126]:
%%time
clf = SVR()

parameters_grid = {
    "kernel": ['rbf', 'poly'],
    "C": [1e0, 1e1, 1e2, 1e3],
    "gamma": np.logspace(-2, 2, 5)
}

gcv = GridSearchCV(clf, parameters_grid, scoring='neg_mean_absolute_error')

gcv.fit(features_train, labels_train.reshape(-1))

print(gcv.best_score_)
print(gcv.best_params_)

-0.0529154749451
{'C': 10.0, 'gamma': 0.01, 'kernel': 'rbf'}
Wall time: 23.2 s


In [3127]:
clf = gcv.best_estimator_
clf.fit(features_train, labels_train.reshape(-1))
prediction = clf.predict(features_test)
results['SVR'] = prediction.reshape(-1)

### XGBoost

In [525]:
def choose_XGB(X, y):
    XGB = xgboost.XGBRegressor()

    parameters_grid = {
        #"n_estimators": [50, 100, 150, 200, 250, 300],
        "n_estimators": [250, 300, 350],
        #"max_depth": [2, 3, 4, 5, 6, 7, 8],
        "max_depth": [3, 4, 5],
        #"learning_rate": [0.1, 0.05, 0.01]
        #"learning_rate": [0.1],
    }

    gcv = GridSearchCV(XGB, parameters_grid, scoring='neg_mean_absolute_error')

    gcv.fit(X, y.reshape(-1))

    print(gcv.best_score_)
    print(gcv.best_params_)
    
    return gcv.best_estimator_

In [526]:
%%time

clf = choose_XGB(features_train, labels_train)

clf.fit(features_train, labels_train.reshape(-1))
prediction = clf.predict(features_test)

print(mean_absolute_error(labels_test, prediction))

#results['XGB'] = prediction.reshape(-1)

-0.0223130876503
{'max_depth': 4, 'n_estimators': 350}
0.0203756575505
Wall time: 2min 4s


In [50]:
XGB = xgboost.XGBRegressor(max_depth=3, n_estimators=350)
XGB.fit(tmp_f, tmp_l.reshape(-1))

prediction = XGB.predict(tmp_f)

print(mean_absolute_error(tmp_l, prediction))

results['XGB'] = prediction.reshape(-1)

0.00881604693124


### Averaging  
В итоге именно это дало самый сильный прирост  
Усреднил три алгоритма: XGB, GBM (с кластерами), Kernel Ridge

In [51]:
results.head(20)

Unnamed: 0,Target,GBM,RF,K_Ridge,XGB
0,0.276159,0.270008,0.275295,0.270691,0.27303
1,0.240397,0.23285,0.236023,0.242654,0.236462
2,0.296026,0.28409,0.292368,0.293084,0.28403
3,0.18543,0.201781,0.203373,0.198417,0.200532
4,0.331126,0.349993,0.341797,0.343984,0.34541
5,0.189404,0.192066,0.198601,0.195735,0.195272
6,0.406623,0.389847,0.392448,0.396643,0.393723
7,0.264901,0.281341,0.289906,0.264174,0.2815
8,0.172053,0.172429,0.185338,0.172835,0.172715
9,0.156291,0.165322,0.160869,0.156223,0.16787


In [53]:
average = np.mean(results[['GBM', 'RF', 'XGB', 'K_Ridge']], axis=1)

print("Average vs Target", mean_absolute_error(results.Target, average))
print("GBM vs Target", mean_absolute_error(results.Target, results.GBM))
#print("Manual_kNN vs Target", mean_absolute_error(results.Target, results.Manual_kNN))

Average vs Target 0.00678051117954
GBM vs Target 0.00855643408885


In [878]:
tmp = np.mean(results[['GBM', 'K_Ridge']], axis=1)
print("Average vs Target", mean_absolute_error(results.Target, tmp))

Average vs Target 0.0197486344607


In [855]:
results["Manual_kNN"] = pred_2
results["Pred3"] = 0

In [861]:
trsh = 0.06

for ii in range(results.shape[0]):
    knn = results.loc[ii, "Manual_kNN"]
    gbm = results.loc[ii, "GBM"]
    xgb = results.loc[ii, "XGB"]
    rf = results.loc[ii, "RF"]
    krg = results.loc[ii, "K_Ridge"]
    
    if (( np.abs(gbm-knn)/gbm > trsh ) or (np.abs(xgb-knn)/xgb > trsh) or (np.abs(rf-knn)/rf > trsh) or (np.abs(krg-knn)/krg > trsh)):
        results.loc[ii, "Pred3"] = (gbm + xgb + rf + krg) / 4
    else:
        results.loc[ii, "Pred3"] = (gbm + xgb + rf + krg + knn) / 5
    
print("Average vs Target", mean_absolute_error(results.Target, results.Pred3))

Average vs Target 0.0193192488507


### Еще раз поищем "близкие" варианты для "плохих" предсказаний

In [90]:
results.head()

Unnamed: 0,Best_GBM-46
0,0.164089
1,0.21911
2,0.243043
3,0.259845
4,0.242814


In [790]:
sorted(dct.items(), key=lambda x: x[1])

[(679, 4.0572058981167798),
 (460, 4.1037883500069876),
 (48, 4.2623789384980393),
 (397, 4.274175333909576),
 (190, 4.3767181932109667),
 (57, 4.4168297180683345),
 (64, 4.4428056354746825),
 (226, 4.5846531235730303),
 (157, 4.596488056804513),
 (930, 4.6632828726130615),
 (47, 4.6999548322582809),
 (283, 4.7014405236404277),
 (75, 4.7381908611740302),
 (634, 4.7580281455803854),
 (68, 4.7615236139951094),
 (757, 4.784261782273119),
 (15, 4.8071201074491876),
 (140, 4.8135197646579488),
 (974, 4.8370012529865019),
 (997, 4.8398494076218439),
 (212, 4.8497802279082824),
 (625, 4.8538997766605068),
 (994, 4.8723637666820618),
 (311, 4.8766821801573776),
 (672, 4.9114913636506818),
 (633, 4.9362610722979419),
 (767, 4.9406670382759019),
 (1004, 4.9642315957389869),
 (90, 4.9657207627467672),
 (245, 4.9857175656754915),
 (424, 5.0168802429649624),
 (861, 5.017664613541565),
 (870, 5.0507985691112127),
 (851, 5.1177925848144046),
 (155, 5.1377546699455685),
 (539, 5.1561584209492919),
 (1

In [94]:
results['Closest'] = 0.0

cluster_size = 6

for ii in range(features_test.shape[0]):
    
    dct = {}
    
    for iii in range(features_train.shape[0]):
        dct[iii] = np.linalg.norm(features_train[iii, :] - features_test[ii, :])
        
    dd = sorted(dct.items(), key=lambda x: x[1])[:cluster_size]
    
    closest_ix = dd[0][0]
    results.loc[ii, 'Closest'] = labels_train[closest_ix]

In [108]:
results.head(20)

Unnamed: 0,Best_GBM-46,Closest
0,0.164089,0.184106
1,0.21911,0.211921
2,0.243043,0.254305
3,0.259845,0.242649
4,0.242814,0.238411
5,0.226986,0.230464
6,0.232517,0.229139
7,0.225256,0.243046
8,0.257503,0.194702
9,0.168512,0.171523


In [112]:
prediction = np.mean(results, axis=1)

# Submission
write_to_submission_file(prediction.as_matrix() * labels_max, test.Id, out_file="submission.Best_GBM+mkNN.csv", target='SalePrice', index_label="Id")

In [113]:
prediction = results.Closest.as_matrix()

# Submission
write_to_submission_file(prediction * labels_max, test.Id, out_file="submission.mkNN.csv", target='SalePrice', index_label="Id")

In [114]:
results['Shaman'] = 0.0

for i in range(results.shape[0]):
    if (results.loc[i, 'Best_GBM-46'] < results.loc[i, 'Closest']):
        results.loc[i, 'Shaman'] = results.loc[i, 'Best_GBM-46'] - 0.01
    else:
        results.loc[i, 'Shaman'] = results.loc[i, 'Best_GBM-46'] + 0.01
        
results.head(20)

Unnamed: 0,Best_GBM-46,Closest,Shaman
0,0.164089,0.184106,0.154089
1,0.21911,0.211921,0.22911
2,0.243043,0.254305,0.233043
3,0.259845,0.242649,0.269845
4,0.242814,0.238411,0.252814
5,0.226986,0.230464,0.216986
6,0.232517,0.229139,0.242517
7,0.225256,0.243046,0.215256
8,0.257503,0.194702,0.267503
9,0.168512,0.171523,0.158512


In [115]:
prediction = results.Shaman.as_matrix()

# Submission
write_to_submission_file(prediction * labels_max, test.Id, out_file="submission.Shamanstvo.csv", target='SalePrice', index_label="Id")

### Снижение размерности

In [738]:
%%time
gbr = GradientBoostingRegressor(n_estimators=300, min_samples_split=5, min_samples_leaf=2, random_state=23)

pipe = Pipeline([
    ('reduce_dim', PCA()),
    ('classify', gbr)
])

parameters_grid = {
        'reduce_dim__n_components': list(range(10, 625, 5)),
        #'reduce_dim__n_components': [600],
    }
    
gcv = GridSearchCV(pipe, parameters_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)

gcv.fit(features, labels.reshape(-1))

print(gcv.best_score_)
print(gcv.best_params_)

-0.0273277097972
{'reduce_dim__n_components': 265}
Wall time: 20min 27s


In [766]:
%%time
gbr = GradientBoostingRegressor(min_samples_split=5, min_samples_leaf=2, random_state=23)

parameters_grid = {
        'n_estimators': [250, 300, 350, 400, 450, 500, 550, 600],
    }
    
gcv = GridSearchCV(gbr, parameters_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)

gcv.fit(features, labels.reshape(-1))

print(gcv.best_score_)
print(gcv.best_params_)

-0.0206620912125
{'n_estimators': 600}
Wall time: 2min 16s


In [2777]:
train[train['Id'] == 432]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LivingAreaSF
431,432,50,RM,60.0,5586,Pave,,IR1,Bnk,AllPub,...,,MnPrv,,0,9,2008,ConLD,Abnorml,79900,2099


In [2780]:
tmp1 = train[(train['LivingAreaSF'] >= 2000) & (train['LivingAreaSF'] <= 2200)]
tmp1[['SaleCondition', 'SalePrice']].groupby('SaleCondition').agg([pd.np.min, pd.np.max, pd.np.mean])

Unnamed: 0_level_0,SalePrice,SalePrice,SalePrice
Unnamed: 0_level_1,amin,amax,mean
SaleCondition,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Abnorml,67000,158900,118257.066667
Alloca,89471,89471,89471.0
Family,107000,135000,120000.0
Normal,76500,190000,140262.439716
Partial,113000,179665,160249.571429


In [2778]:
train['SaleCondition'].value_counts()

Normal     1198
Partial     125
Abnorml     101
Family       20
Alloca       12
AdjLand       4
Name: SaleCondition, dtype: int64

In [2776]:
313.0 * 0.19169329

59.999999769999995

### Feature Engineering 1: handling features mostly correlated with target  
Как показала практика, эта штука не помогает, а наоборот: сильно ухудшает

In [201]:
fe_1_cols

[544, 564, 567, 606]

In [202]:
if (use_fe_1):
    for col in fe_1_cols:
        print("Pearson r for col", col, "is", pearsonr(features_train[:, col].reshape(-1, 1), labels_train.reshape(-1, 1))[0][0])
        
        # Train DT
        DTR = DecisionTreeRegressor(random_state=23)
        DTR.fit(features_train[:, col].reshape(-1, 1), labels_train)

        DTR_prediction1 = DTR.predict(features_train[:, col].reshape(-1, 1))
        DTR_prediction2 = DTR.predict(features_test[:, col].reshape(-1, 1))

        features_train = np.concatenate( [features_train, DTR_prediction1.reshape(-1, 1)], axis=1)
        features_test = np.concatenate( [features_test, DTR_prediction2.reshape(-1, 1)], axis=1)
        
print("New features shape is", features_train.shape)

Pearson r for col 544 is 0.59507792562
Pearson r for col 564 is 0.59725533954
Pearson r for col 567 is 0.69953753481
Pearson r for col 606 is 0.622064164667
New features shape is (1022, 679)


### Feature Selection

In [2707]:
def my_mutual_info(X, y):
    return mutual_info_regression(X, y, random_state=23)

In [2708]:
%%time

for num in list(range(10, 301, 5)):
    sel = SelectKBest(score_func=my_mutual_info, k=num)
    sel.fit(features_train, labels_train.reshape(-1))
    
    best_features_train = sel.transform(features_train)
    best_features_test = sel.transform(features_test)
    
    clf = GradientBoostingRegressor(n_estimators=250, min_samples_split=5, min_samples_leaf=2, random_state=23)
    #clf = xgboost.XGBRegressor(max_depth=4, n_estimators=300)
    
    clf.fit(best_features_train, labels_train.reshape(-1))

    prediction = clf.predict(best_features_test)

    print("Num of features:", num, "and the error:", mean_absolute_error(labels_test, prediction))

Num of features: 10 and the error: 0.0281051605919
Num of features: 15 and the error: 0.026373524992
Num of features: 20 and the error: 0.0252935764034
Num of features: 25 and the error: 0.0228834465052
Num of features: 30 and the error: 0.0228548795961
Num of features: 35 and the error: 0.0220050909192
Num of features: 40 and the error: 0.0214715726029
Num of features: 45 and the error: 0.0217378356126
Num of features: 50 and the error: 0.021290717931
Num of features: 55 and the error: 0.0219111232816
Num of features: 60 and the error: 0.0215221542437
Num of features: 65 and the error: 0.0215535434889
Num of features: 70 and the error: 0.0212812832387
Num of features: 75 and the error: 0.0215981896786
Num of features: 80 and the error: 0.0211061671218
Num of features: 85 and the error: 0.021094033704
Num of features: 90 and the error: 0.0210766351066
Num of features: 95 and the error: 0.0208754660513
Num of features: 100 and the error: 0.0205909384025
Num of features: 105 and the erro

#### Наилучший результат GBM = 0.01999 на 155 признаках  
XGB работает похуже

In [2709]:
sel = SelectKBest(score_func=my_mutual_info, k=155)
sel.fit(features_train, labels_train.reshape(-1))

best_features_train = sel.transform(features_train)
best_features_test = sel.transform(features_test)

#### Прогоним алгоритмы на сокращенном наборе и усредним

In [2710]:
results = pd.DataFrame()
results['Target'] = labels_test.reshape(-1)

In [2696]:
%%time

clf = choose_GBM(best_features_train, labels_train)

clf.fit(best_features_train, labels_train.reshape(-1))
prediction = clf.predict(best_features_test)

print(mean_absolute_error(labels_test, prediction))


-0.0234871402589
{'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 350}
0.0202941592297
Wall time: 2min 27s


In [2711]:
clf = GradientBoostingRegressor(n_estimators=250, min_samples_split=5, min_samples_leaf=2, random_state=23)
clf.fit(best_features_train, labels_train.reshape(-1))

prediction = clf.predict(best_features_test)

print(mean_absolute_error(labels_test, prediction))

results['GBM'] = prediction.reshape(-1)

0.0199961119705


In [2712]:
clf = choose_XGB(best_features_train, labels_train)

clf.fit(best_features_train, labels_train.reshape(-1))
prediction = clf.predict(best_features_test)

print(mean_absolute_error(labels_test, prediction))

results['XGB'] = prediction.reshape(-1)

-0.0240306834833
{'max_depth': 4, 'n_estimators': 300}
0.0210248258735


In [2713]:
clf = choose_KernelRidge(best_features_train, labels_train)

clf.fit(best_features_train, labels_train.reshape(-1))
prediction = clf.predict(best_features_test)

print(mean_absolute_error(labels_test, prediction))

results['K_Ridge'] = prediction.reshape(-1)

Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 5.2198795484543084e-21
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 5.219879548454294e-21


-0.0247435408483
{'alpha': 1.0, 'gamma': 0.021544346900318832, 'kernel': 'polynomial'}
0.0249165079801


In [2717]:
average = np.mean(results[['GBM', 'K_Ridge', 'XGB']], axis=1)

print("Average vs Target", mean_absolute_error(results.Target, average))
print("GBM vs Target", mean_absolute_error(results.Target, results.GBM))

Average vs Target 0.0200525379468
GBM vs Target 0.0199961119705


#### Усреднение на сокращенном наборе признаков не даёт заметного преимущества

### Feature Engineering 4: use Cluster # as additional feature

In [225]:
%%time

if (False):
    for num_of_clusters in range(2, 32):
        k_means = KMeans(n_clusters=num_of_clusters, random_state=23, n_jobs=-1,)
        k_means.fit(features_train)

        prediction_train = k_means.predict(features_train)
        prediction_test = k_means.predict(features_test)

        features_train_cluster = np.concatenate((features_train, prediction_train.reshape(-1, 1)), axis=1)
        features_test_cluster = np.concatenate((features_test, prediction_test.reshape(-1, 1)), axis=1)

        clf = GradientBoostingRegressor(n_estimators=300, min_samples_split=5, min_samples_leaf=2, random_state=23)
        #clf = xgboost.XGBRegressor(max_depth=3, n_estimators=350)
        clf.fit(features_train_cluster, labels_train.reshape(-1))

        prediction = clf.predict(features_test_cluster)

        print("Number of clusters:", num_of_clusters, ", and the error is", mean_absolute_error(labels_test, prediction))    

Wall time: 0 ns


In [226]:
if (False):
    for num_of_clusters in range(2, 32):
        k_means = KMeans(n_clusters=num_of_clusters, random_state=23, n_jobs=-1)
        k_means.fit(features)
        prediction = k_means.predict(features)

        features_cluster = np.concatenate((features, prediction.reshape(-1, 1)), axis=1)

        f_train, f_test, l_train, l_test = train_test_split(features_cluster, labels, test_size=0.3, random_state=23)

        clf = GradientBoostingRegressor(n_estimators=300, min_samples_split=5, min_samples_leaf=2, random_state=23)
        clf.fit(f_train, l_train.reshape(-1))
        prediction = clf.predict(f_test)

        print("Number of clusters:", num_of_clusters, ", and the error is", mean_absolute_error(l_test, prediction))

In [227]:
if (use_fe_4):
    k_means = KMeans(n_clusters=22, random_state=23, n_jobs=-1)
    k_means.fit(features)
    prediction = k_means.predict(features)
    features = np.concatenate((features, prediction.reshape(-1, 1)), axis=1)