In [1]:
import pandas as pd
import numpy as np
import scipy.stats 
from sklearn import preprocessing, decomposition, feature_selection, cross_validation, metrics
from sklearn import cluster
from sklearn import linear_model, svm, ensemble
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
import matplotlib.pyplot as plt
import datetime

# COLLECTING DATA

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


# EXPLORING DATA

## Defining Variable Types

In [5]:
nominal = ['MSSubClass','MSZoning','Street','Alley','LotShape','LandContour','Utilities','LotConfig','LandSlope','Neighborhood',
           'Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType',
           'Foundation','Heating','CentralAir','Electrical','Functional','GarageType','GarageFinish','PavedDrive','Fence',
           'MiscFeature','SaleType','SaleCondition']

ordinal = ['OverallQual','OverallCond','ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1',
           'BsmtFinType2','HeatingQC','KitchenQual','FireplaceQu','GarageQual','GarageCond','PoolQC','MoSold','YrSold']

continuous = ['LotFrontage','LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF',
              'LowQualFinSF','GrLivArea','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch',
              'PoolArea','MiscVal','SalePrice']

discrete = ['YearBuilt','YearRemodAdd','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr',
            'TotRmsAbvGrd','Fireplaces','GarageYrBlt','GarageCars']

categorical = nominal + ordinal
numeric = continuous + discrete

## Data Types

In [6]:
train.dtypes[:40]

Id                int64
MSSubClass        int64
MSZoning         object
LotFrontage     float64
LotArea           int64
Street           object
Alley            object
LotShape         object
LandContour      object
Utilities        object
LotConfig        object
LandSlope        object
Neighborhood     object
Condition1       object
Condition2       object
BldgType         object
HouseStyle       object
OverallQual       int64
OverallCond       int64
YearBuilt         int64
YearRemodAdd      int64
RoofStyle        object
RoofMatl         object
Exterior1st      object
Exterior2nd      object
MasVnrType       object
MasVnrArea      float64
ExterQual        object
ExterCond        object
Foundation       object
BsmtQual         object
BsmtCond         object
BsmtExposure     object
BsmtFinType1     object
BsmtFinSF1        int64
BsmtFinType2     object
BsmtFinSF2        int64
BsmtUnfSF         int64
TotalBsmtSF       int64
Heating          object
dtype: object

In [7]:
train.dtypes[40:]

HeatingQC         object
CentralAir        object
Electrical        object
1stFlrSF           int64
2ndFlrSF           int64
LowQualFinSF       int64
GrLivArea          int64
BsmtFullBath       int64
BsmtHalfBath       int64
FullBath           int64
HalfBath           int64
BedroomAbvGr       int64
KitchenAbvGr       int64
KitchenQual       object
TotRmsAbvGrd       int64
Functional        object
Fireplaces         int64
FireplaceQu       object
GarageType        object
GarageYrBlt      float64
GarageFinish      object
GarageCars         int64
GarageArea         int64
GarageQual        object
GarageCond        object
PavedDrive        object
WoodDeckSF         int64
OpenPorchSF        int64
EnclosedPorch      int64
3SsnPorch          int64
ScreenPorch        int64
PoolArea           int64
PoolQC            object
Fence             object
MiscFeature       object
MiscVal            int64
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object


## Description

### numbers

In [8]:
train.select_dtypes(exclude=['object']).describe().transpose()



Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,1460.0,730.5,421.610009,1.0,365.75,730.5,1095.25,1460.0
MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,50.0,70.0,190.0
LotFrontage,1201.0,70.049958,24.284752,21.0,,,,313.0
LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0
OverallQual,1460.0,6.099315,1.382997,1.0,5.0,6.0,7.0,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,5.0,5.0,6.0,9.0
YearBuilt,1460.0,1971.267808,30.202904,1872.0,1954.0,1973.0,2000.0,2010.0
YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1967.0,1994.0,2004.0,2010.0
MasVnrArea,1452.0,103.685262,181.066207,0.0,,,,1600.0
BsmtFinSF1,1460.0,443.639726,456.098091,0.0,0.0,383.5,712.25,5644.0


### objects

In [9]:
train.select_dtypes(include=['object']).describe().transpose()

Unnamed: 0,count,unique,top,freq
MSZoning,1460,5,RL,1151
Street,1460,2,Pave,1454
Alley,91,2,Grvl,50
LotShape,1460,4,Reg,925
LandContour,1460,4,Lvl,1311
Utilities,1460,2,AllPub,1459
LotConfig,1460,5,Inside,1052
LandSlope,1460,3,Gtl,1382
Neighborhood,1460,25,NAmes,225
Condition1,1460,9,Norm,1260


## Percent Null

In [10]:
nulls = train.isnull().sum().sort_values(ascending=False)
nulls = nulls[nulls != 0].sort_values(ascending=False)
nulls = nulls / len(train)
nulls

PoolQC          0.995205
MiscFeature     0.963014
Alley           0.937671
Fence           0.807534
FireplaceQu     0.472603
LotFrontage     0.177397
GarageYrBlt     0.055479
GarageCond      0.055479
GarageType      0.055479
GarageFinish    0.055479
GarageQual      0.055479
BsmtExposure    0.026027
BsmtFinType2    0.026027
BsmtFinType1    0.025342
BsmtCond        0.025342
BsmtQual        0.025342
MasVnrType      0.005479
MasVnrArea      0.005479
Electrical      0.000685
dtype: float64

## Percent Zero

In [11]:
nonzeros = train.select_dtypes(exclude=['object']).astype(bool).sum()
zeros = len(train) - nonzeros
zeros = zeros[zeros != 0].sort_values(ascending=False)
zeros = zeros / len(train)
zeros

PoolArea         0.995205
3SsnPorch        0.983562
LowQualFinSF     0.982192
MiscVal          0.964384
BsmtHalfBath     0.943836
ScreenPorch      0.920548
BsmtFinSF2       0.885616
EnclosedPorch    0.857534
HalfBath         0.625342
MasVnrArea       0.589726
BsmtFullBath     0.586301
2ndFlrSF         0.567808
WoodDeckSF       0.521233
Fireplaces       0.472603
OpenPorchSF      0.449315
BsmtFinSF1       0.319863
BsmtUnfSF        0.080822
GarageCars       0.055479
GarageArea       0.055479
TotalBsmtSF      0.025342
FullBath         0.006164
BedroomAbvGr     0.004110
KitchenAbvGr     0.000685
dtype: float64

## Correlation

In [12]:
pearson = train[numeric].corr()
spearman = train[numeric].corr(method='spearman')

pearson.to_csv('correlation\\pearson.csv')
spearman.to_csv('correlation\\spearman.csv')

print '\n\nSEE CORRELATION FOLDER FOR OUTPUT\n\n'



SEE CORRELATION FOLDER FOR OUTPUT




## Scatter Plots

In [13]:
def scatter_img(df, features):
    combos = set()
    x = 'SalePrice'
    for y in features:
        combo = [y,x]
        combos.add(tuple(combo))
    
    for combo in combos:
        plt.scatter(df[combo[0]], df[combo[1]])
        plt.xlabel(combo[0])
        plt.ylabel(combo[1])
        plt.savefig('scatter plots\\' + combo[0] + ' - ' + combo[1] + '.png')
        plt.close('all')
    print '\n\nSEE SCATTER PLOTS FOLDER FOR OUTPUT\n\n'

In [14]:
scatter_img(train, numeric)



SEE SCATTER PLOTS FOLDER FOR OUTPUT




## Histograms

In [15]:
def hist_bins(df, series):
    try:    
        feature = series
        series = df[series].dropna()
        full_range = series.max() - series.min()
        iqr = series.describe()[6] - series.describe()[4]
        n = float(len(series))
        bin_size = 2 * iqr * (n ** (-1.0/3.0)) # Freedman Diaconis Estimator
        bins = int(full_range / bin_size) + 1
    except:
        bins = 5     
    bins = 5 if bins < 5 else bins
    return bins

def hist_img(df, features):
    for feature in features:
            series = df[feature].dropna()
            plt.hist(series, bins=hist_bins(df, feature))
            plt.title(feature)
            plt.savefig('histograms\\' + feature + '.png')
            plt.close('all')
    print '\n\nSEE HISTOGRAMS FOLDER FOR OUTPUT\n\n'

In [16]:
hist_img(train, numeric)



SEE HISTOGRAMS FOLDER FOR OUTPUT




## Skewness

In [17]:
train[continuous].skew().sort_values(ascending=False)

MiscVal          24.476794
PoolArea         14.828374
LotArea          12.207688
3SsnPorch        10.304342
LowQualFinSF      9.011341
BsmtFinSF2        4.255261
ScreenPorch       4.122214
EnclosedPorch     3.089872
MasVnrArea        2.669084
OpenPorchSF       2.364342
LotFrontage       2.163569
SalePrice         1.882876
BsmtFinSF1        1.685503
WoodDeckSF        1.541376
TotalBsmtSF       1.524255
1stFlrSF          1.376757
GrLivArea         1.366560
BsmtUnfSF         0.920268
2ndFlrSF          0.813030
GarageArea        0.179981
dtype: float64

## Bar Plots

In [18]:
def bar_img(df, features):
    for feature in features:
            series = df[feature]
            series.value_counts().plot.bar()
            plt.title(feature)
            plt.savefig('bar plots\\' + feature + '.png')
            plt.close('all')
    print '\n\nSEE BAR PLOTS FOLDER FOR OUTPUT\n\n'

In [19]:
bar_img(train, categorical)



SEE BAR PLOTS FOLDER FOR OUTPUT




# FEATURE ENGINEERING

## Make a copy of data

In [20]:
eng_train = train.copy()
eng_test = test.copy()

## Removing training records that have outliers

In [21]:
eng_train = eng_train.loc[train['BedroomAbvGr'] <= 6, :]
eng_train = eng_train.loc[train['BsmtFinSF1'] <= 5000, :]
eng_train = eng_train.loc[train['GrLivArea'] <= 4500, :]
eng_train = eng_train.loc[train['LotArea'] <= 100000, :]

## Hand selecting out-of-the-box features Realtors typically use for CMA (will create more below and add them in)

In [22]:
nominal = ['HouseStyle']

ordinal = ['OverallQual','OverallCond']

continuous = ['LotArea','BsmtUnfSF','GrLivArea']

discrete = ['YearBuilt','YearRemodAdd','BedroomAbvGr','GarageCars']

categorical = nominal + ordinal
numeric = continuous + discrete

## Replacing MoSold and YrSold with FHFA House Price Index

#### Converting MoSold and YrSold to date string

In [23]:
eng_train['SaleMonth'] = eng_train.MoSold.apply(lambda x: str(x)) + '/1/' + eng_train.YrSold.apply(lambda x: str(x))
eng_test['SaleMonth'] = eng_test.MoSold.apply(lambda x: str(x)) + '/1/' + eng_test.YrSold.apply(lambda x: str(x))

#### Reading in House Price Index (HPI) data

In [24]:
#hpi = pd.read_excel('https://www.fhfa.gov/DataTools/Downloads/Documents/HPI/HPI_PO_monthly_hist.xls', header=0, skiprows=[0,1,2])
hpi = pd.read_excel('HPI_PO_monthly_hist.xls', header=0, skiprows=[0,1,2])
hpi = hpi.dropna()
hpi.head()

Unnamed: 0,Month,East North Central (NSA),East North Central (SA),East South Central (NSA),East South Central (SA),Middle Atlantic (NSA),Middle Atlantic (SA),Mountain (NSA),Mountain (SA),New England (NSA),...,Pacific (NSA),Pacific (SA),South Atlantic (NSA),South Atlantic (SA),West North Central (NSA),West North Central (SA),West South Central (NSA),West South Central (SA),USA (NSA),USA (SA)
1,1991-01-01,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
2,1991-02-01,101.04,101.12,100.93,100.65,100.08,100.21,98.51,98.7,102.07,...,100.25,100.55,100.49,100.43,100.69,100.78,100.02,99.81,100.47,100.52
3,1991-03-01,101.41,101.08,101.03,100.65,99.93,99.75,100.22,100.57,101.7,...,100.04,100.18,100.9,100.54,100.35,100.09,100.84,100.54,100.72,100.51
4,1991-04-01,101.8,101.12,101.02,100.76,99.37,99.06,100.5,100.4,101.09,...,100.2,100.06,100.93,100.49,100.95,100.47,100.51,100.14,100.74,100.37
5,1991-05-01,102.45,101.5,101.25,100.8,99.58,99.12,100.71,100.38,99.76,...,100.4,100.1,100.81,100.6,100.73,100.28,100.94,100.33,100.89,100.42


#### Converting HPI Month to date string

In [25]:
hpi['SaleMonth'] = hpi.Month.apply(lambda x: str(x.month)) + '/1/' + hpi.Month.apply(lambda x: str(x.year))

#### Renaming target column (West North Central NSA) to simply HPI

In [26]:
hpi.rename(columns={'West North Central\n(NSA)':'HPI'}, inplace=True)

#### Keeping only the columns we need from HPI

In [27]:
hpi = hpi[['SaleMonth','HPI']]

#### Joining sales data with HPI data on SaleMonth

In [28]:
eng_train = eng_train.merge(hpi, on='SaleMonth')
eng_test = eng_test.merge(hpi, on='SaleMonth')

#### Removing MoSold, YrSold, and SaleMonth

In [29]:
eng_train.drop(['MoSold','YrSold','SaleMonth'], axis=1, inplace=True)
eng_test.drop(['MoSold','YrSold','SaleMonth'], axis=1, inplace=True)

continuous.append('HPI')

## Creating binary (one-hot) features from existing features

#### Creating IrregularLot from LotShape where Reg = 0 else 1

In [30]:
eng_train['IrregularLot'] = 1.0
eng_test['IrregularLot'] = 1.0
eng_train.loc[eng_train.LotShape == 'Reg', 'IrregularLot'] = 0.0
eng_test.loc[eng_test.LotShape == 'Reg', 'IrregularLot'] = 0.0

binary = ['IrregularLot']

#### Creating CornerLot from LotConfig where Corner = 1 else 0

In [31]:
eng_train['CornerLot'] = 0.0
eng_test['CornerLot'] = 0.0
eng_train.loc[eng_train.LotConfig == 'Corner', 'CornerLot'] = 1.0
eng_test.loc[eng_test.LotConfig == 'Corner', 'CornerLot'] = 1.0

binary.append('CornerLot')

#### Creating CulDeSac from LotConfig where CulDeSac = 1 else 0

In [32]:
eng_train['CulDeSac'] = 0.0
eng_test['CulDeSac'] = 0.0
eng_train.loc[eng_train.LotConfig == 'CulDSac', 'CulDeSac'] = 1.0
eng_test.loc[eng_test.LotConfig == 'CulDSac', 'CulDeSac'] = 1.0

binary.append('CulDeSac')

#### Creating BadLocation from Condition1 where Norm, PosN, or PosA = 0 else 1

In [33]:
eng_train['BadLocation'] = 1.0
eng_test['BadLocation'] = 1.0
eng_train.loc[(eng_train.Condition1 == 'Norm') | (eng_train.Condition1 == 'PosN') | (eng_train.Condition1 == 'PosA'),
              'BadLocation'] = 0.0
eng_test.loc[(eng_test.Condition1 == 'Norm') | (eng_test.Condition1 == 'PosN') | (eng_test.Condition1 == 'PosA'), 
             'BadLocation'] = 0.0

binary.append('BadLocation')

#### Creating NeedExterRepair from ExterCond where Fa or Po = 1 else 0

In [34]:
eng_train['NeedExterRepair'] = 0.0
eng_test['NeedExterRepair'] = 0.0
eng_train.loc[(eng_train.ExterCond == 'Fa') | (eng_train.ExterCond == 'Po'), 'NeedExterRepair'] = 1.0
eng_test.loc[(eng_test.ExterCond == 'Fa') | (eng_test.ExterCond == 'Po'), 'NeedExterRepair'] = 1.0

binary.append('NeedExterRepair')

#### Creating NeedBsmtRepair from ExterCond where Fa or Po = 1 else 0

In [35]:
eng_train['NeedBsmtRepair'] = 0.0
eng_test['NeedBsmtRepair'] = 0.0
eng_train.loc[(eng_train.BsmtCond == 'Fa') | (eng_train.BsmtCond == 'Po'), 'NeedBsmtRepair'] = 1.0
eng_test.loc[(eng_test.BsmtCond == 'Fa') | (eng_test.BsmtCond == 'Po'), 'NeedBsmtRepair'] = 1.0

binary.append('NeedBsmtRepair')

#### Creating HasWalkoutBsmt from BsmtExposure where Gd = 1 else 0

In [36]:
eng_train['HasWalkoutBsmt'] = 0.0
eng_test['HasWalkoutBsmt'] = 0.0
eng_train.loc[eng_train.BsmtExposure == 'Gd', 'HasWalkoutBsmt'] = 1.0
eng_test.loc[eng_test.BsmtExposure == 'Gd', 'HasWalkoutBsmt'] = 1.0

binary.append('HasWalkoutBsmt')

#### Creating NeedHeatReplace from HeatingQC where Fa or Po = 1 else 0

In [37]:
eng_train['NeedHeatReplace'] = 0.0
eng_test['NeedHeatReplace'] = 0.0
eng_train.loc[(eng_train.HeatingQC == 'Fa') | (eng_train.HeatingQC == 'Po'), 'NeedHeatReplace'] = 1.0
eng_test.loc[(eng_test.HeatingQC == 'Fa') | (eng_test.HeatingQC == 'Po'), 'NeedHeatReplace'] = 1.0

binary.append('NeedHeatReplace')

#### Creating NoAC from CentralAir where N = 1 else 0

In [38]:
eng_train['NoAC'] = 0.0
eng_test['NoAC'] = 0.0
eng_train.loc[eng_train.CentralAir == 'N', 'NoAC'] = 1.0
eng_test.loc[eng_test.CentralAir == 'N', 'NoAC'] = 1.0

binary.append('NoAC')

#### Creating NeedElectricUpgrade from Electrical where SBrkr or FuseA = 0 else 1

In [39]:
eng_train['NeedElectricUpgrade'] = 1.0
eng_test['NeedElectricUpgrade'] = 1.0
eng_train.loc[(eng_train.Electrical == 'SBrkr') | (eng_train.Electrical == 'FuseA'), 'NeedElectricUpgrade'] = 0.0
eng_test.loc[(eng_test.Electrical == 'SBrkr') | (eng_test.Electrical == 'FuseA'), 'NeedElectricUpgrade'] = 0.0

binary.append('NeedElectricUpgrade')

#### Creating HasMultiKitchens from KitchenAbvGr

In [40]:
eng_train['HasMultiKitchens'] = 0.0
eng_test['HasMultiKitchens'] = 0.0
eng_train.loc[eng_train.KitchenAbvGr > 1.0, 'HasMultiKitchens'] = 1.0
eng_test.loc[eng_test.KitchenAbvGr > 1.0, 'HasMultiKitchens'] = 1.0

binary.append('HasMultiKitchens')

#### Creating HasHighEndKitchen from KitchenQual where Ex = 1 else 0

In [41]:
eng_train['HasHighEndKitchen'] = 0.0
eng_test['HasHighEndKitchen'] = 0.0
eng_train.loc[eng_train.KitchenQual == 'Ex', 'HasHighEndKitchen'] = 1.0
eng_test.loc[eng_test.KitchenQual == 'Ex', 'HasHighEndKitchen'] = 1.0

binary.append('HasHighEndKitchen')

#### Creating NeedGeneralRepair from Functional where Typ = 0 else 1

In [42]:
eng_train['NeedGeneralRepair'] = 1.0
eng_test['NeedGeneralRepair'] = 1.0
eng_train.loc[eng_train.Functional == 'Typ', 'NeedGeneralRepair'] = 0.0
eng_test.loc[eng_test.Functional == 'Typ', 'NeedGeneralRepair'] = 0.0

binary.append('NeedGeneralRepair')

#### Creating HasFireplace from Fireplaces

In [43]:
eng_train['HasFireplace'] = 0.0
eng_test['HasFireplace'] = 0.0
eng_train.loc[eng_train.Fireplaces > 0.0, 'HasFireplace'] = 1.0
eng_test.loc[eng_test.Fireplaces > 0.0, 'HasFireplace'] = 1.0

binary.append('HasFireplace')

#### Creating NeedGarageRepair from GarageCond where Fa or Po = 1 else 0

In [44]:
eng_train['NeedGarageRepair'] = 0.0
eng_test['NeedGarageRepair'] = 0.0
eng_train.loc[(eng_train.GarageCond == 'Fa') | (eng_train.GarageCond == 'Po'), 'NeedGarageRepair'] = 1.0
eng_test.loc[(eng_test.GarageCond == 'Fa') | (eng_test.GarageCond == 'Po'), 'NeedGarageRepair'] = 1.0

binary.append('NeedGarageRepair')

#### Creating HasDeck from WoodDeckSF

In [45]:
eng_train['HasDeck'] = 0.0
eng_test['HasDeck'] = 0.0
eng_train.loc[eng_train.WoodDeckSF > 0.0, 'HasDeck'] = 1.0
eng_test.loc[eng_test.WoodDeckSF > 0.0, 'HasDeck'] = 1.0

binary.append('HasDeck')

#### Creating HasPorch from OpenPorchSF

In [46]:
eng_train['HasPorch'] = 0.0
eng_test['HasPorch'] = 0.0
eng_train.loc[eng_train.OpenPorchSF > 0.0, 'HasPorch'] = 1.0
eng_test.loc[eng_test.OpenPorchSF > 0.0, 'HasPorch'] = 1.0

binary.append('HasPorch')

#### Creating HasEnclPorch from EnclosedPorch

In [47]:
eng_train['HasEnclPorch'] = 0.0
eng_test['HasEnclPorch'] = 0.0
eng_train.loc[eng_train.EnclosedPorch > 0.0, 'HasEnclPorch'] = 1.0
eng_test.loc[eng_test.EnclosedPorch > 0.0, 'HasEnclPorch'] = 1.0

binary.append('HasEnclPorch')

#### Creating Has3SeasonRoom from 3SsnPorch

In [48]:
eng_train['Has3SeasonRoom'] = 0.0
eng_test['Has3SeasonRoom'] = 0.0
eng_train.loc[eng_train['3SsnPorch'] > 0.0, 'Has3SeasonRoom'] = 1.0
eng_test.loc[eng_test['3SsnPorch'] > 0.0, 'Has3SeasonRoom'] = 1.0

binary.append('Has3SeasonRoom')

#### Creating HasScreenPorch from ScreenPorch

In [49]:
eng_train['HasScreenPorch'] = 0.0
eng_test['HasScreenPorch'] = 0.0
eng_train.loc[eng_train.ScreenPorch > 0.0, 'HasScreenPorch'] = 1.0
eng_test.loc[eng_test.ScreenPorch > 0.0, 'HasScreenPorch'] = 1.0

binary.append('HasScreenPorch')

#### Creating HasPool from PoolArea

In [50]:
eng_train['HasPool'] = 0.0
eng_test['HasPool'] = 0.0
eng_train.loc[eng_train.PoolArea > 0.0, 'HasPool'] = 1.0
eng_test.loc[eng_test.PoolArea > 0.0, 'HasPool'] = 1.0

binary.append('HasPool')

#### Creating HasShed from MiscFeature

In [51]:
eng_train['HasShed'] = 0.0
eng_test['HasShed'] = 0.0
eng_train.loc[eng_train.MiscFeature == 'Shed', 'HasShed'] = 1.0
eng_test.loc[eng_test.MiscFeature == 'Shed', 'HasShed'] = 1.0

binary.append('HasShed')

## Creating new numeric features

#### Creating feature to measure desirability of neighborhoods

In [52]:
# values obtained from the multiplier values on the City of Ames Residential Assessment Neighborhoods map
# (http://www.cityofames.org/home/showdocument?id=1024)

nbrhood_multiplier = {'Blmngtn': 105.,
                      'Blueste':  99.,
                      'BrDale':  102.,
                      'BrkSide': 106.,
                      'ClearCr': 103.,
                      'CollgCr':  98.,
                      'Crawfor': 106.,
                      'Edwards':  98.,
                      'Gilbert':  97.,
                      'IDOTRR':  102.,
                      'MeadowV':  90.,
                      'Mitchel':  99.,
                      'NAmes':   100.,
                      'NPkVill': 109.,
                      'NWAmes':   99.,
                      'NoRidge': 101.,
                      'NridgHt': 104.,
                      'OldTown': 102.,
                      'SWISU':    99.,
                      'Sawyer':  101.,
                      'SawyerW':  98.,
                      'Somerst': 101.,
                      'StoneBr': 104.,
                      'Timber':  103.,
                      'Veenker':  98.}

In [53]:
eng_train['NbrhoodMultiplier'] = eng_train.Neighborhood.map(nbrhood_multiplier)
eng_test['NbrhoodMultiplier'] = eng_test.Neighborhood.map(nbrhood_multiplier)

discrete.append('NbrhoodMultiplier')

#### Creating BsmtFinSF from BsmtFinSF1 + BsmtFinSF2

In [54]:
eng_train['BsmtFinSF'] = eng_train.BsmtFinSF1 + eng_train.BsmtFinSF2
eng_test['BsmtFinSF'] = eng_test.BsmtFinSF1 + eng_test.BsmtFinSF2

continuous.append('BsmtFinSF')

#### Combining all Bath features into one

In [55]:
eng_train['TotalBaths'] = eng_train.BsmtFullBath.fillna(0) + (eng_train.BsmtHalfBath.fillna(0) / 2) + eng_train.FullBath + \
                            (eng_train.HalfBath / 2)
eng_test['TotalBaths'] = eng_test.BsmtFullBath.fillna(0) + (eng_test.BsmtHalfBath.fillna(0) / 2) + eng_test.FullBath + \
                            (eng_test.HalfBath / 2)

discrete.append('TotalBaths')

## Reducing levels in categorical features

#### Reducing levels in BldgType

In [56]:
eng_train['BldgTypeSimple'] = 'multifam'
eng_test['BldgTypeSimple'] = 'multifam'
eng_train.loc[eng_train.BldgType == '1Fam', 'BldgTypeSimple'] = '1Fam'
eng_test.loc[eng_test.BldgType == '1Fam', 'BldgTypeSimple'] = '1Fam'

nominal.append('BldgTypeSimple')

#### Reducing levels in SaleCondition

In [57]:
eng_train['SaleCondSimple'] = 'Partial'
eng_test['SaleCondSimple'] = 'Partial'

eng_train.loc[(eng_train.SaleCondition == 'Normal') | (eng_train.SaleCondition == 'Alloca'), 'SaleCondSimple'] = 'Normal'
eng_test.loc[(eng_test.SaleCondition == 'Normal') | (eng_test.SaleCondition == 'Alloca'), 'SaleCondSimple'] = 'Normal'

eng_train.loc[(eng_train.SaleCondition == 'Abnormal') | (eng_train.SaleCondition == 'Family') | \
              (eng_train.SaleCondition == 'AdjLand'), 'SaleCondSimple'] = 'Abnormal'
eng_test.loc[(eng_test.SaleCondition == 'Abnormal') | (eng_test.SaleCondition == 'Family') | \
              (eng_test.SaleCondition == 'AdjLand'), 'SaleCondSimple'] = 'Abnormal'

nominal.append('SaleCondSimple')

## Impute Null Values

#### Set Null GarageCars to 0 in test data

In [58]:
eng_test.loc[eng_test['GarageCars'].isnull(), 'GarageCars'] = 0.0

#### Same thing for basement in test data

In [59]:
eng_test.loc[eng_test['BsmtFinSF'].isnull(), 'BsmtFinSF'] = 0.0
eng_test.loc[eng_test['BsmtUnfSF'].isnull(), 'BsmtUnfSF'] = 0.0

## Use log for skewed continuous variables and SalePrice

In [60]:
skew = pd.DataFrame({'skew': eng_train[continuous + ['SalePrice']].skew().values, 
                     'log_skew': eng_train[continuous + ['SalePrice']].apply(lambda x: np.log1p(x)).skew().values}, 
                      index=eng_train[continuous + ['SalePrice']].skew().index)

In [61]:
for col in skew.columns.values:
    skew['abs_%s' % col] = skew[col].abs()

In [62]:
improved_skew = skew[skew.abs_log_skew < skew.abs_skew].index.tolist()

In [63]:
for feature in improved_skew:
    eng_train[feature] = eng_train[feature].apply(lambda x: np.log1p(x))

improved_skew.remove('SalePrice')
for feature in improved_skew:
    eng_test[feature] = eng_test[feature].apply(lambda x: np.log1p(x))

## Keeping only the columns we want

In [64]:
features = nominal + ordinal + discrete + continuous + binary
target = ['SalePrice']
index = ['Id']

eng_train = eng_train[features + target + index]
eng_test = eng_test[features + index]

## One hot encoding categorical features

In [65]:
one_hot_train = pd.get_dummies(eng_train, columns=nominal, dummy_na=False)
one_hot_test  = pd.get_dummies(eng_test,  columns=nominal, dummy_na=False)

In [66]:
# remove columns from train that aren't also in test after encoding
one_hot_train.drop(list(set(one_hot_train.columns) - set(one_hot_test.columns)), axis=1, inplace=True)

In [67]:
# remove columns from test that aren't also in train after encoding
one_hot_test.drop(list(set(one_hot_test.columns) - set(one_hot_train.columns)), axis=1, inplace=True)

In [68]:
len(one_hot_train.columns) == len(one_hot_test.columns)

True

In [69]:
# adding SalePrice back into train
one_hot_train['SalePrice'] = eng_train.SalePrice

In [70]:
one_hot_train.shape

(1453, 49)

## Outputting for a quick check of the transformed data

In [71]:
one_hot_train.to_csv('one_hot_train.csv')

# EXPLORING LINEAR MODELS

## Random Selection Helper

In [72]:
class RandomSelectionHelper:
    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.random_searches = {}
    
    def fit(self, X, y, n_iter=60, cv=3, n_jobs=1, verbose=1, scoring=None, refit=True, random_state=None):
        for key in self.keys:
            print("Running RandomizedSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            rs = RandomizedSearchCV(model, params, n_iter=n_iter, cv=cv, n_jobs=n_jobs, 
                              verbose=verbose, scoring=scoring, refit=refit, random_state=random_state)
            rs.fit(X,y)
            self.random_searches[key] = rs
    
    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': np.min(scores),
                 'max_score': np.max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series(dict(params.items() + d.items()))
                      
        rows = [row(k, rsc.cv_validation_scores, rsc.parameters) 
                for k in self.keys
                for rsc in self.random_searches[k].grid_scores_]
        df = pd.concat(rows, axis=1).T.sort([sort_by], ascending=False)
        
        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]
        
        return df[columns]

## Features

In [73]:
features = one_hot_train.columns.tolist()
features.remove('SalePrice')
features.remove('Id')
target = 'SalePrice'

## Models to Search

In [74]:
models = { 
    'Ridge': linear_model.Ridge(),
    'Lasso': linear_model.Lasso(),
    'ElasticNet': linear_model.ElasticNet()
}

## Parameters to Search

In [75]:
params = { 
    'Ridge':      {'alpha': scipy.stats.expon(scale=3),
                   'normalize': [True,False]},
    'Lasso':      {'alpha': scipy.stats.expon(scale=3),
                   'normalize': [True,False]},
    'ElasticNet': {'alpha': scipy.stats.expon(scale=3),
                   'l1_ratio': scipy.stats.uniform(0,1),
                   'normalize': [True,False]}
}

## Fitting models for each of two BldgType classes

### Features

In [76]:
bldgType_features = list(features)
bldgType_features.remove('BldgTypeSimple_1Fam')
bldgType_features.remove('BldgTypeSimple_multifam')

### Single Family

In [77]:
single_family_X = one_hot_train.loc[one_hot_train.BldgTypeSimple_1Fam == 1, bldgType_features]
single_family_y = one_hot_train.loc[one_hot_train.BldgTypeSimple_1Fam == 1, target]

#### Fit Models

In [78]:
single_family = RandomSelectionHelper(models,params)
single_family.fit(single_family_X, single_family_y, scoring='mean_squared_error', cv=10, n_iter=200, n_jobs=4)

Running RandomizedSearchCV for ElasticNet.
Fitting 10 folds for each of 200 candidates, totalling 2000 fits


[Parallel(n_jobs=4)]: Done  45 tasks      | elapsed:    9.5s
[Parallel(n_jobs=4)]: Done 932 tasks      | elapsed:   21.3s
[Parallel(n_jobs=4)]: Done 2000 out of 2000 | elapsed:   35.8s finished


Running RandomizedSearchCV for Ridge.
Fitting 10 folds for each of 200 candidates, totalling 2000 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   10.5s
[Parallel(n_jobs=4)]: Done 910 tasks      | elapsed:   20.9s
[Parallel(n_jobs=4)]: Done 2000 out of 2000 | elapsed:   32.4s finished


Running RandomizedSearchCV for Lasso.
Fitting 10 folds for each of 200 candidates, totalling 2000 fits


[Parallel(n_jobs=4)]: Done  46 tasks      | elapsed:    9.3s
[Parallel(n_jobs=4)]: Done 1222 tasks      | elapsed:   23.5s
[Parallel(n_jobs=4)]: Done 2000 out of 2000 | elapsed:   32.4s finished


#### Compare Models

In [79]:
single_family.score_summary(sort_by='mean_score').head(20)



Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,alpha,l1_ratio,normalize
284,Ridge,-0.0210519,-0.0144752,-0.00853154,0.0037046,1.86301,,False
385,Ridge,-0.021052,-0.0144752,-0.00853147,0.00370465,1.86854,,False
362,Ridge,-0.0210489,-0.0144753,-0.00853269,0.00370377,1.76843,,False
287,Ridge,-0.0210483,-0.0144753,-0.00853293,0.00370362,1.75071,,False
343,Ridge,-0.0210483,-0.0144753,-0.00853293,0.00370362,1.75053,,False
376,Ridge,-0.0210481,-0.0144753,-0.00853302,0.00370356,1.7439,,False
367,Ridge,-0.0210552,-0.0144754,-0.00853047,0.00370549,1.96674,,False
267,Ridge,-0.0210559,-0.0144754,-0.0085303,0.00370565,1.98546,,False
277,Ridge,-0.0210595,-0.0144758,-0.00852945,0.00370655,2.09279,,False
237,Ridge,-0.0210426,-0.0144761,-0.00853581,0.00370193,1.56386,,False


### Multi Family

In [80]:
multi_family_X = one_hot_train.loc[one_hot_train.BldgTypeSimple_multifam == 1, bldgType_features]
multi_family_y = one_hot_train.loc[one_hot_train.BldgTypeSimple_multifam == 1, target]

#### Fit Models

In [81]:
multi_family = RandomSelectionHelper(models,params)
multi_family.fit(multi_family_X, multi_family_y, scoring='mean_squared_error', cv=10, n_iter=200, n_jobs=4)

Running RandomizedSearchCV for ElasticNet.
Fitting 10 folds for each of 200 candidates, totalling 2000 fits


[Parallel(n_jobs=4)]: Done  48 tasks      | elapsed:    9.8s
[Parallel(n_jobs=4)]: Done 1484 tasks      | elapsed:   16.1s
[Parallel(n_jobs=4)]: Done 2000 out of 2000 | elapsed:   18.1s finished


Running RandomizedSearchCV for Ridge.
Fitting 10 folds for each of 200 candidates, totalling 2000 fits


[Parallel(n_jobs=4)]: Done  49 tasks      | elapsed:    9.0s
[Parallel(n_jobs=4)]: Done 2000 out of 2000 | elapsed:   23.3s finished


Running RandomizedSearchCV for Lasso.
Fitting 10 folds for each of 200 candidates, totalling 2000 fits


[Parallel(n_jobs=4)]: Done  50 tasks      | elapsed:    8.5s
[Parallel(n_jobs=4)]: Done 1802 tasks      | elapsed:   15.4s
[Parallel(n_jobs=4)]: Done 2000 out of 2000 | elapsed:   15.9s finished


#### Compare Models

In [82]:
multi_family.score_summary(sort_by='mean_score').head(20)



Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,alpha,l1_ratio,normalize
335,Ridge,-0.0197526,-0.0128436,-0.00433618,0.00393988,2.97599,,False
328,Ridge,-0.0197824,-0.0128448,-0.0043308,0.0039431,3.12865,,False
355,Ridge,-0.0197234,-0.012845,-0.00434477,0.00393757,2.80745,,False
259,Ridge,-0.0197868,-0.0128451,-0.00433021,0.00394362,3.15006,,False
216,Ridge,-0.0197106,-0.0128469,-0.00435017,0.003937,2.72351,,False
279,Ridge,-0.0197097,-0.0128471,-0.00435059,0.00393697,2.71755,,False
374,Ridge,-0.0197004,-0.0128493,-0.0043556,0.00393687,2.64961,,False
379,Ridge,-0.0196961,-0.0128506,-0.00435831,0.00393695,2.6156,,False
258,Ridge,-0.0198633,-0.0128548,-0.00432588,0.00395386,3.49207,,False
221,Ridge,-0.0196787,-0.0128596,-0.00437369,0.00393866,2.44953,,False


## Fitting models for each of three SaleCondition classes

### Features

In [83]:
saleCond_features = list(features)
saleCond_features.remove('SaleCondSimple_Normal')
saleCond_features.remove('SaleCondSimple_Abnormal')
saleCond_features.remove('SaleCondSimple_Partial')

### Normal Sale

In [84]:
normal_X = one_hot_train.loc[one_hot_train.SaleCondSimple_Normal == 1, saleCond_features]
normal_y = one_hot_train.loc[one_hot_train.SaleCondSimple_Normal == 1, target]

#### Fit Models

In [85]:
normal = RandomSelectionHelper(models,params)
normal.fit(normal_X, normal_y, scoring='mean_squared_error', cv=10, n_iter=200, n_jobs=4)

Running RandomizedSearchCV for ElasticNet.
Fitting 10 folds for each of 200 candidates, totalling 2000 fits


[Parallel(n_jobs=4)]: Done  43 tasks      | elapsed:   10.8s
[Parallel(n_jobs=4)]: Done 917 tasks      | elapsed:   22.6s
[Parallel(n_jobs=4)]: Done 2000 out of 2000 | elapsed:   36.2s finished


Running RandomizedSearchCV for Ridge.
Fitting 10 folds for each of 200 candidates, totalling 2000 fits


[Parallel(n_jobs=4)]: Done  48 tasks      | elapsed:    9.1s
[Parallel(n_jobs=4)]: Done 1796 tasks      | elapsed:   30.3s
[Parallel(n_jobs=4)]: Done 2000 out of 2000 | elapsed:   32.7s finished


Running RandomizedSearchCV for Lasso.
Fitting 10 folds for each of 200 candidates, totalling 2000 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   10.7s
[Parallel(n_jobs=4)]: Done 626 tasks      | elapsed:   17.1s
[Parallel(n_jobs=4)]: Done 1626 tasks      | elapsed:   28.7s
[Parallel(n_jobs=4)]: Done 2000 out of 2000 | elapsed:   32.9s finished


#### Compare Models

In [86]:
normal.score_summary(sort_by='mean_score').head(20)



Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,alpha,l1_ratio,normalize
294,Ridge,-0.0193582,-0.0101168,-0.0058024,0.00392384,1.56587,,False
315,Ridge,-0.0193578,-0.0101168,-0.00580202,0.00392366,1.57589,,False
358,Ridge,-0.0193629,-0.0101168,-0.00580688,0.00392589,1.4517,,False
314,Ridge,-0.0193553,-0.0101169,-0.00579967,0.00392259,1.63904,,False
291,Ridge,-0.0193547,-0.010117,-0.00579908,0.00392232,1.65522,,False
267,Ridge,-0.0193546,-0.010117,-0.00579896,0.00392226,1.65859,,False
348,Ridge,-0.0193518,-0.0101173,-0.00579612,0.00392095,1.73866,,False
253,Ridge,-0.0193514,-0.0101173,-0.00579579,0.0039208,1.74829,,False
347,Ridge,-0.01937,-0.0101174,-0.00581335,0.00392883,1.29837,,False
303,Ridge,-0.0193707,-0.0101175,-0.00581396,0.00392911,1.28442,,False


### Abnormal Sales

In [87]:
abnormal_X = one_hot_train.loc[one_hot_train.SaleCondSimple_Abnormal == 1, saleCond_features]
abnormal_y = one_hot_train.loc[one_hot_train.SaleCondSimple_Abnormal == 1, target]

#### Fit Models

In [88]:
abnormal = RandomSelectionHelper(models,params)
abnormal.fit(abnormal_X, abnormal_y, scoring='mean_squared_error', cv=3, n_iter=200, n_jobs=4)

Running RandomizedSearchCV for ElasticNet.
Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=4)]: Done  48 tasks      | elapsed:   10.3s
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed:   12.5s finished


Running RandomizedSearchCV for Ridge.
Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    9.5s
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed:   14.8s finished


Running RandomizedSearchCV for Lasso.
Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=4)]: Done  49 tasks      | elapsed:    8.7s
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed:   10.8s finished


#### Compare Models

In [89]:
abnormal.score_summary(sort_by='mean_score').head(20)



Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,alpha,l1_ratio,normalize
558,Lasso,-0.117409,-0.0775544,-0.0276022,0.0373521,4.28993,,False
566,Lasso,-0.117431,-0.0776397,-0.0278691,0.0372381,4.35466,,False
555,Lasso,-0.117443,-0.0776872,-0.0280176,0.0371748,4.39064,,False
516,Lasso,-0.117477,-0.0778162,-0.0284207,0.0370029,4.48658,,False
539,Lasso,-0.117486,-0.0778532,-0.028536,0.0369537,4.51398,,False
533,Lasso,-0.117326,-0.077961,-0.0287799,0.0368091,4.04937,,False
449,Lasso,-0.117515,-0.0779647,-0.0288841,0.0368055,4.59597,,False
564,Lasso,-0.117536,-0.0780476,-0.0291424,0.0366956,4.6562,,False
584,Lasso,-0.11731,-0.0780866,-0.0291486,0.0366415,4.00473,,False
522,Lasso,-0.117299,-0.0781834,-0.0294331,0.0365123,3.97122,,False


### Partial Sales

In [90]:
partial_X = one_hot_train.loc[one_hot_train.SaleCondSimple_Partial == 1, saleCond_features]
partial_y = one_hot_train.loc[one_hot_train.SaleCondSimple_Partial == 1, target]

#### Fit Models

In [91]:
partial = RandomSelectionHelper(models,params)
partial.fit(partial_X, partial_y, scoring='mean_squared_error', cv=10, n_iter=200, n_jobs=4)

Running RandomizedSearchCV for ElasticNet.
Fitting 10 folds for each of 200 candidates, totalling 2000 fits


[Parallel(n_jobs=4)]: Done  48 tasks      | elapsed:   10.3s
[Parallel(n_jobs=4)]: Done 1532 tasks      | elapsed:   16.7s
[Parallel(n_jobs=4)]: Done 2000 out of 2000 | elapsed:   18.4s finished


Running RandomizedSearchCV for Ridge.
Fitting 10 folds for each of 200 candidates, totalling 2000 fits


[Parallel(n_jobs=4)]: Done  45 tasks      | elapsed:    9.2s
[Parallel(n_jobs=4)]: Done 1233 tasks      | elapsed:   19.6s
[Parallel(n_jobs=4)]: Done 2000 out of 2000 | elapsed:   26.5s finished


Running RandomizedSearchCV for Lasso.
Fitting 10 folds for each of 200 candidates, totalling 2000 fits


[Parallel(n_jobs=4)]: Done  50 tasks      | elapsed:    8.7s
[Parallel(n_jobs=4)]: Done 1550 tasks      | elapsed:   14.3s
[Parallel(n_jobs=4)]: Done 2000 out of 2000 | elapsed:   15.9s finished


#### Compare Models

In [92]:
partial.score_summary(sort_by='mean_score').head(20)



Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,alpha,l1_ratio,normalize
345,Ridge,-0.0592889,-0.0340482,-0.0174724,0.0127527,19.0099,,False
311,Ridge,-0.0588327,-0.0342985,-0.0166323,0.0126973,10.4024,,False
245,Ridge,-0.0587345,-0.0343509,-0.01655,0.0126891,9.59152,,False
211,Ridge,-0.0587329,-0.0343518,-0.0165487,0.0126889,9.5791,,False
372,Ridge,-0.0585862,-0.0344282,-0.0164473,0.0126774,8.57377,,False
352,Ridge,-0.0585844,-0.0344292,-0.0164461,0.0126773,8.56223,,False
263,Ridge,-0.0585592,-0.0344421,-0.0164308,0.0126754,8.40895,,False
282,Ridge,-0.0585113,-0.0344666,-0.0164031,0.0126718,8.13059,,False
314,Ridge,-0.0584475,-0.0344989,-0.0163687,0.0126672,7.78302,,False
257,Ridge,-0.0584258,-0.0345097,-0.0163577,0.0126657,7.67053,,False


# FIT BLENDED MODELS TO TEST DATA

## Using Normal Sales fit for normal sales and Single/Multi Family fits for other sales

In [93]:
best_normal = normal.random_searches['Ridge'].best_estimator_
best_single = single_family.random_searches['Ridge'].best_estimator_
best_multi  = multi_family.random_searches['Ridge'].best_estimator_

predict_normal = best_normal.predict(one_hot_test[saleCond_features])
predict_single = best_single.predict(one_hot_test[bldgType_features])
predict_multi  = best_multi.predict(one_hot_test[bldgType_features])

In [94]:
submission = one_hot_test.loc[:,['Id','SaleCondSimple_Normal','BldgTypeSimple_1Fam']]
submission['predict_normal'] = predict_normal
submission['predict_single'] = predict_single
submission['predict_multi'] = predict_multi

In [95]:
submission['SalePrice'] = 0.0
submission.loc[submission.SaleCondSimple_Normal == 1, 'SalePrice'] = submission.predict_normal
submission.loc[(submission.SaleCondSimple_Normal == 0) & (submission.BldgTypeSimple_1Fam == 1), 'SalePrice'] = submission.predict_single
submission.loc[(submission.SaleCondSimple_Normal == 0) & (submission.BldgTypeSimple_1Fam == 0), 'SalePrice'] = submission.predict_multi

In [96]:
submission['SalePrice'] = submission.SalePrice.apply(lambda x: np.exp(x))

In [97]:
submission = submission.loc[:,['Id','SalePrice']]
submission.to_csv('Ridge_Blend_JD_2016_12_27.csv', index=False)