In [1]:
## memo try function based munging next time

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import sklearn

from scipy.stats import skew
from scipy.stats.stats import pearsonr

%config InlineBackend.figure_format = 'retina' #set 'png' here when working on notebook
%matplotlib inline

In [2]:
train_df = pd.read_csv("./train.csv")
test_df = pd.read_csv("./test.csv")
train_df.head()
# test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
train_df.shape, test_df.shape

((1460, 81), (1459, 80))

In [4]:
# There are a few houses with more than 4000 sq ft living area that are
# outliers, so we drop them from the training data. (There is also one in
# the test set but we obviously can't drop that one.)
train_df.drop(train_df[train_df['GrLivArea'] > 4000].index, inplace=True)

# The test example with ID 666 has GarageArea, GarageCars, and GarageType 
# but none of the other fields, so use the mode and median to fill them in.
test_df.loc[666, 'GarageQual'] = 'TA'
test_df.loc[666, 'GarageCond'] = 'TA'
test_df.loc[666, 'GarageFinish'] = 'Unf'
test_df.loc[666, 'GarageYrBlt'] = 1980

# The test example 1116 only has GarageType but no other information. We'll 
# assume it does not have a garage.
test_df.loc[1116, 'GarageType'] = np.nan

# For imputing missing values: fill in missing LotFrontage values by the median
# LotFrontage of the neighborhood.

In [5]:
# combine the data and take all of the feature. This is for easier feature engineering. 
all_data = pd.concat((train_df.iloc[:,1:-1], test_df.iloc[:,1:]), axis = 0)
Y = np.log(train_df["SalePrice"])

# remember the length of train data
trainLen = len(train_df)
all_data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [6]:
# There is a na TotalBsmtSF value in the test set for 
all_data.loc[all_data["TotalBsmtSF"].isnull(),'TotalBsmtSF'] = 0

# Add a variable for total living area
all_data['TotalLivArea'] = all_data['GrLivArea'] + all_data['TotalBsmtSF'] + all_data['GarageArea']

# Add a variable for total porch Area
all_data['TotalPorchSF'] = all_data[['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']].sum(axis = 1)

In [7]:
# Fill in missing values
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(all_data.loc[all_data['LotFrontage'].notnull(), ['LotArea']],
        all_data.loc[all_data['LotFrontage'].notnull(), ['LotFrontage']])
all_data.loc[all_data['LotFrontage'].isnull(), ['LotFrontage']] = reg.predict(
    all_data.loc[all_data['LotFrontage'].isnull(), ['LotArea']])

In [8]:
# Add some feature
all_data['IsRegularLotShape'] = (all_data['LotShape'] == 'Reg') * 1
all_data['IsLandLevel'] = (all_data['LandContour'] == 'Lv1') * 1
all_data['IsLandSlopeGentle'] = (all_data['LandSlope'] == 'Gtl') * 1
all_data['IsElectricalBrkr'] = (all_data['Electrical'] == 'SBrkr') * 1
all_data['IsGarageDetached'] = (all_data['GarageType'] == 'Detchd') * 1
all_data['IsPavedDrive'] = (all_data['PavedDrive'] == 'Y') * 1
all_data['HasShed'] = (all_data['MiscFeature'] == 'Shed') * 1
all_data['Remodeled'] = (all_data['YearRemodAdd'] != all_data['YearBuilt']) * 1
all_data['RecentRemodel'] = (all_data['YearRemodAdd'] == all_data['YearBuilt']) * 1
all_data['VeryNewHouse'] = (all_data['YearBuilt'] == all_data['YrSold']) * 1
all_data['Has2ndFloor'] = (all_data['2ndFlrSF'] > 0) * 1
all_data['HasMasVnr'] = (all_data['MasVnrArea'] > 0) * 1
all_data['HasWoodDeck'] = (all_data['WoodDeckSF'] > 0) * 1
all_data['HasOpenPorch'] = (all_data['OpenPorchSF'] > 0) * 1
all_data['HasEnclosedPorch'] = (all_data['EnclosedPorch'] > 0) * 1
all_data['Has3SsnPorch'] = (all_data['3SsnPorch'] > 0) * 1
all_data['HasScreenPorch'] = (all_data['ScreenPorch'] > 0) * 1

In [9]:
qual_dict = {None: 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'NA' : 0}
QualFeats = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 
             'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']
for feat in QualFeats:
    all_data[feat + '_Quant'] = all_data[feat].map(qual_dict).astype(int)

all_data['LotShape_Quant'] = all_data['LotShape'].replace([None, 'IR3','IR2','IR1' ,'Reg'], [0,1,2,3,4])
all_data['LandContour_Quant'] = all_data['LandContour'].replace([None, 'Low', 'HLS', 'Bnk','Lvl'], [0,1,2,3,4])
all_data['Utilities_Quant'] = all_data['Utilities'].replace([None, 'ELO','NoSeWa','NoSewr','AllPub'], [0,1,2,3,4])
all_data['LandSlope_Quant'] = all_data['LandSlope'].replace([None, 'Sev' , 'Mod', 'Gtl'], [0,1,2,3])
all_data['BsmtExposure_Quant'] = all_data['BsmtExposure'].replace([None, 'No', 'Mn', 'Av', 'Gd'], [0,1,2,3,4])
all_data['BsmtFinType1_Quant'] = all_data['BsmtFinType1'].replace([None, 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'], [0,1,2,3,4,5,6])
all_data['BsmtFinType2_Quant'] = all_data['BsmtFinType2'].replace([None, 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'], [0,1,2,3,4,5,6])
all_data['Functional_Quant'] = all_data['Functional'].replace([None, 'Sal','Sev','Maj2','Maj1','Mod','Min2','Min1','Typ'], [
        0,1,2,3,4,5,6,7,8])
all_data['GarageFinish_Quant'] = all_data['GarageFinish'].replace([None, 'Unf', 'RFn', 'Fin'], [0,1,2,3])
all_data['Fence_Quant'] = all_data['Fence'].replace([None, 'MnWw', 'GdWo', 'MnPrv', 'GdPrv'], [0,1,2,3,4])
all_data['CentralAir_Quant'] = all_data['CentralAir'].replace([None,'N','Y'], [0,0,1])
all_data['NewerDwelling'] = all_data['MSSubClass'].map({20: 1, 30: 0, 40: 0, 45: 0,50: 0, 60: 1,
                                                        70: 0, 75: 0, 80: 0, 85: 0, 90: 0, 120: 1,
                                                        150: 0, 160: 0, 180: 0, 190: 0}).astype(int)
all_data['GoodNeighborhood'] = 0
all_data['GoodNeighborhood'] = ((all_data['Neighborhood'] == 'NridgHt') | 
                                (all_data['Neighborhood'] == 'Crawfor') |
                                (all_data['Neighborhood'] == 'StoneBr') |
                                (all_data['Neighborhood'] == 'Somerst') |
                                (all_data['Neighborhood'] == 'NoRidge')) * 1
all_data['SaleCondition_PriceDown'] = all_data['SaleCondition'].replace(
        {'Abnorml': 1, 'Alloca': 2, 'AdjLand': 3, 'Family': 4, 'Normal': 5, 'Partial': 0})
all_data['BoughtOffPlan'] = all_data['SaleCondition'].replace(
        {'Abnorml' : 0, 'Alloca' : 0, 'AdjLand' : 0, 'Family' : 0, 'Normal' : 0, 'Partial' : 1})



all_data['MSZoning'].fillna('RL', inplace=True)
all_data['Exterior1st'].fillna('Other', inplace=True)
all_data['Exterior2nd'].fillna('Other', inplace=True)
all_data['MasVnrType'].fillna('None', inplace=True)
all_data['SaleType'].fillna('Oth', inplace=True)

In [10]:
# Let's add time interval in year between build and sold time.
all_data['Age'] = 2010 -  all_data['YearBuilt']
all_data['TimeSinceSold'] = 2010 -  all_data['YearRemodAdd']
all_data['BltYears'] = all_data['YrSold'] -  all_data['YearBuilt']
all_data['RemYears'] = all_data['YrSold'] -  all_data['YearRemodAdd']
all_data['GaBltyears'] = all_data['YrSold'] -  all_data['GarageYrBlt']

In [11]:
# MoSold seems more like a categorical data, let's check the distribution
# sns.boxplot(y = Y, x = all_data[:trainLen]['MoSold'])
all_data['HighSeason'] = all_data['MoSold'].map({1: 0, 2: 0, 3: 0, 4: 1, 5: 1, 6: 1, 
                                                 7: 1, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0}).astype(int)
all_data['SeasonSold'] = all_data['MoSold'].map({12:0, 1:0, 2:0, 3:1, 4:1, 5:1, 
                                                  6:2, 7:2, 8:2, 9:3, 10:3, 11:3}).astype(int)



In [12]:
# Summary district
neighborhood_map = {
        'MeadowV' : 0,  #  88000
        'IDOTRR' : 1,   # 103000
        'BrDale' : 1,   # 106000
        'OldTown' : 1,  # 119000
        'Edwards' : 1,  # 119500
        'BrkSide' : 1,  # 124300
        'Sawyer' : 1,   # 135000
        'Blueste' : 1,  # 137500
        'SWISU' : 2,    # 139500
        'NAmes' : 2,    # 140000
        'NPkVill' : 2,  # 146000
        'Mitchel' : 2,  # 153500
        'SawyerW' : 2,  # 179900
        'Gilbert' : 2,  # 181000
        'NWAmes' : 2,   # 182900
        'Blmngtn' : 2,  # 191000
        'CollgCr' : 2,  # 197200
        'ClearCr' : 3,  # 200250
        'Crawfor' : 3,  # 200624
        'Veenker' : 3,  # 218000
        'Somerst' : 3,  # 225500
        'Timber' : 3,   # 228475
        'StoneBr' : 4,  # 278000
        'NoRidge' : 4,  # 290000
        'NridgHt' : 4,  # 315000
}

all_data['NeighborhoodBin'] = all_data['Neighborhood'].map(neighborhood_map).astype(int)

In [13]:
# check na value count
nullCnt = pd.DataFrame({'nullNums' : all_data.isnull().sum()})
nullCnt['DataType'] = all_data[nullCnt.index].dtypes
print nullCnt[nullCnt['nullNums'] > 0].sort_values(by = 'DataType')

              nullNums DataType
GaBltyears         158  float64
TotalLivArea         1  float64
BsmtHalfBath         2  float64
BsmtFullBath         2  float64
GarageCars           1  float64
BsmtUnfSF            1  float64
BsmtFinSF2           1  float64
GarageArea           1  float64
MasVnrArea          23  float64
BsmtFinSF1           1  float64
GarageYrBlt        158  float64
GarageFinish       158   object
GarageQual         158   object
GarageCond         158   object
PoolQC            2907   object
Fence             2345   object
MiscFeature       2810   object
Alley             2717   object
FireplaceQu       1420   object
Functional           2   object
Electrical           1   object
BsmtFinType2        80   object
BsmtFinType1        79   object
BsmtExposure        82   object
BsmtCond            82   object
BsmtQual            81   object
Utilities            2   object
GarageType         158   object
KitchenQual          1   object


In [14]:
# fill numerical na with 0, category na with 'None'
numeric_feature = all_data.columns[all_data.dtypes != 'object']
all_data[numeric_feature] = all_data[numeric_feature].fillna(0)

category_feature = all_data.columns[all_data.dtypes == 'object']
all_data[category_feature] = all_data[category_feature].fillna('None')

In [15]:
all_data['MoSold_Cate'] = all_data['MoSold'].astype('category')
all_data['MSSubClass'] = all_data['MSSubClass'].astype('category')

In [16]:
numeric_feature = all_data.columns[(all_data.dtypes != 'object') & (all_data.dtypes != 'category')]
skewed_feats = all_data[numeric_feature].apply(lambda x: skew(x[x > 0].astype(float)))
skewed_feats = skewed_feats[skewed_feats > 0.9].index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

In [17]:
# Additional processing: scale the data.   
from sklearn.preprocessing import StandardScaler, RobustScaler
scaler = StandardScaler()
scaler.fit(all_data[numeric_feature])

all_data.loc[:, numeric_feature] = scaler.transform(all_data[numeric_feature])

In [18]:
# There're several pairs of feature that should be merged when one-hot encoded

# Deal with Exterior
for name in all_data["Exterior1st"].dropna().unique():
    all_data["Exterior" + "-" + name] = 1 * ((all_data["Exterior1st"] == name) | (all_data["Exterior2nd"] == name))

# Deal with Condition
for name in all_data["Condition1"].dropna().unique():
    all_data["Condition" + "-" + name] = 1 * ((all_data["Condition1"] == name) | (all_data["Condition2"] == name))
    
all_data = all_data.drop(['Exterior1st','Exterior2nd'], axis=1)
all_data = all_data.drop(['Condition1','Condition2'], axis=1)

In [19]:
# one-hot encode category data
all_data = pd.get_dummies(all_data)
all_data.shape
# all_data.head()

(2915, 361)

In [20]:
# Several types of features should be avoid.
# 1. All zero feature in either train set or test set
# 2. Extremely sparse features
trainShape = all_data[:trainLen].shape
testShape = all_data[trainLen:].shape
trainShape, testShape

((1456, 361), (1459, 361))

In [21]:
# Drop low variance candidates
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(0 * (1 - 0)))
sel.fit(all_data[:trainLen])
train_feats = sel.get_support()
sel.fit(all_data[trainLen:])
test_feats = sel.get_support()

In [22]:
all_data = all_data.loc[:, train_feats & test_feats]

In [23]:
all_data.shape

(2915, 344)

In [24]:
X_train = all_data[:trainLen]
X_test  = all_data[trainLen:]

## Model

### Regularized model
Same as before try both ridge and lasso

In [25]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn import model_selection, metrics   #Additional scklearn functions

In [27]:
lasso_param_test = {
    'alpha':[0.0002,0.0005,0.001,0.005,0.01,0.05]
#     'alpha': np.arange(0.0002,0.0007,0.00005)
}

lasso_search = GridSearchCV(
    estimator = Lasso(max_iter = 10000),
    param_grid = lasso_param_test, 
    scoring='neg_mean_squared_error',
    n_jobs=4,
    iid=False, 
    cv=5
)
lasso_search.fit(X_train,Y)
zip(np.sqrt(-lasso_search.cv_results_['mean_test_score']),lasso_search.cv_results_['params']), lasso_search.best_params_,np.sqrt(- lasso_search.best_score_)

([(0.1111355571460618, {'alpha': 0.0002}),
  (0.10892105240246958, {'alpha': 0.0005}),
  (0.10931306187247626, {'alpha': 0.001}),
  (0.11511128694902441, {'alpha': 0.005}),
  (0.11934623731003474, {'alpha': 0.01}),
  (0.15775502473448547, {'alpha': 0.05})],
 {'alpha': 0.0005},
 0.10892105240246958)

As before LASSO works better. Let's take a look at the top features first.

### Xgboost

In [28]:
import xgboost as xgb
from xgboost.sklearn import XGBRegressor

In [32]:
# All tuning would be done within this cell
xgb_param_test = {
#     'max_depth':range(3,10,3),
#     'min_child_weight':range(1,8,2)
#     'gamma' : [0.0005, 0.001,0.005,0.01,0.05,0.1]
#     'gamma' : np.arange(0.003, 0.01, 0.0005)
#     'subsample': np.arange(0.1,1,0.2),
#     'colsample_bytree': np.arange(0.1,1,0.2),
#     'subsample': np.arange(0.4,0.6,0.04),
#     'colsample_bytree': np.arange(0.8,1,0.04),
    'reg_alpha' : [0.0001,0.0005,0.001,0.005,0.01,0.05],
    'reg_lambda' : [0.001,0.005,0.01,0.05,0.1,0.5,1,3,10]
#     'reg_alpha' : np.arange(7e-6,9e-6,3e-7) # 8.2e-6
#     'reg_lambda' : np.arange(0.7,1.1,0.03)
}
xgb_search = GridSearchCV(
    estimator = XGBRegressor(
        learning_rate =0.1,
        n_estimators=140,
        max_depth=3,
        min_child_weight=3,
        gamma=0.005,
        subsample=0.7,
        colsample_bytree=0.3,
        objective= 'reg:linear',
        nthread=4,
        scale_pos_weight=1,
        reg_alpha = 0.005,
        reg_lambda = 0.005,        
        seed=100
    ),
    param_grid = xgb_param_test,
    scoring = 'neg_mean_squared_error',
    n_jobs = 1,
    iid = False,
    cv = 5
)
xgb_search.fit(X_train, Y)
zip(np.sqrt(-xgb_search.cv_results_['mean_test_score']),xgb_search.cv_results_['params']), xgb_search.best_params_,np.sqrt(- xgb_search.best_score_)

([(0.11319861710467638, {'reg_alpha': 0.0001, 'reg_lambda': 0.001}),
  (0.11308659679927889, {'reg_alpha': 0.0001, 'reg_lambda': 0.005}),
  (0.11349974314414901, {'reg_alpha': 0.0001, 'reg_lambda': 0.01}),
  (0.11384893478982977, {'reg_alpha': 0.0001, 'reg_lambda': 0.05}),
  (0.11518235258281116, {'reg_alpha': 0.0001, 'reg_lambda': 0.1}),
  (0.11740754442979882, {'reg_alpha': 0.0001, 'reg_lambda': 0.5}),
  (0.11717870661418714, {'reg_alpha': 0.0001, 'reg_lambda': 1}),
  (0.11689497069836971, {'reg_alpha': 0.0001, 'reg_lambda': 3}),
  (0.11684925931108561, {'reg_alpha': 0.0001, 'reg_lambda': 10}),
  (0.11316236516284806, {'reg_alpha': 0.0005, 'reg_lambda': 0.001}),
  (0.11311912999353588, {'reg_alpha': 0.0005, 'reg_lambda': 0.005}),
  (0.11398938147372378, {'reg_alpha': 0.0005, 'reg_lambda': 0.01}),
  (0.11388538153360214, {'reg_alpha': 0.0005, 'reg_lambda': 0.05}),
  (0.11493284909429179, {'reg_alpha': 0.0005, 'reg_lambda': 0.1}),
  (0.1175859180264322, {'reg_alpha': 0.0005, 'reg_lambd

### Average Result

In [34]:
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import KFold
model_lasso = Lasso(alpha = 0.0005, max_iter = 50000)
model_xgb = XGBRegressor( # use all features
    learning_rate =0.01,
    n_estimators=5000,
    max_depth=3,
    min_child_weight=3,
    gamma=0.01,
    subsample=0.7,
    colsample_bytree=0.3,
    objective= 'reg:linear',
    nthread=4,
    scale_pos_weight=1,
    reg_alpha = 0.005,
    reg_lambda = 0.005, 
    seed=100
)

In [35]:
kf = KFold(n_splits=5)
idxcnt = 1
result = pd.DataFrame({'dummy' : test_df.Id})
result['dummy'] = 0

for train_index, test_index in kf.split(X_train):
    model_lasso.fit(X_train.iloc[train_index,:], Y.iloc[train_index])
    model_xgb.fit(X_train.iloc[train_index,:], Y.iloc[train_index])
    res_lasso = pd.DataFrame({'Lasso_' + str(idxcnt) : np.exp(model_lasso.predict(X_test))})
    res_xgb   = pd.DataFrame({'Xgb_' + str(idxcnt)   : np.exp(model_xgb.predict(X_test))})
    result = pd.concat((result, res_lasso, res_xgb), axis = 1)
    idxcnt += 1
result = result.drop('dummy', axis = 1)
result.head()

Unnamed: 0,Lasso_1,Xgb_1,Lasso_2,Xgb_2,Lasso_3,Xgb_3,Lasso_4,Xgb_4,Lasso_5,Xgb_5
0,123029.099069,130193.023438,123965.674301,131800.203125,122162.774977,130005.304688,125169.398832,134009.375,122732.051415,129670.859375
1,152447.086669,155931.109375,150675.94569,160202.78125,152428.100069,157354.4375,152265.035783,152372.71875,152989.591729,153003.078125
2,179837.083944,184983.203125,182815.728922,186337.34375,176837.98728,188354.296875,176839.281491,186370.046875,180338.432921,193164.984375
3,196528.202382,199833.203125,196887.069518,196749.15625,196214.581294,193698.65625,192684.056586,194015.171875,194788.109483,199242.53125
4,197185.702364,182855.8125,196766.023775,178076.1875,196597.795574,177957.1875,200147.729911,184053.90625,197198.146859,189413.125


In [36]:
final_result = pd.DataFrame({'Id' : test_df.Id, 'SalePrice' : result.mean(axis = 1)})

In [37]:
final_result.to_csv("final_result.csv", index = False)