# House Prices - Advanced Regression Techniques

In [20]:
# Imports
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from scipy import stats

from xgboost import XGBRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Third-party library
import eda

In [2]:
# Load data
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

Id = test_df['Id']

# For easier manipulating the dataframes
combined = [train_df, test_df]

train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Exploratory Data Analysis

In [3]:
# Get a describtion of the data
desc_df = eda.describe(train_df.loc[:, train_df.columns != 'SalePrice'],  train_df['SalePrice'], type = 'Regression')
desc_df

Unnamed: 0,type,MDP,var,std,min,25%,50%,75%,max,mean,MI,RFE,Importance,highestCorr,corr
Id,int64,0.00,1.777550e+05,421.610,1.0,365.75,730.5,1095.25,1460.0,730.500,0.000,55.0,0.000442,PoolQC,0.067
MSSubClass,int64,0.00,1.789338e+03,42.301,20.0,20.00,50.0,70.00,190.0,56.897,0.060,52.0,0.000805,BldgType,0.771
MSZoning,object,0.00,,,,,,,,,0.132,33.0,0.002876,Alley,0.399
LotFrontage,float64,17.74,5.897490e+02,24.285,21.0,59.00,69.0,80.00,313.0,70.050,,,0.000901,BldgType,0.471
LotArea,int64,0.00,9.962565e+07,9981.265,1300.0,7553.50,9478.5,11601.50,215245.0,10516.828,0.096,23.0,0.003047,LandSlope,0.437
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MiscVal,int64,0.00,2.461381e+05,496.123,0.0,0.00,0.0,0.00,15500.0,43.489,0.000,60.0,0.000036,MiscFeature,0.548
MoSold,int64,0.00,7.310000e+00,2.704,1.0,5.00,6.0,8.00,12.0,6.322,0.000,32.0,0.000789,YrSold,0.146
YrSold,int64,0.00,1.764000e+00,1.328,2006.0,2007.00,2008.0,2009.00,2010.0,2007.816,0.004,44.0,0.000377,MoSold,0.146
SaleType,object,0.00,,,,,,,,,0.079,40.0,0.001562,SaleCondition,0.232


### Data Dimensionality Reduction

In [4]:
features_to_remove = set()

In [5]:
# Select features with heavy NAs (more than 40% MDP)
features = desc_df.loc[desc_df['MDP'] > 40].index
features_to_remove.update(features)

print('Features to remove:', features)


Features to remove: Index(['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], dtype='object')


In [6]:
# Select features with 0 mutual information
features = desc_df.loc[desc_df['MI'] == 0].index
features_to_remove.update(features)

print('Features to remove:', list(features))


Features to remove: ['Id', 'Utilities', 'RoofMatl', 'Exterior1st', 'BsmtFinSF2', 'PoolArea', 'MiscVal', 'MoSold']


In [7]:
# Select features ranking last 15 for RFE ranking
features = desc_df.sort_values(by = 'RFE', ascending = False).iloc[:15].index
features_to_remove.update(features)

print('Features to remove:', list(features))


Features to remove: ['Utilities', 'MiscVal', 'Heating', 'Foundation', 'PavedDrive', 'LandSlope', 'Id', 'Street', 'LowQualFinSF', 'MSSubClass', 'LotShape', 'HalfBath', 'BldgType', 'HeatingQC', 'LotConfig']


In [8]:
# Select 15 features with lowest importance
features = desc_df.sort_values(by = 'Importance', ascending = True).iloc[:15].index
features_to_remove.update(features)

print('Features to remove:', list(features))


Features to remove: ['Utilities', 'PoolQC', 'GarageCond', 'Street', 'MiscVal', 'MiscFeature', 'Heating', 'ExterCond', 'BsmtHalfBath', 'Electrical', 'YrSold', 'ExterQual', 'MasVnrType', 'Id', 'HouseStyle']


In [9]:
# Drop gathered features
print('Overall features to remove:', list(features_to_remove))

for df in combined:
    df.drop(columns = features_to_remove, inplace = True)    

Overall features to remove: ['Utilities', 'HeatingQC', 'ExterQual', 'FireplaceQu', 'Exterior1st', 'BsmtFinSF2', 'Id', 'MasVnrType', 'MoSold', 'LotShape', 'HalfBath', 'YrSold', 'MiscFeature', 'MSSubClass', 'Foundation', 'HouseStyle', 'GarageCond', 'LandSlope', 'PavedDrive', 'LotConfig', 'Alley', 'ExterCond', 'Fence', 'Street', 'Electrical', 'PoolQC', 'Heating', 'BsmtHalfBath', 'PoolArea', 'BldgType', 'RoofMatl', 'MiscVal', 'LowQualFinSF']


### Feature Engineering

In [10]:
# Remove the two abnormal 'GrLivArea' records
indexes = train_df.sort_values(by = ['GrLivArea'], ascending = False).iloc[:2].index
train_df.drop(indexes, axis = 0, inplace = True)

# Create a new feature 'TotalBstm' feature
for df in combined:
    df['TotalSF'] = df['1stFlrSF'] + df['2ndFlrSF'] + df['TotalBsmtSF']

In [11]:
# Basement is important and is something I consider when buying a house, so I will keep it. But it's not normal again and in fact, there are many houses with no basements!
# So I'll create a new variabe that indicates wheter the house has a basement or not. Then I'll transform the ones with basement it to log.

# Since 'GrLivArea' is not normal, we will transform it to log to fit it as much as possible

for df in combined:

    # Feature Extraction
    df['HasBsmt'] = 0
    df.loc[df['TotalBsmtSF'] > 0, 'HasBsmt'] = 1

    # 'TotalBsmtSF' Log Transformation
    df.loc[df['HasBsmt'] == 1,'TotalBsmtSF'] = np.log(df['TotalBsmtSF'])

    # 'GrLivArea' Log Transformation
    df['GrLivArea'] = np.log(df['GrLivArea'])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [12]:
# Get a describtion of the data
desc_df = eda.describe(train_df.loc[:, train_df.columns != 'SalePrice'],  train_df['SalePrice'], type = 'Regression')
desc_df[desc_df['corr'] > 0.8].sort_values(by = 'corr', ascending = False)

Unnamed: 0,type,MDP,var,std,min,25%,50%,75%,max,mean,MI,RFE,Importance,highestCorr,corr
TotalBsmtSF,float64,0.0,1.311,1.145,0.0,6.679,6.899,7.168,8.073,6.747,0.397,35.0,0.000971,HasBsmt,0.951
HasBsmt,int64,0.0,0.025,0.157,0.0,1.0,1.0,1.0,1.0,0.975,0.02,38.0,0.0,TotalBsmtSF,0.951
GarageCars,int64,0.0,0.558,0.747,0.0,1.0,2.0,2.0,4.0,1.766,0.366,4.0,0.052462,GarageArea,0.887
GarageArea,int64,0.0,45045.499,212.239,0.0,331.5,479.5,576.0,1390.0,472.05,0.362,18.0,0.003186,GarageCars,0.887
GrLivArea,float64,0.0,0.109,0.33,5.811,7.029,7.287,7.482,8.406,7.266,0.432,12.0,0.008582,TotalSF,0.856
TotalSF,int64,0.0,599245.986,774.11,334.0,2008.5,2473.0,3002.25,6872.0,2557.15,0.681,2.0,0.234203,GrLivArea,0.856
YearBuilt,int64,0.0,911.663,30.194,1872.0,1954.0,1972.5,2000.0,2010.0,1971.218,0.374,7.0,0.017473,GarageYrBlt,0.825
GarageYrBlt,float64,5.56,609.245,24.683,1900.0,1961.0,1980.0,2002.0,2010.0,1978.464,,,0.002263,YearBuilt,0.825
TotRmsAbvGrd,int64,0.0,2.611,1.616,2.0,5.0,6.0,7.0,14.0,6.511,0.217,15.0,0.009305,GrLivArea,0.823


In [13]:
# Since 'GarageArea' and 'GarageCars' are correlated, we remove 'GarageCars' because it proviedes fewer information.
# 'GarageYrBlt' is also highly correlated with 'YearBuilt', and between the two, it provides less information.

for df in combined:
    df.drop(columns = ['GarageCars', 'GarageYrBlt'], inplace = True)

### Feature Selection

In [14]:
# Get a describtion of the data
desc_df = eda.describe(train_df.loc[:, train_df.columns != 'SalePrice'],  train_df['SalePrice'], type = 'Regression')
desc_df.sort_values(by = ['MI', 'Importance'], ascending = False)

# Running the EDA once more, we can see the two features "OverallQual" and "GrLivArea" are the first two features according to MI and Importance

Unnamed: 0,type,MDP,var,std,min,25%,50%,75%,max,mean,MI,RFE,Importance,highestCorr,corr
TotalSF,int64,0.0,599246.0,774.11,334.0,2008.5,2473.0,3002.25,6872.0,2557.15,0.683,2.0,0.258275,GrLivArea,0.856
OverallQual,int64,0.0,1.894,1.376,1.0,5.0,6.0,7.0,10.0,6.094,0.571,1.0,0.43296,TotalSF,0.677
Neighborhood,object,0.0,,,,,,,,,0.498,24.0,0.002397,GarageArea,0.201
GrLivArea,float64,0.0,0.109,0.33,5.811,7.029,7.287,7.482,8.406,7.266,0.435,9.0,0.006792,TotalSF,0.856
TotalBsmtSF,float64,0.0,1.311,1.145,0.0,6.679,6.899,7.168,8.073,6.747,0.399,33.0,0.001284,HasBsmt,0.951
YearBuilt,int64,0.0,911.663,30.194,1872.0,1954.0,1972.5,2000.0,2010.0,1971.218,0.369,8.0,0.018833,YearRemodAdd,0.592
GarageArea,int64,0.0,45045.5,212.239,0.0,331.5,479.5,576.0,1390.0,472.05,0.367,17.0,0.004907,TotalSF,0.558
KitchenQual,object,0.0,,,,,,,,,0.327,11.0,0.006114,YearRemodAdd,0.383
1stFlrSF,int64,0.0,138413.4,372.039,334.0,882.0,1086.0,1390.75,3228.0,1158.851,0.312,23.0,0.001357,TotalSF,0.782
FullBath,int64,0.0,0.302,0.55,0.0,1.0,2.0,2.0,3.0,1.564,0.255,28.0,0.002758,GrLivArea,0.653


### Modeling

In [15]:
# Evaluation function

def rmsle_cv(model, X, y, n_folds = 5):    
    kf = KFold(n_folds, shuffle = True, random_state = 42).get_n_splits(X)
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring = "neg_mean_squared_error", cv = kf))
    return(rmse)

In [16]:
# Final features to use for the model(s)
features = ['OverallQual', 'GrLivArea', 'TotalSF', 'HasBsmt']

In [26]:
# XGBoost Regressor
XG_reg = XGBRegressor(n_estimators = 1000, learning_rate = 0.05)
XG_reg.fit(train_df[features], train_df['SalePrice'])

# Get RMSLE score
XGB_score = rmsle_cv(XG_reg, train_df[features], train_df['SalePrice'])
print(f'XGB score: {round(XGB_score.mean(), 3)}')

XGB score: 34851.818


In [22]:
# RandomForest Regressor

XG_reg = RandomForestRegressor()
XG_reg.fit(train_df[features], train_df['SalePrice'])

# Get RMSLE score
RandomForest_score = rmsle_cv(XG_reg, train_df[features], train_df['SalePrice'])
print(f'RandomForest score: {round(RandomForest_score.mean(), 3)}')

RandomForest score: 32878.776


In [18]:
pred = XG_reg.predict(test_df[features])

submission_df = pd.DataFrame(data = {
    'Id': Id,
    'SalePrice': pred
})

submission_df.to_csv(f'submission.csv', index = False)