# House Prices - Advanced Regression Techniques

In [1]:
# Imports
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from scipy import stats

from xgboost import XGBRegressor

# Third-party library
import eda

In [2]:
# Load data
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

# Split data into train and test
train_X = train_df.loc[:, train_df.columns != 'SalePrice'].copy()
train_y = train_df['SalePrice'].copy()
test_X = test_df.copy()

# For easier manipulating the dataframes
combined = [train_X, test_X]

train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Exploratory Data Analysis

In [3]:
# Get a describtion of the data
desc_df = eda.describe(train_X, train_y, type = 'Regression')
desc_df

Unnamed: 0,type,MDP,var,std,min,25%,50%,75%,max,mean,MI,RFE,Importance,highestCorr,corr
Id,int64,0.00,1.777550e+05,421.610,1.0,365.75,730.5,1095.25,1460.0,730.500,0.000,55.0,0.000442,PoolQC,0.067
MSSubClass,int64,0.00,1.789338e+03,42.301,20.0,20.00,50.0,70.00,190.0,56.897,0.061,52.0,0.000805,BldgType,0.771
MSZoning,object,0.00,,,,,,,,,0.127,33.0,0.002876,Alley,0.399
LotFrontage,float64,17.74,5.897490e+02,24.285,21.0,59.00,69.0,80.00,313.0,70.050,,,0.000901,BldgType,0.471
LotArea,int64,0.00,9.962565e+07,9981.265,1300.0,7553.50,9478.5,11601.50,215245.0,10516.828,0.094,23.0,0.003047,LandSlope,0.437
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MiscVal,int64,0.00,2.461381e+05,496.123,0.0,0.00,0.0,0.00,15500.0,43.489,0.000,60.0,0.000036,MiscFeature,0.548
MoSold,int64,0.00,7.310000e+00,2.704,1.0,5.00,6.0,8.00,12.0,6.322,0.000,32.0,0.000789,YrSold,0.146
YrSold,int64,0.00,1.764000e+00,1.328,2006.0,2007.00,2008.0,2009.00,2010.0,2007.816,0.009,44.0,0.000377,MoSold,0.146
SaleType,object,0.00,,,,,,,,,0.086,40.0,0.001562,SaleCondition,0.232


### Data Dimensionality Reduction

In [4]:
features_to_remove = set()

In [5]:
# Select features with heavy NAs (more than 40% MDP)
features = desc_df.loc[desc_df['MDP'] > 40].index
features_to_remove.update(features)

print('Features to remove:', features)
print('Reason: high NA entries (More than 40%)')


Features to remove: Index(['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], dtype='object')
Reason: high NA entries (More than 40%)


In [6]:
# Select features with 0 mutual information
features = desc_df.loc[desc_df['MI'] == 0].index
features_to_remove.update(features)

print('Features to remove:', list(features))
print('Reason: Mutual Information of 0')


Features to remove: ['Id', 'LandSlope', 'Condition2', 'RoofMatl', 'BsmtFinSF2', 'LowQualFinSF', '3SsnPorch', 'PoolArea', 'MiscVal', 'MoSold']
Reason: Mutual Information of 0


In [7]:
# Select features ranking last 15 for RFE ranking
features = desc_df.sort_values(by = 'RFE', ascending = False).iloc[:15].index
features_to_remove.update(features)

print('Features to remove:', list(features))
print('Reason: Bad RFE ranking')

Features to remove: ['Utilities', 'MiscVal', 'Heating', 'Foundation', 'PavedDrive', 'LandSlope', 'Id', 'Street', 'LowQualFinSF', 'MSSubClass', 'LotShape', 'HalfBath', 'BldgType', 'HeatingQC', 'LotConfig']
Reason: Bad RFE ranking


In [8]:
# Select 15 features with lowest importance
features = desc_df.sort_values(by = 'Importance', ascending = True).iloc[:15].index
features_to_remove.update(features)

print('Features to remove:', list(features))
print('Reason: Low Importance')

Features to remove: ['Utilities', 'PoolQC', 'GarageCond', 'Street', 'MiscVal', 'MiscFeature', 'Heating', 'ExterCond', 'BsmtHalfBath', 'Electrical', 'YrSold', 'ExterQual', 'MasVnrType', 'Id', 'HouseStyle']
Reason: Low Importance


In [9]:
# Drop gathered features
print('Overall features to remove:', list(features_to_remove))

for df in combined:
    df.drop(columns = features_to_remove, inplace = True)    

Overall features to remove: ['HeatingQC', 'YrSold', 'Foundation', 'PavedDrive', 'BldgType', 'LotShape', 'Alley', 'Fence', 'Street', 'PoolArea', 'MiscFeature', 'HouseStyle', 'MoSold', 'FireplaceQu', 'Heating', 'MiscVal', 'BsmtHalfBath', 'GarageCond', 'BsmtFinSF2', 'RoofMatl', 'Utilities', 'LotConfig', 'LowQualFinSF', 'LandSlope', 'PoolQC', 'Id', 'MSSubClass', 'ExterCond', 'MasVnrType', '3SsnPorch', 'Electrical', 'Condition2', 'ExterQual', 'HalfBath']


### Feature Engineering

In [10]:
# Remove the two abnormal 'GrLivArea' records

indexes = train_X.sort_values(by = ['GrLivArea'], ascending = False).iloc[:2].index
train_X.drop(indexes, axis = 0, inplace = True)


In [11]:
# Basement is important and is something I consider when buying a house, so I will keep it. But it's not normal again and in fact, there are many houses with no basements!
# So I'll create a new variabe that indicates wheter the house has a basement or not. Then I'll transform the ones with basement it to log.

# Since 'GrLivArea' is not normal, we will transform it to log to fit it as much as possible

for df in combined:

    # Feature Extraction
    df['HasBsmt'] = 0
    df.loc[df['TotalBsmtSF'] > 0, 'HasBsmt'] = 1

    # 'TotalBsmtSF' Log Transformation
    df.loc[df['HasBsmt'] == 1,'TotalBsmtSF'] = np.log(df['TotalBsmtSF'])

    # 'GrLivArea' Log Transformation
    df['GrLivArea'] = np.log(df['GrLivArea'])


# Log Transformation for 'SalePrice'
train_y = np.log(train_y)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [12]:
# Get a describtion of the data
desc_df = eda.describe(train_X, train_y, type = 'Regression')
desc_df[desc_df['corr'] > 0.8].sort_values(by = 'corr', ascending = False)

ValueError: Found input variables with inconsistent numbers of samples: [1458, 1460]

In [None]:
# Since 'GarageArea' and 'GarageCars' are correlated, we remove 'GarageCars' because it proviedes fewer information.
# 'GarageYrBlt' is also highly correlated with 'YearBuilt', and between the two, it provides less information.

for df in combined:
    df.drop(columns = ['GarageCars', 'GarageYrBlt'], inplace = True)

### Feature Selection

In [None]:
desc_df = eda.describe(train_X, train_y.ravel(), type = 'Regression')
desc_df.sort_values(by = ['MI', 'Importance'], ascending = False)

# Running the EDA once more, we can see the two features "OverallQual" and "GrLivArea" are the first two features according to MI and Importance

In [None]:
features = ['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'HasBsmt']

XG_reg = XGBRegressor(n_estimators = 1000, learning_rate = 0.05)

XG_reg.fit(train_X[features], train_y)
XG_reg.score(train_X[features], train_y)

In [None]:
pred = XG_reg.predict(test_X[features])

submission_df = pd.DataFrame(data = {
    'Id': test_df['Id'],
    'SalePrice': pred
})

submission_df.to_csv(f'submission.csv', index = False)