# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

# Load Data

In [2]:
X = pd.read_csv('train.csv', index_col='Id')
X_test_full = pd.read_csv('test.csv', index_col='Id')

In [3]:
X.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


The goal of this exercise is to predict the SalePrice of a house given the other features.

It is a good training ground to learn the XGBoost method.

# Preprocessing

In [4]:
# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice              
X.drop(['SalePrice'], axis=1, inplace=True)

# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

In [5]:
cat_cols = X_train_full[[cname for cname in X_train_full.columns if 
                        X_train_full[cname].dtype == "object"]] #categorical columns

Some of the categorical columns have too many values. We need to remove them, or else one-hot-encoding will generate too many new entries.

In [6]:
cat_cols.nunique() #Some of the categorical columns

MSZoning          5
Street            2
Alley             2
LotShape          4
LandContour       4
Utilities         2
LotConfig         5
LandSlope         3
Neighborhood     25
Condition1        9
Condition2        6
BldgType          5
HouseStyle        8
RoofStyle         6
RoofMatl          7
Exterior1st      15
Exterior2nd      16
MasVnrType        3
ExterQual         4
ExterCond         5
Foundation        6
BsmtQual          4
BsmtCond          4
BsmtExposure      4
BsmtFinType1      6
BsmtFinType2      6
Heating           6
HeatingQC         5
CentralAir        2
Electrical        5
KitchenQual       4
Functional        6
FireplaceQu       5
GarageType        6
GarageFinish      3
GarageQual        5
GarageCond        5
PavedDrive        3
PoolQC            3
Fence             4
MiscFeature       3
SaleType          9
SaleCondition     6
dtype: int64

In [7]:
to_drop=['Neighborhood','Exterior1st','Exterior2nd'] #problematic categorical variables

X_train = X_train_full.drop(to_drop, axis = 1) # we drop them
X_valid = X_valid_full.drop(to_drop, axis = 1)
X_test = X_test_full.drop(to_drop, axis = 1)

X_train = pd.get_dummies(X_train) #one-hot encoding the categorical variables
X_valid = pd.get_dummies(X_valid)
X_test = pd.get_dummies(X_test)

X_train, X_valid = X_train.align(X_valid, join='left', axis=1) #align the columns
X_train, X_test = X_train.align(X_test, join='left', axis=1)

# Model

In [8]:
# Define the model
model = XGBRegressor(n_estimators=1000, learning_rate=0.05,  early_stopping_rounds=5) 

#n_estimators is the number of trees that we use
# learning rate makes the trees that we add at the end less important

# Fit the model
model.fit(X_train, y_train, 
             eval_set=[(X_valid, y_valid)], verbose = False) # Your code here # Your code here

# early_stopping stops adding trees once the accuracy starts getting reduced

predictions = model.predict(X_valid) 
print(mean_absolute_error(predictions, y_valid))#mean absolute error
print(y_valid.mean())

17032.769063035103
181370.38356164383


On average, our model is 17 thousand dollars off, which is not bad since the average price is 181370.