## XGBoost Model
- Popular tree-boosting model

In [1]:
import pandas as pd
from pandas_summary import DataFrameSummary
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from nbutils import utils
%matplotlib inline

In [2]:
data_dir = '_data/' 
df_train = pd.read_csv(data_dir + 'train.csv')
target_col = 'SalePrice'

def drop_cols(df):
    # Pre-process - drop attributes as most properties(>80%) do not have these features...
    return df.drop(['Alley', 'Fence','MiscFeature','PoolArea', 'PoolQC'], axis=1)

df_train = df_train.drop('Id', axis=1)
df_train1 = drop_cols(df_train)


In [3]:
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from scipy import stats

def hot_encode_categorial_variables(df, required_cols=[]):
    df = pd.get_dummies(df)
    if len(required_cols) > 0:
        for c in required_cols:
            if c not in df.columns.values:
                df[c] = np.nan
    return df

def predict_score(mymodel, xtrain, xtest, ytrain, ytest, verbose=True):
    mymodel.fit(xtrain, ytrain)
    #Predicting the prices
    pred = mymodel.predict(xtest)
    err_rms = np.sqrt(metrics.mean_squared_error(ytest, pred))/1000
    return {'model':mymodel, 'err':err_rms}

def xgb_predict(seeds,xtrain, xtest, ytrain, ytest):
    """
        Train and predict.  Show 
    """
    rms = list()
    model = None
    for s in seeds:
        p = xgb.XGBRegressor(objective='reg:squarederror', random_state=s)
        results = predict_score(p, xtrain, xtest, ytrain, ytest)
        err = results['err']
        rms.append(err)
        if model is None or err < np.min(rms):
            model = results['model']
    print(stats.describe(np.array(rms)))
    return model
    
def xgb_split_and_predict(seeds, df):
    X = df.drop(target_col, axis=1)
    y = df[target_col].values
    xtrain, xtest, ytrain, ytest = train_test_split(X,y,test_size=1/4, random_state=0)
    return xgb_predict(seeds, xtrain, xtest, ytrain, ytest)

In [4]:
import xgboost as xgb

print('\nAttempt 6: XGBoost')
seeds=(np.random.random_sample((10,))*100).astype(int)
xgboost1 = xgb_split_and_predict(seeds, hot_encode_categorial_variables(df_train))

print('\nAttempt 6: XGBoost with less attributes')
xgboost2_train = hot_encode_categorial_variables(df_train1)
xgboost2 = xgb_split_and_predict(seeds, xgboost2_train)



Attempt 6: XGBoost
DescribeResult(nobs=10, minmax=(33.914644656927635, 33.914644656927635), mean=33.914644656927635, variance=0.0, skewness=0.0, kurtosis=-3.0)

Attempt 6: XGBoost with less attributes
DescribeResult(nobs=10, minmax=(33.71647608069665, 33.71647608069665), mean=33.71647608069666, variance=5.609677548238306e-29, skewness=-1.0, kurtosis=-2.0)


In [5]:
# Let's see if we can improve this score by imputating missing NA value using multivariate imputation

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor

estimators = [
    BayesianRidge(),
    DecisionTreeRegressor(max_features='sqrt', random_state=0),
    ExtraTreesRegressor(n_estimators=10, random_state=0),
    KNeighborsRegressor(n_neighbors=15)
]

def get_imputer(estimator, X):
    print('Imputation estimator=' + estimator.__class__.__name__)
    imp = IterativeImputer(random_state=0, estimator=estimator)
    imp.fit(X)
    print('Imputation done')
    return imp

def impute_and_predict(imputer, X, y):
    X_transformed = imputer.transform(X)
    xtrain, xtest, ytrain, ytest = train_test_split(X_transformed,y,test_size=1/4, random_state=0)
    model = xgb_predict(seeds,xtrain, xtest, ytrain, ytest)
    return model

In [6]:
X = pd.get_dummies(df_train.drop(target_col, axis=1))
y = df_train[target_col]
for e in estimators:
    impute_and_predict(get_imputer(e, X),X,y)

Imputation estimator=BayesianRidge
Imputation done
DescribeResult(nobs=10, minmax=(31.248909682255505, 31.248909682255505), mean=31.248909682255505, variance=0.0, skewness=0.0, kurtosis=-3.0)
Imputation estimator=DecisionTreeRegressor




Imputation done
DescribeResult(nobs=10, minmax=(33.1674164379457, 33.1674164379457), mean=33.1674164379457, variance=0.0, skewness=0.0, kurtosis=-3.0)
Imputation estimator=ExtraTreesRegressor
Imputation done
DescribeResult(nobs=10, minmax=(31.720118299893862, 31.720118299893862), mean=31.72011829989386, variance=1.4024193870595766e-29, skewness=1.0, kurtosis=-2.0)
Imputation estimator=KNeighborsRegressor
Imputation done
DescribeResult(nobs=10, minmax=(31.718221422070982, 31.718221422070982), mean=31.718221422070986, variance=1.4024193870595766e-29, skewness=-1.0, kurtosis=-2.0)


**Imputated the training data lead to slightly better results (around 6%)**

In [7]:
train_cols = xgboost2_train.drop('SalePrice', axis=1).columns
test = pd.read_csv(data_dir + 'test.csv')
test_encoded =  hot_encode_categorial_variables(drop_cols(test), train_cols)
testy = xgboost2.predict(test_encoded[train_cols])
results = pd.DataFrame()
results['SalePrice'] = testy
results['Id'] = test_encoded.Id
results.to_csv(data_dir + 'xgb2_prediction.csv', index=False)

In [8]:
# In the kaggle forum, users suggest that imputate using the concantenation of the training set and test set lead to better results
X_all = X.append(test[X.columns], sort=False)

for e in estimators:
    impute_and_predict(get_imputer(e, X_all),X,y)

KeyError: "['BsmtFinType1_Rec', 'RoofMatl_ClyTile', 'Exterior2nd_CBlock', 'SaleType_CWD', 'LotShape_IR2', 'HouseStyle_SFoyer', 'RoofStyle_Gable', 'Condition1_PosA', 'Heating_Floor', 'MasVnrType_None', 'Neighborhood_SawyerW', 'BldgType_TwnhsE', 'Heating_Grav', 'Functional_Mod', 'Electrical_FuseA', 'Neighborhood_SWISU', 'SaleCondition_Abnorml', 'Foundation_Wood', 'LotConfig_Inside', 'Neighborhood_Veenker', 'Condition1_Feedr', 'GarageQual_Gd', 'Heating_Wall', 'Street_Pave', 'GarageQual_Ex', 'Fence_GdPrv', 'Foundation_Slab', 'SaleType_Con', 'SaleType_ConLD', 'Electrical_FuseP', 'Exterior1st_Stucco', 'Fence_MnWw', 'Heating_OthW', 'Electrical_SBrkr', 'MSZoning_RM', 'Neighborhood_Gilbert', 'Exterior1st_Plywood', 'Exterior2nd_AsbShng', 'CentralAir_N', 'KitchenQual_Fa', 'Exterior1st_MetalSd', 'Neighborhood_Blueste', 'Neighborhood_Somerst', 'Condition2_PosN', 'Condition1_RRNn', 'BsmtExposure_Mn', 'RoofMatl_Membran', 'RoofMatl_CompShg', 'RoofStyle_Hip', 'PavedDrive_P', 'KitchenQual_Gd', 'ExterCond_Po', 'GarageType_BuiltIn', 'BldgType_Duplex', 'Exterior1st_WdShing', 'Exterior1st_AsbShng', 'BsmtFinType2_GLQ', 'SaleType_New', 'MiscFeature_TenC', 'Neighborhood_NPkVill', 'GarageQual_Po', 'ExterQual_TA', 'ExterQual_Fa', 'Neighborhood_BrDale', 'Foundation_BrkTil', 'Neighborhood_Edwards', 'Condition2_RRAe', 'Exterior2nd_ImStucc', 'HouseStyle_2Story', 'Exterior1st_HdBoard', 'Neighborhood_Crawfor', 'Neighborhood_Mitchel', 'KitchenQual_Ex', 'Exterior1st_BrkFace', 'Functional_Typ', 'FireplaceQu_Ex', 'GarageType_Detchd', 'Exterior1st_ImStucc', 'Neighborhood_NridgHt', 'LotShape_Reg', 'SaleCondition_Partial', 'FireplaceQu_Gd', 'SaleType_WD', 'LotConfig_FR3', 'Electrical_Mix', 'BldgType_Twnhs', 'Exterior2nd_Wd Shng', 'BsmtFinType2_Unf', 'Alley_Pave', 'Condition2_RRNn', 'BsmtQual_Ex', 'Exterior1st_Wd Sdng', 'PavedDrive_N', 'Exterior1st_Stone', 'HeatingQC_Gd', 'HeatingQC_Ex', 'Condition1_Artery', 'FireplaceQu_TA', 'GarageQual_Fa', 'GarageQual_TA', 'RoofMatl_WdShngl', 'Heating_GasA', 'LandSlope_Gtl', 'MasVnrType_BrkCmn', 'Condition1_PosN', 'GarageFinish_Fin', 'Neighborhood_Blmngtn', 'RoofMatl_Metal', 'Neighborhood_IDOTRR', 'Functional_Maj1', 'RoofStyle_Mansard', 'PavedDrive_Y', 'BsmtFinType2_ALQ', 'Fence_GdWo', 'MiscFeature_Othr', 'Exterior2nd_Stone', 'Functional_Min2', 'Electrical_FuseF', 'GarageFinish_Unf', 'Exterior2nd_CmentBd', 'RoofMatl_Tar&Grv', 'Exterior2nd_Wd Sdng', 'LotConfig_Corner', 'SaleCondition_Normal', 'Exterior1st_BrkComm', 'LotConfig_CulDSac', 'RoofStyle_Shed', 'Exterior1st_CBlock', 'MasVnrType_Stone', 'Neighborhood_BrkSide', 'ExterCond_Gd', 'BsmtFinType1_BLQ', 'SaleType_Oth', 'BsmtExposure_No', 'BldgType_1Fam', 'GarageCond_Fa', 'GarageType_2Types', 'LandContour_Lvl', 'Neighborhood_NAmes', 'Neighborhood_ClearCr', 'MSZoning_RL', 'Foundation_Stone', 'Street_Grvl', 'Exterior2nd_Brk Cmn', 'BsmtCond_Fa', 'GarageType_Basment', 'Exterior2nd_HdBoard', 'FireplaceQu_Fa', 'BsmtFinType1_Unf', 'GarageCond_Gd', 'BsmtQual_Gd', 'Condition1_RRNe', 'RoofMatl_Roll', 'Functional_Min1', 'Functional_Sev', 'SaleCondition_AdjLand', 'Condition2_Norm', 'Condition1_Norm', 'Exterior2nd_VinylSd', 'ExterQual_Ex', 'BsmtQual_TA', 'LandContour_Bnk', 'LandContour_Low', 'CentralAir_Y', 'SaleType_ConLI', 'HeatingQC_TA', 'GarageFinish_RFn', 'RoofMatl_WdShake', 'MSZoning_RH', 'Condition2_RRAn', 'BsmtFinType2_BLQ', 'HouseStyle_2.5Fin', 'Exterior2nd_Other', 'ExterCond_TA', 'LotConfig_FR2', 'Alley_Grvl', 'Condition2_Feedr', 'Neighborhood_Sawyer', 'HouseStyle_1.5Fin', 'PoolQC_Gd', 'BsmtExposure_Gd', 'Functional_Maj2', 'BldgType_2fmCon', 'Exterior2nd_AsphShn', 'LotShape_IR1', 'Exterior2nd_Stucco', 'BsmtFinType2_Rec', 'Condition2_PosA', 'Neighborhood_Timber', 'Neighborhood_CollgCr', 'MiscFeature_Gar2', 'BsmtFinType1_ALQ', 'FireplaceQu_Po', 'HouseStyle_SLvl', 'Neighborhood_MeadowV', 'RoofStyle_Flat', 'Utilities_NoSeWa', 'ExterCond_Ex', 'BsmtExposure_Av', 'BsmtCond_TA', 'BsmtFinType1_LwQ', 'KitchenQual_TA', 'GarageCond_TA', 'MSZoning_C (all)', 'PoolQC_Ex', 'MiscFeature_Shed', 'MasVnrType_BrkFace', 'Fence_MnPrv', 'Condition1_RRAe', 'BsmtFinType2_LwQ', 'HeatingQC_Fa', 'SaleType_COD', 'LandSlope_Mod', 'LotShape_IR3', 'HeatingQC_Po', 'ExterQual_Gd', 'RoofStyle_Gambrel', 'Neighborhood_OldTown', 'ExterCond_Fa', 'PoolQC_Fa', 'Exterior2nd_MetalSd', 'Exterior1st_AsphShn', 'GarageType_Attchd', 'LandContour_HLS', 'Heating_GasW', 'HouseStyle_2.5Unf', 'Condition2_Artery', 'Exterior1st_CemntBd', 'Exterior1st_VinylSd', 'BsmtFinType1_GLQ', 'SaleCondition_Alloca', 'BsmtCond_Po', 'SaleCondition_Family', 'HouseStyle_1Story', 'Neighborhood_StoneBr', 'Neighborhood_NoRidge', 'Exterior2nd_Plywood', 'Foundation_CBlock', 'Foundation_PConc', 'BsmtQual_Fa', 'BsmtCond_Gd', 'MSZoning_FV', 'SaleType_ConLw', 'Neighborhood_NWAmes', 'GarageType_CarPort', 'GarageCond_Ex', 'Condition1_RRAn', 'HouseStyle_1.5Unf', 'Utilities_AllPub', 'GarageCond_Po', 'Exterior2nd_BrkFace', 'LandSlope_Sev'] not in index"

**The mean error is higher than before, so it seems using the test set for the imputation process doesn't provide much benefit in this case.  Still I will create another set of predictions and have it scored.**

In [None]:
# Generate another submission
estimator = KNeighborsRegressor(n_neighbors=15)
imputer = get_imputer(estimator, X_all)


In [None]:
mymodel = xgb.XGBRegressor(objective='reg:squarederror')
mymodel.fit(imputer.transform(X), y)
train_cols = X.columns
test_encoded =  hot_encode_categorial_variables(drop_cols(test), train_cols)
testy = xgboost2.predict(test_encoded[train_cols])

In [None]:
len(test_encoded.columns)

In [None]:
results = pd.DataFrame()
results['SalePrice'] = testy
results['Id'] = test_encoded.Id
results.to_csv(data_dir + 'xgb3_prediction.csv', index=False)