## XGBoost Model
- Popular tree-boosting model (especially on Kaggle)

In [1]:
import pandas as pd
from pandas_summary import DataFrameSummary
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [117]:
data_dir = '_data/' 
df_train = pd.read_csv(data_dir + 'train1.csv')
target_col = 'SalePrice'

# def preprocess(df):
#     # Find categorical variables
#     types_df = pd.DataFrame(df.dtypes).reset_index()
#     cat_cols = types_df[types_df[0] == 'object']['index'].values
#     # Assign NA as Other for these attributes, usually NA indicates Not Present
#     #df[cat_cols] = df[cat_cols].fillna('Other')
#     return df

def drop_cols(df):
    # Pre-process - drop attributes as most properties(>80%) do not have these features...
    return df.drop(['Alley', 'Fence','MiscFeature','PoolArea', 'PoolQC'], axis=1)

df_train = df_train.drop('Id', axis=1)
df_train1 = drop_cols(df_train)


In [79]:
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from scipy import stats

def hot_encode_categorial_variables(df, required_cols=[]):
    df = pd.get_dummies(df)
    if len(required_cols) > 0:
        for c in required_cols:
            if c not in df.columns.values:
                df[c] = 0
    return df

def predict_score(mymodel, xtrain, xtest, ytrain, ytest, verbose=True):
    mymodel.fit(xtrain, ytrain)
    #Predicting the prices
    pred = mymodel.predict(xtest)
    err_rms = np.sqrt(metrics.mean_squared_error(ytest, pred))/1000
    return {'model':mymodel, 'err':err_rms}

def xgb_predict(seeds,xtrain, xtest, ytrain, ytest):
    rms = list()
    model = None
    for s in seeds:
        p = xgb.XGBRegressor(s)
        results = predict_score(p, xtrain, xtest, ytrain, ytest)
        err = results['err']
        rms.append(err)
        if model is None or err < np.min(rms):
            model = results['model']
    print(stats.describe(np.array(rms)))
    return model
    
def xgb_split_and_predict(seeds, df):
    X = df.drop(target_col, axis=1)
    y = df[target_col].values
    xtrain, xtest, ytrain, ytest = train_test_split(X,y,test_size=1/4, random_state=0)
    return xgb_predict(seeds, xtrain, xtest, ytrain, ytest)

In [122]:
import xgboost as xgb

print('\nAttempt 6: XGBoost')
seeds=(np.random.random_sample((10,))*100).astype(int)
xgboost1 = xgb_split_and_predict(seeds, hot_encode_categorial_variables(df_train))

print('\nAttempt 6: XGBoost with less attributes')
xgboost2_train = hot_encode_categorial_variables(df_train1)
xgboost2 = xgb_split_and_predict(seeds, xgboost2_train)



Attempt 6: XGBoost
DescribeResult(nobs=10, minmax=(29.804433210279093, 30.26136839134928), mean=30.135484506927135, variance=0.014544722019494959, skewness=-2.279346177966918, kurtosis=4.15844086001757)

Attempt 6: XGBoost with less attributes
DescribeResult(nobs=10, minmax=(28.856005810332096, 29.278142335242553), mean=29.187269170831524, variance=0.023305546984801605, skewness=-1.253318016148924, kurtosis=0.07506844685155878)


In [121]:
# Let's see if we can improve this score by imputating missing NA value using multivariate imputation

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor

estimators = [
    BayesianRidge(),
    DecisionTreeRegressor(max_features='sqrt', random_state=0),
    ExtraTreesRegressor(n_estimators=10, random_state=0),
    KNeighborsRegressor(n_neighbors=15)
]

def impute_and_predict(estimator, X, y):
    print('Imputation estimator=' + estimator.__class__.__name__)
    imp = IterativeImputer(random_state=0, estimator=estimator)
    imp.fit(X)
    X_transformed = imp.transform(X)
    xtrain, xtest, ytrain, ytest = train_test_split(X_transformed,y,test_size=1/4, random_state=0)
    xgb_predict(seeds,xtrain, xtest, ytrain, ytest)
    return imp

X = hot_encode_categorial_variables(pd.get_dummies(df_train.drop(target_col, axis=1)))
y = df_train[target_col]
for e in estimators:
    impute_and_predict(e,X,y)

Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x7fd38b7286a0>>
Traceback (most recent call last):
  File "/home/hungap/programs/anaconda3/lib/python3.6/site-packages/xgboost/core.py", line 366, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'


Imputation estimator=BayesianRidge
DescribeResult(nobs=10, minmax=(28.570264107191022, 81.25570685806929), mean=34.84732478961672, variance=266.48609667014375, skewness=2.655731304840336, kurtosis=5.0763580867134)
Imputation estimator=DecisionTreeRegressor




DescribeResult(nobs=10, minmax=(29.73919697781643, 81.25570685806929), mean=35.15557875539179, variance=262.8448824298886, skewness=2.657880450276656, kurtosis=5.082990211712058)
Imputation estimator=ExtraTreesRegressor
DescribeResult(nobs=10, minmax=(29.59127321770937, 81.25570685806929), mean=35.326458700275296, variance=260.5470800467616, skewness=2.6644576624661225, kurtosis=5.10420866709884)
Imputation estimator=KNeighborsRegressor
DescribeResult(nobs=10, minmax=(29.41266468828967, 81.25570685806929), mean=34.899836409073295, variance=266.0081609339686, skewness=2.653575221789534, kurtosis=5.068879212402145)


In [123]:
# Use second XGBoost model to submit first predictions
test = pd.read_csv(data_dir + 'test.csv')
test_train =  hot_encode_categorial_variables(drop_cols(preprocess(test)), xgboost2_train.columns)
test_train = test_train.drop(['Id','SalePrice'], axis=1)
testy = xgboost2.predict(test_train[xgboost2_train.drop('SalePrice', axis=1).columns])
test['SalePrice'] = testy
test[['Id', 'SalePrice']].to_csv(data_dir + 'prediction.csv', index=False)

In [None]:
# Impute with test set concatenated with training set
X_all = X.append(test_train)
for e in estimators:
    print('Imputation estimator=' + e.__class__.__name__)
    imp = IterativeImputer(random_state=0, estimator=e)
    imp.fit(X_all)
    xtrain, xtest, ytrain, ytest = train_test_split(imp.transform(X),y,test_size=1/4, random_state=0)
    xgb_predict(seeds,xtrain, xtest, ytrain, ytest)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Imputation estimator=BayesianRidge
DescribeResult(nobs=10, minmax=(26.166011981512124, 26.73323052804383), mean=26.448759425151728, variance=0.02223945921456005, skewness=0.23399876642647374, kurtosis=0.5241395830990787)
Imputation estimator=DecisionTreeRegressor




DescribeResult(nobs=10, minmax=(26.166011981512124, 26.73323052804383), mean=26.448759425151728, variance=0.02223945921456005, skewness=0.23399876642647374, kurtosis=0.5241395830990787)
Imputation estimator=ExtraTreesRegressor


In [116]:
# Generate another submission wtih amputation
testx = pd.get_dummies(test.drop(target_col, axis=1))
imp_model = xgboost2.predict(imp.transform(testx))


ValueError: X has 271 features per sample, expected 288