## XGBoost Model
- Popular tree-boosting model (especially on Kaggle)

In [1]:
import pandas as pd
from pandas_summary import DataFrameSummary
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from nbutils import utils
%matplotlib inline

In [2]:
data_dir = '_data/' 
df_train = pd.read_csv(data_dir + 'train.csv')
target_col = 'SalePrice'

def drop_cols(df):
    # Pre-process - drop attributes as most properties(>80%) do not have these features...
    return df.drop(['Alley', 'Fence','MiscFeature','PoolArea', 'PoolQC'], axis=1)

df_train = df_train.drop('Id', axis=1)
df_train1 = drop_cols(df_train)


In [3]:
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from scipy import stats

def hot_encode_categorial_variables(df, required_cols=[]):
    df = pd.get_dummies(df)
    if len(required_cols) > 0:
        for c in required_cols:
            if c not in df.columns.values:
                df[c] = np.nan
    return df

def predict_score(mymodel, xtrain, xtest, ytrain, ytest, verbose=True):
    mymodel.fit(xtrain, ytrain)
    #Predicting the prices
    pred = mymodel.predict(xtest)
    err_rms = np.sqrt(metrics.mean_squared_error(ytest, pred))/1000
    return {'model':mymodel, 'err':err_rms}

def xgb_predict(seeds,xtrain, xtest, ytrain, ytest):
    """
        Train and predict.  Show 
    """
    rms = list()
    model = None
    for s in seeds:
        p = xgb.XGBRegressor(objective='reg:squarederror', random_state=s)
        results = predict_score(p, xtrain, xtest, ytrain, ytest)
        err = results['err']
        rms.append(err)
        if model is None or err < np.min(rms):
            model = results['model']
    print(stats.describe(np.array(rms)))
    return model
    
def xgb_split_and_predict(seeds, df):
    X = df.drop(target_col, axis=1)
    y = df[target_col].values
    xtrain, xtest, ytrain, ytest = train_test_split(X,y,test_size=1/4, random_state=0)
    return xgb_predict(seeds, xtrain, xtest, ytrain, ytest)

In [4]:
import xgboost as xgb

print('\nAttempt 6: XGBoost')
seeds=(np.random.random_sample((10,))*100).astype(int)
xgboost1 = xgb_split_and_predict(seeds, hot_encode_categorial_variables(df_train))

print('\nAttempt 6: XGBoost with less attributes')
xgboost2_train = hot_encode_categorial_variables(df_train1)
xgboost2 = xgb_split_and_predict(seeds, xgboost2_train)



Attempt 6: XGBoost
DescribeResult(nobs=10, minmax=(33.914644656927635, 33.914644656927635), mean=33.914644656927635, variance=0.0, skewness=0.0, kurtosis=-3.0)

Attempt 6: XGBoost with less attributes
DescribeResult(nobs=10, minmax=(33.71647608069665, 33.71647608069665), mean=33.71647608069666, variance=5.609677548238306e-29, skewness=-1.0, kurtosis=-2.0)


In [5]:
# Let's see if we can improve this score by imputating missing NA value using multivariate imputation

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor

estimators = [
    BayesianRidge(),
    DecisionTreeRegressor(max_features='sqrt', random_state=0),
    ExtraTreesRegressor(n_estimators=10, random_state=0),
    KNeighborsRegressor(n_neighbors=15)
]

def get_imputer(estimator, X):
    print('Imputation estimator=' + estimator.__class__.__name__)
    imp = IterativeImputer(random_state=0, estimator=estimator)
    imp.fit(X)
    print('Imputation done')
    return imp

def impute_and_predict(imputer, X, y):
    X_transformed = imputer.transform(X)
    xtrain, xtest, ytrain, ytest = train_test_split(X_transformed,y,test_size=1/4, random_state=0)
    model = xgb_predict(seeds,xtrain, xtest, ytrain, ytest)
    return model

In [6]:
X = pd.get_dummies(df_train.drop(target_col, axis=1))
y = df_train[target_col]
for e in estimators:
    impute_and_predict(get_imputer(e, X),X,y)

Imputation estimator=BayesianRidge
Imputation done
DescribeResult(nobs=10, minmax=(31.248909682255505, 31.248909682255505), mean=31.248909682255505, variance=0.0, skewness=0.0, kurtosis=-3.0)
Imputation estimator=DecisionTreeRegressor




Imputation done
DescribeResult(nobs=10, minmax=(33.1674164379457, 33.1674164379457), mean=33.1674164379457, variance=0.0, skewness=0.0, kurtosis=-3.0)
Imputation estimator=ExtraTreesRegressor
Imputation done
DescribeResult(nobs=10, minmax=(31.720118299893862, 31.720118299893862), mean=31.72011829989386, variance=1.4024193870595766e-29, skewness=1.0, kurtosis=-2.0)
Imputation estimator=KNeighborsRegressor
Imputation done
DescribeResult(nobs=10, minmax=(31.718221422070982, 31.718221422070982), mean=31.718221422070986, variance=1.4024193870595766e-29, skewness=-1.0, kurtosis=-2.0)


**Imputated the training data lead to slightly better results (around 6%)**

In [19]:
train_cols = xgboost2_train.drop('SalePrice', axis=1).columns
test = pd.read_csv(data_dir + 'test.csv')
test_encoded =  hot_encode_categorial_variables(drop_cols(test), train_cols)
testy = xgboost2.predict(test_encoded[train_cols])
results = pd.DataFrame()
results['SalePrice'] = testy
results['Id'] = test_encoded.Id
results.to_csv(data_dir + 'xgb2_prediction.csv', index=False)

In [10]:
# In the kaggle forum, users suggest that imputate using the concantenation of the training set and test set lead to better results
X_all = X.append(test[X.columns], sort=False)

for e in estimators:
    impute_and_predict(get_imputer(e, X_all),X,y)

Imputation estimator=BayesianRidge
Imputation done
DescribeResult(nobs=10, minmax=(32.118017227283225, 32.118017227283225), mean=32.11801722728323, variance=5.609677548238306e-29, skewness=-1.0, kurtosis=-2.0)
Imputation estimator=DecisionTreeRegressor




Imputation done
DescribeResult(nobs=10, minmax=(32.120165183997685, 32.120165183997685), mean=32.120165183997685, variance=0.0, skewness=0.0, kurtosis=-3.0)
Imputation estimator=ExtraTreesRegressor




Imputation done
DescribeResult(nobs=10, minmax=(32.31380808915051, 32.31380808915051), mean=32.313808089150506, variance=5.609677548238306e-29, skewness=1.0, kurtosis=-2.0)
Imputation estimator=KNeighborsRegressor
Imputation done
DescribeResult(nobs=10, minmax=(31.995452188962105, 31.995452188962105), mean=31.995452188962098, variance=5.609677548238306e-29, skewness=1.0, kurtosis=-2.0)


**The mean error is higher than before, so it seems using the test set for the imputation process doesn't provide much benefit in this case.  Still I will create another set of predictions and have it scored.**

In [None]:
# Generate another submission
estimator = KNeighborsRegressor(n_neighbors=15)
imputer = get_imputer(estimator, X_all)


Imputation estimator=KNeighborsRegressor


In [None]:
mymodel = xgb.XGBRegressor(objective='reg:squarederror')
mymodel.fit(imputer.transform(X), y)
train_cols = X.columns
test_encoded =  hot_encode_categorial_variables(drop_cols(test), train_cols)
testy = xgboost2.predict(test_encoded[train_cols])

In [None]:
len(test_encoded.columns)

In [None]:
results = pd.DataFrame()
results['SalePrice'] = testy
results['Id'] = test_encoded.Id
results.to_csv(data_dir + 'xgb3_prediction.csv', index=False)