# XGBoost
XGBoost stands for Extreme Gradient Boost, which works through gradient descent.

## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

## Loading data

In [2]:
# Load the .csv files into pandas dataframes
X_full = pd.read_csv('train.csv', index_col='Id')
X_test_full = pd.read_csv('test.csv', index_col='Id')

# Remove rows (axis=0) with null ('NaN') values for the target variable
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)

# Separate target and predictor variables
y = X_full.SalePrice
X = X_full.drop('SalePrice', axis=1, inplace=False)

# Split X into training and validation datasets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state=0)

In [3]:
# Investigating cardinality of categorical datasets
cat_cols = [col for col in X_train_full.columns if X_train_full[col].dtypes == 'object' and X_train_full[col].nunique() < 10]
num_cols = [col for col in X_train_full.columns if X_train_full[col].dtypes in ['int64', 'float64']]

# Selecting my columns
my_cols = cat_cols + num_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

# Running some tests
# s = X_train_full.dtypes == 'object'
# print(len(s[s == False].index), len(s[s == True].index))
# print(X_train_full.dtypes.unique())
# print(len(num_cols), len(cat_cols))

## Using pipelines to further clean my data
* Numerical columns: impute missing data
* Categorical columns: impute missing data, one-hot encoding

In [4]:
numerical_transformer = SimpleImputer(strategy='mean')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, cat_cols),
    ('num', numerical_transformer, num_cols)
])

my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)

my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('my_model', my_model)
])

# help(XGBRegressor)

## Training my model

    pipeline_name.set_params(...).fit()
    
* Use this method to set parameters for specific transformers in your pipeline using name__parameter notation

In [5]:
my_pipeline.set_params(my_model__early_stopping_rounds=1, my_model__eval_set=[(X_valid, y_valid)]).fit(X_train, y_train)

predictions = my_pipeline.predict(X_valid)
mae = mean_absolute_error(y_valid, predictions)
print('MAE:', mae)

MAE: 15989.218027611301


## Testing the model with model predictions

In [9]:
# No need to worry about cleaning X, y; since pre-processing is part of my_pipeline
score = cross_val_score(my_pipeline, X, y,
                        cv=5,
                        scoring='neg_mean_absolute_error'
                        )

print('Cross-validation score:', -1 * score.mean())

Cross-validation score: 16259.513230415241
