# XGBoost Alternative Base Learners

#### Loading Libraries

In [1]:
# Numerical Computing
import numpy as np
# Data Manipulation
import pandas as pd
# Data Visualization
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# DateTime Library
import datetime as dt

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score, StratifiedKFold, KFold
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier

# Datasets
from sklearn import datasets
from sklearn.datasets import load_diabetes

# Model Metrics
from sklearn.metrics import mean_squared_error as MSE, accuracy_score, confusion_matrix, classification_report, recall_score

#Extreme Gradient Boosting
from xgboost import XGBRegressor, XGBRFRegressor
from xgboost import XGBClassifier, XGBRFClassifier

# Warnings
import warnings

# Timing
import time

In [2]:
warnings.filterwarnings('ignore')

### Applying gblinear

#### Loading Diabetes Data

In [3]:
# Assigning Target & Predictors
X, y = load_diabetes(return_X_y=True)

In [4]:
# For consistent splits in data
kfold = KFold(n_splits=5,
              shuffle=True,
              random_state=2)

In [5]:
# A function for Cross-Validation score:
def regression_model(model):
    scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kfold)
    rmse = (-scores)**0.5
    return rmse.mean()

In [6]:
# Based on 'gblinear base'
regression_model(XGBRegressor(booster='gblinear'))

55.52093240797664

In [7]:
# Checking on Absolute Value Regularization
regression_model(LinearRegression())

55.50936875436023

In [8]:
# Checking on Lasso:
regression_model(Lasso())

62.64904114426349

In [9]:
# Checking on Ridge:
regression_model(Ridge())

58.835292374356676

In [10]:
# Checking on 'gbtree base'
regression_model(XGBRegressor(booster='gbtree'))

66.8205077933539

#### gblinear grid search

In [11]:
# Setting a Grid-Search function
def grid_search(params, reg=XGBRegressor(booster='gblinear')):
    grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=kfold)
    # Fitting Grid-Model
    grid_reg.fit(X, y)
    # Getting best params
    best_params = grid_reg.best_params_
    print("Best params: ", best_params)
    # Getting best score
    best_score = np.sqrt(-grid_reg.best_score_)
    print('Best score: ', best_score)

In [12]:
# Setting Alpha with standard range:
grid_search(params={'reg_alpha': [0.001, 0.01, 0.1, 0.5, 1, 5]})

Best params:  {'reg_alpha': 0.01}
Best score:  55.4740863429363


In [13]:
# Setting reg_lambda
grid_search(params={'reg_lambda': [0.001, 0.01, 0.1, 0.5, 1, 5]})

Best params:  {'reg_lambda': 0.001}
Best score:  56.17170031099451


In [14]:
# Setting feature_selector with shuffle
grid_search(params={'feature_selector': ['shuffle']})

Best params:  {'feature_selector': 'shuffle'}
Best score:  55.503338945521286


In [15]:
# Setting feature_selector with random, greedy & thrifty with updater
grid_search(params={'feature_selector': ['random', 'greedy', 'thrifty'], 'updater': ['coord_descent']})

Best params:  {'feature_selector': 'thrifty', 'updater': 'coord_descent'}
Best score:  55.48980199825518


In [16]:
# Checking on tok_k
grid_search(params={'feature_selector': ['greedy', 'thrifty'], 'updater': ['coord_descent'], 'top_k': [3, 5, 7, 9]})

Best params:  {'feature_selector': 'thrifty', 'top_k': 3, 'updater': 'coord_descent'}
Best score:  55.47154721588882


#### Linear Datasets

In [17]:
X = np.arange(1, 100)

In [18]:
np.random.seed(2)

In [19]:
y = []

In [20]:
for i in X:
    y.append(i * np.random.uniform(-0.2, 0.2))

In [21]:
# Transforming into numpy array fore M.L purpose:
y = np.array(y)

In [22]:
X = X.reshape(X.shape[0], 1)
y = y.reshape(y.shape[0], 1)

In [23]:
# Running a regression model on 'gblinear':
regression_model(XGBRegressor(booster='gblinear', objetive='reg:squarederror'))

6.214942988206784

In [24]:
# Now, running a regression model on 'gbtree':
regression_model(XGBRegressor(booster='gbtree', objetive='reg:squarederror'))

9.320187242566508

In [25]:
# Running on standard Linear regression model:
regression_model(LinearRegression())

6.214962315808842

## Comparing DART

#### DART with XGBRegressor

In [26]:
# Loading again the diabetes dataset
X, y = load_diabetes(return_X_y=True)

In [28]:
# Model evaluation on 'DART'
regression_model(XGBRegressor(booster='dart', objetive='reg:squarederror'))

66.82050720095658

#### DART with XGBClassifier

In [29]:
# Setting data up
df_census = pd.read_csv('census_cleaned.csv')
df_census.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,...,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia,income_ >50K
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,53,234721,7,0,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,28,338409,13,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
# Setting Predictors & Target
X_census = df_census.iloc[:, :-1]
y_census = df_census.iloc[:, -1]

In [31]:
# A classification function:
def classification_model(model):
    scores = cross_val_score(model, X_census, y_census, scoring='accuracy', cv=kfold)
    return scores.mean()

In [None]:
classification_model(XGBClassifier(booster='', ))

In [33]:
# Using 'gbtree'
classification_model(XGBClassifier(booster='gbtree'))

0.8706121776481058

In [35]:
# Using 'DART'
classification_model(XGBClassifier(booster='dart'))

0.8706121776481058

In [36]:
# Using 'gblinear'
classification_model(XGBClassifier(booster='gblinear'))

0.8501275562652808

In [37]:
# Using 'Logistic Regression'
classification_model(LogisticRegression(max_iter=1000))

0.833021841405075

#### Modifying DART Hyperparameters

In [38]:
# Using 'one_drop' parameter
classification_model(XGBClassifier(booster='dart', one_drop=1))

0.8725471189543047

In [39]:
# On Diabetes dataset:
regression_model(XGBRegressor(booster='dart', objetive='reg:squarederror', sample_type='weighted'))

66.82050720095658

In [40]:
# On Diabetes dataset - normalize_type:
regression_model(XGBRegressor(booster='dart', objetive='reg:squarederror', normalize_type='forest'))

66.82050720095658

In [41]:
# On Diabetes dataset - normalize_type:
regression_model(XGBRegressor(booster='dart', objetive='reg:squarederror', one_drop=1))

65.07987076021041

In [42]:
# Checking on 'rate_drop':
grid_search(params={'rate_drop': [0.01, 0.1, 0.2, 0.4]}, reg=XGBRegressor(booster='dart', objetive='reg:squarederror', one_drop=1))

Best params:  {'rate_drop': 0.1}
Best score:  63.33053166103637


In [43]:
# Checking on 'skip_drop':
grid_search(params={'skip_drop': [0.01, 0.1, 0.2, 0.4]}, reg=XGBRegressor(booster='dart', objetive='reg:squarederror'))

Best params:  {'skip_drop': 0.01}
Best score:  66.86765429925147


In [44]:
# Checking on '':
# grid_search(params={'': []})

## Finding XGBoost Random Forest

#### As base learner

In [45]:
# checking on 25 parallel tree:
regression_model(XGBRegressor(booster='gbtree', objetive='reg:squarederror', num_parallel_tree=25))

66.73225316649118

In [46]:
# checking on 5 parallel tree:
regression_model(XGBRegressor(booster='gbtree', objetive='reg:squarederror', num_parallel_tree=5))

66.80212003856249

#### Random Forest as XGBoost Models

In [47]:
# Random Forest Regressor on XGB:
regression_model(XGBRFRegressor(objetive='reg:squarederror'))

59.768436395297364

In [48]:
# On R.F Regressor:
regression_model(RandomForestRegressor())

59.44526192030271

In [53]:
# Using 'XGBRFClassifier'
classification_model(XGBRFClassifier())

0.8555328249190526

In [52]:
classification_model(RandomForestClassifier())

0.8563313082774162