# XGBoost Alternative Base Learners

#### Loading Libraries

In [5]:
# Numerical Computing
import numpy as np
# Data Manipulation
import pandas as pd
# Data Visualization
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# DateTime Library
import datetime as dt

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score, StratifiedKFold, KFold
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier

# Datasets
from sklearn import datasets
from sklearn.datasets import load_diabetes

# Model Metrics
from sklearn.metrics import mean_squared_error as MSE, accuracy_score, confusion_matrix, classification_report, recall_score

#Extreme Gradient Boosting
from xgboost import XGBRegressor, XGBRFRegressor
from xgboost import XGBClassifier, XGBRFClassifier

# Warnings
import warnings

# Timing
import time

In [6]:
warnings.filterwarnings('ignore')

### Applying gblinear

#### Loading Diabetes Data

In [7]:
# Assigning Target & Predictors
X, y = load_diabetes(return_X_y=True)

In [8]:
# For consistent splits in data
kfold = KFold(n_splits=5,
              shuffle=True,
              random_state=2)

In [9]:
# A function for Cross-Validation score:
def regression_model(model):
    scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kfold)
    rmse = (-scores)**0.5
    return rmse.mean()

In [15]:
# Based on 'gblinear base'
regression_model(XGBRegressor(booster='gblinear'))

55.50365945475089

In [16]:
# Checking on Absolute Value Regularization
regression_model(LinearRegression())

55.50936875436023

In [17]:
# Checking on Lasso:
regression_model(Lasso())

62.64904114426349

In [18]:
# Checking on Ridge:
regression_model(Ridge())

58.835292374356676

In [21]:
# Checking on 'gbtree base'
regression_model(XGBRegressor(booster='gbtree'))

66.8205077933539

#### gblinear grid search

In [22]:
# Setting a Grid-Search function
def grid_search(params, reg=XGBRegressor(booster='gblinear')):
    grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=kfold)
    # Fitting Grid-Model
    grid_reg.fit(X, y)
    # Getting best params
    best_params = grid_reg.best_params_
    print("Best params: ", best_params)
    # Getting best score
    best_score = np.sqrt(-grid_reg.best_score_)
    print('Best score: ', best_score)

In [23]:
# Setting Alpha with standard range:
grid_search(params={'reg_alpha': [0.001, 0.01, 0.1, 0.5, 1, 5]})

Best params:  {'reg_alpha': 0.001}
Best score:  55.48890574749087


In [24]:
# Setting reg_lambda
grid_search(params={'reg_lambda': [0.001, 0.01, 0.1, 0.5, 1, 5]})

Best params:  {'reg_lambda': 0.001}
Best score:  56.17170025588034


In [25]:
# Setting feature_selector with shuffle
grid_search(params={'feature_selector': ['shuffle']})

Best params:  {'feature_selector': 'shuffle'}
Best score:  55.49607728794356


In [27]:
# Setting feature_selector with random, greedy & thrifty with updater
grid_search(params={'feature_selector': ['random', 'greedy', 'thrifty'], 'updater': ['coord_descent']})

Best params:  {'feature_selector': 'thrifty', 'updater': 'coord_descent'}
Best score:  55.48980199825518


In [28]:
# Checking on tok_k
grid_search(params={'feature_selector': ['greedy', 'thrifty'], 'updater': ['coord_descent'], 'top_k': [3, 5, 7, 9]})

Best params:  {'feature_selector': 'thrifty', 'top_k': 3, 'updater': 'coord_descent'}
Best score:  55.47154721588882


#### Linear Datasets

In [30]:
X = np.arange(1, 100)

In [31]:
np.random.seed(2)

In [32]:
y = []

In [33]:
for i in X:
    y.append(i * np.random.uniform(-0.2, 0.2))

In [35]:
# Transforming into numpy array fore M.L purpose:
y = np.array(y)

In [36]:
X = X.reshape(X.shape[0], 1)
y = y.reshape(y.shape[0], 1)

In [37]:
# Running a regression model on 'gblinear':
regression_model(XGBRegressor(booster='gblinear', objetive='reg:squarederror'))

6.214942988206784

In [38]:
# Now, running a regression model on 'gbtree':
regression_model(XGBRegressor(booster='gbtree', objetive='reg:squarederror'))

9.320187242566508

In [40]:
# Running on standard Linear regression model:
regression_model(LinearRegression())

6.214962315808842

## Comparing DART

#### DART with XGBRegressor