# From Gradient Boosting to XGBoost

#### Loading Libraries

In [94]:
# Numerical Computing
import numpy as np
# Data Manipulation
import pandas as pd
# Data Visualization
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# DateTime Library
import datetime as dt

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# Model Metrics
from sklearn.metrics import mean_squared_error as MSE, accuracy_score

#Extreme Gradient Boosting
from xgboost import XGBRegressor
from xgboost import XGBClassifier

# Warnings
import warnings

# Timing
import time

#### Loading Data - Processing Bike Rentals

In [2]:
warnings.filterwarnings('ignore')

In [3]:
df_bikes = pd.read_csv('bike_rentals_cleaned.csv')
df_bikes.head()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1,1.0,0.0,1.0,0.0,6.0,0.0,2,0.344167,0.363625,0.805833,0.160446,985
1,2,1.0,0.0,1.0,0.0,0.0,0.0,2,0.363478,0.353739,0.696087,0.248539,801
2,3,1.0,0.0,1.0,0.0,1.0,1.0,1,0.196364,0.189405,0.437273,0.248309,1349
3,4,1.0,0.0,1.0,0.0,2.0,1.0,1,0.2,0.212122,0.590435,0.160296,1562
4,5,1.0,0.0,1.0,0.0,3.0,1.0,1,0.226957,0.22927,0.436957,0.1869,1600


In [26]:
# Setting Variables (Predictors & Target)
X_bikes = df_bikes.iloc[:, :-1]
y_bikes = df_bikes.iloc[:,-1]

In [27]:
#Splitting Process
X_train, X_test, y_train, y_test = train_test_split(X_bikes, y_bikes, random_state=2)

In [28]:
# Building The Model
tree_1 = DecisionTreeRegressor(max_depth=2, random_state=2)

In [29]:
# Fitting model
tree_1.fit(X_train, y_train)

In [30]:
y_train_pred = tree_1.predict(X_train)

In [31]:
y2_train = y_train - y_train_pred

In [32]:
tree_2 = DecisionTreeRegressor(max_depth=2, random_state=2)

In [33]:
tree_2.fit(X_train, y_train)

In [34]:
y2_train_pred = tree_2.predict(X_train)

In [35]:
y3_train = y2_train - y2_train_pred

In [36]:
tree_3 = DecisionTreeRegressor(max_depth=2, random_state=2)

In [40]:
tree_3.fit(X_train, y_train)

In [41]:
y1_pred = tree_1.predict(X_test)
y2_pred =tree_2.predict(X_test)
y3_pred = tree_3.predict(X_test)

In [42]:
y_pred = y1_pred + y2_pred +y3_pred

In [43]:
# Computing MSE
MSE(y_test, y_pred)**0.5

9735.280503063783

### Gradient Boosting with Scikit-Learn

In [45]:
# Model Initialization
gbr = GradientBoostingRegressor(max_depth=2, n_estimators=3, random_state=2, learning_rate=1.0)

In [47]:
# Fitting Model
gbr.fit(X_train, y_train)

In [49]:
# Predicting
y_pred = gbr.predict(X_test)

In [51]:
MSE(y_test, y_pred)**0.5

911.0479538776439

#### Increasing estimators

In [52]:
# Model Re-Initialization
gbr = GradientBoostingRegressor(max_depth=2, n_estimators=30, random_state=2, learning_rate=1.0)

In [53]:
# Fitting Model
gbr.fit(X_train, y_train)

In [54]:
y_pred = gbr.predict(X_test)

In [55]:
MSE(y_test, y_pred)**0.5

857.1072323426944

In [56]:
# Checking on 300 estimators

# Model Re-Initialization
gbr = GradientBoostingRegressor(max_depth=2, n_estimators=300, random_state=2, learning_rate=1.0)

# Fitting Model
gbr.fit(X_train, y_train)

In [57]:
y_pred = gbr.predict(X_test)

In [58]:
MSE(y_test, y_pred)**0.5

936.3617413678853

In [59]:
# Removing Learning rate

# Model Re-Initialization
gbr = GradientBoostingRegressor(max_depth=2, n_estimators=300, random_state=2)

# Fitting Model
gbr.fit(X_train, y_train)

In [60]:
y_pred = gbr.predict(X_test)

In [61]:
MSE(y_test, y_pred)**0.5

653.7456840231495

### Modifying Gradient Boosting Hyperparameters

#### Learning Rate

In [62]:
# Learning Rate Range Set-Up
learning_rate_values = [0.001, 0.01, 0.05, 0.1, 0.15, 0.2, 0.3, 0.5, 1.0]

In [63]:
for value in learning_rate_values:
    gbr = GradientBoostingRegressor(max_depth=2, n_estimators=300, random_state=2, learning_rate=value)
    gbr.fit(X_train, y_train)
    y_pred = gbr.predict(X_test)
    rmse = MSE(y_test, y_pred)**0.5
    print('Learning rate:', value, ',Score: ', rmse)

Learning rate: 0.001 ,Score:  1633.0261400367258
Learning rate: 0.01 ,Score:  831.5430182728547
Learning rate: 0.05 ,Score:  685.0192988749717
Learning rate: 0.1 ,Score:  653.7456840231495
Learning rate: 0.15 ,Score:  687.666134269379
Learning rate: 0.2 ,Score:  664.312804425697
Learning rate: 0.3 ,Score:  689.4190385930236
Learning rate: 0.5 ,Score:  693.8856905068778
Learning rate: 1.0 ,Score:  936.3617413678853


In [64]:
# Checkingo on estimator's number
n_estimator_values = [30, 300, 3000]

In [66]:
for value in n_estimator_values:
    gbr = GradientBoostingRegressor(max_depth=2, n_estimators=300, random_state=2, learning_rate=0.1)
    gbr.fit(X_train, y_train)
    y_pred = gbr.predict(X_test)
    rmse = MSE(y_test, y_pred)**0.5
    print('n_estimators for:', value, ',Score: ', rmse)

n_estimators for: 30 ,Score:  653.7456840231495
n_estimators for: 300 ,Score:  653.7456840231495
n_estimators for: 3000 ,Score:  653.7456840231495


#### Base Learner

In [67]:
depths = [None, 1, 2, 3, 4]

In [69]:
for depth in depths:
    gbr = GradientBoostingRegressor(max_depth=depth, n_estimators=300, random_state=2)
    gbr.fit(X_train, y_train)
    y_pred = gbr.predict(X_test)
    rmse = MSE(y_test, y_pred)**0.5
    print('Max Depth:', depth, ',Score: ', rmse)

Max Depth: None ,Score:  869.2788645118395
Max Depth: 1 ,Score:  707.8261886858736
Max Depth: 2 ,Score:  653.7456840231495
Max Depth: 3 ,Score:  646.4045923317708
Max Depth: 4 ,Score:  663.048387855927


#### Subsample

In [70]:
samples = [1, 0.9, 0.8, 0.7, 0.6, 0.5]

In [71]:
for sample in samples:
    gbr = GradientBoostingRegressor(max_depth=3, n_estimators=300, subsample=sample, random_state=2)
    gbr.fit(X_train, y_train)
    y_pred = gbr.predict(X_test)
    rmse = MSE(y_test, y_pred)**0.5
    print('Sub-sample:', sample, ',Score: ', rmse)

Sub-sample: 1 ,Score:  646.4045923317708
Sub-sample: 0.9 ,Score:  620.1819001443569
Sub-sample: 0.8 ,Score:  617.2355650565677
Sub-sample: 0.7 ,Score:  612.9879156983139
Sub-sample: 0.6 ,Score:  622.6385116402317
Sub-sample: 0.5 ,Score:  626.9974073227554


### RandomizedSearchCV

In [72]:
params = {'subsample': [0.65, 0.7, 0.75],
          'n_estimators': [300, 500, 1000],
          'learning_rate': [0.05, 0.075, 0.1]}

In [73]:
# Initializing GB Model
gbr = GradientBoostingRegressor(max_depth=3, random_state=2)

In [74]:
# Initializing RandomSearch
rand_reg = RandomizedSearchCV(gbr, params, n_iter=10, scoring='neg_mean_squared_error', cv=5, n_jobs=-1, random_state=2)

In [79]:
rand_reg.fit(X_train, y_train)
best_model = rand_reg.best_estimator_
best_params = rand_reg.best_params_
print("Best_params:", best_params)
best_score = np.sqrt(-rand_reg.best_score_)
print("Training score: {:.3f}".format(best_score))
y_pred = best_model.predict(X_test)
rmse_test = MSE(y_test, y_pred)**0.5
print('Test set score: {:.3f}'.format(rmse_test))

Best_params: {'subsample': 0.65, 'n_estimators': 300, 'learning_rate': 0.05}
Training score: 636.200
Test set score: 625.985


In [80]:
gbr = GradientBoostingRegressor(max_depth=3, n_estimators=1600, subsample=0.75, learning_rate=0.02, random_state=2)
gbr.fit(X_train, y_train)

In [81]:
y_pred = gbr.predict(X_test)

In [82]:
MSE(y_test, y_pred)**0.5

596.9544588974487

### XGBoost

In [83]:
# Model Initialization
xg_reg = XGBRegressor(max_depth=3, n_estimators=1600, eta=0.02, subsample=0.75, random_state=2)

In [84]:
xg_reg.fit(X_train, y_train)

In [85]:
y_pred = xg_reg.predict(X_test)

In [86]:
MSE(y_test, y_pred)**0.5

588.1964972637095

## Exoplanets Dataset

#### Loading Data

In [87]:
df = pd.read_csv('exoplanets.csv')
df.head()

Unnamed: 0,LABEL,FLUX.1,FLUX.2,FLUX.3,FLUX.4,FLUX.5,FLUX.6,FLUX.7,FLUX.8,FLUX.9,...,FLUX.3188,FLUX.3189,FLUX.3190,FLUX.3191,FLUX.3192,FLUX.3193,FLUX.3194,FLUX.3195,FLUX.3196,FLUX.3197
0,2,93.85,83.81,20.1,-26.98,-39.56,-124.71,-135.18,-96.27,-79.89,...,-78.07,-102.15,-102.15,25.13,48.57,92.54,39.32,61.42,5.08,-39.54
1,2,-38.88,-33.83,-58.54,-40.09,-79.31,-72.81,-86.55,-85.33,-83.97,...,-3.28,-32.21,-32.21,-24.89,-4.86,0.76,-11.7,6.46,16.0,19.93
2,2,532.64,535.92,513.73,496.92,456.45,466.0,464.5,486.39,436.56,...,-71.69,13.31,13.31,-29.89,-20.88,5.06,-11.8,-28.91,-70.02,-96.67
3,2,326.52,347.39,302.35,298.13,317.74,312.7,322.33,311.31,312.42,...,5.71,-3.73,-3.73,30.05,20.03,-12.67,-8.77,-17.31,-17.35,13.98
4,2,-1107.21,-1112.59,-1118.95,-1095.1,-1057.55,-1034.48,-998.34,-1022.71,-989.57,...,-594.37,-401.66,-401.66,-357.24,-443.76,-438.54,-399.71,-384.65,-411.79,-510.54


In [89]:
# Checking Info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5087 entries, 0 to 5086
Columns: 3198 entries, LABEL to FLUX.3197
dtypes: float64(3197), int64(1)
memory usage: 124.1 MB


In [90]:
# Checking on Null-Values
df.isnull().sum().sum()

0

In [91]:
# Setting Variables (Predictors & Target)
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

In [92]:
# Splitting Process
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

#### Timing Model

In [95]:
start = time.time()
df.info()
end = time.time()
elapsed = end - start
print('\nRun Time: ' + str(elapsed) + 'seconds.')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5087 entries, 0 to 5086
Columns: 3198 entries, LABEL to FLUX.3197
dtypes: float64(3197), int64(1)
memory usage: 124.1 MB

Run Time: 0.026938676834106445seconds.


#### Comparing Speed

In [97]:
# Running on Gradient Boosting Classifier
start = time.time()
gbr = GradientBoostingClassifier(n_estimators=100, max_depth=2, random_state=2)
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)
score = accuracy_score(y_pred, y_test)
print('Score: ' +str(score))
end = time.time()
elapsed = end -start
print('\nRun Time: ' +str(elapsed) + 'seconds')

Score: 0.9874213836477987

Run Time: 137.65067505836487seconds


In [101]:
y_train = y_train.map({1: 0, 2: 1})
y_test = y_test.map({1: 0, 2: 1})

In [102]:
# Now! on XGBClassifier
start = time.time()
xg_reg = XGBClassifier(n_estimators=100, max_depth=2, random_state=2)
xg_reg.fit(X_train, y_train)
y_pred = xg_reg.predict(X_test)
score = accuracy_score(y_pred, y_test)
print('Score: ' +str(score))
end = time.time()
elapsed = end -start
print('\nRun Time: ' +str(elapsed) + 'seconds')

Score: 0.9913522012578616

Run Time: 2.215780258178711seconds
