# XGBoost Unveiled

#### Loading Libraries

In [81]:
# Numerical Computing
import numpy as np
# Data Manipulation
import pandas as pd
# Data Visualization
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# DateTime Library
import datetime as dt

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import datasets

# Model Metrics
from sklearn.metrics import mean_squared_error as MSE, accuracy_score

#Extreme Gradient Boosting
from xgboost import XGBRegressor
from xgboost import XGBClassifier

# Warnings
import warnings

# Timing
import time

#### Loading Iris Dataset

In [82]:
warnings.filterwarnings('ignore')

In [83]:
# Retrieving from the Iris dataset Library
iris = datasets.load_iris()

In [84]:
# Setting on Pandas frame
df = pd.DataFrame(data= np.c_[iris['data'],
                  iris['target']], columns=iris['feature_names'] + ['target'])

In [85]:
# Checking on
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [86]:
# Splitting Data Procedure
X_train, X_test, y_train, y_test = train_test_split(iris['data'], iris['target'], random_state=2)

#### XGBoost Classification Template

In [87]:
# Model Initialization
xgb = XGBClassifier(booster='gbtree',
                    objetive='multi:softprob',
                    max_depth=6,
                    learning_rate=0.1,
                    n_estimators=100,
                    random_state=2,
                    n_jobs=-1)

In [88]:
# Fitting the Classifier
xgb.fit(X_train, y_train)

In [89]:
y_pred = xgb.predict(X_test)

In [90]:
score = accuracy_score(y_pred, y_test)

In [91]:
print('Score: ' + str(score))

Score: 0.9736842105263158


### Loading Diabites Data

In [92]:
# Setting Data
X, y = datasets.load_diabetes(return_X_y=True)

#### XGBoost Regresor Template (Cross-Validation)

In [93]:
# Model Initialization
xgb = XGBRegressor(booster='gbtree',
                   objetive='reg:squarederror',
                   max_depth=6,
                   learning_rate=0.1,
                   n_estimators=100,
                   random_state=2, n_jobs=-1)

In [94]:
scores = cross_val_score(xgb, X, y, scoring='neg_mean_squared_error', cv=5)

In [95]:
rmse = np.sqrt(-scores)

In [96]:
print('RMSE: ', np.round(rmse, 3))
print('RMSE mean: %0.3f' % (rmse.mean()))

RMSE:  [59.397 60.322 69.036 63.211 66.953]
RMSE mean: 63.784


In [97]:
pd.DataFrame(y).describe()

Unnamed: 0,0
count,442.0
mean,152.133484
std,77.093005
min,25.0
25%,87.0
50%,140.5
75%,211.5
max,346.0


## Finding Higgs Boson

#### Loading Higgs Boson Data

In [98]:
df = pd.read_csv('atlas-higgs-challenge-2014-v2.csv.gz', nrows=250000, compression='gzip')
df.head()

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label,KaggleSet,KaggleWeight
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,2.15,0.444,46.062,1.24,-2.475,113.497,0.000814,s,t,0.002653
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,0.725,1.158,-999.0,-999.0,-999.0,46.226,0.681042,b,t,2.233584
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,2.053,-2.028,-999.0,-999.0,-999.0,44.251,0.715742,b,t,2.347389
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,1.660654,b,t,5.446378
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,1.904263,b,t,6.245333


In [99]:
# Dropping Columns
del df['Weight']
del df['KaggleSet']

In [100]:
df = df.rename(columns={'KaggleWeight': 'Weight'})

In [101]:
label_col = df['Label']

In [102]:
del df['Label']

In [103]:
df['Label'] = label_col

In [104]:
df.head()

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497,0.002653,s
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,1,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226,2.233584,b
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,1,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251,2.347389,b
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,5.446378,b
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,6.245333,b


In [105]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 33 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   EventId                      250000 non-null  int64  
 1   DER_mass_MMC                 250000 non-null  float64
 2   DER_mass_transverse_met_lep  250000 non-null  float64
 3   DER_mass_vis                 250000 non-null  float64
 4   DER_pt_h                     250000 non-null  float64
 5   DER_deltaeta_jet_jet         250000 non-null  float64
 6   DER_mass_jet_jet             250000 non-null  float64
 7   DER_prodeta_jet_jet          250000 non-null  float64
 8   DER_deltar_tau_lep           250000 non-null  float64
 9   DER_pt_tot                   250000 non-null  float64
 10  DER_sum_pt                   250000 non-null  float64
 11  DER_pt_ratio_lep_tau         250000 non-null  float64
 12  DER_met_phi_centrality       250000 non-null  float64
 13 

In [106]:
# Switching 'Label' column data with replace method
df['Label'].replace(('s', 'b'), (1, 0), inplace=True)

In [107]:
# Splitting Procedure
X = df.iloc[:, 1:31]
y = df.iloc[:, -1]

#### Weights

In [108]:
df['test_Weight'] = df['Weight'] * 550000 / len(y)

In [109]:
# Scaling Factor Procedure
s = np.sum(df[df['Label']==1]['test_Weight'])
b = np.sum(df[df['Label']==0]['test_Weight'])

In [110]:
b/s

593.9401931492318

#### Modeling

In [111]:
import xgboost as xgb

In [121]:
# Initializing Model Classifier
xgb_clf = xgb.DMatrix(X, y, missing=-999.0, weight=df['test_Weight'])

In [122]:
# Additional Params
param = {}

In [123]:
param['objetive'] = 'binary:logitraw'
param['scale_pos_weight'] = b/s
param['eta'] = 0.1
param['max_depth'] = 6
param['eval_metric'] = 'auc'

In [124]:
plst = list(param.items()) + [('eval_metric', 'ams@0.15')]

In [125]:
watchlist = [ (xgb_clf, 'train') ]

In [126]:
num_round = 120

In [131]:
print('Loading data end, start to boost trees')
bst = xgb.train(plst, xgb_clf, num_round, watchlist)
bst.save_model('higgs.model')
print('finish training')

Loading data end, start to boost trees
[0]	train-auc:0.91089	train-ams@0.15:3.83927
[1]	train-auc:0.91510	train-ams@0.15:3.85330
[2]	train-auc:0.91739	train-ams@0.15:3.96310
[3]	train-auc:0.91914	train-ams@0.15:4.11078
[4]	train-auc:0.92023	train-ams@0.15:4.27650
[5]	train-auc:0.92145	train-ams@0.15:4.28261
[6]	train-auc:0.92253	train-ams@0.15:4.34832
[7]	train-auc:0.92313	train-ams@0.15:4.33971
[8]	train-auc:0.92382	train-ams@0.15:4.36745
[9]	train-auc:0.92450	train-ams@0.15:4.41225
[10]	train-auc:0.92502	train-ams@0.15:4.43893
[11]	train-auc:0.92557	train-ams@0.15:4.50385
[12]	train-auc:0.92614	train-ams@0.15:4.49381
[13]	train-auc:0.92660	train-ams@0.15:4.52376
[14]	train-auc:0.92732	train-ams@0.15:4.53521
[15]	train-auc:0.92782	train-ams@0.15:4.56550
[16]	train-auc:0.92833	train-ams@0.15:4.59922
[17]	train-auc:0.92865	train-ams@0.15:4.61073
[18]	train-auc:0.92916	train-ams@0.15:4.64614
[19]	train-auc:0.92948	train-ams@0.15:4.65615
[20]	train-auc:0.92986	train-ams@0.15:4.67435
[21]	