# XGBoost - Higgs Boson (Regression)
[ch5-advanced-xgboost-unveiled.ipynb](https://github.com/kyopark2014/ML-Algorithms/blob/main/xgboost/src/ch5-advanced-xgboost-unveiled.ipynb)

In [1]:
import xgboost as xgb
xgb.set_config(verbosity=0)

In [2]:
import pandas as pd
import numpy as np

# df = pd.read_csv('atlas-higgs-challenge-2014-v2.csv.gz', nrows=250000, compression='gzip')
df = pd.read_csv('https://raw.githubusercontent.com/rickiepark/handson-gb/main/Chapter05/atlas-higgs-challenge-2014-v2.csv.gz', nrows=250000, compression='gzip')
    
df.head()

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label,KaggleSet,KaggleWeight
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,2.15,0.444,46.062,1.24,-2.475,113.497,0.000814,s,t,0.002653
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,0.725,1.158,-999.0,-999.0,-999.0,46.226,0.681042,b,t,2.233584
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,2.053,-2.028,-999.0,-999.0,-999.0,44.251,0.715742,b,t,2.347389
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,1.660654,b,t,5.446378
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,1.904263,b,t,6.245333


In [3]:
df = df.drop(['Weight', 'KaggleSet', 'Label'], axis=1).assign(Label=df['Label'])
df = df.rename(columns={'KaggleWeight': 'Weight'})

In [4]:
df.head()

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497,0.002653,s
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,1,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226,2.233584,b
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,1,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251,2.347389,b
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,5.446378,b
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,6.245333,b


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 33 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   EventId                      250000 non-null  int64  
 1   DER_mass_MMC                 250000 non-null  float64
 2   DER_mass_transverse_met_lep  250000 non-null  float64
 3   DER_mass_vis                 250000 non-null  float64
 4   DER_pt_h                     250000 non-null  float64
 5   DER_deltaeta_jet_jet         250000 non-null  float64
 6   DER_mass_jet_jet             250000 non-null  float64
 7   DER_prodeta_jet_jet          250000 non-null  float64
 8   DER_deltar_tau_lep           250000 non-null  float64
 9   DER_pt_tot                   250000 non-null  float64
 10  DER_sum_pt                   250000 non-null  float64
 11  DER_pt_ratio_lep_tau         250000 non-null  float64
 12  DER_met_phi_centrality       250000 non-null  float64
 13 

In [6]:
df['Label'].replace(('s', 'b'), (1, 0), inplace=True)

In [7]:
X = df.iloc[:,1:31]
y = df.iloc[:,-1]

In [8]:
y

0         1
1         0
2         0
3         0
4         0
         ..
249995    0
249996    0
249997    1
249998    0
249999    0
Name: Label, Length: 250000, dtype: int64

### Split Train/Test dataset

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

### XGBoost

In [10]:
df['test_Weight'] = df['Weight'] * 550000 / len(y)

In [11]:
s = np.sum(df[df['Label']==1]['test_Weight'])
b = np.sum(df[df['Label']==0]['test_Weight'])

In [12]:
b/s

593.9401931492318

In [13]:
from xgboost import XGBClassifier

clf = XGBClassifier(n_estimators=120, learning_rate=0.1, missing=-999.0, 
                    scale_pos_weight=b/s)

clf.fit(X, y, sample_weight=df['test_Weight'], 
        eval_set=[(X, y)], eval_metric=['auc', 'ams@0.15'],
        sample_weight_eval_set=[df['test_Weight']])

#clf.save_model('higgs-sklearn.model')



[0]	validation_0-auc:0.91088	validation_0-ams@0.15:3.70349
[1]	validation_0-auc:0.91528	validation_0-ams@0.15:3.97016
[2]	validation_0-auc:0.91771	validation_0-ams@0.15:4.06973
[3]	validation_0-auc:0.91931	validation_0-ams@0.15:4.21078
[4]	validation_0-auc:0.92011	validation_0-ams@0.15:4.13576
[5]	validation_0-auc:0.92099	validation_0-ams@0.15:4.17622
[6]	validation_0-auc:0.92191	validation_0-ams@0.15:4.26380
[7]	validation_0-auc:0.92231	validation_0-ams@0.15:4.26293
[8]	validation_0-auc:0.92330	validation_0-ams@0.15:4.32536
[9]	validation_0-auc:0.92415	validation_0-ams@0.15:4.38135
[10]	validation_0-auc:0.92470	validation_0-ams@0.15:4.39102
[11]	validation_0-auc:0.92528	validation_0-ams@0.15:4.40832
[12]	validation_0-auc:0.92589	validation_0-ams@0.15:4.44617
[13]	validation_0-auc:0.92630	validation_0-ams@0.15:4.44726
[14]	validation_0-auc:0.92692	validation_0-ams@0.15:4.48756
[15]	validation_0-auc:0.92739	validation_0-ams@0.15:4.51991
[16]	validation_0-auc:0.92804	validation_0-ams@0.1

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=6, min_child_weight=1, missing=-999.0,
              monotone_constraints='()', n_estimators=120, n_jobs=16,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=593.9401931492318,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [14]:
clf.evals_result()

{'validation_0': OrderedDict([('auc',
               [0.910879,
                0.915277,
                0.917712,
                0.919314,
                0.920109,
                0.920992,
                0.921914,
                0.922306,
                0.923298,
                0.924153,
                0.924703,
                0.925281,
                0.925889,
                0.926301,
                0.926918,
                0.927393,
                0.928042,
                0.928428,
                0.928996,
                0.929478,
                0.929999,
                0.930363,
                0.93069,
                0.931093,
                0.931384,
                0.931678,
                0.932015,
                0.932341,
                0.932681,
                0.933036,
                0.933275,
                0.93359,
                0.933943,
                0.934161,
                0.934456,
                0.934762,
                0.935109,
  

In [15]:
clf.score(X, y)

0.800476