## 5.1 XGBoost 구조

### 5.1.1 역사

### 5.1.2 주요 기능

## 5.2 XGBoost 파라미터 최적화 

### 5.2.1 학습 목적

### 5.3.1 붓꽃 데이터셋 

In [2]:
import pandas as pd
import numpy as np
from sklearn import datasets
iris = datasets.load_iris()

In [3]:
df = pd.DataFrame(data=np.c_[iris['data'], iris['target']], columns=iris['feature_names'] + ['target'])

In [4]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [5]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(iris['data'], iris['target'], random_state=2)

In [8]:
from xgboost import XGBClassifier

In [9]:
from sklearn.metrics import accuracy_score

In [10]:
xgb = XGBClassifier(booster='gbtree', objective='multi:softprob', max_depth=6, learning_rate=0.1, n_estimators=100, n_jobs=-1)

In [11]:
xgb.fit(X_train, y_train)

In [12]:
y_pred = xgb.predict(X_test)

In [13]:
score = accuracy_score(y_pred, y_test)

In [15]:
print('점수: ', str(score))

점수:  0.9736842105263158


In [16]:
xgb.score(X_test, y_test)

0.9736842105263158

In [21]:
import xgboost as xgb

In [22]:
dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test[:5])

param = {'objective': 'multi:softprob', 'num_class': 3}
bstr = xgb.train(param, dtrain, 10)
bstr.predict(dtest)

array([[0.9486482 , 0.02711029, 0.02424142],
       [0.9486482 , 0.02711029, 0.02424142],
       [0.02841366, 0.05416913, 0.9174172 ],
       [0.9486482 , 0.02711029, 0.02424142],
       [0.9486482 , 0.02711029, 0.02424142]], dtype=float32)

In [23]:
param = {'objective': 'multi:softmax', 'num_class': 3}
bstr = xgb.train(param, dtrain, 10)
bstr.predict(dtest)

array([0., 0., 2., 0., 0.], dtype=float32)

In [26]:
### 5.3.2 당뇨병 데이터셋

In [27]:
X, y = datasets.load_diabetes(return_X_y=True)

In [28]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

In [34]:
xgb = XGBRegressor(booster='gbtree', objective='reg:squarederror', max_depth=6, learning_rate=0.1, n_estimators=100, n_jobs=-1)

In [36]:
scores = cross_val_score(xgb, X, y, scoring='neg_mean_squared_error', cv=5)

In [37]:
rmse = np.sqrt(-scores)

In [38]:
print('RMSE:', np.round(rmse, 3))

RMSE: [59.397 60.322 69.036 63.211 66.953]


In [39]:
print('RMSE 평균: %0.3f' % (rmse.mean()))

RMSE 평균: 63.784


In [40]:
pd.DataFrame(y).describe()

Unnamed: 0,0
count,442.0
mean,152.133484
std,77.093005
min,25.0
25%,87.0
50%,140.5
75%,211.5
max,346.0


## 5.4 힉스 보손 찾기 - 사례 연구

### 5.4.1 물리학적 배경

### 5.4.2 캐글 대회

### 5.4.3 XGBoost와 힉스 보손 대회 

### 5.4.4 데이터

In [54]:
df = pd.read_csv('data/atlas-higgs-challenge-2014-v2.csv.gz', nrows=250000, compression='gzip')
df.head()

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label,KaggleSet,KaggleWeight
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,2.15,0.444,46.062,1.24,-2.475,113.497,0.000814,s,t,0.002653
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,0.725,1.158,-999.0,-999.0,-999.0,46.226,0.681042,b,t,2.233584
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,2.053,-2.028,-999.0,-999.0,-999.0,44.251,0.715742,b,t,2.347389
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,1.660654,b,t,5.446378
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,1.904263,b,t,6.245333


In [55]:
del df['Weight']
del df['KaggleSet']
df = df.rename(columns={'KaggleWeight': 'Weight'})

In [56]:
label_col = df['Label']
del df['Label']
df['Label'] = label_col

In [57]:
df.head()

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497,0.002653,s
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,1,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226,2.233584,b
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,1,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251,2.347389,b
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,5.446378,b
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,6.245333,b


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 33 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   EventId                      250000 non-null  int64  
 1   DER_mass_MMC                 250000 non-null  float64
 2   DER_mass_transverse_met_lep  250000 non-null  float64
 3   DER_mass_vis                 250000 non-null  float64
 4   DER_pt_h                     250000 non-null  float64
 5   DER_deltaeta_jet_jet         250000 non-null  float64
 6   DER_mass_jet_jet             250000 non-null  float64
 7   DER_prodeta_jet_jet          250000 non-null  float64
 8   DER_deltar_tau_lep           250000 non-null  float64
 9   DER_pt_tot                   250000 non-null  float64
 10  DER_sum_pt                   250000 non-null  float64
 11  DER_pt_ratio_lep_tau         250000 non-null  float64
 12  DER_met_phi_centrality       250000 non-null  float64
 13 

In [59]:
df['Label'].replace(('s', 'b'), (1, 0), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Label'].replace(('s', 'b'), (1, 0), inplace=True)
  df['Label'].replace(('s', 'b'), (1, 0), inplace=True)


In [60]:
X = df.iloc[:, 1:31]
y = df.iloc[:, -1]

### 5.4.5 측정 지표 

### 5.4.6 가중치

In [61]:
df['test_Weight'] = df['Weight'] * 550000 / len(y)

In [66]:
s = np.sum(df[df['Label']==1]['test_Weight'])
b = np.sum(df[df['Label']==0]['test_Weight'])

In [67]:
b/s

np.float64(593.9401931492318)

In [69]:
b/s

np.float64(593.9401931492318)

In [98]:
clf = XGBClassifier(n_estimators=120, learning_rate=0.1, missing=-999.0, scale_pos_wight=b/s)

In [100]:
clf.fit(X, y, 
        sample_weight=df['test_Weight'], 
        eval_set=[(X, y)],
        # eval_metric=['auc', 'ams@0.15'],
        sample_weight_eval_set=[df['test_Weight']]
       )

[0]	validation_0-logloss:0.11846
[1]	validation_0-logloss:0.10720
[2]	validation_0-logloss:0.09714
[3]	validation_0-logloss:0.08816
[4]	validation_0-logloss:0.08012
[5]	validation_0-logloss:0.07291
[6]	validation_0-logloss:0.06646
[7]	validation_0-logloss:0.06067
[8]	validation_0-logloss:0.05547
[9]	validation_0-logloss:0.05081
[10]	validation_0-logloss:0.04662
[11]	validation_0-logloss:0.04285
[12]	validation_0-logloss:0.03946
[13]	validation_0-logloss:0.03642
[14]	validation_0-logloss:0.03369
[15]	validation_0-logloss:0.03123
[16]	validation_0-logloss:0.02901
[17]	validation_0-logloss:0.02702
[18]	validation_0-logloss:0.02523


Parameters: { "scale_pos_wight" } are not used.



[19]	validation_0-logloss:0.02362
[20]	validation_0-logloss:0.02217
[21]	validation_0-logloss:0.02087
[22]	validation_0-logloss:0.01970
[23]	validation_0-logloss:0.01864
[24]	validation_0-logloss:0.01769
[25]	validation_0-logloss:0.01684
[26]	validation_0-logloss:0.01607
[27]	validation_0-logloss:0.01539
[28]	validation_0-logloss:0.01477
[29]	validation_0-logloss:0.01421
[30]	validation_0-logloss:0.01371
[31]	validation_0-logloss:0.01326
[32]	validation_0-logloss:0.01286
[33]	validation_0-logloss:0.01250
[34]	validation_0-logloss:0.01217
[35]	validation_0-logloss:0.01187
[36]	validation_0-logloss:0.01161
[37]	validation_0-logloss:0.01138
[38]	validation_0-logloss:0.01116
[39]	validation_0-logloss:0.01097
[40]	validation_0-logloss:0.01080
[41]	validation_0-logloss:0.01064
[42]	validation_0-logloss:0.01051
[43]	validation_0-logloss:0.01038
[44]	validation_0-logloss:0.01027
[45]	validation_0-logloss:0.01017
[46]	validation_0-logloss:0.01008
[47]	validation_0-logloss:0.01000
[48]	validatio

In [101]:
clf.evals_result()

{'validation_0': OrderedDict([('logloss',
               [0.1184552077933575,
                0.10719550509303788,
                0.09714362549808203,
                0.0881569884906602,
                0.08011579741100654,
                0.07291338709391215,
                0.06645788063444048,
                0.06066853534233743,
                0.05547268728987874,
                0.05080717614206492,
                0.04661693902457795,
                0.04284881071570033,
                0.03946495926992465,
                0.03642301225843319,
                0.03368784093257031,
                0.03122573309837283,
                0.02901221924516596,
                0.02702217465241052,
                0.02523135409781013,
                0.02362159628026857,
                0.02217240434773232,
                0.02086927122408731,
                0.01969584938729907,
                0.01864222877473106,
                0.01769495619569992,
                0.01684346511097822