# 붓꽃 데이터 분류 모델 (Iris)

In [1]:
import pandas as pd 
import numpy as np
from sklearn import datasets 
iris = datasets.load_iris()

In [2]:
df = pd.DataFrame(data=np.c_[iris['data'], iris['target']],
                  columns=iris['feature_names'] + ['target'])

In [3]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=  train_test_split(iris['data'], iris['target'], random_state=2)

In [5]:
from xgboost import XGBClassifier

In [6]:
from sklearn.metrics import accuracy_score

In [7]:
xgb = XGBClassifier(booster='gbtree', objective='multi:softprob',
                    max_depth=6, learning_rate=0.1, n_estimators=100,
                    n_jobs=-1)

In [8]:
xgb.fit(X_train, y_train)

XGBClassifier(max_depth=6, n_jobs=-1, objective='multi:softprob')

In [9]:
y_pred = xgb.predict(X_test)

In [10]:
score = accuracy_score(y_pred, y_test)

In [11]:
print('점수: ' + str(score))

점수: 0.9736842105263158


In [12]:
xgb.score(X_test, y_test)

0.9736842105263158

In [13]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, y_train)
dtest= xgb.DMatrix(X_test[:5])

param = {'objective': 'multi:softprob', 'num_class': 3}
bstr = xgb.train(param, dtrain, 10)
bstr.predict(dtest)

array([[0.9486482 , 0.0271103 , 0.02424143],
       [0.9486482 , 0.0271103 , 0.02424143],
       [0.02841365, 0.05416912, 0.9174172 ],
       [0.9486482 , 0.0271103 , 0.02424143],
       [0.9486482 , 0.0271103 , 0.02424143]], dtype=float32)

In [14]:
param = {'objective': 'multi:softmax', 'num_class': 3}
bstr = xgb.train(param, dtrain, 10)
bstr.predict(dtest)

array([0., 0., 2., 0., 0.], dtype=float32)

# 당뇨병 데이터셋 회귀 모델 (diabetes)

In [15]:
X, y = datasets.load_diabetes(return_X_y=True)

In [16]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

In [17]:
xgb = XGBRegressor(booster='gbtree', objective='reg:squarederror',
                   max_depth=6, learning_rate=0.1, n_estimators=100,
                   n_jobs=-1)

In [18]:
scores = cross_val_score(xgb, X, y, 
                   scoring='neg_mean_squared_error', cv=5)

In [19]:
rmse = np.sqrt(-scores)
print('RMSE:', np.round(rmse, 3))
print('RMSE 평규니 %0.3F' % (rmse.mean()))

RMSE: [63.033 59.689 64.538 63.699 64.661]
RMSE 평규니 63.124


In [20]:
pd.DataFrame(y).describe()

Unnamed: 0,0
count,442.0
mean,152.133484
std,77.093005
min,25.0
25%,87.0
50%,140.5
75%,211.5
max,346.0


# 힉스 보손 찾기 - 사례연구

In [21]:
df = pd.read_csv('atlas-higgs-challenge-2014-v2.csv.gz', nrows=250000,
                 compression='gzip')
df.head()

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label,KaggleSet,KaggleWeight
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,2.15,0.444,46.062,1.24,-2.475,113.497,0.000814,s,t,0.002653
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,0.725,1.158,-999.0,-999.0,-999.0,46.226,0.681042,b,t,2.233584
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,2.053,-2.028,-999.0,-999.0,-999.0,44.251,0.715742,b,t,2.347389
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,1.660654,b,t,5.446378
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,1.904263,b,t,6.245333


In [22]:
del df['Weight']
del df['KaggleSet']
df = df.rename(columns={"KaggleWeight": "Weight"})

In [23]:
label_col = df['Label']
del df['Label']
df['Label'] = label_col 

In [25]:
df_h = pd.read_csv('atlas-higgs-challenge-2014-v2.csv.gz', nrows=250000,
                   compression='gzip')

df_new = df_h.drop(['Weight', 'KaggleSet', 'Label'], axis=1).assign(Label=df_h['Label']) 
df_new = df_new.rename(columns={'KaggleWeight': 'Weight'})

In [26]:
df.head()

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497,0.002653,s
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,1,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226,2.233584,b
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,1,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251,2.347389,b
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,5.446378,b
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,6.245333,b


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 33 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   EventId                      250000 non-null  int64  
 1   DER_mass_MMC                 250000 non-null  float64
 2   DER_mass_transverse_met_lep  250000 non-null  float64
 3   DER_mass_vis                 250000 non-null  float64
 4   DER_pt_h                     250000 non-null  float64
 5   DER_deltaeta_jet_jet         250000 non-null  float64
 6   DER_mass_jet_jet             250000 non-null  float64
 7   DER_prodeta_jet_jet          250000 non-null  float64
 8   DER_deltar_tau_lep           250000 non-null  float64
 9   DER_pt_tot                   250000 non-null  float64
 10  DER_sum_pt                   250000 non-null  float64
 11  DER_pt_ratio_lep_tau         250000 non-null  float64
 12  DER_met_phi_centrality       250000 non-null  float64
 13 

In [28]:
df['Label'].replace(('s', 'b'), (1, 0), inplace=True)

In [29]:
X = df.iloc[:,1:31]
y = df.iloc[:,-1]

In [30]:
X_new = df_new.loc[:, ~df_new.columns.isin(['EventId', 'Weight', 'Label'])]
y_new = df_new.loc[:, 'Label']

In [31]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_new = le.fit_transform(df_new['Label'])

In [32]:
print(le.classes_)

['b' 's']


In [33]:
xgb = XGBClassifier(n_estimators=5)
xgb.fit(X_train, y_train,
        eval_set = [(X_train, y_train), (X_test, y_test)],
        eval_metric='mlogloss')

[0]	validation_0-mlogloss:0.967606	validation_1-mlogloss:0.971525
[1]	validation_0-mlogloss:0.857707	validation_1-mlogloss:0.865211
[2]	validation_0-mlogloss:0.764221	validation_1-mlogloss:0.77506
[3]	validation_0-mlogloss:0.68386	validation_1-mlogloss:0.697841
[4]	validation_0-mlogloss:0.614213	validation_1-mlogloss:0.630577


XGBClassifier(n_estimators=5, objective='multi:softprob')

In [38]:
xgb.evals_result

<bound method XGBClassifier.evals_result of XGBClassifier(n_estimators=5, objective='multi:softprob')>

In [39]:
df['test_Weight'] = df['Weight'] * 55000 / len(y)

In [42]:
s = np.sum(df[df['Label']==1]['test_Weight'])
b = np.sum(df[df['Label']==0]['test_Weight'])

In [43]:
b/s

593.940193149232

In [58]:
import xgboost as xgb

In [59]:
xgmat = xgb.DMatrix(X, y, missing = -999.0, weight=df['test_Weight'])

In [60]:
param = {}

In [61]:
param['objective'] = 'binary:logitraw'

In [62]:
param['scale_pos_weight'] = b/s

In [63]:
param['eta'] = 0.1

In [64]:
param['max_depth'] = 6

In [65]:
param['eval_metric'] = 'auc'

In [66]:
plst = list(param.items())+[('eval_metric', 'ams@0.15')]

In [67]:
watchlist = [(xgmat,'train')]

In [68]:
num_round = 120

In [69]:
print('데이터 로딩 완료, 트리 부스팅 시작')
bst = xgb.train(plst, xgmat, num_round, watchlist)
bst.save_model('higgs.model')
print('훈련 종료')

데이터 로딩 완료, 트리 부스팅 시작
[0]	train-auc:0.910806	train-ams@0.15:1.16745
[1]	train-auc:0.915198	train-ams@0.15:1.25307
[2]	train-auc:0.917728	train-ams@0.15:1.28329
[3]	train-auc:0.919317	train-ams@0.15:1.32925
[4]	train-auc:0.92028	train-ams@0.15:1.32867
[5]	train-auc:0.921232	train-ams@0.15:1.35806
[6]	train-auc:0.921844	train-ams@0.15:1.35772
[7]	train-auc:0.922429	train-ams@0.15:1.35772
[8]	train-auc:0.923007	train-ams@0.15:1.36704
[9]	train-auc:0.92397	train-ams@0.15:1.37769
[10]	train-auc:0.924614	train-ams@0.15:1.39123
[11]	train-auc:0.92524	train-ams@0.15:1.38977
[12]	train-auc:0.925802	train-ams@0.15:1.39777
[13]	train-auc:0.92618	train-ams@0.15:1.4068
[14]	train-auc:0.926768	train-ams@0.15:1.41061
[15]	train-auc:0.927291	train-ams@0.15:1.42229
[16]	train-auc:0.927851	train-ams@0.15:1.4278
[17]	train-auc:0.928207	train-ams@0.15:1.42082
[18]	train-auc:0.928758	train-ams@0.15:1.44203
[19]	train-auc:0.929333	train-ams@0.15:1.45897
[20]	train-auc:0.929732	train-ams@0.15:1.47128
[21]	tra

In [71]:
clf = XGBClassifier(n_estimators=120, learning_rate=0.1, missing=-999.0,
                    scale_pos_weight=b/s)

clf.fit(X, y, sample_weight=df['test_Weight'],
        eval_set=[(X, y)], eval_metric=['auc', 'ams@0.15'],
        sample_weight_eval_set=[df['test_Weight']])

[0]	validation_0-auc:0.865637	validation_0-ams@0.15:0.629598
[1]	validation_0-auc:0.880259	validation_0-ams@0.15:0.692053
[2]	validation_0-auc:0.891856	validation_0-ams@0.15:0.86749
[3]	validation_0-auc:0.898697	validation_0-ams@0.15:0.975143
[4]	validation_0-auc:0.899739	validation_0-ams@0.15:0.98151
[5]	validation_0-auc:0.900191	validation_0-ams@0.15:0.987044
[6]	validation_0-auc:0.901801	validation_0-ams@0.15:1.02609
[7]	validation_0-auc:0.903606	validation_0-ams@0.15:1.07812
[8]	validation_0-auc:0.90548	validation_0-ams@0.15:1.14783
[9]	validation_0-auc:0.906781	validation_0-ams@0.15:1.20854
[10]	validation_0-auc:0.907533	validation_0-ams@0.15:1.20124
[11]	validation_0-auc:0.908574	validation_0-ams@0.15:1.25933
[12]	validation_0-auc:0.909159	validation_0-ams@0.15:1.25872
[13]	validation_0-auc:0.91024	validation_0-ams@0.15:1.28615
[14]	validation_0-auc:0.911317	validation_0-ams@0.15:1.28796
[15]	validation_0-auc:0.912097	validation_0-ams@0.15:1.29827
[16]	validation_0-auc:0.912565	v

XGBClassifier(missing=-999.0, n_estimators=120,
              scale_pos_weight=593.940193149232)

In [73]:
clf.save_model('higgs-sklearn.model')
clf.evals_result()

{'validation_0': {'ams@0.15': [0.629598,
   0.692053,
   0.86749,
   0.975143,
   0.98151,
   0.987044,
   1.026095,
   1.078117,
   1.147827,
   1.208539,
   1.201239,
   1.259331,
   1.258715,
   1.286155,
   1.287962,
   1.298267,
   1.297956,
   1.313196,
   1.305574,
   1.322971,
   1.323687,
   1.320077,
   1.329252,
   1.326062,
   1.33925,
   1.338482,
   1.347531,
   1.363397,
   1.367683,
   1.365909,
   1.373831,
   1.374321,
   1.387134,
   1.388808,
   1.399369,
   1.402973,
   1.398468,
   1.404984,
   1.408445,
   1.418936,
   1.42507,
   1.428449,
   1.434896,
   1.43594,
   1.438923,
   1.445116,
   1.449471,
   1.458251,
   1.46064,
   1.466337,
   1.465328,
   1.46523,
   1.468959,
   1.478852,
   1.478443,
   1.479323,
   1.484654,
   1.484919,
   1.493765,
   1.493424,
   1.497615,
   1.50528,
   1.500732,
   1.503211,
   1.511929,
   1.51664,
   1.511169,
   1.516191,
   1.519797,
   1.517912,
   1.516353,
   1.518272,
   1.521291,
   1.523937,
   1.521898,
   1.5