In [211]:
import pandas as pd
import random
import os
import numpy as np
import matplotlib.pyplot as plt
import warnings; warnings.filterwarnings(action='ignore')
import pickle

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import sklearn.svm as svm
import xgboost as xgb

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from bayes_opt import BayesianOptimization

from xgboost import plot_importance
from scipy.stats import randint
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [212]:
def seed_everything(seed):
    random.seed(seed) #파이썬 자체 모듈 random 모듈의 시드 고정
    os.environ['PYTHONHASHSEED'] = str(seed) 
    np.random.seed(seed)

seed_everything(37) # Seed 고정

# Data Load

In [213]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [214]:
sub = pd.read_csv('./sample_submission.csv')
sub2 = pd.read_csv('./sample_submission.csv')
sub3 = pd.read_csv('./sample_submission.csv')
sub4 = pd.read_csv('./sample_submission.csv')
sub5 = pd.read_csv('./sample_submission.csv')

# Data Preprocessing

X

In [228]:
train_010 = train_df[(train_df['LINE']=='T010306') | (train_df['LINE']=='T010305')]
train_050 = train_df[(train_df['LINE']=='T050304') | (train_df['LINE']=='T050307')]
train_100 = train_df[(train_df['LINE']=='T100306') | (train_df['LINE']=='T100304')]

In [229]:
test_010 = test_df[(test_df['LINE']=='T010306') | (test_df['LINE']=='T010305')]
test_050 = test_df[(test_df['LINE']=='T050304') | (test_df['LINE']=='T050307')]
test_100 = test_df[(test_df['LINE']=='T100304') | (test_df['LINE']=='T100306')]

In [237]:
print(len(train_010)) #196 245
print(len(train_050))
print(len(train_100))

129
120
349


In [238]:
print(len(test_010)) #196 245
print(len(test_050))
print(len(test_100))

28
39
243


In [256]:
train_y = train_100['Y_Class']

In [257]:
train_x = train_100.drop(columns = ['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
test_x = test_100.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

In [258]:
p1 = test_x.index.tolist() #28 #39 # 
p1

[0,
 1,
 2,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 55,
 56,
 57,
 58,
 59,
 60,
 67,
 68,
 69,
 70,
 73,
 74,
 75,
 76,
 77,
 78,
 80,
 81,
 82,
 83,
 84,
 85,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 110,
 111,
 112,
 113,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 186,
 187,
 188,
 189,
 190,
 191,
 192,
 193,
 194,
 195,
 196,
 197,
 198,
 199,
 200,
 201,
 202,
 203,
 204,
 205,
 206,
 207,
 208,
 209,
 210,
 211,
 212,
 213,
 214,
 215,
 216,
 217,
 218,
 219,
 220,
 221,
 222,
 223,
 224,
 225,

범주형 데이터를 수치 데이터로 전환하기 위해 LabelEncoder 활용

In [259]:
# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i]) #원래 column 값을 기준으로 fit.
    train_x[i] = le.transform(train_x[i]) #수치화, 수치로 변형

    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 
print('Done.')

Done.


# Modeling

학습용과 테스트용 데이터 세트를 위해 별도의 Dmatrix를 생성

In [260]:
#t7_0.53219 ########성능 0.683 testsize만 0.2로 바꿨더니 0.653
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, 
                                                    test_size=0.2, 
                                                    stratify=train_y, 
                                                    random_state=37)


dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test, label=y_test)
dtest_x = xgb.DMatrix(data=test_x)

params = {'max_depth' : 6,
          'eta': 0.035,
          'objective':'multi:softmax',
          'num_class':3,
          'eval_metric':'mlogloss'
          }
num_rounds = 400

wlist = [(dtrain, 'train'), (dtest, 'eval')]
xgb_model = xgb.train(params=params, 
                      dtrain=dtrain, 
                      num_boost_round=num_rounds, 
                      early_stopping_rounds=100, 
                      evals=wlist)

[0]	train-mlogloss:1.05831	eval-mlogloss:1.06865
[1]	train-mlogloss:1.02135	eval-mlogloss:1.03825
[2]	train-mlogloss:0.98582	eval-mlogloss:1.01160
[3]	train-mlogloss:0.95312	eval-mlogloss:0.98509
[4]	train-mlogloss:0.92077	eval-mlogloss:0.96043
[5]	train-mlogloss:0.89129	eval-mlogloss:0.93739
[6]	train-mlogloss:0.86193	eval-mlogloss:0.91788
[7]	train-mlogloss:0.83444	eval-mlogloss:0.89757
[8]	train-mlogloss:0.80886	eval-mlogloss:0.87742
[9]	train-mlogloss:0.78349	eval-mlogloss:0.86147
[10]	train-mlogloss:0.75970	eval-mlogloss:0.84404
[11]	train-mlogloss:0.73745	eval-mlogloss:0.82866
[12]	train-mlogloss:0.71576	eval-mlogloss:0.81398
[13]	train-mlogloss:0.69549	eval-mlogloss:0.80028
[14]	train-mlogloss:0.67507	eval-mlogloss:0.78717
[15]	train-mlogloss:0.65485	eval-mlogloss:0.77542
[16]	train-mlogloss:0.63566	eval-mlogloss:0.76296
[17]	train-mlogloss:0.61755	eval-mlogloss:0.75108
[18]	train-mlogloss:0.59985	eval-mlogloss:0.74139
[19]	train-mlogloss:0.58289	eval-mlogloss:0.73190
[20]	train

In [249]:
# pred_probs = xgb_model.predict(dtest_x, ntree_limit=xgb_model.best_ntree_limit)
y_preds = xgb_model.predict(dtest_x, ntree_limit=xgb_model.best_ntree_limit)
preds_1 = np.round(y_preds).astype(int)
preds1 = pd.DataFrame(preds_1)
p1 = pd.Series(data = p1)
preds10 = pd.concat([preds1, p1], axis = 1)
preds10.columns = ['y_class', 'p1']
preds10.index = preds10['p1']
del preds10['p1']
preds10

Unnamed: 0_level_0,y_class
p1,Unnamed: 1_level_1
3,1
4,1
5,1
6,1
9,1
10,1
11,1
12,1
37,0
38,0


In [255]:
y_preds = xgb_model.predict(dtest_x, ntree_limit=xgb_model.best_ntree_limit)
preds_2 = np.round(y_preds).astype(int)
preds2 = pd.DataFrame(preds_2)
p1 = pd.Series(data = p1)
preds20 = pd.concat([preds2, p1], axis = 1)
preds20.columns = ['y_class', 'p1']
preds20.index = preds20['p1']
del preds20['p1']
preds20

Unnamed: 0_level_0,y_class
p1,Unnamed: 1_level_1
7,0
8,1
13,2
14,0
35,0
36,2
41,0
42,0
52,0
53,0


In [261]:
y_preds = xgb_model.predict(dtest_x, ntree_limit=xgb_model.best_ntree_limit)
preds_3 = np.round(y_preds).astype(int)
preds3 = pd.DataFrame(preds_3)
p1 = pd.Series(data = p1)
preds30 = pd.concat([preds3, p1], axis = 1)
preds30.columns = ['y_class', 'p1']
preds30.index = preds30['p1']
del preds30['p1']
preds30

Unnamed: 0_level_0,y_class
p1,Unnamed: 1_level_1
0,1
1,2
2,1
15,1
16,1
...,...
305,2
306,1
307,1
308,2


In [272]:
preds30[40:80]

Unnamed: 0_level_0,y_class
p1,Unnamed: 1_level_1
69,1
70,1
73,1
74,1
75,1
76,1
77,1
78,2
80,1
81,1


In [262]:
preds = pd.concat([preds10, preds20, preds30], axis = 0)
preds = preds.sort_index(ascending=True)

In [265]:
preds

Unnamed: 0_level_0,y_class
p1,Unnamed: 1_level_1
0,1
1,2
2,1
3,1
4,1
...,...
305,2
306,1
307,1
308,2


In [269]:
sub2['Y_Class'] = preds['y_class']
sub2.to_csv('./t30_XGB_sep_LINEdata_com.csv', index = False)

In [270]:
preds

Unnamed: 0_level_0,y_class
p1,Unnamed: 1_level_1
0,1
1,2
2,1
3,1
4,1
...,...
305,2
306,1
307,1
308,2


In [268]:
subm = pd.read_csv('/Users/kimminyoung/Desktop/Dacon_SmartFactory/t30_XGB_sep_LINEdata_com.csv')
subm.describe()

Unnamed: 0,Y_Class
count,310.0
mean,0.980645
std,0.425269
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,2.0


In [98]:
y_pred = xgb_model.predict(dtest, ntree_limit=xgb_model.best_ntree_limit)
preds = np.round(y_pred).astype(int)
print(f1_score(y_test, preds, average = 'macro'))

0.6530173736877647


In [99]:
#t7_0.53219 ########성능 0.683 원래대로 진행 0.782
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, 
                                                    test_size=0.1, 
                                                    stratify=train_y, 
                                                    random_state=37)


dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test, label=y_test)
#dtest_x = xgb.DMatrix(data=test_x)

params = {'max_depth' : 6,
          'eta': 0.04,
          'objective':'multi:softmax',
          'num_class':3,
          'eval_metric':'mlogloss',
#           'eval_set': [(X_test, y_test)], #적용이 안 되고 있다고 뜸
#           'early_stopping':100 #적용이 안 되고 있다고 뜸
          }
num_rounds = 400

wlist = [(dtrain, 'train'), (dtest, 'eval')]
xgb_model = xgb.train(params=params, 
                      dtrain=dtrain, 
                      num_boost_round=num_rounds, 
                      early_stopping_rounds=100, 
                      evals=wlist)

[0]	train-mlogloss:1.05928	eval-mlogloss:1.06846
[1]	train-mlogloss:1.02224	eval-mlogloss:1.03964
[2]	train-mlogloss:0.98753	eval-mlogloss:1.01551
[3]	train-mlogloss:0.95449	eval-mlogloss:0.99104
[4]	train-mlogloss:0.92392	eval-mlogloss:0.96798
[5]	train-mlogloss:0.89469	eval-mlogloss:0.94454
[6]	train-mlogloss:0.86730	eval-mlogloss:0.92149
[7]	train-mlogloss:0.84109	eval-mlogloss:0.90036
[8]	train-mlogloss:0.81629	eval-mlogloss:0.88033
[9]	train-mlogloss:0.79239	eval-mlogloss:0.85956
[10]	train-mlogloss:0.77003	eval-mlogloss:0.84263
[11]	train-mlogloss:0.74875	eval-mlogloss:0.82553
[12]	train-mlogloss:0.72855	eval-mlogloss:0.80987
[13]	train-mlogloss:0.70990	eval-mlogloss:0.79619
[14]	train-mlogloss:0.69126	eval-mlogloss:0.78199
[15]	train-mlogloss:0.67458	eval-mlogloss:0.76859
[16]	train-mlogloss:0.65735	eval-mlogloss:0.75758
[17]	train-mlogloss:0.64150	eval-mlogloss:0.74715
[18]	train-mlogloss:0.62598	eval-mlogloss:0.73522
[19]	train-mlogloss:0.61060	eval-mlogloss:0.72494
[20]	train

[163]	train-mlogloss:0.06951	eval-mlogloss:0.54001
[164]	train-mlogloss:0.06887	eval-mlogloss:0.53961
[165]	train-mlogloss:0.06811	eval-mlogloss:0.53860
[166]	train-mlogloss:0.06752	eval-mlogloss:0.53814
[167]	train-mlogloss:0.06689	eval-mlogloss:0.53932
[168]	train-mlogloss:0.06617	eval-mlogloss:0.53933
[169]	train-mlogloss:0.06552	eval-mlogloss:0.53895
[170]	train-mlogloss:0.06484	eval-mlogloss:0.54014
[171]	train-mlogloss:0.06418	eval-mlogloss:0.54057
[172]	train-mlogloss:0.06358	eval-mlogloss:0.54042
[173]	train-mlogloss:0.06300	eval-mlogloss:0.54039
[174]	train-mlogloss:0.06247	eval-mlogloss:0.54069
[175]	train-mlogloss:0.06187	eval-mlogloss:0.54067
[176]	train-mlogloss:0.06126	eval-mlogloss:0.53972
[177]	train-mlogloss:0.06071	eval-mlogloss:0.54002
[178]	train-mlogloss:0.06011	eval-mlogloss:0.54000
[179]	train-mlogloss:0.05962	eval-mlogloss:0.53936
[180]	train-mlogloss:0.05904	eval-mlogloss:0.53939
[181]	train-mlogloss:0.05853	eval-mlogloss:0.53913
[182]	train-mlogloss:0.05807	ev

In [100]:
y_pred = xgb_model.predict(dtest, ntree_limit=xgb_model.best_ntree_limit)
preds = np.round(y_pred).astype(int)
print(f1_score(y_test, preds, average = 'macro'))

0.782559847895601


In [103]:
#t7_0.53219 ########성능 0.683 원래에서 max_depth 7로 0.787 과적합 가능성 있음
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, 
                                                    test_size=0.1, 
                                                    stratify=train_y, 
                                                    random_state=37)


dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test, label=y_test)
#dtest_x = xgb.DMatrix(data=test_x)

params = {'max_depth' : 7,
          'eta': 0.04,
          'objective':'multi:softmax',
          'num_class':3,
          'eval_metric':'mlogloss',
#           'eval_set': [(X_test, y_test)], #적용이 안 되고 있다고 뜸
#           'early_stopping':100 #적용이 안 되고 있다고 뜸
          }
num_rounds = 400

wlist = [(dtrain, 'train'), (dtest, 'eval')]
xgb_model = xgb.train(params=params, 
                      dtrain=dtrain, 
                      num_boost_round=num_rounds, 
                      early_stopping_rounds=100, 
                      evals=wlist)

[0]	train-mlogloss:1.05734	eval-mlogloss:1.06639
[1]	train-mlogloss:1.01836	eval-mlogloss:1.03465
[2]	train-mlogloss:0.98192	eval-mlogloss:1.00666
[3]	train-mlogloss:0.94688	eval-mlogloss:0.98095
[4]	train-mlogloss:0.91448	eval-mlogloss:0.95759
[5]	train-mlogloss:0.88443	eval-mlogloss:0.93223
[6]	train-mlogloss:0.85517	eval-mlogloss:0.90750
[7]	train-mlogloss:0.82821	eval-mlogloss:0.88531
[8]	train-mlogloss:0.80172	eval-mlogloss:0.86370
[9]	train-mlogloss:0.77609	eval-mlogloss:0.84379
[10]	train-mlogloss:0.75217	eval-mlogloss:0.82476
[11]	train-mlogloss:0.72932	eval-mlogloss:0.80693
[12]	train-mlogloss:0.70798	eval-mlogloss:0.79104
[13]	train-mlogloss:0.68855	eval-mlogloss:0.77681
[14]	train-mlogloss:0.66958	eval-mlogloss:0.76155
[15]	train-mlogloss:0.65132	eval-mlogloss:0.74939
[16]	train-mlogloss:0.63192	eval-mlogloss:0.73638
[17]	train-mlogloss:0.61405	eval-mlogloss:0.72359
[18]	train-mlogloss:0.59767	eval-mlogloss:0.71347
[19]	train-mlogloss:0.58050	eval-mlogloss:0.70147
[20]	train

[163]	train-mlogloss:0.04885	eval-mlogloss:0.52874
[164]	train-mlogloss:0.04832	eval-mlogloss:0.52860
[165]	train-mlogloss:0.04781	eval-mlogloss:0.52807
[166]	train-mlogloss:0.04726	eval-mlogloss:0.52934
[167]	train-mlogloss:0.04663	eval-mlogloss:0.53003
[168]	train-mlogloss:0.04610	eval-mlogloss:0.53017
[169]	train-mlogloss:0.04570	eval-mlogloss:0.53035
[170]	train-mlogloss:0.04516	eval-mlogloss:0.53122
[171]	train-mlogloss:0.04463	eval-mlogloss:0.53193
[172]	train-mlogloss:0.04414	eval-mlogloss:0.53294
[173]	train-mlogloss:0.04362	eval-mlogloss:0.53284
[174]	train-mlogloss:0.04310	eval-mlogloss:0.53328
[175]	train-mlogloss:0.04268	eval-mlogloss:0.53409
[176]	train-mlogloss:0.04219	eval-mlogloss:0.53437
[177]	train-mlogloss:0.04169	eval-mlogloss:0.53506
[178]	train-mlogloss:0.04123	eval-mlogloss:0.53469
[179]	train-mlogloss:0.04078	eval-mlogloss:0.53437


In [104]:
y_pred = xgb_model.predict(dtest, ntree_limit=xgb_model.best_ntree_limit)
preds = np.round(y_pred).astype(int)
print(f1_score(y_test, preds, average = 'macro'))

0.7877394636015325


In [105]:
#t7_0.53219 ########성능 0.683 원래에서 max_depth 10로 0.791
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, 
                                                    test_size=0.1, 
                                                    stratify=train_y, 
                                                    random_state=37)


dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test, label=y_test)
#dtest_x = xgb.DMatrix(data=test_x)

params = {'max_depth' : 10,
          'eta': 0.03,
          'objective':'multi:softmax',
          'num_class':3,
          'eval_metric':'mlogloss',
#           'eval_set': [(X_test, y_test)], #적용이 안 되고 있다고 뜸
#           'early_stopping':100 #적용이 안 되고 있다고 뜸
          }
num_rounds = 400

wlist = [(dtrain, 'train'), (dtest, 'eval')]
xgb_model = xgb.train(params=params, 
                      dtrain=dtrain, 
                      num_boost_round=num_rounds, 
                      early_stopping_rounds=100, 
                      evals=wlist)

[0]	train-mlogloss:1.06473	eval-mlogloss:1.07224
[1]	train-mlogloss:1.03262	eval-mlogloss:1.05003
[2]	train-mlogloss:1.00198	eval-mlogloss:1.02696
[3]	train-mlogloss:0.97269	eval-mlogloss:1.00597
[4]	train-mlogloss:0.94468	eval-mlogloss:0.98503
[5]	train-mlogloss:0.91779	eval-mlogloss:0.96462
[6]	train-mlogloss:0.89202	eval-mlogloss:0.94676
[7]	train-mlogloss:0.86716	eval-mlogloss:0.92772
[8]	train-mlogloss:0.84327	eval-mlogloss:0.90891
[9]	train-mlogloss:0.82052	eval-mlogloss:0.89083
[10]	train-mlogloss:0.79841	eval-mlogloss:0.87388
[11]	train-mlogloss:0.77796	eval-mlogloss:0.85915
[12]	train-mlogloss:0.75698	eval-mlogloss:0.84447
[13]	train-mlogloss:0.73748	eval-mlogloss:0.83089
[14]	train-mlogloss:0.71823	eval-mlogloss:0.81684
[15]	train-mlogloss:0.69980	eval-mlogloss:0.80418
[16]	train-mlogloss:0.68107	eval-mlogloss:0.79105
[17]	train-mlogloss:0.66354	eval-mlogloss:0.77746
[18]	train-mlogloss:0.64592	eval-mlogloss:0.76592
[19]	train-mlogloss:0.62957	eval-mlogloss:0.75423
[20]	train

[163]	train-mlogloss:0.04835	eval-mlogloss:0.51480
[164]	train-mlogloss:0.04785	eval-mlogloss:0.51524
[165]	train-mlogloss:0.04734	eval-mlogloss:0.51564
[166]	train-mlogloss:0.04686	eval-mlogloss:0.51650
[167]	train-mlogloss:0.04635	eval-mlogloss:0.51691
[168]	train-mlogloss:0.04586	eval-mlogloss:0.51747
[169]	train-mlogloss:0.04536	eval-mlogloss:0.51727
[170]	train-mlogloss:0.04490	eval-mlogloss:0.51772
[171]	train-mlogloss:0.04442	eval-mlogloss:0.51807
[172]	train-mlogloss:0.04397	eval-mlogloss:0.51866
[173]	train-mlogloss:0.04353	eval-mlogloss:0.51901
[174]	train-mlogloss:0.04311	eval-mlogloss:0.51940
[175]	train-mlogloss:0.04267	eval-mlogloss:0.52013
[176]	train-mlogloss:0.04224	eval-mlogloss:0.51985
[177]	train-mlogloss:0.04184	eval-mlogloss:0.52027
[178]	train-mlogloss:0.04142	eval-mlogloss:0.51979
[179]	train-mlogloss:0.04101	eval-mlogloss:0.51993
[180]	train-mlogloss:0.04061	eval-mlogloss:0.52042
[181]	train-mlogloss:0.04022	eval-mlogloss:0.52069
[182]	train-mlogloss:0.03984	ev

In [106]:
y_pred = xgb_model.predict(dtest, ntree_limit=xgb_model.best_ntree_limit)
preds = np.round(y_pred).astype(int)
print(f1_score(y_test, preds, average = 'macro'))

0.7918075276087446


In [125]:
#t7_0.53219 ########성능 0.683 원래에서 eta 0.035로 0.807
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, 
                                                    test_size=0.1, 
                                                    stratify=train_y, 
                                                    random_state=37)


dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test, label=y_test)
dtest_x = xgb.DMatrix(data=test_x)

params = {'max_depth' : 6,
          'eta': 0.035,
          'objective':'multi:softmax',
          'num_class':3,
          'eval_metric':'mlogloss'
          }
num_rounds = 400

wlist = [(dtrain, 'train'), (dtest, 'eval')]
xgb_model = xgb.train(params=params, 
                      dtrain=dtrain, 
                      num_boost_round=num_rounds, 
                      early_stopping_rounds=100, 
                      evals=wlist)

[0]	train-mlogloss:1.06414	eval-mlogloss:1.07218
[1]	train-mlogloss:1.03142	eval-mlogloss:1.04668
[2]	train-mlogloss:1.00052	eval-mlogloss:1.02519
[3]	train-mlogloss:0.97054	eval-mlogloss:1.00240
[4]	train-mlogloss:0.94299	eval-mlogloss:0.98364
[5]	train-mlogloss:0.91671	eval-mlogloss:0.96251
[6]	train-mlogloss:0.89149	eval-mlogloss:0.94173
[7]	train-mlogloss:0.86732	eval-mlogloss:0.92338
[8]	train-mlogloss:0.84442	eval-mlogloss:0.90463
[9]	train-mlogloss:0.82251	eval-mlogloss:0.88676
[10]	train-mlogloss:0.80135	eval-mlogloss:0.86907
[11]	train-mlogloss:0.78150	eval-mlogloss:0.85287
[12]	train-mlogloss:0.76243	eval-mlogloss:0.83708
[13]	train-mlogloss:0.74417	eval-mlogloss:0.82313
[14]	train-mlogloss:0.72705	eval-mlogloss:0.80971
[15]	train-mlogloss:0.71031	eval-mlogloss:0.79851
[16]	train-mlogloss:0.69456	eval-mlogloss:0.78730
[17]	train-mlogloss:0.67880	eval-mlogloss:0.77663
[18]	train-mlogloss:0.66375	eval-mlogloss:0.76521
[19]	train-mlogloss:0.64977	eval-mlogloss:0.75537
[20]	train

[163]	train-mlogloss:0.08658	eval-mlogloss:0.54319
[164]	train-mlogloss:0.08580	eval-mlogloss:0.54360
[165]	train-mlogloss:0.08513	eval-mlogloss:0.54335
[166]	train-mlogloss:0.08432	eval-mlogloss:0.54375
[167]	train-mlogloss:0.08355	eval-mlogloss:0.54421
[168]	train-mlogloss:0.08263	eval-mlogloss:0.54343
[169]	train-mlogloss:0.08186	eval-mlogloss:0.54386
[170]	train-mlogloss:0.08103	eval-mlogloss:0.54450
[171]	train-mlogloss:0.08021	eval-mlogloss:0.54430
[172]	train-mlogloss:0.07954	eval-mlogloss:0.54411
[173]	train-mlogloss:0.07879	eval-mlogloss:0.54337
[174]	train-mlogloss:0.07806	eval-mlogloss:0.54366
[175]	train-mlogloss:0.07740	eval-mlogloss:0.54397
[176]	train-mlogloss:0.07670	eval-mlogloss:0.54462
[177]	train-mlogloss:0.07605	eval-mlogloss:0.54556
[178]	train-mlogloss:0.07541	eval-mlogloss:0.54595
[179]	train-mlogloss:0.07477	eval-mlogloss:0.54635
[180]	train-mlogloss:0.07410	eval-mlogloss:0.54706
[181]	train-mlogloss:0.07353	eval-mlogloss:0.54824
[182]	train-mlogloss:0.07288	ev

In [126]:
y_pred = xgb_model.predict(dtest, ntree_limit=xgb_model.best_ntree_limit)
preds = np.round(y_pred).astype(int)
print(f1_score(y_test, preds, average = 'macro'))

0.8070874861572537


In [127]:
pred_probs = xgb_model.predict(dtest_x, ntree_limit=xgb_model.best_ntree_limit)
preds = np.round(pred_probs).astype(int)
sub3['Y_Class'] = preds
sub3.to_csv('./t30_XGB_Dmatrix_Earlystopping_eta0035_testsize01.csv', index = False)

In [123]:
y_pred = xgb_model.predict(dtest, ntree_limit=xgb_model.best_ntree_limit)
preds = np.round(y_pred).astype(int)
print(f1_score(y_test, preds, average = 'macro'))

0.7557720057720058


In [190]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

xgb_model = XGBClassifier(n_estimators=200)

X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, 
                                                    test_size=0.1, 
                                                    stratify=train_y, 
                                                    random_state=37)

# 후보 파라미터 선정
params = {'max_depth':[6], 'min_child_weight':[1,3,5], 'colsample_bytree':[0.75,1]}
          #'learning_rate' :[0.01, 0.02, 0.03, 0.035]

# gridsearchcv 객체 정보 입력(어떤 모델, 파라미터 후보, 교차검증 몇 번)
gridcv = GridSearchCV(xgb_model, param_grid=params, cv=3)

# 파라미터 튜닝 시작
gridcv.fit(train_x, train_y, early_stopping_rounds=20, eval_metric='mlogloss', eval_set=[(X_val, y_val)])

#튜닝된 파라미터 출력
print(gridcv.best_params_)

[0]	validation_0-mlogloss:0.95927
[1]	validation_0-mlogloss:0.85181
[2]	validation_0-mlogloss:0.75319
[3]	validation_0-mlogloss:0.72759
[4]	validation_0-mlogloss:0.72671
[5]	validation_0-mlogloss:0.69939
[6]	validation_0-mlogloss:0.66953
[7]	validation_0-mlogloss:0.65160
[8]	validation_0-mlogloss:0.65946
[9]	validation_0-mlogloss:0.61611
[10]	validation_0-mlogloss:0.61855
[11]	validation_0-mlogloss:0.60403
[12]	validation_0-mlogloss:0.59930
[13]	validation_0-mlogloss:0.59560
[14]	validation_0-mlogloss:0.57280
[15]	validation_0-mlogloss:0.56851
[16]	validation_0-mlogloss:0.55139
[17]	validation_0-mlogloss:0.54871
[18]	validation_0-mlogloss:0.55955
[19]	validation_0-mlogloss:0.57247
[20]	validation_0-mlogloss:0.57112
[21]	validation_0-mlogloss:0.57502
[22]	validation_0-mlogloss:0.57584
[23]	validation_0-mlogloss:0.57856
[24]	validation_0-mlogloss:0.57598
[25]	validation_0-mlogloss:0.57346
[26]	validation_0-mlogloss:0.57668
[27]	validation_0-mlogloss:0.57791
[28]	validation_0-mlogloss:0.5

[28]	validation_0-mlogloss:0.53232
[29]	validation_0-mlogloss:0.53315
[30]	validation_0-mlogloss:0.52233
[31]	validation_0-mlogloss:0.52261
[32]	validation_0-mlogloss:0.53075
[33]	validation_0-mlogloss:0.53251
[34]	validation_0-mlogloss:0.52875
[35]	validation_0-mlogloss:0.52810
[36]	validation_0-mlogloss:0.54207
[37]	validation_0-mlogloss:0.54290
[38]	validation_0-mlogloss:0.53631
[0]	validation_0-mlogloss:0.89777
[1]	validation_0-mlogloss:0.76063
[2]	validation_0-mlogloss:0.66868
[3]	validation_0-mlogloss:0.60883
[4]	validation_0-mlogloss:0.55450
[5]	validation_0-mlogloss:0.52922
[6]	validation_0-mlogloss:0.50608
[7]	validation_0-mlogloss:0.48786
[8]	validation_0-mlogloss:0.47597
[9]	validation_0-mlogloss:0.46468
[10]	validation_0-mlogloss:0.44858
[11]	validation_0-mlogloss:0.44312
[12]	validation_0-mlogloss:0.44223
[13]	validation_0-mlogloss:0.44209
[14]	validation_0-mlogloss:0.43977
[15]	validation_0-mlogloss:0.43805
[16]	validation_0-mlogloss:0.43049
[17]	validation_0-mlogloss:0.4

[37]	validation_0-mlogloss:0.55547
[38]	validation_0-mlogloss:0.56283
[39]	validation_0-mlogloss:0.55983
[40]	validation_0-mlogloss:0.56468
[41]	validation_0-mlogloss:0.55980
[42]	validation_0-mlogloss:0.56168
[43]	validation_0-mlogloss:0.56228
[44]	validation_0-mlogloss:0.56161
[45]	validation_0-mlogloss:0.56492
[46]	validation_0-mlogloss:0.56491
[47]	validation_0-mlogloss:0.56115
[48]	validation_0-mlogloss:0.56128
[49]	validation_0-mlogloss:0.56020
[50]	validation_0-mlogloss:0.55857
[0]	validation_0-mlogloss:0.84658
[1]	validation_0-mlogloss:0.67239
[2]	validation_0-mlogloss:0.58583
[3]	validation_0-mlogloss:0.51811
[4]	validation_0-mlogloss:0.45745
[5]	validation_0-mlogloss:0.42760
[6]	validation_0-mlogloss:0.40417
[7]	validation_0-mlogloss:0.38103
[8]	validation_0-mlogloss:0.37547
[9]	validation_0-mlogloss:0.37695
[10]	validation_0-mlogloss:0.37481
[11]	validation_0-mlogloss:0.37520
[12]	validation_0-mlogloss:0.36853
[13]	validation_0-mlogloss:0.37304
[14]	validation_0-mlogloss:0.3

[33]	validation_0-mlogloss:0.04302
[34]	validation_0-mlogloss:0.04144
[35]	validation_0-mlogloss:0.04025
[36]	validation_0-mlogloss:0.03941
[37]	validation_0-mlogloss:0.03851
[38]	validation_0-mlogloss:0.03717
[39]	validation_0-mlogloss:0.03742
[40]	validation_0-mlogloss:0.03671
[41]	validation_0-mlogloss:0.03590
[42]	validation_0-mlogloss:0.03564
[43]	validation_0-mlogloss:0.03554
[44]	validation_0-mlogloss:0.03430
[45]	validation_0-mlogloss:0.03389
[46]	validation_0-mlogloss:0.03307
[47]	validation_0-mlogloss:0.03239
[48]	validation_0-mlogloss:0.03231
[49]	validation_0-mlogloss:0.03147
[50]	validation_0-mlogloss:0.03135
[51]	validation_0-mlogloss:0.03054
[52]	validation_0-mlogloss:0.03072
[53]	validation_0-mlogloss:0.03039
[54]	validation_0-mlogloss:0.03011
[55]	validation_0-mlogloss:0.02979
[56]	validation_0-mlogloss:0.02947
[57]	validation_0-mlogloss:0.02919
[58]	validation_0-mlogloss:0.02880
[59]	validation_0-mlogloss:0.02841
[60]	validation_0-mlogloss:0.02806
[61]	validation_0-ml

In [None]:
# 1차적으로 튜닝된 파라미터를 가지고 객체 생성
xgb_model = XGBClassifier(n_estimators=1000, learning_rate=0.02, max_depth=7, min_child_weight=1, colsample_bytree=0.75, reg_alpha=0.03)

# 학습
xgb_model.fit(X_train, y_train, early_stopping_rounds=200, eval_metric='mlogloss', eval_set=[(X_val, y_val)])

In [185]:
#t7_0.53219 ########성능 0.683 train데이터내에서 train validation test 나눠서
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, 
                                                    test_size=0.1, 
                                                    stratify=train_y, 
                                                    random_state=37)

dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test, label=y_test)
dtest_x = xgb.DMatrix(data=test_x)

params = {'max_depth' : 6,
          'eta': 0.035,
          'objective':'multi:softmax',
          'num_class':3,
          'eval_metric':'mlogloss',
          'eval_set': [(X_test, y_test)], #적용이 안 되고 있다고 뜸
          'early_stopping':100 #적용이 안 되고 있다고 뜸
          }
num_rounds = 400

wlist = [(dtrain, 'train'), (dtest, 'eval')]
xgb_model = xgb.train(params=params, 
                      dtrain=dtrain, 
                      num_boost_round=num_rounds, 
                      early_stopping_rounds=100, 
                      evals=wlist)

Parameters: { "early_stopping", "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-mlogloss:1.05972	eval-mlogloss:1.07173
[1]	train-mlogloss:1.02242	eval-mlogloss:1.04669
[2]	train-mlogloss:0.98684	eval-mlogloss:1.02306
[3]	train-mlogloss:0.95344	eval-mlogloss:1.00009
[4]	train-mlogloss:0.92220	eval-mlogloss:0.97894
[5]	train-mlogloss:0.89228	eval-mlogloss:0.95901
[6]	train-mlogloss:0.86367	eval-mlogloss:0.94035
[7]	train-mlogloss:0.83676	eval-mlogloss:0.92235
[8]	train-mlogloss:0.81038	eval-mlogloss:0.90631
[9]	train-mlogloss:0.78413	eval-mlogloss:0.89372
[10]	train-mlogloss:0.76006	eval-mlogloss:0.87879
[11]	train-mlogloss:0.73582	eval-mlogloss:0.86668
[12]	train-mlogloss:0.71379	eval-mlogloss:0.85385
[13]	train-mlogloss:0.69135

In [71]:
print(xgb_model.best_iteration)
print(xgb_model.best_ntree_limit)
print(xgb_model.best_score)

43
44
0.7244108068943024


In [12]:
#모델 저장
with open('model_XGB_0683.pickle','wb') as fw:
    pickle.dump(xgb_model, fw)