In [14]:
import pandas as pd
import random
import os
import numpy as np
import matplotlib.pyplot as plt
import warnings; warnings.filterwarnings(action='ignore')
import pickle

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import sklearn.svm as svm
import xgboost as xgb

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from bayes_opt import BayesianOptimization

from xgboost import plot_importance
from scipy.stats import randint
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [3]:
def seed_everything(seed):
    random.seed(seed) #파이썬 자체 모듈 random 모듈의 시드 고정
    os.environ['PYTHONHASHSEED'] = str(seed) 
    np.random.seed(seed)

seed_everything(37) # Seed 고정

# Data Load

In [4]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [5]:
sub = pd.read_csv('./sample_submission.csv')

# Data Preprocessing

X

In [6]:
train_y = train_df['Y_Class']

In [7]:
train_x = train_df.drop(columns = ['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

범주형 데이터를 수치 데이터로 전환하기 위해 LabelEncoder 활용

In [8]:
# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i]) #원래 column 값을 기준으로 fit.
    train_x[i] = le.transform(train_x[i]) #수치화, 수치로 변형

    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 
print('Done.')

Done.


# Modeling

학습용과 테스트용 데이터 세트를 위해 별도의 Dmatrix를 생성

In [9]:
#t7_0.53219 ########성능 0.683
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, 
                                                    test_size=0.1, 
                                                    stratify=train_y, 
                                                    random_state=37)

dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test, label=y_test)
dtest_x = xgb.DMatrix(data=test_x)

params = {'max_depth' : 6,
          'eta': 0.04,
          'objective':'multi:softmax',
          'num_class':3,
          'eval_metric':'mlogloss',
          'eval_set': [(X_test, y_test)], #적용이 안 되고 있다고 뜸
          'early_stopping':100 #적용이 안 되고 있다고 뜸
          }
num_rounds = 400

wlist = [(dtrain, 'train'), (dtest, 'eval')]
xgb_model = xgb.train(params=params, 
                      dtrain=dtrain, 
                      num_boost_round=num_rounds, 
                      early_stopping_rounds=100, 
                      evals=wlist)

Parameters: { "early_stopping", "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-mlogloss:1.05928	eval-mlogloss:1.06846
[1]	train-mlogloss:1.02224	eval-mlogloss:1.03964
[2]	train-mlogloss:0.98753	eval-mlogloss:1.01551
[3]	train-mlogloss:0.95449	eval-mlogloss:0.99104
[4]	train-mlogloss:0.92392	eval-mlogloss:0.96798
[5]	train-mlogloss:0.89469	eval-mlogloss:0.94454
[6]	train-mlogloss:0.86730	eval-mlogloss:0.92149
[7]	train-mlogloss:0.84109	eval-mlogloss:0.90036
[8]	train-mlogloss:0.81629	eval-mlogloss:0.88033
[9]	train-mlogloss:0.79239	eval-mlogloss:0.85956
[10]	train-mlogloss:0.77003	eval-mlogloss:0.84263
[11]	train-mlogloss:0.74875	eval-mlogloss:0.82553
[12]	train-mlogloss:0.72855	eval-mlogloss:0.80987
[13]	train-mlogloss:0.70990

[154]	train-mlogloss:0.07586	eval-mlogloss:0.53859
[155]	train-mlogloss:0.07515	eval-mlogloss:0.53809
[156]	train-mlogloss:0.07432	eval-mlogloss:0.53885
[157]	train-mlogloss:0.07356	eval-mlogloss:0.53911
[158]	train-mlogloss:0.07286	eval-mlogloss:0.53812
[159]	train-mlogloss:0.07214	eval-mlogloss:0.53908
[160]	train-mlogloss:0.07149	eval-mlogloss:0.53970
[161]	train-mlogloss:0.07085	eval-mlogloss:0.53975
[162]	train-mlogloss:0.07019	eval-mlogloss:0.53901
[163]	train-mlogloss:0.06951	eval-mlogloss:0.54001
[164]	train-mlogloss:0.06887	eval-mlogloss:0.53961
[165]	train-mlogloss:0.06811	eval-mlogloss:0.53860
[166]	train-mlogloss:0.06752	eval-mlogloss:0.53814
[167]	train-mlogloss:0.06689	eval-mlogloss:0.53932
[168]	train-mlogloss:0.06617	eval-mlogloss:0.53933
[169]	train-mlogloss:0.06552	eval-mlogloss:0.53895
[170]	train-mlogloss:0.06484	eval-mlogloss:0.54014
[171]	train-mlogloss:0.06418	eval-mlogloss:0.54057
[172]	train-mlogloss:0.06358	eval-mlogloss:0.54042
[173]	train-mlogloss:0.06300	ev

In [10]:
print(xgb_model.best_iteration)
print(xgb_model.best_ntree_limit)
print(xgb_model.best_score)

137
138
0.5321924893185497


In [28]:
pred_probs = xgb_model.predict(dtest_x, ntree_limit=xgb_model.best_ntree_limit)
preds = np.round(pred_probs).astype(int)
sub['Y_Class'] = preds
sub.to_csv('./t30_XGB_Dmatrix_Earlystopping_eta004_testsize01.csv', index = False)

In [12]:
#모델 저장
with open('model_XGB_0683.pickle','wb') as fw:
    pickle.dump(xgb_model, fw)