In [129]:
import pandas as pd
import random
import os
import numpy as np
import matplotlib.pyplot as plt
import warnings; warnings.filterwarnings(action='ignore')
import pickle

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import sklearn.svm as svm
import xgboost as xgb

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from bayes_opt import BayesianOptimization

from xgboost import plot_importance
from scipy.stats import randint
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [130]:
def seed_everything(seed):
    random.seed(seed) #파이썬 자체 모듈 random 모듈의 시드 고정
    os.environ['PYTHONHASHSEED'] = str(seed) 
    np.random.seed(seed)

seed_everything(37) # Seed 고정

# Data Load

In [131]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [132]:
sub = pd.read_csv('./sample_submission.csv')
sub2 = pd.read_csv('./sample_submission.csv')
sub3 = pd.read_csv('./sample_submission.csv')
sub4 = pd.read_csv('./sample_submission.csv')
sub5 = pd.read_csv('./sample_submission.csv')

# Data Preprocessing

X

In [176]:
train_010 = train_df[(train_df['LINE']=='T010306') | (train_df['LINE']=='T010305')]
train_050 = train_df[(train_df['LINE']=='T050304') | (train_df['LINE']=='T050307')]
train_100 = train_df[(train_df['LINE']=='T010306') | (train_df['LINE']=='T100304')]

In [177]:
test_010 = test_df[(test_df['LINE']=='T010306') | (test_df['LINE']=='T010305')]
test_050 = test_df[(test_df['LINE']=='T050304') | (test_df['LINE']=='T050307')]
test_100 = test_df[(test_df['LINE']=='T100304') | (test_df['LINE']=='T100306')]

In [181]:
train_y = train_100['Y_Class']

In [182]:
train_x = train_100.drop(columns = ['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
test_x = test_100.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

In [183]:
p1 = test_x.index.tolist() #28 #39 #243
len(p1)

243

범주형 데이터를 수치 데이터로 전환하기 위해 LabelEncoder 활용

In [184]:
# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i]) #원래 column 값을 기준으로 fit.
    train_x[i] = le.transform(train_x[i]) #수치화, 수치로 변형

    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 
print('Done.')

Done.


# Modeling

학습용과 테스트용 데이터 세트를 위해 별도의 Dmatrix를 생성

In [185]:
#t7_0.53219 ########성능 0.683 train데이터내에서 train validation test 나눠서
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, 
                                                    test_size=0.1, 
                                                    stratify=train_y, 
                                                    random_state=37)

dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test, label=y_test)
dtest_x = xgb.DMatrix(data=test_x)

params = {'max_depth' : 6,
          'eta': 0.035,
          'objective':'multi:softmax',
          'num_class':3,
          'eval_metric':'mlogloss',
          'eval_set': [(X_test, y_test)], #적용이 안 되고 있다고 뜸
          'early_stopping':100 #적용이 안 되고 있다고 뜸
          }
num_rounds = 400

wlist = [(dtrain, 'train'), (dtest, 'eval')]
xgb_model = xgb.train(params=params, 
                      dtrain=dtrain, 
                      num_boost_round=num_rounds, 
                      early_stopping_rounds=100, 
                      evals=wlist)

Parameters: { "early_stopping", "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-mlogloss:1.05972	eval-mlogloss:1.07173
[1]	train-mlogloss:1.02242	eval-mlogloss:1.04669
[2]	train-mlogloss:0.98684	eval-mlogloss:1.02306
[3]	train-mlogloss:0.95344	eval-mlogloss:1.00009
[4]	train-mlogloss:0.92220	eval-mlogloss:0.97894
[5]	train-mlogloss:0.89228	eval-mlogloss:0.95901
[6]	train-mlogloss:0.86367	eval-mlogloss:0.94035
[7]	train-mlogloss:0.83676	eval-mlogloss:0.92235
[8]	train-mlogloss:0.81038	eval-mlogloss:0.90631
[9]	train-mlogloss:0.78413	eval-mlogloss:0.89372
[10]	train-mlogloss:0.76006	eval-mlogloss:0.87879
[11]	train-mlogloss:0.73582	eval-mlogloss:0.86668
[12]	train-mlogloss:0.71379	eval-mlogloss:0.85385
[13]	train-mlogloss:0.69135

In [71]:
print(xgb_model.best_iteration)
print(xgb_model.best_ntree_limit)
print(xgb_model.best_score)

43
44
0.7244108068943024


In [148]:
# pred_probs = xgb_model.predict(dtest_x, ntree_limit=xgb_model.best_ntree_limit)
y_preds = xgb_model.predict(dtest_x, ntree_limit=xgb_model.best_ntree_limit)
preds_1 = np.round(y_preds).astype(int)
preds1 = pd.DataFrame(preds_1)
p1 = pd.Series(data = p1)
preds10 = pd.concat([preds1, p1], axis = 1)
preds10.columns = ['y_class', 'p1']
preds10.index = preds10['p1']
del preds10['p1']
preds10

Unnamed: 0_level_0,y_class
p1,Unnamed: 1_level_1
3,1
4,1
5,1
6,1
9,1
10,1
11,1
12,1
37,0
38,0


In [154]:
y_preds = xgb_model.predict(dtest_x, ntree_limit=xgb_model.best_ntree_limit)
preds_2 = np.round(y_preds).astype(int)
preds2 = pd.DataFrame(preds_2)
p1 = pd.Series(data = p1)
preds20 = pd.concat([preds2, p1], axis = 1)
preds20.columns = ['y_class', 'p1']
preds20.index = preds20['p1']
del preds20['p1']
preds20

Unnamed: 0_level_0,y_class
p1,Unnamed: 1_level_1
7,0
8,1
13,2
14,0
35,0
36,2
41,0
42,0
52,0
53,0


In [186]:
y_preds = xgb_model.predict(dtest_x, ntree_limit=xgb_model.best_ntree_limit)
preds_3 = np.round(y_preds).astype(int)
preds3 = pd.DataFrame(preds_3)
p1 = pd.Series(data = p1)
preds30 = pd.concat([preds3, p1], axis = 1)
preds30.columns = ['y_class', 'p1']
preds30.index = preds30['p1']
del preds30['p1']
preds30

Unnamed: 0_level_0,y_class
p1,Unnamed: 1_level_1
0,1
1,1
2,1
15,1
16,1
...,...
305,1
306,1
307,1
308,1


In [187]:
preds = pd.concat([preds10, preds20, preds30], axis = 0)
preds = preds.sort_index(ascending=True)


In [188]:
sub2['Y_Class'] = preds['y_class']
sub2.to_csv('./t30_XGB_sep_LINEdata_com.csv', index = False)

In [12]:
#모델 저장
with open('model_XGB_0683.pickle','wb') as fw:
    pickle.dump(xgb_model, fw)