In [1]:
import pandas as pd
import random
import os
import numpy as np
import matplotlib.pyplot as plt
import warnings; warnings.filterwarnings(action='ignore')
import pickle

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import sklearn.svm as svm
import xgboost as xgb

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from bayes_opt import BayesianOptimization

from xgboost import plot_importance
from scipy.stats import randint
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [2]:
def seed_everything(seed):
    random.seed(seed) #파이썬 자체 모듈 random 모듈의 시드 고정
    os.environ['PYTHONHASHSEED'] = str(seed) 
    np.random.seed(seed)

seed_everything(37) # Seed 고정

# Data Load

In [3]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
sub = pd.read_csv('./sample_submission.csv')

# Data Preprocessing

결측값을 모두 -100으로 채운다.

In [4]:
train_df = train_df.fillna(-100)
test_df = test_df.fillna(-100)

In [5]:
train_y = train_df['Y_Class']

In [6]:
train_x = train_df.drop(columns = ['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

범주형 데이터를 수치 데이터로 전환하기 위해 LabelEncoder 활용

In [7]:
# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i]) #원래 column 값을 기준으로 fit.
    train_x[i] = le.transform(train_x[i]) #수치화, 수치로 변형
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 
print('Done.')

Done.


# Modeling

### GBM

### RamdomSearchCV 파라미터 최적화

In [207]:
param_distribs = {
    'n_estimators' : randint(low=1, high=200),
    'min_samples_split': randint(low = 1, high = 10),
    'min_samples_leaf': randint(low = 1, high = 17)
}

GBM= GradientBoostingClassifier(random_state=37)

rand_cv = RandomizedSearchCV(GBM, 
                            param_distributions=param_distribs,
                            cv = 5, 
                            n_iter = 50,
                            scoring = 'f1_macro',
                            n_jobs = -1,
                            verbose=3)
rand_cv.fit(train_x, train_y)
preds = rand_cv.predict(test_x)

sub['Y_Class'] = preds
sub.to_csv('./t10_GBM_RandomSearchCV_niter50.csv', index=False)

print(f'최적 하이퍼 파라미터: {rand_cv.best_params_}')
print(f'최고 예측 정확도: {(rand_cv.best_score_)*100:.4f}')
print(f'time elapsed: {end-start}')

[0.78333333 0.79166667 0.78333333 0.78991597 0.76470588]
Fitting 5 folds for each of 50 candidates, totalling 250 fits
최적 하이퍼 파라미터: {'min_samples_leaf': 3, 'min_samples_split': 4, 'n_estimators': 157}
최고 예측 정확도: 41.3178
time elapsed: 1106.4138910770416
