## Setup

In [1]:
import os
os.environ['PYTHONHASHSEED'] = str(37)

import pandas as pd
import random

import numpy as np
from collections import Counter
from matplotlib import pyplot
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
import optuna
from sklearn.model_selection import StratifiedKFold
from optuna import Trial
from optuna.samplers import TPESampler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)    
    np.random.seed(seed)

In [2]:
class CFG:
    debug = False

## Create Data Pipeline

In [3]:
seed_everything(37) # Seed 고정
train = pd.read_csv('../../datasets/train.csv')
test = pd.read_csv('../../datasets/test.csv')

'''
train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
train_y = train_df['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])
train_x = train_x.fillna(0) # NaN 0으로 채우기
test_x = test_x.fillna(0)
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    # test_x 데이터에만 존재하는 새로 출현한 데이터를 신규 클래스로 추가한다 (중요!!!)
    for label in np.unique(test_x[i]):
        if label not in le.classes_: # unseen label 데이터인 경우( )
            le.classes_ = np.append(le.classes_,label) # 미처리 시 ValueError발생
    test_x[i] = le.transform(test_x[i])
'''

col_list = train.columns
nan_list = []
nan_cnt = []
nan_col = []
full_list = []
for col in col_list:
    if train[col].isnull().sum() == 0 :
        full_list.append(col)
        continue
    nan_list.append([col, train[col].isnull().sum()])
    nan_cnt.append(train[col].isnull().sum())
    nan_col.append(col)
    
'''모든값이 결측값이면 제거'''
del_col = []
for col in nan_list :
    if col[1] == 598 :
        del_col.append(col[0])
train = train.drop(columns=del_col)
test = test.drop(columns=del_col)
train.head(3)

trainA_31 = train[train['PRODUCT_CODE']=='A_31']
train_T_31 = train[train['PRODUCT_CODE']=='T_31']
train_O_31 = train[train['PRODUCT_CODE']=='O_31']

testA_31 = test[test['PRODUCT_CODE']=='A_31']
test_T_31 = test[test['PRODUCT_CODE']=='T_31']
test_O_31 = test[test['PRODUCT_CODE']=='O_31']

## Preprocessing

In [4]:
col_list = train.columns
nan_listA_31 = []
nan_cntA_31 = []
nan_colA_31 = []
full_listA_31 = []
for col in col_list:
    if trainA_31[col].isnull().sum() == 0 :
        full_listA_31.append(col)
        continue
    nan_listA_31.append([col, trainA_31[col].isnull().sum()])
    nan_cntA_31.append(trainA_31[col].isnull().sum())
    nan_colA_31.append(col)
    

del_col = []
for col in nan_listA_31 :
    if col[1] == len(trainA_31) :
        del_col.append(col[0])
trainA_31 = trainA_31.drop(columns=del_col)
testA_31 = testA_31.drop(columns=del_col)


del_col = []
col_list = trainA_31.columns
for col in col_list[6:] :
    if trainA_31[col].nunique()==1 :
        del_col.append(col)
trainA_31 = trainA_31.drop(columns=del_col)
testA_31 = testA_31.drop(columns=del_col)


col_list = train.columns
nan_listO = []
nan_cntO = []
nan_colO = []
full_listO = []
for col in col_list:
    if train_O_31[col].isnull().sum() == 0 :
        full_listO.append(col)
        continue
    nan_listO.append([col, train_O_31[col].isnull().sum()])
    nan_cntO.append(train_O_31[col].isnull().sum())
    nan_colO.append(col)
    

del_col = []
for col in nan_listO :
    if col[1] == len(train_O_31) :
        del_col.append(col[0])
train_O_31 = train_O_31.drop(columns=del_col)
test_O_31 = test_O_31.drop(columns=del_col)


del_col = []
col_list = train_O_31.columns
for col in col_list[6:] :
    if train_O_31[col].nunique()==1 :
        del_col.append(col)
train_O_31 = train_O_31.drop(columns=del_col)
test_O_31 = test_O_31.drop(columns=del_col)


col_list = train.columns
nan_listT = []
nan_cntT = []
nan_colT = []
full_listT = []
for col in col_list:
    if train_T_31[col].isnull().sum() == 0 :
        full_listT.append(col)
        continue
    nan_listT.append([col, train_T_31[col].isnull().sum()])
    nan_cntT.append(train_T_31[col].isnull().sum())
    nan_colT.append(col)

## Feature Engineering

In [5]:
'''
trainA_31_x = trainA_31.drop(columns=['PRODUCT_ID','TIMESTAMP','PRODUCT_CODE','Y_Class','Y_Quality'])
testA_31_x = testA_31.drop(columns=['PRODUCT_ID','TIMESTAMP','PRODUCT_CODE'])
train_T_31_x = train_T_31.drop(columns=['PRODUCT_ID','TIMESTAMP','Y_Class','Y_Quality','PRODUCT_CODE'])
test_T_31_x = test_T_31.drop(columns=['PRODUCT_ID','TIMESTAMP','PRODUCT_CODE'])
train_O_31_x = train_O_31.drop(columns=['PRODUCT_ID','TIMESTAMP','PRODUCT_CODE','Y_Class','Y_Quality'])
test_O_31_x = test_O_31.drop(columns=['PRODUCT_ID','TIMESTAMP','PRODUCT_CODE'])
'''

#클래스 살리기
trainA_31_x = trainA_31.drop(columns=['PRODUCT_ID','TIMESTAMP','PRODUCT_CODE','Y_Quality'])
testA_31_x = testA_31.drop(columns=['PRODUCT_ID','TIMESTAMP','PRODUCT_CODE'])
train_T_31_x = train_T_31.drop(columns=['PRODUCT_ID','TIMESTAMP','Y_Quality','PRODUCT_CODE'])
test_T_31_x = test_T_31.drop(columns=['PRODUCT_ID','TIMESTAMP','PRODUCT_CODE'])
train_O_31_x = train_O_31.drop(columns=['PRODUCT_ID','TIMESTAMP','PRODUCT_CODE','Y_Quality'])
test_O_31_x = test_O_31.drop(columns=['PRODUCT_ID','TIMESTAMP','PRODUCT_CODE'])

# classification
trainA_31_y_c = trainA_31['Y_Class']
train_T_31_y_c = train_T_31['Y_Class']
train_O_31_y_c = train_O_31['Y_Class']

# regression
trainA_31_y_r = trainA_31['Y_Quality']
train_T_31_y_r = train_T_31['Y_Quality']
train_O_31_y_r = train_O_31['Y_Quality']

train_T_31_y_r = pd.DataFrame(train_T_31_y_r,columns = ['Y_Quality'])
train_T_31_y_r = train_T_31_y_r.reset_index(drop = True)
test_T = train_T_31_y_r

trainA_31_x=trainA_31_x.fillna(-1)
testA_31_x=testA_31_x.fillna(-1)
train_T_31_x=train_T_31_x.fillna(-1)
test_T_31_x=test_T_31_x.fillna(-1)
train_O_31_x=train_O_31_x.fillna(-1)
test_O_31_x=test_O_31_x.fillna(-1)

trainA_31_x['level0'] = 0
trainA_31_x['level1'] = 0
trainA_31_x['level2'] = 0
trainA_31_x['level3'] = 0
trainA_31_x['level4'] = 0
trainA_31_x['level0'][trainA_31_x['LINE'] == 'T050304'] = 1
trainA_31_x['level1'][trainA_31_x['LINE'] == 'T050307'] = 1
trainA_31_x['level2'][trainA_31_x['LINE'] == 'T010305'] = 1
trainA_31_x['level3'][trainA_31_x['LINE'] == 'T010306'] = 1
trainA_31_x['level4'][(trainA_31_x['LINE'] == 'T010306') & (trainA_31_x['LINE'] == 'T010305')] = 1

testA_31_x['level0'] = 0
testA_31_x['level1'] = 0
testA_31_x['level2'] = 0
testA_31_x['level3'] = 0
testA_31_x['level4'] = 0
testA_31_x['level0'][testA_31_x['LINE'] == 'T050304'] = 1
testA_31_x['level1'][testA_31_x['LINE'] == 'T050307'] = 1
testA_31_x['level2'][testA_31_x['LINE'] == 'T010305'] = 1
testA_31_x['level3'][testA_31_x['LINE'] == 'T010306'] = 1
testA_31_x['level4'][(testA_31_x['LINE'] == 'T010306') & (testA_31_x['LINE'] == 'T010305')] = 1

le_list = []
# qualitative to quantitative
qual_col = ['LINE']
for i in qual_col:
    le = LabelEncoder()
    le = le.fit(trainA_31_x[i])
    trainA_31_x[i] = le.transform(trainA_31_x[i])
    
    for label in np.unique(testA_31_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    testA_31_x[i] = le.transform(testA_31_x[i])
le_list.append(le)
print('Done.')


# qualitative to quantitative
qual_col = ['LINE']
for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_T_31_x[i])
    train_T_31_x[i] = le.transform(train_T_31_x[i])
    
    for label in np.unique(test_T_31_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_T_31_x[i] = le.transform(test_T_31_x[i])
le_list.append(le)
print('Done.')


# qualitative to quantitative
qual_col = ['LINE']
for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_O_31_x[i])
    train_O_31_x[i] = le.transform(train_O_31_x[i])
    
    for label in np.unique(test_O_31_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_O_31_x[i] = le.transform(test_O_31_x[i])
le_list.append(le)
print('Done.')

Done.
Done.
Done.


## Define model & Training

In [6]:
from catboost import *

def objective(trial, train_x, train_y, valid_x, valid_y, output_container):
    params = {
            'iterations':trial.suggest_int("iterations", 300, 1000),
            'learning_rate' : trial.suggest_uniform('learning_rate',0.1, 1),
            'depth': trial.suggest_int('depth',5, 16),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
            'reg_lambda': trial.suggest_uniform('reg_lambda',30,100),
            'subsample': trial.suggest_uniform('subsample',0.3, 1),
            'random_strength': trial.suggest_uniform('random_strength',10,100),
            'od_wait':trial.suggest_int('od_wait', 10, 150),
            'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,20),
            'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 1, 100),
            "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.0, 1.0),
            'random_state' : 1234,
            'verbose' : 0,
        }

    cat = CatBoostRegressor(**params)
    cat.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], verbose=False, use_best_model=True)
    cat_pred = cat.predict(valid_x)
    
    y_valid = pd.DataFrame(cat_pred, columns=['Y_Quality'])
    y_valid = y_valid.reset_index(drop = True)
    y_valid['Y_Class2'] = 1
    y_valid['Y_Class2'] = quality_to_class(y_valid['Y_Quality'], quality_threshold)

    score = f1_score(valid_line_y_cls, y_valid['Y_Class2'], average='macro')
    if score > output_container["best_score"]:
        output_container["model"] = cat
        output_container["best_score"] = score
    return score

def quality_to_class(x, threshold):
    y_pred_class = []
    for i in x:
        tmp_class = len(threshold)
        for k, v in threshold.items():
            if i < v:
                tmp_class = k
                break
        y_pred_class.append(tmp_class)
    return y_pred_class

quality_threshold = {
    0: 0.52507,
    1: 0.53490
}

In [7]:
output_full = {
    "A_31": np.zeros(len(trainA_31_x), dtype="float32"),
    "T_31": np.zeros(len(train_T_31_x), dtype="float32"),
    "O_31": np.zeros(len(train_O_31_x), dtype="float32"),
}
output_test = {
    "A_31": np.zeros(len(testA_31_x), dtype="float32"),
    "T_31": np.zeros(len(test_T_31_x), dtype="float32"),
    "O_31": np.zeros(len(test_O_31_x), dtype="float32"),
}

output_y_pred = {
    "A_31": [],
    "T_31": [],
    "O_31": [],
}
output_y_true = {
    "A_31": [],
    "T_31": [],
    "O_31": [],
}
output_y_true_cls = {
    "A_31": [],
    "T_31": [],
    "O_31": [],
}

tmp_score = {"MAE": None, "R2": None, "Accuracy": None, "F1": None}
score_dic = {i: tmp_score.copy() for i in ["A_31", "T_31", "O_31"]}

## A_31 Training

In [None]:
output_container = {"model": None, "best_score": -np.inf}
line_split = list(range(len(le_list[0].classes_)))

seed_everything(37)
for fold in line_split:
    
    valid_line_x = trainA_31_x[trainA_31_x["LINE"].isin([fold]).values].drop("Y_Class", axis=1)
    valid_line_y = trainA_31_y_r[trainA_31_x["LINE"].isin([fold]).values]
    valid_line_y_cls = trainA_31_y_c[trainA_31_x["LINE"].isin([fold]).values]
    train_line_x = trainA_31_x[~(trainA_31_x["LINE"].isin([fold])).values].drop("Y_Class", axis=1)
    train_line_y = trainA_31_y_r[~(trainA_31_x["LINE"].isin([fold])).values]
    train_line_y_cls = trainA_31_y_c[~(trainA_31_x["LINE"].isin([fold])).values]
    
    study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=37))
    study.optimize(
            lambda trial: objective(
                trial, train_line_x, train_line_y, valid_line_x, valid_line_y, output_container
            ), n_trials=1 if CFG.debug else 500, show_progress_bar=True
    )
    
    best_model = output_container["model"]
    output_y_true["A_31"].append(valid_line_y.values)
    output_y_true_cls["A_31"].append(valid_line_y_cls.values)
    output_y_pred["A_31"].append(best_model.predict(valid_line_x))
    output_full["A_31"][:] += best_model.predict(trainA_31_x) / len(line_split)
    output_test["A_31"][:] += best_model.predict(testA_31_x) / len(line_split)

In [7]:
y_pred = np.concatenate(output_y_pred["A_31"])
y_pred_cls = quality_to_class(y_pred, quality_threshold)
y_true = np.concatenate(output_y_true["A_31"])
y_true_cls = np.concatenate(output_y_true_cls["A_31"])

score_dic["A_31"]["MAE"] = mean_absolute_error(y_true, y_pred)
score_dic["A_31"]["R2"] = r2_score(y_true, y_pred)
score_dic["A_31"]["Accuracy"] = accuracy_score(y_true_cls, y_pred_cls)
score_dic["A_31"]["F1"] = f1_score(y_true_cls, y_pred_cls, average="macro")

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



ERROR! Session/line number was not unique in database. History logging moved to new session 362
Traceback (most recent call last):
  File "c:\users\flash\pycharmprojects\pythonproject\venv\lib\site-packages\IPython\core\interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\flash\AppData\Local\Temp\ipykernel_18096\994115778.py", line 1, in <module>
    y_pred = np.concatenate(output_y_pred["A_31"])
  File "<__array_function__ internals>", line 5, in concatenate
ValueError: need at least one array to concatenate

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\users\flash\pycharmprojects\pythonproject\venv\lib\site-packages\IPython\core\interactiveshell.py", line 2064, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'ValueError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "c:\users\flash\pycharmprojects\pythonproject\venv\lib\site-packages\IPython\core\interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\flash\AppData\Local\Temp\ipykernel_18096\994115778.py", line 1, in <module>
    y_pred = np.concatenate(output_y_pred["A_31"])
  File "<__array_function__ internals>", line 5, in concatenate
ValueError: need at least one array to concatenate

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\users\flash\pycharmprojects\pythonproject\venv\lib\site-packages\IPython\core\interactiveshell.py", line 2064, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'ValueError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\users\flash\pycharmprojects\pythonproject\venv\

TypeError: object of type 'NoneType' has no len()

In [None]:
score_dic["A_31"]

## T_31 Training

In [None]:
output_container = {"model": None, "best_score": -np.inf}
line_split = list(range(len(le_list[1].classes_)))
output = np.zeros(len(train_T_31_x), dtype="float32")

seed_everything(37)
for fold in line_split:
    
    valid_line_x = train_T_31_x[train_T_31_x["LINE"].isin([fold]).values].drop("Y_Class", axis=1)
    valid_line_y = train_T_31_y_r[train_T_31_x["LINE"].isin([fold]).values]
    valid_line_y_cls = train_T_31_y_c[train_T_31_x["LINE"].isin([fold]).values]
    train_line_x = train_T_31_x[~(train_T_31_x["LINE"].isin([fold])).values].drop("Y_Class", axis=1)
    train_line_y = train_T_31_y_r[~(train_T_31_x["LINE"].isin([fold])).values]
    train_line_y_cls = train_T_31_y_c[~(train_T_31_x["LINE"].isin([fold])).values]
    
    study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=37))
    study.optimize(
            lambda trial: objective(
                trial, train_line_x, train_line_y, valid_line_x, valid_line_y, output_container
            ), n_trials=1 if CFG.debug else 500, show_progress_bar=True
    )
    
    best_model = output_container["model"]
    output_y_true["T_31"].append(valid_line_y)
    output_y_true_cls["T_31"].append(valid_line_y_cls)
    output_y_pred["T_31"].append(best_model.predict(valid_line_x))
    output_full["T_31"][:] += best_model.predict(train_T_31_x) / len(line_split)
    output_test["T_31"][:] += best_model.predict(test_T_31_x) / len(line_split)

In [None]:
y_pred = np.concatenate(output_y_pred["T_31"])
y_pred_cls = quality_to_class(y_pred, quality_threshold)
y_true = np.concatenate(output_y_true["T_31"])
y_true_cls = np.concatenate(output_y_true_cls["T_31"])

score_dic["T_31"]["MAE"] = mean_absolute_error(y_true, y_pred)
score_dic["T_31"]["R2"] = r2_score(y_true, y_pred)
score_dic["T_31"]["Accuracy"] = accuracy_score(y_true_cls, y_pred_cls)
score_dic["T_31"]["F1"] = f1_score(y_true_cls, y_pred_cls, average="macro")

In [None]:
score_dic["T_31"]

## O_31 Training

In [None]:
output_container = {"model": None, "best_score": -np.inf}
line_split = list(range(len(le_list[2].classes_)))
output = np.zeros(len(train_O_31_x), dtype="float32")

seed_everything(37)
for fold in line_split:
    
    valid_line_x = train_O_31_x[train_O_31_x["LINE"].isin([fold]).values].drop("Y_Class", axis=1)
    valid_line_y = train_O_31_y_r[train_O_31_x["LINE"].isin([fold]).values]
    valid_line_y_cls = train_O_31_y_c[train_O_31_x["LINE"].isin([fold]).values]
    train_line_x = train_O_31_x[~(train_O_31_x["LINE"].isin([fold])).values].drop("Y_Class", axis=1)
    train_line_y = train_O_31_y_r[~(train_O_31_x["LINE"].isin([fold])).values]
    train_line_y_cls = train_O_31_y_c[~(train_O_31_x["LINE"].isin([fold])).values]
    
    study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=37))
    study.optimize(
            lambda trial: objective(
                trial, train_line_x, train_line_y, valid_line_x, valid_line_y, output_container
            ), n_trials=1 if CFG.debug else 500, show_progress_bar=True
    )
    
    best_model = output_container["model"]
    output_y_true["O_31"].append(valid_line_y)
    output_y_true_cls["O_31"].append(valid_line_y_cls)
    output_y_pred["O_31"].append(best_model.predict(valid_line_x))
    output_full["O_31"][:] += best_model.predict(train_O_31_x) / len(line_split)
    output_test["O_31"][:] += best_model.predict(test_O_31_x) / len(line_split)

In [None]:
y_pred = np.concatenate(output_y_pred["O_31"])
y_pred_cls = quality_to_class(y_pred, quality_threshold)
y_true = np.concatenate(output_y_true["O_31"])
y_true_cls = np.concatenate(output_y_true_cls["O_31"])

score_dic["O_31"]["MAE"] = mean_absolute_error(y_true, y_pred)
score_dic["O_31"]["R2"] = r2_score(y_true, y_pred)
score_dic["O_31"]["Accuracy"] = accuracy_score(y_true_cls, y_pred_cls)
score_dic["O_31"]["F1"] = f1_score(y_true_cls, y_pred_cls, average="macro")

In [None]:
score_dic["O_31"]

## Save result

In [None]:
tmp_test = test[["PRODUCT_CODE"]]
tmp_test["Y_Quality"] = 0.5
tmp_test["Y_Class"] = 1

In [None]:
for k, v in output_test.items():
    tmp_test.loc[tmp_test["PRODUCT_CODE"] == k, "Y_Quality"] = v

In [None]:
tmp_test["Y_Class"] = quality_to_class(tmp_test["Y_Quality"], quality_threshold)

In [None]:
sns.set_palette("Set2")

fig, ax = plt.subplots(figsize=(8, 6))
graph = sns.histplot(x=tmp_test["Y_Quality"], edgecolor="grey")

plt.xlabel(ax.get_xlabel(), fontsize=16, labelpad=20)
plt.ylabel("", fontsize=16, labelpad=20)

plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

plt.title("y_quality Histogram", fontsize=18, fontweight="bold", pad=20)

plt.show()

In [None]:
for i in ['A_31', 'T_31', 'O_31']:
    print(f"\n=== {i} distribution ===")
    print(tmp_test.loc[tmp_test["PRODUCT_CODE"] == i, "Y_Class"].value_counts())
    print(tmp_test.loc[tmp_test["PRODUCT_CODE"] == i, "Y_Class"].value_counts(normalize=True))

In [None]:
tmp_test["Y_Class"].value_counts()

In [None]:
tmp_test["Y_Class"].value_counts(normalize=True)

## Submission

In [None]:
submission = pd.read_csv("../../datasets/sample_submission.csv")
submission["Y_Class"] = tmp_test["Y_Class"].values

In [None]:
submission

In [None]:
submission.to_csv("주혁님_cat_linefold.csv", index=False)