## Setup

In [1]:
GLOBAL_SEED = 42

import os
os.environ['PYTHONHASHSEED'] = str(GLOBAL_SEED)
import sys

import gc
from tqdm import tqdm
import datetime
import pickle
import random as rnd
from glob import glob
import pandas as pd
import numpy as np
from numpy import random as np_rnd
import warnings
from math import ceil

import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import rcParams
from itertools import combinations
from collections import Counter

import lightgbm as lgb
import xgboost as xgb
import catboost as cat

import optuna
from optuna import Trial, create_study
from optuna.samplers import TPESampler

from sklearn import linear_model as lm
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold

from sklearn.impute import KNNImputer
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from itertools import product

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from scipy.stats import f_oneway
from scipy.stats import pearsonr

# display setting
warnings.filterwarnings(action='ignore')
rcParams['axes.unicode_minus'] = False

In [2]:
def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    # python random
    rnd.seed(seed)
    # numpy random
    np_rnd.seed(seed)
    # tf random
    try:
        tf_rnd.set_seed(seed)
    except:
        pass
    # RAPIDS random
    try:
        cupy.random.seed(seed)
    except:
        pass
    # pytorch random
    try:
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
    except:
        pass

def pickleIO(obj, src, op="w"):
    if op=="w":
        with open(src, op + "b") as f:
            pickle.dump(obj, f)
    elif op=="r":
        with open(src, op + "b") as f:
            tmp = pickle.load(f)
        return tmp
    else:
        print("unknown operation")
        return obj

def findIdx(data_x, col_names):
    return [int(i) for i, j in enumerate(data_x) if j in col_names]

def diff(first, second):
    second = set(second)
    return [item for item in first if item not in second]

def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print('Error: Creating directory. ' + directory)
        
def week_of_month(dt):
    """ 
        Returns the week of the month for the specified date.
    """
    first_day = dt.replace(day=1)
    dom = dt.day
    adjusted_dom = dom + (1 + first_day.weekday()) % 7
    return int(ceil(adjusted_dom/7.0))

def quality_to_class(x, threshold):
    y_pred_class = []
    for i in x:
        tmp_class = len(threshold)
        for k, v in threshold.items():
            if i < v:
                tmp_class = k
                break
        y_pred_class.append(tmp_class)
    return y_pred_class

In [3]:
class CFG:
    debug = False
    TF = True
    product_mapper = {
        "A": ["T010305", "T010306", "T050304", "T050307"],
        "O": ["T100304", "T100306"],
        "T": ["T100304", "T100306"],
    }
    line_mapper = {
        "T010305": "A", "T010306": "A", "T050304": "A", "T050307": "A",
        "T100304": "O_T", "T100306": "O_T",
    }
    line_groups = [
        ["T010305", "T010306"],
        ["T050304", "T050307"],
        ["T100304", "T100306"],
    ]
    classes = [0, 1, 2]
    time_features = ["month", "day", "weekday", "week_of_month", "hour", "office_hour", "sec_in_day", "sin_in_day", "cos_in_day"]

In [4]:
quality_threshold = pickleIO(None, "../datasets/dataset_valid/quality_threshold.pkl", "r")
quality_threshold['T010305_T010306_T050304_T050307'] = {0: (0.525046 + 0.525086) / 2, 1: (0.534843 + 0.535279) / 2}

In [5]:
quality_threshold = {
    "T010305_T010306": {0: 0.52507, 1: 0.53490},
    "T050304_T050307": {0: 0.52507, 1: 0.53490},
    "T100304_T100306": {0: 0.52507, 1: 0.53490},
}

In [6]:
quality_threshold

{'T010305_T010306': {0: 0.52507, 1: 0.5349},
 'T050304_T050307': {0: 0.52507, 1: 0.5349},
 'T100304_T100306': {0: 0.52507, 1: 0.5349}}

## Loading Data

In [7]:
# information Provided by Dacon

# PRODUCT_ID : 제품의 고유 ID
# Y_Class : 제품 품질 상태(Target) 
# 0 : 적정 기준 미달 (부적합)
# 1 : 적합
# 2 : 적정 기준 초과 (부적합)
# Y_Quality : 제품 품질 관련 정량적 수치
# TIMESTAMP : 제품이 공정에 들어간 시각
# LINE : 제품이 들어간 공정 LINE 종류 ('T050304', 'T050307', 'T100304', 'T100306', 'T010306', 'T010305' 존재)
# PRODUCT_CODE : 제품의 CODE 번호 ('A_31', 'T_31', 'O_31' 존재)
# X_1 ~ X_2875 : 공정 과정에서 추출되어 비식별화된 변수

In [8]:
df_full = pd.read_csv("../datasets/train.csv")
df_full.columns = df_full.columns.str.lower()

In [9]:
# # time feature engineernig
# df_full["month"] = df_full["timestamp"].dt.month
# df_full["day"] = df_full["timestamp"].dt.day
# df_full["weekday"] = df_full["timestamp"].dt.weekday
# df_full["week_of_month"] = df_full["timestamp"].apply(week_of_month)
# df_full["hour"] = df_full["timestamp"].dt.hour
# df_full["office_hour"] = df_full["hour"].apply(lambda x: 1 if ((x >= 9) & (x < 18)) else 0)
# df_full["sec_in_day"] = (df_full["timestamp"] - df_full["timestamp"].dt.normalize()).dt.total_seconds() / 3600
# df_full["sin_in_day"] = np.sin(2 * np.pi * df_full["sec_in_day"].values)
# df_full["cos_in_day"] = np.cos(2 * np.pi * df_full["sec_in_day"].values)

In [10]:
# df_train = pickleIO(None, "../../datasets/dataset_valid2/df_train.pkl", "r")
# df_valid = pickleIO(None, "../../datasets/dataset_valid2/df_valid.pkl", "r")

In [11]:
# # time feature engineernig
# df_train["month"] = df_train["timestamp"].dt.month
# df_train["day"] = df_train["timestamp"].dt.day
# df_train["weekday"] = df_train["timestamp"].dt.weekday
# df_train["week_of_month"] = df_train["timestamp"].apply(week_of_month)
# df_train["hour"] = df_train["timestamp"].dt.hour
# df_train["office_hour"] = df_train["hour"].apply(lambda x: 1 if ((x >= 9) & (x < 18)) else 0)
# df_train["sec_in_day"] = (df_train["timestamp"] - df_train["timestamp"].dt.normalize()).dt.total_seconds() / 3600
# df_train["sin_in_day"] = np.sin(2 * np.pi * df_train["sec_in_day"].values)
# df_train["cos_in_day"] = np.cos(2 * np.pi * df_train["sec_in_day"].values)

In [12]:
# # time feature engineernig
# df_valid["month"] = df_valid["timestamp"].dt.month
# df_valid["day"] = df_valid["timestamp"].dt.day
# df_valid["weekday"] = df_valid["timestamp"].dt.weekday
# df_valid["week_of_month"] = df_valid["timestamp"].apply(week_of_month)
# df_valid["hour"] = df_valid["timestamp"].dt.hour
# df_valid["office_hour"] = df_valid["hour"].apply(lambda x: 1 if ((x >= 9) & (x < 18)) else 0)
# df_valid["sec_in_day"] = (df_valid["timestamp"] - df_valid["timestamp"].dt.normalize()).dt.total_seconds() / 3600
# df_valid["sin_in_day"] = np.sin(2 * np.pi * df_valid["sec_in_day"].values)
# df_valid["cos_in_day"] = np.cos(2 * np.pi * df_valid["sec_in_day"].values)

In [13]:
# df_train.info()

In [14]:
# df_train.head()

In [15]:
# df_train["y_quality"]

In [16]:
# df_train.groupby(["line", "product_code"]).size()

In [17]:
# df_valid.info()

In [18]:
# df_valid.head()

In [19]:
# df_valid["y_quality"]

In [20]:
# df_valid.groupby(["line", "product_code"]).size()

In [74]:
df_full["tmp"] = df_full["product_code"].apply(lambda x: 1 if x == "A_31" else 0)
display(df_full.groupby(["tmp", "y_class"])["y_quality"].describe().T)
df_full = df_full.drop("tmp", axis=1)

tmp,0,0,0,1,1,1
y_class,0,1,2,0,1,2
count,28.0,289.0,32.0,60.0,118.0,71.0
mean,0.521246,0.530272,0.538753,0.520646,0.530209,0.543508
std,0.005765,0.002334,0.004931,0.00399,0.002705,0.008733
min,0.502517,0.525213,0.534951,0.500856,0.525086,0.535279
25%,0.520467,0.528483,0.535541,0.519388,0.527989,0.53733
50%,0.523422,0.530308,0.536237,0.521315,0.530353,0.539235
75%,0.524612,0.532119,0.539517,0.523522,0.532332,0.547506
max,0.525067,0.534837,0.551279,0.525046,0.534843,0.578841


In [75]:
df_full["tmp"] = df_full["product_code"].apply(lambda x: 1 if x == "A_31" else 0)
display(df_full.groupby("tmp")["y_class"].describe().T)
df_full = df_full.drop("tmp", axis=1)

tmp,0,1
count,349.0,249.0
mean,1.011461,1.044177
std,0.415069,0.725442
min,0.0,0.0
25%,1.0,1.0
50%,1.0,1.0
75%,1.0,2.0
max,2.0,2.0


In [76]:
df_full["tmp"] = df_full["product_code"].apply(lambda x: 1 if x == "A_31" else 0)
print(df_full.groupby("tmp")["y_class"].value_counts(normalize=True))
df_full = df_full.drop("tmp", axis=1)

tmp  y_class
0    1          0.828080
     2          0.091691
     0          0.080229
1    1          0.473896
     2          0.285141
     0          0.240964
Name: y_class, dtype: float64


In [22]:
df_test = pd.read_csv("../datasets/test.csv")
df_test.columns = df_test.columns.str.lower()

In [23]:
# # time feature engineernig
# df_test["month"] = df_test["timestamp"].dt.month
# df_test["day"] = df_test["timestamp"].dt.day
# df_test["weekday"] = df_test["timestamp"].dt.weekday
# df_test["week_of_month"] = df_test["timestamp"].apply(week_of_month)
# df_test["hour"] = df_test["timestamp"].dt.hour
# df_test["office_hour"] = df_test["hour"].apply(lambda x: 1 if ((x >= 9) & (x < 18)) else 0)
# df_test["sec_in_day"] = (df_test["timestamp"] - df_test["timestamp"].dt.normalize()).dt.total_seconds() / 3600
# df_test["sin_in_day"] = np.sin(2 * np.pi * df_test["sec_in_day"].values)
# df_test["cos_in_day"] = np.cos(2 * np.pi * df_test["sec_in_day"].values)

In [24]:
df_test.head()

Unnamed: 0,product_id,timestamp,line,product_code,x_1,x_2,x_3,x_4,x_5,x_6,...,x_2866,x_2867,x_2868,x_2869,x_2870,x_2871,x_2872,x_2873,x_2874,x_2875
0,TEST_000,2022-09-09 2:01,T100306,T_31,2.0,94.0,0.0,45.0,10.0,0.0,...,,,,,,,,,,
1,TEST_001,2022-09-09 2:09,T100304,T_31,2.0,93.0,0.0,45.0,11.0,0.0,...,,,,,,,,,,
2,TEST_002,2022-09-09 8:42,T100304,T_31,2.0,95.0,0.0,45.0,11.0,0.0,...,,,,,,,,,,
3,TEST_003,2022-09-09 10:56,T010305,A_31,,,,,,,...,,,,,,,,,,
4,TEST_004,2022-09-09 11:04,T010306,A_31,,,,,,,...,,,,,,,,,,


## Training by Lines

* "T010305", "T010306"
* "T050304", "T050307"
* "T100304", "T100306"

In [25]:
architecture_root_path = "./architectures/"
architecture_name = "daheeRef_targetQual_bylines_TF_cat_gbm_metaLearning_try2"
architecture_path = architecture_root_path + architecture_name + "/"
createFolder(architecture_path)

In [26]:
# Shuffling
df_full = df_full.sample(frac=1, random_state=GLOBAL_SEED).reset_index(drop=True)

In [27]:
# Label Encoding on Line
lbe = LabelEncoder()
lbe.fit(["T010305", "T010306", "T050304", "T050307", "T100304", "T100306"])

LabelEncoder()

In [28]:
df_meta = {"reg_qual": None, "cls_cls": None, "reg_cls": None}

### Training - Regression (y_quality) 

In [29]:
# objective
# regession : "mae", "mse"
# classification - binary : "binary"
# classification - binary : "multiclass" (num_class=n)
# ranking : "xe_ndcg_mart"

# metric
# regession : "mae", "mse", "rmse"
# classification - binary : "binary_logloss", "binary_error", "auc"
# classification - muticlass : "multi_logloss", "multi_error"
# ranking : "ndcg", "map"

fixed_params = {
    "n_estimators": 1500,
    "learning_rate": 0.033,
}

searching_params = {
    "reg_lambda": [3.0, 1.0],
    "min_child_samples": [1, 3],
    "rsm": [0.5, 0.8],
    "random_state": [1234, 42],
}

preset_params = []
for params in list(product(*searching_params.values())):
    print(params)
    tmp = fixed_params.copy()
    tmp["reg_lambda"] = params[0]
    tmp["min_child_samples"] = params[1]
    tmp["rsm"] = params[2]
    tmp["random_state"] = params[3]
    preset_params.append(tmp)

preset_params = preset_params[:3] if CFG.debug else preset_params
len(preset_params)

(3.0, 1, 0.5, 1234)
(3.0, 1, 0.5, 42)
(3.0, 1, 0.8, 1234)
(3.0, 1, 0.8, 42)
(3.0, 3, 0.5, 1234)
(3.0, 3, 0.5, 42)
(3.0, 3, 0.8, 1234)
(3.0, 3, 0.8, 42)
(1.0, 1, 0.5, 1234)
(1.0, 1, 0.5, 42)
(1.0, 1, 0.8, 1234)
(1.0, 1, 0.8, 42)
(1.0, 3, 0.5, 1234)
(1.0, 3, 0.5, 42)
(1.0, 3, 0.8, 1234)
(1.0, 3, 0.8, 42)


16

In [30]:
line_split = [
    # A
    ["A_31"],
    # O, T
    ["T_31", "O_31"],
]
score_dic = {
    "mae": None,
    "r2": None,
    "accuracy": None,
    "f1": None,
}
line_split_f1 = {}
line_score = {"_".join(i): {"train": score_dic.copy(), "valid": score_dic.copy()} for i in line_split}
valid_pred = {"_".join(i): None for i in line_split}
test_pred = {"_".join(i): None for i in line_split}

for line in line_split:
    seed_everything()
    # Training
    # === Preprocessing (Train) ===
    full_x = df_full[df_full["product_code"].isin(line)]
    full_x["line"] = lbe.transform(full_x["line"])
    
    full_y = full_x["y_quality"].values
    full_y_cls = full_x["y_class"].values
    # Drop columns
    full_x = full_x.drop(["product_id", "y_class", "y_quality", "timestamp", "product_code"], axis=1)
    
    full_x = full_x[full_x.columns[~(full_x.var() == 0).values & ~full_x.isna().all().values]]
    full_x = full_x.fillna(-1.0)
    full_x = full_x.T.drop_duplicates().T
    full_x = full_x.drop_duplicates().reset_index(drop=True)
    
    selected_vars = full_x.columns
    cat_vars = ["line"]
    num_vars = diff(selected_vars, cat_vars)
    categoIdx = findIdx(selected_vars, cat_vars)

    full_x[num_vars] = full_x[num_vars].astype("float32")
    full_x[cat_vars] = full_x[cat_vars].astype("int32")
    print(full_x.shape)
    
    # === Preprocessing (Test) ===
    test_x = df_test[df_test["product_code"].isin(line)]
    test_x["line"] = lbe.transform(test_x["line"])
    test_x = test_x[selected_vars]
    test_x = test_x.fillna(-1.0)
    
    test_x[num_vars] = test_x[num_vars].astype("float32")
    test_x[cat_vars] = test_x[cat_vars].astype("int32")

    line_full_pred = np.zeros(len(full_x))
    line_test_pred = np.zeros(len(test_x))
    for params in preset_params:
        model = cat.CatBoostRegressor(verbose=int(params["n_estimators"] * 0.2), **params)
        model.fit(full_x, full_y)
        line_full_pred[:] += model.predict(full_x) / len(preset_params)
        line_test_pred[:] += model.predict(test_x) / len(preset_params)

    # Evaluation
    y_pred = line_full_pred.copy()
    y_true = full_y.copy()
    eval_mae = mean_absolute_error(y_true, y_pred)
    eval_r2 = r2_score(y_true, y_pred)

    # Transform quality to class
    y_true_class = full_y_cls.copy()
    y_pred_class = quality_to_class(y_pred, {0: 0.52507, 1: 0.53490})
    eval_acc = accuracy_score(y_true_class, y_pred_class)
    eval_f1 = f1_score(y_true_class, y_pred_class, average="macro")
    
    # Save values
    line_score["_".join(line)]["valid"]["mae"] = eval_mae
    line_score["_".join(line)]["valid"]["r2"] = eval_r2
    line_score["_".join(line)]["valid"]["accuracy"] = eval_acc
    line_score["_".join(line)]["valid"]["f1"] = eval_f1
    valid_pred["_".join(line)] = y_pred
    
    # Inference
    y_pred = line_test_pred.copy()
    
    # Save values
    test_pred["_".join(line)] = y_pred

df_meta["reg_qual"] = [valid_pred, test_pred]

(249, 1173)
0:	learn: 0.0098407	total: 153ms	remaining: 3m 48s
300:	learn: 0.0019380	total: 9.55s	remaining: 38.1s
600:	learn: 0.0004322	total: 19.1s	remaining: 28.5s
900:	learn: 0.0001109	total: 27.6s	remaining: 18.3s
1200:	learn: 0.0000302	total: 38.9s	remaining: 9.68s
1499:	learn: 0.0000080	total: 47.2s	remaining: 0us
0:	learn: 0.0098596	total: 25.4ms	remaining: 38s
300:	learn: 0.0019299	total: 8.19s	remaining: 32.6s
600:	learn: 0.0004388	total: 16.2s	remaining: 24.3s
900:	learn: 0.0001154	total: 24.2s	remaining: 16.1s
1200:	learn: 0.0000324	total: 32.3s	remaining: 8.04s
1499:	learn: 0.0000088	total: 40.6s	remaining: 0us
0:	learn: 0.0098730	total: 93.7ms	remaining: 2m 20s
300:	learn: 0.0018343	total: 9.53s	remaining: 38s
600:	learn: 0.0004066	total: 19.5s	remaining: 29.2s
900:	learn: 0.0001074	total: 29.3s	remaining: 19.5s
1200:	learn: 0.0000287	total: 38.9s	remaining: 9.69s
1499:	learn: 0.0000069	total: 48.6s	remaining: 0us
0:	learn: 0.0098661	total: 56.2ms	remaining: 1m 24s
300:	l

900:	learn: 0.0000331	total: 13.1s	remaining: 8.74s
1200:	learn: 0.0000061	total: 17.4s	remaining: 4.33s
1499:	learn: 0.0000012	total: 21.7s	remaining: 0us
0:	learn: 0.0046962	total: 9ms	remaining: 13.5s
300:	learn: 0.0009626	total: 4.24s	remaining: 16.9s
600:	learn: 0.0001972	total: 8.35s	remaining: 12.5s
900:	learn: 0.0000384	total: 12.5s	remaining: 8.3s
1200:	learn: 0.0000078	total: 16.7s	remaining: 4.16s
1499:	learn: 0.0000016	total: 20.8s	remaining: 0us
0:	learn: 0.0046923	total: 7.41ms	remaining: 11.1s
300:	learn: 0.0009895	total: 3.51s	remaining: 14s
600:	learn: 0.0001958	total: 7.1s	remaining: 10.6s
900:	learn: 0.0000420	total: 10.5s	remaining: 6.97s
1200:	learn: 0.0000089	total: 14.2s	remaining: 3.52s
1499:	learn: 0.0000020	total: 17.8s	remaining: 0us
0:	learn: 0.0046895	total: 7.61ms	remaining: 11.4s
300:	learn: 0.0009711	total: 3.28s	remaining: 13.1s
600:	learn: 0.0002062	total: 6.63s	remaining: 9.91s
900:	learn: 0.0000423	total: 10.1s	remaining: 6.71s
1200:	learn: 0.0000090

### Training - Regression (y_class)

In [31]:
# objective
# regession : "mae", "mse"
# classification - binary : "binary"
# classification - binary : "multiclass" (num_class=n)
# ranking : "xe_ndcg_mart"

# metric
# regession : "mae", "mse", "rmse"
# classification - binary : "binary_logloss", "binary_error", "auc"
# classification - muticlass : "multi_logloss", "multi_error"
# ranking : "ndcg", "map"

fixed_params = {
    "n_estimators": 1500,
    "learning_rate": 0.033,
}

searching_params = {
    "reg_lambda": [3.0, 1.0],
    "min_child_samples": [1, 3],
    "rsm": [0.5, 0.8],
    "random_state": [1234, 42],
}

preset_params = []
for params in list(product(*searching_params.values())):
    print(params)
    tmp = fixed_params.copy()
    tmp["reg_lambda"] = params[0]
    tmp["min_child_samples"] = params[1]
    tmp["rsm"] = params[2]
    tmp["random_state"] = params[3]
    preset_params.append(tmp)


preset_params = preset_params[:3] if CFG.debug else preset_params
len(preset_params)

(3.0, 1, 0.5, 1234)
(3.0, 1, 0.5, 42)
(3.0, 1, 0.8, 1234)
(3.0, 1, 0.8, 42)
(3.0, 3, 0.5, 1234)
(3.0, 3, 0.5, 42)
(3.0, 3, 0.8, 1234)
(3.0, 3, 0.8, 42)
(1.0, 1, 0.5, 1234)
(1.0, 1, 0.5, 42)
(1.0, 1, 0.8, 1234)
(1.0, 1, 0.8, 42)
(1.0, 3, 0.5, 1234)
(1.0, 3, 0.5, 42)
(1.0, 3, 0.8, 1234)
(1.0, 3, 0.8, 42)


16

In [32]:
line_split = [
    # A
    ["A_31"],
    # O, T
    ["T_31", "O_31"],
]
score_dic = {
    "mae": None,
    "r2": None,
    "accuracy": None,
    "f1": None,
}
line_split_f1 = {}
line_score = {"_".join(i): {"train": score_dic.copy(), "valid": score_dic.copy()} for i in line_split}
valid_pred = {"_".join(i): None for i in line_split}
test_pred = {"_".join(i): None for i in line_split}

for line in line_split:
    seed_everything()
    # Training
    # === Preprocessing (Train) ===
    full_x = df_full[df_full["product_code"].isin(line)]
    
    # Label Encoding on Line
    full_x["line"] = lbe.transform(full_x["line"])
    
    full_y = full_x["y_quality"].values
    full_y_cls = full_x["y_class"].values
    # Drop columns
    full_x = full_x.drop(["product_id", "y_class", "y_quality", "timestamp", "product_code"], axis=1)
    
    full_x = full_x[full_x.columns[~(full_x.var() == 0).values & ~full_x.isna().all().values]]
    full_x = full_x.fillna(-1.0)
    full_x = full_x.T.drop_duplicates().T
    full_x = full_x.drop_duplicates().reset_index(drop=True)
    
    selected_vars = full_x.columns
    cat_vars = ["line"]
    num_vars = diff(selected_vars, cat_vars)
    categoIdx = findIdx(selected_vars, cat_vars)

    full_x[num_vars] = full_x[num_vars].astype("float32")
    full_x[cat_vars] = full_x[cat_vars].astype("int32")
    print(full_x.shape)
    
    # === Preprocessing (Test) ===
    test_x = df_test[df_test["product_code"].isin(line)]
    test_x["line"] = lbe.transform(test_x["line"])
    test_x = test_x[selected_vars]
    test_x = test_x.fillna(-1.0)
    
    test_x[num_vars] = test_x[num_vars].astype("float32")
    test_x[cat_vars] = test_x[cat_vars].astype("int32")

    line_full_pred = np.zeros(len(full_x))
    line_test_pred = np.zeros(len(test_x))
    for params in preset_params:
        model = cat.CatBoostRegressor(verbose=int(params["n_estimators"] * 0.2), **params)
        model.fit(full_x, full_y_cls)
        line_full_pred[:] += model.predict(full_x) / len(preset_params)
        line_test_pred[:] += model.predict(test_x) / len(preset_params)
    
    # Evaluation
    y_pred = line_full_pred.copy()
    y_true = full_y.copy()
    eval_mae = mean_absolute_error(y_true, y_pred)
    eval_r2 = r2_score(y_true, y_pred)

    # Transform quality to class
    y_true_class = full_y_cls.copy()
    y_pred_class = np.clip(np.round(y_true), 0, 2)
    eval_acc = accuracy_score(y_true_class, full_y_cls)
    eval_f1 = f1_score(y_true_class, full_y_cls, average="macro")
    
    # Save values
    line_score["_".join(line)]["valid"]["mae"] = eval_mae
    line_score["_".join(line)]["valid"]["r2"] = eval_r2
    line_score["_".join(line)]["valid"]["accuracy"] = eval_acc
    line_score["_".join(line)]["valid"]["f1"] = eval_f1
    valid_pred["_".join(line)] = y_pred
    
    # Inference
    y_pred = line_test_pred.copy()
    
    # Save values
    test_pred["_".join(line)] = y_pred

df_meta["reg_cls"] = [valid_pred, test_pred]

(249, 1173)
0:	learn: 0.7186816	total: 15.8ms	remaining: 23.8s
300:	learn: 0.1621561	total: 7.79s	remaining: 31s
600:	learn: 0.0423705	total: 15.7s	remaining: 23.4s
900:	learn: 0.0112506	total: 23.6s	remaining: 15.7s
1200:	learn: 0.0029833	total: 31.3s	remaining: 7.79s
1499:	learn: 0.0008344	total: 38.9s	remaining: 0us
0:	learn: 0.7167165	total: 33.9ms	remaining: 50.8s
300:	learn: 0.1664601	total: 8.02s	remaining: 32s
600:	learn: 0.0469860	total: 15.8s	remaining: 23.7s
900:	learn: 0.0130129	total: 23.8s	remaining: 15.8s
1200:	learn: 0.0034731	total: 31.6s	remaining: 7.87s
1499:	learn: 0.0009195	total: 39.7s	remaining: 0us
0:	learn: 0.7196055	total: 45.2ms	remaining: 1m 7s
300:	learn: 0.1623294	total: 9.62s	remaining: 38.3s
600:	learn: 0.0419731	total: 19.2s	remaining: 28.7s
900:	learn: 0.0115889	total: 28.7s	remaining: 19.1s
1200:	learn: 0.0032031	total: 38.3s	remaining: 9.53s
1499:	learn: 0.0007542	total: 47.8s	remaining: 0us
0:	learn: 0.7185661	total: 76.2ms	remaining: 1m 54s
300:	le

900:	learn: 0.0063800	total: 13.7s	remaining: 9.14s
1200:	learn: 0.0014947	total: 18.5s	remaining: 4.6s
1499:	learn: 0.0003366	total: 22.9s	remaining: 0us
0:	learn: 0.4131414	total: 9.07ms	remaining: 13.6s
300:	learn: 0.1093332	total: 4.46s	remaining: 17.8s
600:	learn: 0.0251463	total: 8.85s	remaining: 13.2s
900:	learn: 0.0057536	total: 13.1s	remaining: 8.72s
1200:	learn: 0.0013759	total: 17.3s	remaining: 4.32s
1499:	learn: 0.0003179	total: 21.6s	remaining: 0us
0:	learn: 0.4122409	total: 7.14ms	remaining: 10.7s
300:	learn: 0.1054892	total: 3.39s	remaining: 13.5s
600:	learn: 0.0262601	total: 6.68s	remaining: 9.99s
900:	learn: 0.0068443	total: 10.3s	remaining: 6.84s
1200:	learn: 0.0015486	total: 13.7s	remaining: 3.41s
1499:	learn: 0.0003634	total: 17.1s	remaining: 0us
0:	learn: 0.4130141	total: 13.3ms	remaining: 19.9s
300:	learn: 0.1125514	total: 3.6s	remaining: 14.3s
600:	learn: 0.0275011	total: 7.12s	remaining: 10.7s
900:	learn: 0.0068876	total: 10.6s	remaining: 7.03s
1200:	learn: 0.00

### Training - Classfication (y_class)

In [33]:
# objective
# regession : "mae", "mse"
# classification - binary : "binary"
# classification - binary : "multiclass" (num_class=n)
# ranking : "xe_ndcg_mart"

# metric
# regession : "mae", "mse", "rmse"
# classification - binary : "binary_logloss", "binary_error", "auc"
# classification - muticlass : "multi_logloss", "multi_error"
# ranking : "ndcg", "map"

fixed_params = {
    "loss_function": "MultiClass",
    "n_estimators": 1500,
    "learning_rate": 0.033,
}

searching_params = {
    "reg_lambda": [3.0, 1.0],
    "min_child_samples": [1, 3],
    "rsm": [0.5, 0.8],
    "random_state": [1234, 42],
}

preset_params = []
for params in list(product(*searching_params.values())):
    print(params)
    tmp = fixed_params.copy()
    tmp["reg_lambda"] = params[0]
    tmp["min_child_samples"] = params[1]
    tmp["rsm"] = params[2]
    tmp["random_state"] = params[3]
    preset_params.append(tmp)

preset_params = preset_params[:3] if CFG.debug else preset_params
len(preset_params)

(3.0, 1, 0.5, 1234)
(3.0, 1, 0.5, 42)
(3.0, 1, 0.8, 1234)
(3.0, 1, 0.8, 42)
(3.0, 3, 0.5, 1234)
(3.0, 3, 0.5, 42)
(3.0, 3, 0.8, 1234)
(3.0, 3, 0.8, 42)
(1.0, 1, 0.5, 1234)
(1.0, 1, 0.5, 42)
(1.0, 1, 0.8, 1234)
(1.0, 1, 0.8, 42)
(1.0, 3, 0.5, 1234)
(1.0, 3, 0.5, 42)
(1.0, 3, 0.8, 1234)
(1.0, 3, 0.8, 42)


16

In [34]:
line_split = [
    # A
    ["A_31"],
    # O, T
    ["T_31", "O_31"],
]
score_dic = {
    "mae": None,
    "r2": None,
    "accuracy": None,
    "f1": None,
}
line_split_f1 = {}
line_score = {"_".join(i): {"train": score_dic.copy(), "valid": score_dic.copy()} for i in line_split}
valid_pred = {"_".join(i): None for i in line_split}
test_pred = {"_".join(i): None for i in line_split}

for line in line_split:
    seed_everything()
    # Training
    # === Preprocessing (Train) ===
    full_x = df_full[df_full["product_code"].isin(line)]
    
    # Label Encoding on Line
    full_x["line"] = lbe.transform(full_x["line"])
    
    full_y = full_x["y_quality"].values
    full_y_cls = full_x["y_class"].values
    # Drop columns
    full_x = full_x.drop(["product_id", "y_class", "y_quality", "timestamp", "product_code"], axis=1)
    
    full_x = full_x[full_x.columns[~(full_x.var() == 0).values & ~full_x.isna().all().values]]
    full_x = full_x.fillna(-1.0)
    full_x = full_x.T.drop_duplicates().T
    full_x = full_x.drop_duplicates().reset_index(drop=True)
    
    selected_vars = full_x.columns
    cat_vars = ["line"]
    num_vars = diff(selected_vars, cat_vars)
    categoIdx = findIdx(selected_vars, cat_vars)

    full_x[num_vars] = full_x[num_vars].astype("float32")
    full_x[cat_vars] = full_x[cat_vars].astype("int32")
    print(full_x.shape)
    
    # === Preprocessing (Test) ===
    test_x = df_test[df_test["product_code"].isin(line)]
    test_x["line"] = lbe.transform(test_x["line"])
    test_x = test_x[selected_vars]
    test_x = test_x.fillna(-1.0)
    
    test_x[num_vars] = test_x[num_vars].astype("float32")
    test_x[cat_vars] = test_x[cat_vars].astype("int32")

    line_full_pred = np.zeros((len(full_x), 3))
    line_test_pred = np.zeros((len(test_x), 3))
    for params in preset_params:
        model = cat.CatBoostClassifier(verbose=int(params["n_estimators"] * 0.2), **params)
        model.fit(full_x, full_y_cls)
        line_full_pred[:] += model.predict_proba(full_x) / len(preset_params)
        line_test_pred[:] += model.predict_proba(test_x) / len(preset_params)

    # Evaluation
    y_pred = line_full_pred.copy()
    y_true = full_y_cls.copy()

    # Transform quality to class
    y_true_class = y_true
    y_pred_class = y_pred.argmax(axis=1)
    eval_acc = accuracy_score(y_true_class, y_pred_class)
    eval_f1 = f1_score(y_true_class, y_pred_class, average="macro")
    
    # Save values
    line_score["_".join(line)]["valid"]["mae"] = eval_mae
    line_score["_".join(line)]["valid"]["r2"] = eval_r2
    line_score["_".join(line)]["valid"]["accuracy"] = eval_acc
    line_score["_".join(line)]["valid"]["f1"] = eval_f1
    valid_pred["_".join(line)] = y_pred
    
    # Inference
    y_pred = line_test_pred.copy()
    
    # Save values
    test_pred["_".join(line)] = y_pred

df_meta["cls_cls"] = [valid_pred, test_pred]

(249, 1173)
0:	learn: 1.0910120	total: 53.2ms	remaining: 1m 19s
300:	learn: 0.3119336	total: 20.5s	remaining: 1m 21s
600:	learn: 0.1270136	total: 38.9s	remaining: 58.2s
900:	learn: 0.0700619	total: 57.3s	remaining: 38.1s
1200:	learn: 0.0453913	total: 1m 15s	remaining: 18.9s
1499:	learn: 0.0327034	total: 1m 34s	remaining: 0us
0:	learn: 1.0877682	total: 33.6ms	remaining: 50.4s
300:	learn: 0.3067916	total: 18.7s	remaining: 1m 14s
600:	learn: 0.1269287	total: 37.1s	remaining: 55.4s
900:	learn: 0.0704734	total: 55.6s	remaining: 37s
1200:	learn: 0.0461881	total: 1m 14s	remaining: 18.5s
1499:	learn: 0.0333619	total: 1m 32s	remaining: 0us
0:	learn: 1.0908704	total: 71.3ms	remaining: 1m 46s
300:	learn: 0.3094341	total: 24.4s	remaining: 1m 37s
600:	learn: 0.1244020	total: 48.8s	remaining: 1m 12s
900:	learn: 0.0672832	total: 1m 13s	remaining: 48.7s
1200:	learn: 0.0437057	total: 1m 37s	remaining: 24.3s
1499:	learn: 0.0315469	total: 2m 1s	remaining: 0us
0:	learn: 1.0901989	total: 57.8ms	remaining: 

600:	learn: 0.0469393	total: 18.2s	remaining: 27.2s
900:	learn: 0.0242184	total: 27.4s	remaining: 18.2s
1200:	learn: 0.0148042	total: 36.6s	remaining: 9.12s
1499:	learn: 0.0102228	total: 45.9s	remaining: 0us
0:	learn: 1.0644993	total: 20.6ms	remaining: 30.8s
300:	learn: 0.1437133	total: 9.2s	remaining: 36.6s
600:	learn: 0.0483892	total: 18.3s	remaining: 27.4s
900:	learn: 0.0241092	total: 27.7s	remaining: 18.4s
1200:	learn: 0.0149748	total: 36.9s	remaining: 9.2s
1499:	learn: 0.0104069	total: 46.2s	remaining: 0us
0:	learn: 1.0676159	total: 15ms	remaining: 22.5s
300:	learn: 0.1447683	total: 7.57s	remaining: 30.2s
600:	learn: 0.0476796	total: 15.2s	remaining: 22.7s
900:	learn: 0.0243019	total: 22.7s	remaining: 15.1s
1200:	learn: 0.0147548	total: 30.3s	remaining: 7.55s
1499:	learn: 0.0104298	total: 37.8s	remaining: 0us
0:	learn: 1.0650127	total: 15.3ms	remaining: 23s
300:	learn: 0.1441078	total: 7.53s	remaining: 30s
600:	learn: 0.0502139	total: 15s	remaining: 22.4s
900:	learn: 0.0250885	tot

In [35]:
df_meta_backup = df_meta.copy()

## Meta Learning (no class weight,  normalizing)

In [175]:
output_prob_valid = {}
output_prob_test = {}

for line in line_split:
    full_x = []
    test_x = []
    for k, v in df_meta.items():
        full_x.append(v[0]["_".join(line)])
        test_x.append(v[1]["_".join(line)])
    full_x = np.concatenate([i.reshape(-1, 1) if len(i.shape) == 1 else i for i in full_x], axis=1)
    full_x[: ,0] = (full_x[: ,0] - 0.5) * 10
    full_x[: ,4] = (full_x[: ,4]) / 3
    full_y = df_full.loc[df_full["product_code"].isin(line), "y_class"].values
    test_x = np.concatenate([i.reshape(-1, 1) if len(i.shape) == 1 else i for i in test_x], axis=1)
    test_x[: ,0] = (test_x[: ,0] - 0.5) * 10
    test_x[: ,4] = (test_x[: ,4]) / 3

    meta_learner = lm.LogisticRegression(multi_class="multinomial", penalty='elasticnet', solver="saga", l1_ratio=0.5, class_weight=None, random_state=42)
    meta_learner.fit(full_x, full_y)
    output_prob_valid["_".join(line)] = meta_learner.predict_proba(full_x)
    output_prob_test["_".join(line)] = meta_learner.predict_proba(test_x)
    
df_meta["meta_learning"] = [output_prob_valid, output_prob_test]

## Threshold Optimization

### Create infernced value table for analysis

In [583]:
raw_output_container_train = []
raw_output_container_test = []

for line in line_split:
    train_x = []
    test_x = []
    for k, v in df_meta.items():
        train_x.append(v[0]["_".join(line)])
        test_x.append(v[1]["_".join(line)])
    train_x = np.concatenate([i.reshape(-1, 1) if len(i.shape) == 1 else i for i in train_x], axis=1)
    train_y = df_full.loc[df_full["product_code"].isin(line), "y_class"].values
    test_x = np.concatenate([i.reshape(-1, 1) if len(i.shape) == 1 else i for i in test_x], axis=1)
    
    raw_output = pd.DataFrame(train_x, columns=["reg_qual", "cls_cls0", "cls_cls1", "cls_cls2", "reg_cls", "meta_learning_cls0", "meta_learning_cls1", "meta_learning_cls2"])
    raw_output["y_true"] = train_y
    raw_output["product_code"] = df_full.loc[df_full["product_code"].isin(line), "product_code"].values
    raw_output_container_train.append(raw_output)

    raw_output = pd.DataFrame(test_x, columns=["reg_qual", "cls_cls0", "cls_cls1", "cls_cls2", "reg_cls", "meta_learning_cls0", "meta_learning_cls1", "meta_learning_cls2"])
    raw_output["product_code"] = df_test.loc[df_test["product_code"].isin(line), "product_code"].values
    raw_output["product_id"] = df_test.loc[df_test["product_code"].isin(line), "product_id"].values
    raw_output_container_test.append(raw_output)

In [584]:
raw_output_container_train = pd.concat(raw_output_container_train)
raw_output_container_train.head()

Unnamed: 0,reg_qual,cls_cls0,cls_cls1,cls_cls2,reg_cls,meta_learning_cls0,meta_learning_cls1,meta_learning_cls2,y_true,product_code
0,0.519452,0.981948,0.012431,0.005621,6.9e-05,0.976334,0.012306,0.01136,0,A_31
1,0.531042,0.002948,0.98366,0.013393,1.000342,0.00643,0.986936,0.006633,1,A_31
2,0.537461,0.009813,0.020241,0.969945,1.999295,0.01012,0.011211,0.978669,2,A_31
3,0.534444,0.007261,0.981813,0.010926,0.999474,0.00653,0.986839,0.006631,1,A_31
4,0.531511,0.003391,0.989771,0.006838,1.000364,0.006321,0.987264,0.006415,1,A_31


In [585]:
with pd.ExcelWriter(architecture_path + "train_threshold_analysis.xlsx") as writer:
    raw_output_container_train.groupby("y_true").describe().T.reset_index().to_excel(writer, sheet_name="all")
    for line in line_split:
        df_tmp = raw_output_container_train[raw_output_container_train["product_code"].isin(line)].groupby("y_true").describe().T.reset_index()
        df_tmp.to_excel(writer, sheet_name="_".join(line))

In [586]:
raw_output_container_test = pd.concat(raw_output_container_test)
raw_output_container_test.head()

Unnamed: 0,reg_qual,cls_cls0,cls_cls1,cls_cls2,reg_cls,meta_learning_cls0,meta_learning_cls1,meta_learning_cls2,product_code,product_id
0,0.524192,0.099548,0.88226,0.018192,0.575829,0.011901,0.979461,0.008638,A_31,TEST_003
1,0.531549,0.063362,0.826198,0.110439,1.058468,0.012233,0.973575,0.014191,A_31,TEST_004
2,0.532612,0.02529,0.910008,0.064702,1.139967,0.008311,0.982055,0.009634,A_31,TEST_005
3,0.532474,0.07456,0.887601,0.037839,0.848745,0.010505,0.980226,0.00927,A_31,TEST_006
4,0.52539,0.731943,0.202567,0.06549,0.461333,0.906487,0.058996,0.034517,A_31,TEST_007


In [587]:
df_tmp = df_test[["product_id", "product_code"]]
for line in line_split:
    df_tmp.loc[df_tmp["product_code"].isin(line).values, raw_output_container_test.columns[:-1]] = raw_output_container_test[raw_output_container_test["product_code"].isin(line).values].iloc[:, :-1].values

In [588]:
df_tmp.to_csv(architecture_path + "./test_raw_output.csv", index=False)

## Inference with Threshold

In [589]:
# # 주혁님 threshold 계산 algorithm
# submit = pd.read_csv('/content/모델4개2.csv', encoding = 'cp949')
# submit['0.748 결과'][(submit['0.748 결과'] != 0) & (submit['class0'] > 0.45) & (submit['Class를 회귀로(모델3개)'] <= 0.75) & (submit['Class를 회귀로(모델1개)'] <= 0.75)] = 0
# s = pd.read_csv('/content/drive/MyDrive/LG_Aimers2/open (7)/sample_submission.csv')
# s['Y_Class'] = submit['0.748 결과']
# s.to_csv('SotaToCha_0.csv',index=False)

In [591]:
def get_threshold_params(x, norm_params={"reg_qual": 0.05, "reg_cls": 0.5, "cls_cls": 0.05, "meta_learning": 0.05}):
    '''
        reg_qual : percentage of reduction on upper and lower bound for class 1
        cls_reg : multiplier of standard deviation on upper and lower bound for class 1 (mean + std * alpha, mean - std * alpha)
        cls_cls : minimum probability for class 1
        meta_learning : minimum probability for class 1 (same as cls_cls)
    '''

    threshold_dic = {"reg_qual": {}, "reg_cls": {}, "cls_cls": {}, "meta_learning": {}}

    df_tmp = x.copy()
    df_tmp["tmp"] = df_tmp["product_code"].apply(lambda x: "A_31" if x == "A_31" else "T_31_O_31")

    # reg_qual
    df_stats = df_tmp.groupby(["tmp", "y_class"])["y_quality"].describe().T
    for k in ["A_31", "T_31_O_31"]:
        tmp_range = (df_stats[k].loc["max", 1] - df_stats[k].loc["min", 1])
        threshold_dic["reg_qual"][k] = [df_stats[k].loc["min", 1] + tmp_range * norm_params["reg_qual"], df_stats[k].loc["max", 1] - tmp_range * norm_params["reg_qual"]]

    # reg_cls
    df_stats = df_tmp.groupby("tmp")["y_class"].describe().T
    for k in ["A_31", "T_31_O_31"]:
        threshold_dic["reg_cls"][k] = [df_stats.loc["mean", k] - df_stats.loc["std", k] * norm_params["reg_cls"], df_stats.loc["mean", k] + df_stats.loc["std", k] * norm_params["reg_cls"]]

    # cls_cls
    df_stats = df_tmp.groupby("tmp")["y_class"].value_counts(normalize=True)
    for k in ["A_31", "T_31_O_31"]:
        threshold_dic["cls_cls"][k] = [df_stats[(k, 1)] * (1 + norm_params["cls_cls"]), 1]

    # meta_learning
    df_stats = df_tmp.groupby("tmp")["y_class"].value_counts(normalize=True)
    for k in ["A_31", "T_31_O_31"]:
        threshold_dic["meta_learning"][k] = [df_stats[(k, 1)] * (1 + norm_params["meta_learning"]), 1]

    for k, v in threshold_dic.items():
        print(k)
        print(v)

    return threshold_dic

In [592]:
def get_inference_label(x, class_spliter):
    df_tmp = x.copy()
    df_tmp["tmp"] = df_tmp["product_code"].apply(lambda x: "A_31" if x == "A_31" else "T_31_O_31")

    # reg_qual
    df_tmp["label_reg_qual"] = 1
    for k, v in class_spliter["reg_qual"].items():
        tmp_labeld = []
        for i in df_tmp.loc[df_tmp["tmp"] == k, "reg_qual"].values:
            if i < v[0]:
                tmp_labeld.append(0)
            elif i > v[1]:
                tmp_labeld.append(2)
            else:
                tmp_labeld.append(1)
        df_tmp.loc[df_tmp["tmp"] == k, "label_reg_qual"] = tmp_labeld

    # reg_cls
    df_tmp["label_reg_cls"] = 1
    for k, v in class_spliter["reg_cls"].items():
        tmp_labeld = []
        for i in df_tmp.loc[df_tmp["tmp"] == k, "reg_cls"].values:
            if i < v[0]:
                tmp_labeld.append(0)
            elif i > v[1]:
                tmp_labeld.append(2)
            else:
                tmp_labeld.append(1)
        df_tmp.loc[df_tmp["tmp"] == k, "label_reg_cls"] = tmp_labeld

    # cls_cls
    df_tmp["label_cls_cls"] = 1
    for k, v in class_spliter["cls_cls"].items():
        tmp_labeld = []
        for i in df_tmp.loc[df_tmp["tmp"] == k, ["cls_cls0", "cls_cls1", "cls_cls2"]].values:
            if (i[1] >= v[0]) & (i[1] <= v[1]):
                tmp_labeld.append(1)
            else:
                tmp_labeld.append(0 if np.argmax([i[0], i[2]]) == 0 else 2)
        df_tmp.loc[df_tmp["tmp"] == k, "label_cls_cls"] = tmp_labeld

    # meta_learning
    df_tmp["label_meta_learning"] = 1
    for k, v in class_spliter["meta_learning"].items():
        tmp_labeld = []
        for i in df_tmp.loc[df_tmp["tmp"] == k, ["meta_learning_cls0", "meta_learning_cls1", "meta_learning_cls2"]].values:
            if (i[1] >= v[0]) & (i[1] <= v[1]):
                tmp_labeld.append(1)
            else:
                tmp_labeld.append(0 if np.argmax([i[0], i[2]]) == 0 else 2)
        df_tmp.loc[df_tmp["tmp"] == k, "label_meta_learning"] = tmp_labeld
    
    # ensemble prediction
    df_tmp["majority_vote"] = df_tmp.filter(regex="label_*").apply(lambda x: Counter(x).most_common(1)[0][0], axis=1)
    
    return df_tmp

In [593]:
params = {"reg_qual": 0.15, "reg_cls": 0.6, "cls_cls": -0.2, "meta_learning": 0.175}
class_spliter = get_threshold_params(df_full, params)

reg_qual
{'A_31': [0.52654928545, 0.53337928555], 'T_31_O_31': [0.5266562694999999, 0.5333929365]}
reg_cls
{'A_31': [0.6089113991488744, 1.4794420145057443], 'T_31_O_31': [0.762420077744715, 1.2605025583584368]}
cls_cls
{'A_31': [0.3791164658634538, 1], 'T_31_O_31': [0.6624641833810889, 1]}
meta_learning
{'A_31': [0.5568273092369478, 1], 'T_31_O_31': [0.9729942693409742, 1]}


In [594]:
train_inference = get_inference_label(raw_output_container_train, class_spliter)
for i in ["label_reg_qual", "label_reg_cls", "label_cls_cls", "label_meta_learning", "majority_vote"]:
    print(f"=== {i} ===")
    print("value counts")
    display(train_inference[i].value_counts())
    display(train_inference[i].value_counts(normalize=True))
    display(train_inference.groupby("product_code")[i].value_counts(normalize=True))
    print("\n")
#     print("acc:", accuracy_score(train_inference["y_true"], test_inference[i]))
#     print("f1:", f1_score(train_inference["y_true"], train_inference[i], average="macro"))

=== label_reg_qual ===
value counts


1    328
2    151
0    119
Name: label_reg_qual, dtype: int64

1    0.548495
2    0.252508
0    0.198997
Name: label_reg_qual, dtype: float64

product_code  label_reg_qual
A_31          2                 0.365462
              1                 0.349398
              0                 0.285141
O_31          2                 0.500000
              1                 0.333333
              0                 0.166667
T_31          1                 0.696793
              2                 0.166181
              0                 0.137026
Name: label_reg_qual, dtype: float64



=== label_reg_cls ===
value counts


1    407
2    103
0     88
Name: label_reg_cls, dtype: int64

1    0.680602
2    0.172241
0    0.147157
Name: label_reg_cls, dtype: float64

product_code  label_reg_cls
A_31          1                0.473896
              2                0.285141
              0                0.240964
O_31          1                0.666667
              2                0.333333
T_31          1                0.830904
              2                0.087464
              0                0.081633
Name: label_reg_cls, dtype: float64



=== label_cls_cls ===
value counts


1    407
2    103
0     88
Name: label_cls_cls, dtype: int64

1    0.680602
2    0.172241
0    0.147157
Name: label_cls_cls, dtype: float64

product_code  label_cls_cls
A_31          1                0.473896
              2                0.285141
              0                0.240964
O_31          1                0.666667
              2                0.333333
T_31          1                0.830904
              2                0.087464
              0                0.081633
Name: label_cls_cls, dtype: float64



=== label_meta_learning ===
value counts


1    407
2    103
0     88
Name: label_meta_learning, dtype: int64

1    0.680602
2    0.172241
0    0.147157
Name: label_meta_learning, dtype: float64

product_code  label_meta_learning
A_31          1                      0.473896
              2                      0.285141
              0                      0.240964
O_31          1                      0.666667
              2                      0.333333
T_31          1                      0.830904
              2                      0.087464
              0                      0.081633
Name: label_meta_learning, dtype: float64



=== majority_vote ===
value counts


1    407
2    103
0     88
Name: majority_vote, dtype: int64

1    0.680602
2    0.172241
0    0.147157
Name: majority_vote, dtype: float64

product_code  majority_vote
A_31          1                0.473896
              2                0.285141
              0                0.240964
O_31          1                0.666667
              2                0.333333
T_31          1                0.830904
              2                0.087464
              0                0.081633
Name: majority_vote, dtype: float64





In [595]:
for i in ["label_reg_qual", "label_reg_cls", "label_cls_cls", "majority_vote"]:
    print(f"=== {i} ===")
    print("acc:", accuracy_score(train_inference["y_true"], train_inference[i]))
    print("f1:", f1_score(train_inference["y_true"], train_inference[i], average="macro"))

=== label_reg_qual ===
acc: 0.8678929765886287
f1: 0.8512607249145616
=== label_reg_cls ===
acc: 1.0
f1: 1.0
=== label_cls_cls ===
acc: 1.0
f1: 1.0
=== majority_vote ===
acc: 1.0
f1: 1.0


In [596]:
test_inference = get_inference_label(raw_output_container_test, class_spliter)
for i in ["label_reg_qual", "label_reg_cls", "label_cls_cls", "label_meta_learning", "majority_vote"]:
    print(f"=== {i} ===")
    print("value counts")
    display(test_inference[i].value_counts())
    display(test_inference[i].value_counts(normalize=True))
    display(test_inference.groupby("product_code")[i].value_counts(normalize=True))
    print("\n")
#     print("acc:", accuracy_score(train_inference["y_true"], test_inference[i]))
#     print("f1:", f1_score(train_inference["y_true"], train_inference[i], average="macro"))

=== label_reg_qual ===
value counts


1    260
0     34
2     16
Name: label_reg_qual, dtype: int64

1    0.838710
0    0.109677
2    0.051613
Name: label_reg_qual, dtype: float64

product_code  label_reg_qual
A_31          1                 0.537313
              0                 0.343284
              2                 0.119403
O_31          1                 1.000000
T_31          1                 0.920502
              0                 0.046025
              2                 0.033473
Name: label_reg_qual, dtype: float64



=== label_reg_cls ===
value counts


1    246
0     47
2     17
Name: label_reg_cls, dtype: int64

1    0.793548
0    0.151613
2    0.054839
Name: label_reg_cls, dtype: float64

product_code  label_reg_cls
A_31          1                0.656716
              0                0.343284
O_31          1                1.000000
T_31          1                0.828452
              0                0.100418
              2                0.071130
Name: label_reg_cls, dtype: float64



=== label_cls_cls ===
value counts


1    255
0     43
2     12
Name: label_cls_cls, dtype: int64

1    0.822581
0    0.138710
2    0.038710
Name: label_cls_cls, dtype: float64

product_code  label_cls_cls
A_31          0                0.507463
              1                0.462687
              2                0.029851
O_31          1                1.000000
T_31          1                0.920502
              2                0.041841
              0                0.037657
Name: label_cls_cls, dtype: float64



=== label_meta_learning ===
value counts


1    252
0     45
2     13
Name: label_meta_learning, dtype: int64

1    0.812903
0    0.145161
2    0.041935
Name: label_meta_learning, dtype: float64

product_code  label_meta_learning
A_31          0                      0.537313
              1                      0.432836
              2                      0.029851
O_31          1                      1.000000
T_31          1                      0.916318
              2                      0.046025
              0                      0.037657
Name: label_meta_learning, dtype: float64



=== majority_vote ===
value counts


1    269
0     33
2      8
Name: majority_vote, dtype: int64

1    0.867742
0    0.106452
2    0.025806
Name: majority_vote, dtype: float64

product_code  majority_vote
A_31          1                0.671642
              0                0.313433
              2                0.014925
O_31          1                1.000000
T_31          1                0.920502
              0                0.050209
              2                0.029289
Name: majority_vote, dtype: float64





In [597]:
test_inference.sort_values("product_id").to_csv(architecture_path + "test_rawoutput.csv", index=False)

## Submission

In [598]:
inference_col = "majority_vote"

In [599]:
df_test_tmp = df_test[["product_id", "line"]]
df_test_tmp["target_class"] = 1

for line in line_split:
    df_test_tmp.loc[df_tmp["product_code"].isin(line), "target_class"] = test_inference.loc[test_inference["product_code"].isin(line), inference_col].values

df_test_tmp["target_class"] = df_test_tmp["target_class"].astype("int32")

In [600]:
df_test_tmp

Unnamed: 0,product_id,line,target_class
0,TEST_000,T100306,1
1,TEST_001,T100304,1
2,TEST_002,T100304,1
3,TEST_003,T010305,0
4,TEST_004,T010306,1
...,...,...,...
305,TEST_305,T100306,1
306,TEST_306,T100304,1
307,TEST_307,T100306,1
308,TEST_308,T100306,1


In [601]:
submission = pd.read_csv("../datasets/sample_submission.csv")
submission["Y_Class"] = df_test_tmp["target_class"].values

In [602]:
submission

Unnamed: 0,PRODUCT_ID,Y_Class
0,TEST_000,1
1,TEST_001,1
2,TEST_002,1
3,TEST_003,0
4,TEST_004,1
...,...,...
305,TEST_305,1
306,TEST_306,1
307,TEST_307,1
308,TEST_308,1


In [603]:
submission.to_csv(architecture_path + "submission_" + architecture_name + ".csv", index=False)