## Setup

In [1]:
GLOBAL_SEED = 42

import os
os.environ['PYTHONHASHSEED'] = str(GLOBAL_SEED)
import sys

import gc
from tqdm import tqdm
import datetime
import pickle
import random as rnd
from glob import glob
import pandas as pd
import numpy as np
from numpy import random as np_rnd
import warnings
from math import ceil

import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import rcParams
from itertools import combinations
from collections import Counter

import lightgbm as lgb
import xgboost as xgb
import catboost as cat
from sklearn.utils.class_weight import compute_class_weight

import optuna
from optuna import Trial, create_study
from optuna.samplers import TPESampler

from sklearn import linear_model as lm
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold

from sklearn.impute import KNNImputer
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from itertools import product

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from scipy.stats import f_oneway
from scipy.stats import pearsonr

# display setting
warnings.filterwarnings(action='ignore')
rcParams['axes.unicode_minus'] = False

In [2]:
def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    # python random
    rnd.seed(seed)
    # numpy random
    np_rnd.seed(seed)
    # tf random
    try:
        tf_rnd.set_seed(seed)
    except:
        pass
    # RAPIDS random
    try:
        cupy.random.seed(seed)
    except:
        pass
    # pytorch random
    try:
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
    except:
        pass

def pickleIO(obj, src, op="w"):
    if op=="w":
        with open(src, op + "b") as f:
            pickle.dump(obj, f)
    elif op=="r":
        with open(src, op + "b") as f:
            tmp = pickle.load(f)
        return tmp
    else:
        print("unknown operation")
        return obj

def findIdx(data_x, col_names):
    return [int(i) for i, j in enumerate(data_x) if j in col_names]

def diff(first, second):
    second = set(second)
    return [item for item in first if item not in second]

def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print('Error: Creating directory. ' + directory)
        
def week_of_month(dt):
    """ 
        Returns the week of the month for the specified date.
    """
    first_day = dt.replace(day=1)
    dom = dt.day
    adjusted_dom = dom + (1 + first_day.weekday()) % 7
    return int(ceil(adjusted_dom/7.0))

def quality_to_class(x, threshold):
    y_pred_class = []
    for i in x:
        tmp_class = len(threshold)
        for k, v in threshold.items():
            if i < v:
                tmp_class = k
                break
        y_pred_class.append(tmp_class)
    return y_pred_class

In [3]:
class CFG:
    debug = False
    TF = True
    product_mapper = {
        "A": ["T010305", "T010306", "T050304", "T050307"],
        "O": ["T100304", "T100306"],
        "T": ["T100304", "T100306"],
    }
    line_mapper = {
        "T010305": "A", "T010306": "A", "T050304": "A", "T050307": "A",
        "T100304": "O_T", "T100306": "O_T",
    }
    line_groups = [
        ["T010305", "T010306"],
        ["T050304", "T050307"],
        ["T100304", "T100306"],
    ]
    classes = [0, 1, 2]
    time_features = ["month", "day", "weekday", "week_of_month", "hour", "office_hour", "sec_in_day", "sin_in_day", "cos_in_day"]

In [4]:
quality_threshold = pickleIO(None, "../datasets/dataset_valid/quality_threshold.pkl", "r")
quality_threshold['T010305_T010306_T050304_T050307'] = {0: (0.525046 + 0.525086) / 2, 1: (0.534843 + 0.535279) / 2}

In [5]:
quality_threshold = {
    "T010305_T010306": {0: 0.52507, 1: 0.53490},
    "T050304_T050307": {0: 0.52507, 1: 0.53490},
    "T100304_T100306": {0: 0.52507, 1: 0.53490},
}

In [6]:
quality_threshold

{'T010305_T010306': {0: 0.52507, 1: 0.5349},
 'T050304_T050307': {0: 0.52507, 1: 0.5349},
 'T100304_T100306': {0: 0.52507, 1: 0.5349}}

## Loading Data

In [7]:
# information Provided by Dacon

# PRODUCT_ID : 제품의 고유 ID
# Y_Class : 제품 품질 상태(Target) 
# 0 : 적정 기준 미달 (부적합)
# 1 : 적합
# 2 : 적정 기준 초과 (부적합)
# Y_Quality : 제품 품질 관련 정량적 수치
# TIMESTAMP : 제품이 공정에 들어간 시각
# LINE : 제품이 들어간 공정 LINE 종류 ('T050304', 'T050307', 'T100304', 'T100306', 'T010306', 'T010305' 존재)
# PRODUCT_CODE : 제품의 CODE 번호 ('A_31', 'T_31', 'O_31' 존재)
# X_1 ~ X_2875 : 공정 과정에서 추출되어 비식별화된 변수

In [8]:
df_full = pd.read_csv("../datasets/train.csv")
df_full.columns = df_full.columns.str.lower()

In [9]:
# # time feature engineernig
# df_full["month"] = df_full["timestamp"].dt.month
# df_full["day"] = df_full["timestamp"].dt.day
# df_full["weekday"] = df_full["timestamp"].dt.weekday
# df_full["week_of_month"] = df_full["timestamp"].apply(week_of_month)
# df_full["hour"] = df_full["timestamp"].dt.hour
# df_full["office_hour"] = df_full["hour"].apply(lambda x: 1 if ((x >= 9) & (x < 18)) else 0)
# df_full["sec_in_day"] = (df_full["timestamp"] - df_full["timestamp"].dt.normalize()).dt.total_seconds() / 3600
# df_full["sin_in_day"] = np.sin(2 * np.pi * df_full["sec_in_day"].values)
# df_full["cos_in_day"] = np.cos(2 * np.pi * df_full["sec_in_day"].values)

In [10]:
# df_train = pickleIO(None, "../../datasets/dataset_valid2/df_train.pkl", "r")
# df_valid = pickleIO(None, "../../datasets/dataset_valid2/df_valid.pkl", "r")

In [11]:
# # time feature engineernig
# df_train["month"] = df_train["timestamp"].dt.month
# df_train["day"] = df_train["timestamp"].dt.day
# df_train["weekday"] = df_train["timestamp"].dt.weekday
# df_train["week_of_month"] = df_train["timestamp"].apply(week_of_month)
# df_train["hour"] = df_train["timestamp"].dt.hour
# df_train["office_hour"] = df_train["hour"].apply(lambda x: 1 if ((x >= 9) & (x < 18)) else 0)
# df_train["sec_in_day"] = (df_train["timestamp"] - df_train["timestamp"].dt.normalize()).dt.total_seconds() / 3600
# df_train["sin_in_day"] = np.sin(2 * np.pi * df_train["sec_in_day"].values)
# df_train["cos_in_day"] = np.cos(2 * np.pi * df_train["sec_in_day"].values)

In [12]:
# # time feature engineernig
# df_valid["month"] = df_valid["timestamp"].dt.month
# df_valid["day"] = df_valid["timestamp"].dt.day
# df_valid["weekday"] = df_valid["timestamp"].dt.weekday
# df_valid["week_of_month"] = df_valid["timestamp"].apply(week_of_month)
# df_valid["hour"] = df_valid["timestamp"].dt.hour
# df_valid["office_hour"] = df_valid["hour"].apply(lambda x: 1 if ((x >= 9) & (x < 18)) else 0)
# df_valid["sec_in_day"] = (df_valid["timestamp"] - df_valid["timestamp"].dt.normalize()).dt.total_seconds() / 3600
# df_valid["sin_in_day"] = np.sin(2 * np.pi * df_valid["sec_in_day"].values)
# df_valid["cos_in_day"] = np.cos(2 * np.pi * df_valid["sec_in_day"].values)

In [13]:
# df_train.info()

In [14]:
# df_train.head()

In [15]:
# df_train["y_quality"]

In [16]:
# df_train.groupby(["line", "product_code"]).size()

In [17]:
# df_valid.info()

In [18]:
# df_valid.head()

In [19]:
# df_valid["y_quality"]

In [20]:
# df_valid.groupby(["line", "product_code"]).size()

In [21]:
df_full["tmp"] = df_full["product_code"].apply(lambda x: 1 if x == "A_31" else 0)
display(df_full.groupby(["tmp", "y_class"])["y_quality"].describe().T)
df_full = df_full.drop("tmp", axis=1)

tmp,0,0,0,1,1,1
y_class,0,1,2,0,1,2
count,28.0,289.0,32.0,60.0,118.0,71.0
mean,0.521246,0.530272,0.538753,0.520646,0.530209,0.543508
std,0.005765,0.002334,0.004931,0.00399,0.002705,0.008733
min,0.502517,0.525213,0.534951,0.500856,0.525086,0.535279
25%,0.520467,0.528483,0.535541,0.519388,0.527989,0.53733
50%,0.523422,0.530308,0.536237,0.521315,0.530353,0.539235
75%,0.524612,0.532119,0.539517,0.523522,0.532332,0.547506
max,0.525067,0.534837,0.551279,0.525046,0.534843,0.578841


In [22]:
df_full["tmp"] = df_full["product_code"].apply(lambda x: 1 if x == "A_31" else 0)
display(df_full.groupby("tmp")["y_class"].describe().T)
df_full = df_full.drop("tmp", axis=1)

tmp,0,1
count,349.0,249.0
mean,1.011461,1.044177
std,0.415069,0.725442
min,0.0,0.0
25%,1.0,1.0
50%,1.0,1.0
75%,1.0,2.0
max,2.0,2.0


In [23]:
df_full["tmp"] = df_full["product_code"].apply(lambda x: 1 if x == "A_31" else 0)
print(df_full.groupby("tmp")["y_class"].value_counts(normalize=True))
df_full = df_full.drop("tmp", axis=1)

tmp  y_class
0    1          0.828080
     2          0.091691
     0          0.080229
1    1          0.473896
     2          0.285141
     0          0.240964
Name: y_class, dtype: float64


In [24]:
df_test = pd.read_csv("../datasets/test.csv")
df_test.columns = df_test.columns.str.lower()

In [25]:
# # time feature engineernig
# df_test["month"] = df_test["timestamp"].dt.month
# df_test["day"] = df_test["timestamp"].dt.day
# df_test["weekday"] = df_test["timestamp"].dt.weekday
# df_test["week_of_month"] = df_test["timestamp"].apply(week_of_month)
# df_test["hour"] = df_test["timestamp"].dt.hour
# df_test["office_hour"] = df_test["hour"].apply(lambda x: 1 if ((x >= 9) & (x < 18)) else 0)
# df_test["sec_in_day"] = (df_test["timestamp"] - df_test["timestamp"].dt.normalize()).dt.total_seconds() / 3600
# df_test["sin_in_day"] = np.sin(2 * np.pi * df_test["sec_in_day"].values)
# df_test["cos_in_day"] = np.cos(2 * np.pi * df_test["sec_in_day"].values)

In [26]:
df_test.head()

Unnamed: 0,product_id,timestamp,line,product_code,x_1,x_2,x_3,x_4,x_5,x_6,...,x_2866,x_2867,x_2868,x_2869,x_2870,x_2871,x_2872,x_2873,x_2874,x_2875
0,TEST_000,2022-09-09 2:01,T100306,T_31,2.0,94.0,0.0,45.0,10.0,0.0,...,,,,,,,,,,
1,TEST_001,2022-09-09 2:09,T100304,T_31,2.0,93.0,0.0,45.0,11.0,0.0,...,,,,,,,,,,
2,TEST_002,2022-09-09 8:42,T100304,T_31,2.0,95.0,0.0,45.0,11.0,0.0,...,,,,,,,,,,
3,TEST_003,2022-09-09 10:56,T010305,A_31,,,,,,,...,,,,,,,,,,
4,TEST_004,2022-09-09 11:04,T010306,A_31,,,,,,,...,,,,,,,,,,


## Training by Lines

* "T010305", "T010306"
* "T050304", "T050307"
* "T100304", "T100306"

In [27]:
architecture_root_path = "./architectures/"
architecture_name = "daheeRef_targetQual_bylines_TF_xgb_gbt_metaLearning_try2"
architecture_path = architecture_root_path + architecture_name + "/"
createFolder(architecture_path)

In [28]:
# Shuffling
df_full = df_full.sample(frac=1, random_state=GLOBAL_SEED).reset_index(drop=True)

In [29]:
# Label Encoding on Line
lbe = LabelEncoder()
lbe.fit(["T010305", "T010306", "T050304", "T050307", "T100304", "T100306"])

LabelEncoder()

In [30]:
df_meta = {"reg_qual": None, "cls_cls": None, "reg_cls": None}

### Training - Regression (y_quality) 

In [42]:
# objective
# regession : "mae", "mse"
# classification - binary : "binary"
# classification - binary : "multiclass" (num_class=n)
# ranking : "xe_ndcg_mart"

# metric
# regession : "mae", "mse", "rmse"
# classification - binary : "binary_logloss", "binary_error", "auc"
# classification - muticlass : "multi_logloss", "multi_error"
# ranking : "ndcg", "map"

fixed_params = {
    "n_estimators": 1000,
    "learning_rate": 1e-2,
    "max_depth": 6,
}

searching_params = {
    "reg_lambda": [1.0, 0.1],
    "subsample": [0.5, 0.8],
    "colsample_bytree": [0.5, 0.8],
    "random_state": [1234, 42],
}

preset_params = []
for params in list(product(*searching_params.values())):
    print(params)
    tmp = fixed_params.copy()
    tmp["reg_lambda"] = params[0]
    tmp["subsample"] = params[1]
    tmp["colsample_bytree"] = params[2]
    tmp["random_state"] = params[3]
    preset_params.append(tmp)

preset_params = preset_params[:3] if CFG.debug else preset_params
len(preset_params)

(1.0, 0.5, 0.5, 1234)
(1.0, 0.5, 0.5, 42)
(1.0, 0.5, 0.8, 1234)
(1.0, 0.5, 0.8, 42)
(1.0, 0.8, 0.5, 1234)
(1.0, 0.8, 0.5, 42)
(1.0, 0.8, 0.8, 1234)
(1.0, 0.8, 0.8, 42)
(0.1, 0.5, 0.5, 1234)
(0.1, 0.5, 0.5, 42)
(0.1, 0.5, 0.8, 1234)
(0.1, 0.5, 0.8, 42)
(0.1, 0.8, 0.5, 1234)
(0.1, 0.8, 0.5, 42)
(0.1, 0.8, 0.8, 1234)
(0.1, 0.8, 0.8, 42)


16

In [62]:
line_split = [
    # A
    ["A_31"],
    # O, T
    ["T_31", "O_31"],
]
score_dic = {
    "mae": None,
    "r2": None,
    "accuracy": None,
    "f1": None,
}
line_split_f1 = {}
line_score = {"_".join(i): {"train": score_dic.copy(), "valid": score_dic.copy()} for i in line_split}
valid_pred = {"_".join(i): None for i in line_split}
test_pred = {"_".join(i): None for i in line_split}

for line in line_split:
    seed_everything()
    # Training
    # === Preprocessing (Train) ===
    full_x = df_full[df_full["product_code"].isin(line)]
    full_x["line"] = lbe.transform(full_x["line"])
    
    full_y = full_x["y_quality"].values
    full_y_cls = full_x["y_class"].values
    # Drop columns
    full_x = full_x.drop(["product_id", "y_class", "y_quality", "timestamp", "product_code"], axis=1)
    
    full_x = full_x[full_x.columns[~(full_x.var() == 0).values & ~full_x.isna().all().values]]
    full_x = full_x.fillna(-1.0)
    full_x = full_x.T.drop_duplicates().T
    full_x = full_x.drop_duplicates().reset_index(drop=True)
    
    selected_vars = full_x.columns
    cat_vars = ["line"]
    num_vars = diff(selected_vars, cat_vars)
    categoIdx = findIdx(selected_vars, cat_vars)

    full_x[num_vars] = full_x[num_vars].astype("float32")
    full_x[cat_vars] = full_x[cat_vars].astype("int32")
    print(full_x.shape)
    
    # === Preprocessing (Test) ===
    test_x = df_test[df_test["product_code"].isin(line)]
    test_x["line"] = lbe.transform(test_x["line"])
    test_x = test_x[selected_vars]
    test_x = test_x.fillna(-1.0)
    
    test_x[num_vars] = test_x[num_vars].astype("float32")
    test_x[cat_vars] = test_x[cat_vars].astype("int32")
    
    class_weight = compute_class_weight(class_weight="balanced", classes=[0, 1, 2], y=full_y_cls)
    line_full_pred = np.zeros(len(full_x))
    line_test_pred = np.zeros(len(test_x))
    for params in tqdm(preset_params):
        model = xgb.XGBRegressor(verbosity=1, **params)
        model.fit(full_x, np.log1p(full_y), sample_weight=class_weight[full_y_cls], verbose=int(params["n_estimators"] * 0.2))
        line_full_pred[:] += np.expm1(model.predict(full_x)) / len(preset_params)
        line_test_pred[:] += np.expm1(model.predict(test_x)) / len(preset_params)

    # Evaluation
    y_pred = line_full_pred.copy()
    y_true = full_y.copy()
    eval_mae = mean_absolute_error(y_true, y_pred)
    eval_r2 = r2_score(y_true, y_pred)

    # Transform quality to class
    y_true_class = full_y_cls.copy()
    y_pred_class = quality_to_class(y_pred, {0: 0.52507, 1: 0.53490})
    eval_acc = accuracy_score(y_true_class, y_pred_class)
    eval_f1 = f1_score(y_true_class, y_pred_class, average="macro")
    
    # Save values
    line_score["_".join(line)]["valid"]["mae"] = eval_mae
    line_score["_".join(line)]["valid"]["r2"] = eval_r2
    line_score["_".join(line)]["valid"]["accuracy"] = eval_acc
    line_score["_".join(line)]["valid"]["f1"] = eval_f1
    valid_pred["_".join(line)] = y_pred
    
    # Inference
    y_pred = line_test_pred.copy()
    
    # Save values
    test_pred["_".join(line)] = y_pred

df_meta["reg_qual"] = [valid_pred, test_pred]

(249, 1173)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [02:13<00:00,  8.37s/it]


(349, 452)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [01:02<00:00,  3.90s/it]


In [63]:
print(line_score)

{'A_31': {'train': {'mae': None, 'r2': None, 'accuracy': None, 'f1': None}, 'valid': {'mae': 0.00041076571860552795, 'r2': 0.9973971129195781, 'accuracy': 0.9718875502008032, 'f1': 0.9723616789676588}}, 'T_31_O_31': {'train': {'mae': None, 'r2': None, 'accuracy': None, 'f1': None}, 'valid': {'mae': 0.0005373432574761392, 'r2': 0.9781928046144005, 'accuracy': 0.9885386819484241, 'f1': 0.9746106820710523}}}


### Training - Regression (y_class)

In [64]:
# objective
# regession : "mae", "mse"
# classification - binary : "binary"
# classification - binary : "multiclass" (num_class=n)
# ranking : "xe_ndcg_mart"

# metric
# regession : "mae", "mse", "rmse"
# classification - binary : "binary_logloss", "binary_error", "auc"
# classification - muticlass : "multi_logloss", "multi_error"
# ranking : "ndcg", "map"

fixed_params = {
    "n_estimators": 1000,
    "learning_rate": 1e-2,
    "max_depth": 6,
}

searching_params = {
    "reg_lambda": [1.0, 0.1],
    "subsample": [0.5, 0.8],
    "colsample_bytree": [0.5, 0.8],
    "random_state": [1234, 42],
}

preset_params = []
for params in list(product(*searching_params.values())):
    print(params)
    tmp = fixed_params.copy()
    tmp["reg_lambda"] = params[0]
    tmp["subsample"] = params[1]
    tmp["colsample_bytree"] = params[2]
    tmp["random_state"] = params[3]
    preset_params.append(tmp)

preset_params = preset_params[:3] if CFG.debug else preset_params
len(preset_params)

(1.0, 0.5, 0.5, 1234)
(1.0, 0.5, 0.5, 42)
(1.0, 0.5, 0.8, 1234)
(1.0, 0.5, 0.8, 42)
(1.0, 0.8, 0.5, 1234)
(1.0, 0.8, 0.5, 42)
(1.0, 0.8, 0.8, 1234)
(1.0, 0.8, 0.8, 42)
(0.1, 0.5, 0.5, 1234)
(0.1, 0.5, 0.5, 42)
(0.1, 0.5, 0.8, 1234)
(0.1, 0.5, 0.8, 42)
(0.1, 0.8, 0.5, 1234)
(0.1, 0.8, 0.5, 42)
(0.1, 0.8, 0.8, 1234)
(0.1, 0.8, 0.8, 42)


16

In [65]:
line_split = [
    # A
    ["A_31"],
    # O, T
    ["T_31", "O_31"],
]
score_dic = {
    "mae": None,
    "r2": None,
    "accuracy": None,
    "f1": None,
}
line_split_f1 = {}
line_score = {"_".join(i): {"train": score_dic.copy(), "valid": score_dic.copy()} for i in line_split}
valid_pred = {"_".join(i): None for i in line_split}
test_pred = {"_".join(i): None for i in line_split}

for line in line_split:
    seed_everything()
    # Training
    # === Preprocessing (Train) ===
    full_x = df_full[df_full["product_code"].isin(line)]
    
    # Label Encoding on Line
    full_x["line"] = lbe.transform(full_x["line"])
    
    full_y = full_x["y_quality"].values
    full_y_cls = full_x["y_class"].values
    # Drop columns
    full_x = full_x.drop(["product_id", "y_class", "y_quality", "timestamp", "product_code"], axis=1)
    
    full_x = full_x[full_x.columns[~(full_x.var() == 0).values & ~full_x.isna().all().values]]
    full_x = full_x.fillna(-1.0)
    full_x = full_x.T.drop_duplicates().T
    full_x = full_x.drop_duplicates().reset_index(drop=True)
    
    selected_vars = full_x.columns
    cat_vars = ["line"]
    num_vars = diff(selected_vars, cat_vars)
    categoIdx = findIdx(selected_vars, cat_vars)

    full_x[num_vars] = full_x[num_vars].astype("float32")
    full_x[cat_vars] = full_x[cat_vars].astype("int32")
    print(full_x.shape)
    
    # === Preprocessing (Test) ===
    test_x = df_test[df_test["product_code"].isin(line)]
    test_x["line"] = lbe.transform(test_x["line"])
    test_x = test_x[selected_vars]
    test_x = test_x.fillna(-1.0)
    
    test_x[num_vars] = test_x[num_vars].astype("float32")
    test_x[cat_vars] = test_x[cat_vars].astype("int32")
    
    class_weight = compute_class_weight(class_weight="balanced", classes=[0, 1, 2], y=full_y_cls)
    line_full_pred = np.zeros(len(full_x))
    line_test_pred = np.zeros(len(test_x))
    for params in tqdm(preset_params):
        model = xgb.XGBRegressor(verbosity=1, **params)
        model.fit(full_x, full_y_cls, sample_weight=class_weight[full_y_cls], verbose=int(params["n_estimators"] * 0.2))
        line_full_pred[:] += model.predict(full_x) / len(preset_params)
        line_test_pred[:] += model.predict(test_x) / len(preset_params)
    
    # Evaluation
    y_pred = line_full_pred.copy()
    y_true = full_y.copy()
    eval_mae = mean_absolute_error(y_true, y_pred)
    eval_r2 = r2_score(y_true, y_pred)

    # Transform quality to class
    y_true_class = full_y_cls.copy()
    y_pred_class = np.clip(np.round(y_true), 0, 2)
    eval_acc = accuracy_score(y_true_class, full_y_cls)
    eval_f1 = f1_score(y_true_class, full_y_cls, average="macro")
    
    # Save values
    line_score["_".join(line)]["valid"]["mae"] = eval_mae
    line_score["_".join(line)]["valid"]["r2"] = eval_r2
    line_score["_".join(line)]["valid"]["accuracy"] = eval_acc
    line_score["_".join(line)]["valid"]["f1"] = eval_f1
    valid_pred["_".join(line)] = y_pred
    
    # Inference
    y_pred = line_test_pred.copy()
    
    # Save values
    test_pred["_".join(line)] = y_pred

df_meta["reg_cls"] = [valid_pred, test_pred]

(249, 1173)
(349, 452)


In [66]:
print(line_score)

{'A_31': {'train': {'mae': None, 'r2': None, 'accuracy': None, 'f1': None}, 'valid': {'mae': 0.7580452113758052, 'r2': -7722.723816671717, 'accuracy': 1.0, 'f1': 1.0}}, 'T_31_O_31': {'train': {'mae': None, 'r2': None, 'accuracy': None, 'f1': None}, 'valid': {'mae': 0.5641397042119296, 'r2': -17822.474144681797, 'accuracy': 1.0, 'f1': 1.0}}}


### Training - Classfication (y_class)

In [67]:
# objective
# regession : "mae", "mse"
# classification - binary : "binary"
# classification - binary : "multiclass" (num_class=n)
# ranking : "xe_ndcg_mart"

# metric
# regession : "mae", "mse", "rmse"
# classification - binary : "binary_logloss", "binary_error", "auc"
# classification - muticlass : "multi_logloss", "multi_error"
# ranking : "ndcg", "map"

fixed_params = {
    "n_estimators": 1000,
    "learning_rate": 1e-2,
    "max_depth": 6,
}

searching_params = {
    "reg_lambda": [1.0, 0.1],
    "subsample": [0.5, 0.8],
    "colsample_bytree": [0.5, 0.8],
    "random_state": [1234, 42],
}

preset_params = []
for params in list(product(*searching_params.values())):
    print(params)
    tmp = fixed_params.copy()
    tmp["reg_lambda"] = params[0]
    tmp["subsample"] = params[1]
    tmp["colsample_bytree"] = params[2]
    tmp["random_state"] = params[3]
    preset_params.append(tmp)

preset_params = preset_params[:3] if CFG.debug else preset_params
len(preset_params)

(1.0, 0.5, 0.5, 1234)
(1.0, 0.5, 0.5, 42)
(1.0, 0.5, 0.8, 1234)
(1.0, 0.5, 0.8, 42)
(1.0, 0.8, 0.5, 1234)
(1.0, 0.8, 0.5, 42)
(1.0, 0.8, 0.8, 1234)
(1.0, 0.8, 0.8, 42)
(0.1, 0.5, 0.5, 1234)
(0.1, 0.5, 0.5, 42)
(0.1, 0.5, 0.8, 1234)
(0.1, 0.5, 0.8, 42)
(0.1, 0.8, 0.5, 1234)
(0.1, 0.8, 0.5, 42)
(0.1, 0.8, 0.8, 1234)
(0.1, 0.8, 0.8, 42)


16

In [68]:
line_split = [
    # A
    ["A_31"],
    # O, T
    ["T_31", "O_31"],
]
score_dic = {
    "mae": None,
    "r2": None,
    "accuracy": None,
    "f1": None,
}
line_split_f1 = {}
line_score = {"_".join(i): {"train": score_dic.copy(), "valid": score_dic.copy()} for i in line_split}
valid_pred = {"_".join(i): None for i in line_split}
test_pred = {"_".join(i): None for i in line_split}

for line in line_split:
    seed_everything()
    # Training
    # === Preprocessing (Train) ===
    full_x = df_full[df_full["product_code"].isin(line)]
    
    # Label Encoding on Line
    full_x["line"] = lbe.transform(full_x["line"])
    
    full_y = full_x["y_quality"].values
    full_y_cls = full_x["y_class"].values
    # Drop columns
    full_x = full_x.drop(["product_id", "y_class", "y_quality", "timestamp", "product_code"], axis=1)
    
    full_x = full_x[full_x.columns[~(full_x.var() == 0).values & ~full_x.isna().all().values]]
    full_x = full_x.fillna(-1.0)
    full_x = full_x.T.drop_duplicates().T
    full_x = full_x.drop_duplicates().reset_index(drop=True)
    
    selected_vars = full_x.columns
    cat_vars = ["line"]
    num_vars = diff(selected_vars, cat_vars)
    categoIdx = findIdx(selected_vars, cat_vars)

    full_x[num_vars] = full_x[num_vars].astype("float32")
    full_x[cat_vars] = full_x[cat_vars].astype("int32")
    print(full_x.shape)
    
    # === Preprocessing (Test) ===
    test_x = df_test[df_test["product_code"].isin(line)]
    test_x["line"] = lbe.transform(test_x["line"])
    test_x = test_x[selected_vars]
    test_x = test_x.fillna(-1.0)
    
    test_x[num_vars] = test_x[num_vars].astype("float32")
    test_x[cat_vars] = test_x[cat_vars].astype("int32")
    
    class_weight = compute_class_weight(class_weight="balanced", classes=[0, 1, 2], y=full_y_cls)
    line_full_pred = np.zeros((len(full_x), 3))
    line_test_pred = np.zeros((len(test_x), 3))
    for params in tqdm(preset_params):
        model = xgb.XGBClassifier(verbosity=1, **params)
        model.fit(full_x, full_y_cls, sample_weight=class_weight[full_y_cls], verbose=int(params["n_estimators"] * 0.2))
        line_full_pred[:] += model.predict_proba(full_x) / len(preset_params)
        line_test_pred[:] += model.predict_proba(test_x) / len(preset_params)

    # Evaluation
    y_pred = line_full_pred.copy()
    y_true = full_y_cls.copy()

    # Transform quality to class
    y_true_class = y_true
    y_pred_class = y_pred.argmax(axis=1)
    eval_acc = accuracy_score(y_true_class, y_pred_class)
    eval_f1 = f1_score(y_true_class, y_pred_class, average="macro")
    
    # Save values
    line_score["_".join(line)]["valid"]["mae"] = eval_mae
    line_score["_".join(line)]["valid"]["r2"] = eval_r2
    line_score["_".join(line)]["valid"]["accuracy"] = eval_acc
    line_score["_".join(line)]["valid"]["f1"] = eval_f1
    valid_pred["_".join(line)] = y_pred
    
    # Inference
    y_pred = line_test_pred.copy()
    
    # Save values
    test_pred["_".join(line)] = y_pred

df_meta["cls_cls"] = [valid_pred, test_pred]

(249, 1173)


  0%|                                                                                                                                | 0/16 [00:00<?, ?it/s]



  6%|███████▌                                                                                                                | 1/16 [00:11<02:52, 11.52s/it]



 12%|███████████████                                                                                                         | 2/16 [00:23<02:42, 11.64s/it]



 19%|██████████████████████▌                                                                                                 | 3/16 [00:36<02:42, 12.51s/it]



 25%|██████████████████████████████                                                                                          | 4/16 [00:50<02:35, 12.92s/it]



 31%|█████████████████████████████████████▌                                                                                  | 5/16 [01:02<02:20, 12.73s/it]



 38%|█████████████████████████████████████████████                                                                           | 6/16 [01:15<02:06, 12.67s/it]



 44%|████████████████████████████████████████████████████▌                                                                   | 7/16 [01:30<02:00, 13.44s/it]



 50%|████████████████████████████████████████████████████████████                                                            | 8/16 [01:45<01:51, 13.88s/it]



 56%|███████████████████████████████████████████████████████████████████▌                                                    | 9/16 [01:56<01:31, 13.13s/it]



 62%|██████████████████████████████████████████████████████████████████████████▍                                            | 10/16 [02:08<01:15, 12.61s/it]



 69%|█████████████████████████████████████████████████████████████████████████████████▊                                     | 11/16 [02:21<01:03, 12.76s/it]



 75%|█████████████████████████████████████████████████████████████████████████████████████████▎                             | 12/16 [02:34<00:51, 12.85s/it]



 81%|████████████████████████████████████████████████████████████████████████████████████████████████▋                      | 13/16 [02:46<00:37, 12.59s/it]



 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████▏              | 14/16 [02:58<00:24, 12.39s/it]



 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▌       | 15/16 [03:13<00:13, 13.16s/it]



100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [03:27<00:00, 12.97s/it]


(349, 452)


  0%|                                                                                                                                | 0/16 [00:00<?, ?it/s]



  6%|███████▌                                                                                                                | 1/16 [00:05<01:25,  5.71s/it]



 12%|███████████████                                                                                                         | 2/16 [00:11<01:20,  5.74s/it]



 19%|██████████████████████▌                                                                                                 | 3/16 [00:18<01:21,  6.28s/it]



 25%|██████████████████████████████                                                                                          | 4/16 [00:25<01:18,  6.54s/it]



 31%|█████████████████████████████████████▌                                                                                  | 5/16 [00:31<01:10,  6.41s/it]



 38%|█████████████████████████████████████████████                                                                           | 6/16 [00:37<01:03,  6.32s/it]



 44%|████████████████████████████████████████████████████▌                                                                   | 7/16 [00:45<01:00,  6.73s/it]



 50%|████████████████████████████████████████████████████████████                                                            | 8/16 [00:52<00:55,  6.97s/it]



 56%|███████████████████████████████████████████████████████████████████▌                                                    | 9/16 [00:58<00:46,  6.59s/it]



 62%|██████████████████████████████████████████████████████████████████████████▍                                            | 10/16 [01:04<00:38,  6.34s/it]



 69%|█████████████████████████████████████████████████████████████████████████████████▊                                     | 11/16 [01:11<00:32,  6.53s/it]



 75%|█████████████████████████████████████████████████████████████████████████████████████████▎                             | 12/16 [01:18<00:26,  6.66s/it]



 81%|████████████████████████████████████████████████████████████████████████████████████████████████▋                      | 13/16 [01:24<00:19,  6.44s/it]



 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████▏              | 14/16 [01:30<00:12,  6.29s/it]



 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▌       | 15/16 [01:37<00:06,  6.58s/it]



100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [01:44<00:00,  6.54s/it]


In [71]:
print(line_score)

{'A_31': {'train': {'mae': None, 'r2': None, 'accuracy': None, 'f1': None}, 'valid': {'mae': 0.5641397042119296, 'r2': -17822.474144681797, 'accuracy': 1.0, 'f1': 1.0}}, 'T_31_O_31': {'train': {'mae': None, 'r2': None, 'accuracy': None, 'f1': None}, 'valid': {'mae': 0.5641397042119296, 'r2': -17822.474144681797, 'accuracy': 1.0, 'f1': 1.0}}}


## Meta Learning (no class weight,  normalizing)

In [185]:
output_prob_valid = {}
output_prob_test = {}

for line in line_split:
    full_x = []
    test_x = []
    for k, v in df_meta.items():
        full_x.append(v[0]["_".join(line)])
        test_x.append(v[1]["_".join(line)])
    full_x = np.concatenate([i.reshape(-1, 1) if len(i.shape) == 1 else i for i in full_x], axis=1)
    full_x[: ,0] = (full_x[: ,0] - 0.5) * 10
    full_x[: ,4] = (full_x[: ,4]) / 3
    full_y = df_full.loc[df_full["product_code"].isin(line), "y_class"].values
    test_x = np.concatenate([i.reshape(-1, 1) if len(i.shape) == 1 else i for i in test_x], axis=1)
    test_x[: ,0] = (test_x[: ,0] - 0.5) * 10
    test_x[: ,4] = (test_x[: ,4]) / 3

    meta_learner = lm.LogisticRegression(multi_class="multinomial", penalty='elasticnet', solver="saga", l1_ratio=0.5, class_weight=None, random_state=42)
    meta_learner.fit(full_x, full_y)
    output_prob_valid["_".join(line)] = meta_learner.predict_proba(full_x)
    output_prob_test["_".join(line)] = meta_learner.predict_proba(test_x)
    
df_meta["meta_learning"] = [output_prob_valid, output_prob_test]

## Threshold Optimization

### Create infernced value table for analysis

In [217]:
raw_output_container_train = []
raw_output_container_test = []

for line in line_split:
    train_x = []
    test_x = []
    for k, v in df_meta.items():
        train_x.append(v[0]["_".join(line)])
        test_x.append(v[1]["_".join(line)])
    train_x = np.concatenate([i.reshape(-1, 1) if len(i.shape) == 1 else i for i in train_x], axis=1)
    train_y = df_full.loc[df_full["product_code"].isin(line), "y_class"].values
    test_x = np.concatenate([i.reshape(-1, 1) if len(i.shape) == 1 else i for i in test_x], axis=1)
    
    raw_output = pd.DataFrame(train_x, columns=["reg_qual", "cls_cls0", "cls_cls1", "cls_cls2", "reg_cls", "meta_learning_cls0", "meta_learning_cls1", "meta_learning_cls2"])
    raw_output["y_true"] = train_y
    raw_output["product_code"] = df_full.loc[df_full["product_code"].isin(line), "product_code"].values
    raw_output_container_train.append(raw_output)

    raw_output = pd.DataFrame(test_x, columns=["reg_qual", "cls_cls0", "cls_cls1", "cls_cls2", "reg_cls", "meta_learning_cls0", "meta_learning_cls1", "meta_learning_cls2"])
    raw_output["product_code"] = df_test.loc[df_test["product_code"].isin(line), "product_code"].values
    raw_output["product_id"] = df_test.loc[df_test["product_code"].isin(line), "product_id"].values
    raw_output_container_test.append(raw_output)

In [218]:
raw_output_container_train = pd.concat(raw_output_container_train)
raw_output_container_train.head()

Unnamed: 0,reg_qual,cls_cls0,cls_cls1,cls_cls2,reg_cls,meta_learning_cls0,meta_learning_cls1,meta_learning_cls2,y_true,product_code
0,0.519407,0.977511,0.014327,0.008162,0.00422,0.976175,0.012453,0.011371,0,A_31
1,0.532045,0.004169,0.965334,0.030497,1.026221,0.006327,0.986816,0.006857,1,A_31
2,0.537199,0.018024,0.02987,0.952105,1.983897,0.010572,0.011909,0.977518,2,A_31
3,0.533947,0.029058,0.951198,0.019743,0.986219,0.007024,0.986045,0.006931,1,A_31
4,0.532109,0.006753,0.977561,0.015687,1.008253,0.006155,0.987474,0.006371,1,A_31


In [219]:
with pd.ExcelWriter(architecture_path + "train_threshold_analysis.xlsx") as writer:
    raw_output_container_train.groupby("y_true").describe().T.reset_index().to_excel(writer, sheet_name="all")
    for line in line_split:
        df_tmp = raw_output_container_train[raw_output_container_train["product_code"].isin(line)].groupby("y_true").describe().T.reset_index()
        df_tmp.to_excel(writer, sheet_name="_".join(line))

In [220]:
raw_output_container_test = pd.concat(raw_output_container_test)
raw_output_container_test.head()

Unnamed: 0,reg_qual,cls_cls0,cls_cls1,cls_cls2,reg_cls,meta_learning_cls0,meta_learning_cls1,meta_learning_cls2,product_code,product_id
0,0.524071,0.121408,0.8644,0.014192,0.536509,0.012408,0.979189,0.008404,A_31,TEST_003
1,0.531568,0.054616,0.837202,0.108182,1.107915,0.010555,0.97679,0.012656,A_31,TEST_004
2,0.533111,0.023887,0.835881,0.140231,1.223501,0.009728,0.976171,0.014101,A_31,TEST_005
3,0.532295,0.073844,0.888689,0.037468,0.835941,0.009695,0.981736,0.00857,A_31,TEST_006
4,0.526138,0.877041,0.082427,0.040532,0.382443,0.959212,0.022046,0.018742,A_31,TEST_007


In [221]:
df_tmp = df_test[["product_id", "product_code"]]
for line in line_split:
    df_tmp.loc[df_tmp["product_code"].isin(line).values, raw_output_container_test.columns[:-1]] = raw_output_container_test[raw_output_container_test["product_code"].isin(line).values].iloc[:, :-1].values

## Inference with Threshold

In [222]:
# # 주혁님 threshold 계산 algorithm
# submit = pd.read_csv('/content/모델4개2.csv', encoding = 'cp949')
# submit['0.748 결과'][(submit['0.748 결과'] != 0) & (submit['class0'] > 0.45) & (submit['Class를 회귀로(모델3개)'] <= 0.75) & (submit['Class를 회귀로(모델1개)'] <= 0.75)] = 0
# s = pd.read_csv('/content/drive/MyDrive/LG_Aimers2/open (7)/sample_submission.csv')
# s['Y_Class'] = submit['0.748 결과']
# s.to_csv('SotaToCha_0.csv',index=False)

In [223]:
def get_threshold_params(x, norm_params={"reg_qual": 0.05, "reg_cls": 0.5, "cls_cls": 0.05, "meta_learning": 0.05}):
    '''
        reg_qual : percentage of reduction on upper and lower bound for class 1
        cls_reg : multiplier of standard deviation on upper and lower bound for class 1 (mean + std * alpha, mean - std * alpha)
        cls_cls : minimum probability for class 1
        meta_learning : minimum probability for class 1 (same as cls_cls)
    '''

    threshold_dic = {"reg_qual": {}, "reg_cls": {}, "cls_cls": {}, "meta_learning": {}}

    df_tmp = x.copy()
    df_tmp["tmp"] = df_tmp["product_code"].apply(lambda x: "A_31" if x == "A_31" else "T_31_O_31")

    # reg_qual
    df_stats = df_tmp.groupby(["tmp", "y_class"])["y_quality"].describe().T
    for k in ["A_31", "T_31_O_31"]:
        tmp_range = (df_stats[k].loc["max", 1] - df_stats[k].loc["min", 1])
        threshold_dic["reg_qual"][k] = [df_stats[k].loc["min", 1] + tmp_range * norm_params["reg_qual"], df_stats[k].loc["max", 1] - tmp_range * norm_params["reg_qual"]]

    # reg_cls
    df_stats = df_tmp.groupby("tmp")["y_class"].describe().T
    for k in ["A_31", "T_31_O_31"]:
        threshold_dic["reg_cls"][k] = [df_stats.loc["mean", k] - df_stats.loc["std", k] * norm_params["reg_cls"], df_stats.loc["mean", k] + df_stats.loc["std", k] * norm_params["reg_cls"]]

    # cls_cls
    df_stats = df_tmp.groupby("tmp")["y_class"].value_counts(normalize=True)
    for k in ["A_31", "T_31_O_31"]:
        threshold_dic["cls_cls"][k] = [df_stats[(k, 1)] * (1 + norm_params["cls_cls"]), 1]

    # meta_learning
    df_stats = df_tmp.groupby("tmp")["y_class"].value_counts(normalize=True)
    for k in ["A_31", "T_31_O_31"]:
        threshold_dic["meta_learning"][k] = [df_stats[(k, 1)] * (1 + norm_params["meta_learning"]), 1]

    for k, v in threshold_dic.items():
        print(k)
        print(v)

    return threshold_dic

In [224]:
def get_inference_label(x, class_spliter):
    df_tmp = x.copy()
    df_tmp["tmp"] = df_tmp["product_code"].apply(lambda x: "A_31" if x == "A_31" else "T_31_O_31")

    # reg_qual
    df_tmp["label_reg_qual"] = 1
    for k, v in class_spliter["reg_qual"].items():
        tmp_labeld = []
        for i in df_tmp.loc[df_tmp["tmp"] == k, "reg_qual"].values:
            if i < v[0]:
                tmp_labeld.append(0)
            elif i > v[1]:
                tmp_labeld.append(2)
            else:
                tmp_labeld.append(1)
        df_tmp.loc[df_tmp["tmp"] == k, "label_reg_qual"] = tmp_labeld

    # reg_cls
    df_tmp["label_reg_cls"] = 1
    for k, v in class_spliter["reg_cls"].items():
        tmp_labeld = []
        for i in df_tmp.loc[df_tmp["tmp"] == k, "reg_cls"].values:
            if i < v[0]:
                tmp_labeld.append(0)
            elif i > v[1]:
                tmp_labeld.append(2)
            else:
                tmp_labeld.append(1)
        df_tmp.loc[df_tmp["tmp"] == k, "label_reg_cls"] = tmp_labeld

    # cls_cls
    df_tmp["label_cls_cls"] = 1
    for k, v in class_spliter["cls_cls"].items():
        tmp_labeld = []
        for i in df_tmp.loc[df_tmp["tmp"] == k, ["cls_cls0", "cls_cls1", "cls_cls2"]].values:
            if (i[1] >= v[0]) & (i[1] <= v[1]):
                tmp_labeld.append(1)
            else:
                tmp_labeld.append(0 if np.argmax([i[0], i[2]]) == 0 else 2)
        df_tmp.loc[df_tmp["tmp"] == k, "label_cls_cls"] = tmp_labeld

    # meta_learning
    df_tmp["label_meta_learning"] = 1
    for k, v in class_spliter["meta_learning"].items():
        tmp_labeld = []
        for i in df_tmp.loc[df_tmp["tmp"] == k, ["meta_learning_cls0", "meta_learning_cls1", "meta_learning_cls2"]].values:
            if (i[1] >= v[0]) & (i[1] <= v[1]):
                tmp_labeld.append(1)
            else:
                tmp_labeld.append(0 if np.argmax([i[0], i[2]]) == 0 else 2)
        df_tmp.loc[df_tmp["tmp"] == k, "label_meta_learning"] = tmp_labeld
    
    # ensemble prediction
    df_tmp["counter"] = df_tmp.filter(regex="label_*").apply(lambda x: Counter(x).most_common(3), axis=1)
    df_tmp["majority_vote"] = df_tmp["counter"].apply(lambda x: x[0][0])
    casting_voter = "label_meta_learning"
    print((df_tmp["counter"].apply(len) > 1).sum())
    df_tmp.loc[(df_tmp["counter"].apply(len) > 1), "majority_vote"] = df_tmp.loc[(df_tmp["counter"].apply(len) > 1), casting_voter].values
    df_tmp = df_tmp.drop("counter", axis=1)
    
    return df_tmp

In [225]:
params = {"reg_qual": -0.05, "reg_cls": 0.5, "cls_cls": -0.05, "meta_learning": 0.175}
class_spliter = get_threshold_params(df_full, params)

reg_qual
{'A_31': [0.52459785685, 0.53533071415], 'T_31_O_31': [0.5247315075, 0.5353176984999999]}
reg_cls
{'A_31': [0.6814556170952801, 1.4068977965593386], 'T_31_O_31': [0.8039269511291918, 1.21899568497396]}
cls_cls
{'A_31': [0.4502008032128514, 1], 'T_31_O_31': [0.7866762177650429, 1]}
meta_learning
{'A_31': [0.5568273092369478, 1], 'T_31_O_31': [0.9729942693409742, 1]}


In [226]:
train_inference = get_inference_label(raw_output_container_train, class_spliter)
for i in ["label_reg_qual", "label_reg_cls", "label_cls_cls", "label_meta_learning", "majority_vote"]:
    print(f"=== {i} ===")
    print("value counts")
    display(train_inference[i].value_counts())
    display(train_inference[i].value_counts(normalize=True))
    display(train_inference.groupby("product_code")[i].value_counts(normalize=True))
    print("\n")
#     print("acc:", accuracy_score(train_inference["y_true"], test_inference[i]))
#     print("f1:", f1_score(train_inference["y_true"], train_inference[i], average="macro"))

26
=== label_reg_qual ===
value counts


1    433
2     93
0     72
Name: label_reg_qual, dtype: int64

1    0.724080
2    0.155518
0    0.120401
Name: label_reg_qual, dtype: float64

product_code  label_reg_qual
A_31          1                 0.518072
              2                 0.277108
              0                 0.204819
O_31          1                 1.000000
T_31          1                 0.868805
              2                 0.069971
              0                 0.061224
Name: label_reg_qual, dtype: float64



=== label_reg_cls ===
value counts


1    407
2    103
0     88
Name: label_reg_cls, dtype: int64

1    0.680602
2    0.172241
0    0.147157
Name: label_reg_cls, dtype: float64

product_code  label_reg_cls
A_31          1                0.473896
              2                0.285141
              0                0.240964
O_31          1                0.666667
              2                0.333333
T_31          1                0.830904
              2                0.087464
              0                0.081633
Name: label_reg_cls, dtype: float64



=== label_cls_cls ===
value counts


1    407
2    103
0     88
Name: label_cls_cls, dtype: int64

1    0.680602
2    0.172241
0    0.147157
Name: label_cls_cls, dtype: float64

product_code  label_cls_cls
A_31          1                0.473896
              2                0.285141
              0                0.240964
O_31          1                0.666667
              2                0.333333
T_31          1                0.830904
              2                0.087464
              0                0.081633
Name: label_cls_cls, dtype: float64



=== label_meta_learning ===
value counts


1    407
2    103
0     88
Name: label_meta_learning, dtype: int64

1    0.680602
2    0.172241
0    0.147157
Name: label_meta_learning, dtype: float64

product_code  label_meta_learning
A_31          1                      0.473896
              2                      0.285141
              0                      0.240964
O_31          1                      0.666667
              2                      0.333333
T_31          1                      0.830904
              2                      0.087464
              0                      0.081633
Name: label_meta_learning, dtype: float64



=== majority_vote ===
value counts


1    407
2    103
0     88
Name: majority_vote, dtype: int64

1    0.680602
2    0.172241
0    0.147157
Name: majority_vote, dtype: float64

product_code  majority_vote
A_31          1                0.473896
              2                0.285141
              0                0.240964
O_31          1                0.666667
              2                0.333333
T_31          1                0.830904
              2                0.087464
              0                0.081633
Name: majority_vote, dtype: float64





In [227]:
for i in ["label_reg_qual", "label_reg_cls", "label_cls_cls", "majority_vote"]:
    print(f"=== {i} ===")
    print("acc:", accuracy_score(train_inference["y_true"], train_inference[i]))
    print("f1:", f1_score(train_inference["y_true"], train_inference[i], average="macro"))

=== label_reg_qual ===
acc: 0.9565217391304348
f1: 0.9393424036281179
=== label_reg_cls ===
acc: 1.0
f1: 1.0
=== label_cls_cls ===
acc: 1.0
f1: 1.0
=== majority_vote ===
acc: 1.0
f1: 1.0


In [228]:
test_inference = get_inference_label(raw_output_container_test, class_spliter)
for i in ["label_reg_qual", "label_reg_cls", "label_cls_cls", "label_meta_learning", "majority_vote"]:
    print(f"=== {i} ===")
    print("value counts")
    display(test_inference[i].value_counts())
    display(test_inference[i].value_counts(normalize=True))
    display(test_inference.groupby("product_code")[i].value_counts(normalize=True))
    print("\n")
#     print("acc:", accuracy_score(train_inference["y_true"], test_inference[i]))
#     print("f1:", f1_score(train_inference["y_true"], train_inference[i], average="macro"))

86
=== label_reg_qual ===
value counts


1    271
0     22
2     17
Name: label_reg_qual, dtype: int64

1    0.874194
0    0.070968
2    0.054839
Name: label_reg_qual, dtype: float64

product_code  label_reg_qual
A_31          1                 0.731343
              0                 0.223881
              2                 0.044776
O_31          1                 1.000000
T_31          1                 0.912134
              2                 0.058577
              0                 0.029289
Name: label_reg_qual, dtype: float64



=== label_reg_cls ===
value counts


1    259
0     35
2     16
Name: label_reg_cls, dtype: int64

1    0.835484
0    0.112903
2    0.051613
Name: label_reg_cls, dtype: float64

product_code  label_reg_cls
A_31          1                0.611940
              0                0.388060
O_31          1                1.000000
T_31          1                0.895397
              2                0.066946
              0                0.037657
Name: label_reg_cls, dtype: float64



=== label_cls_cls ===
value counts


1    252
0     42
2     16
Name: label_cls_cls, dtype: int64

1    0.812903
0    0.135484
2    0.051613
Name: label_cls_cls, dtype: float64

product_code  label_cls_cls
A_31          0                0.611940
              1                0.328358
              2                0.059701
O_31          1                1.000000
T_31          1                0.945607
              2                0.050209
              0                0.004184
Name: label_cls_cls, dtype: float64



=== label_meta_learning ===
value counts


1    264
0     38
2      8
Name: label_meta_learning, dtype: int64

1    0.851613
0    0.122581
2    0.025806
Name: label_meta_learning, dtype: float64

product_code  label_meta_learning
A_31          0                      0.567164
              1                      0.358209
              2                      0.074627
O_31          1                      1.000000
T_31          1                      0.987448
              2                      0.012552
Name: label_meta_learning, dtype: float64



=== majority_vote ===
value counts


1    264
0     38
2      8
Name: majority_vote, dtype: int64

1    0.851613
0    0.122581
2    0.025806
Name: majority_vote, dtype: float64

product_code  majority_vote
A_31          0                0.567164
              1                0.358209
              2                0.074627
O_31          1                1.000000
T_31          1                0.987448
              2                0.012552
Name: majority_vote, dtype: float64





In [229]:
test_inference

Unnamed: 0,reg_qual,cls_cls0,cls_cls1,cls_cls2,reg_cls,meta_learning_cls0,meta_learning_cls1,meta_learning_cls2,product_code,product_id,tmp,label_reg_qual,label_reg_cls,label_cls_cls,label_meta_learning,majority_vote
0,0.524071,0.121408,0.864400,0.014192,0.536509,0.012408,0.979189,0.008404,A_31,TEST_003,A_31,0,0,1,1,1
1,0.531568,0.054616,0.837202,0.108182,1.107915,0.010555,0.976790,0.012656,A_31,TEST_004,A_31,1,1,1,1,1
2,0.533111,0.023887,0.835881,0.140231,1.223501,0.009728,0.976171,0.014101,A_31,TEST_005,A_31,1,1,1,1,1
3,0.532295,0.073844,0.888689,0.037468,0.835941,0.009695,0.981736,0.008570,A_31,TEST_006,A_31,1,1,1,1,1
4,0.526138,0.877041,0.082427,0.040532,0.382443,0.959212,0.022046,0.018742,A_31,TEST_007,A_31,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238,0.527853,0.023972,0.953319,0.022709,0.936865,0.003062,0.993919,0.003018,T_31,TEST_305,T_31_O_31,1,1,1,1,1
239,0.527677,0.060022,0.902709,0.037269,0.902171,0.003894,0.992464,0.003642,T_31,TEST_306,T_31_O_31,1,1,1,1,1
240,0.528133,0.024509,0.954418,0.021073,1.026776,0.003020,0.993947,0.003033,T_31,TEST_307,T_31_O_31,1,1,1,1,1
241,0.528046,0.046015,0.897660,0.056326,1.017740,0.003790,0.992301,0.003909,T_31,TEST_308,T_31_O_31,1,1,1,1,1


In [230]:
test_inference.sort_values("product_id").to_csv(architecture_path + "test_rawoutput.csv", index=False)

## Submission

In [231]:
inference_col = "majority_vote"

In [232]:
df_test_tmp = df_test[["product_id", "line"]]
df_test_tmp["target_class"] = 1

for line in line_split:
    df_test_tmp.loc[df_tmp["product_code"].isin(line), "target_class"] = test_inference.loc[test_inference["product_code"].isin(line), inference_col].values

df_test_tmp["target_class"] = df_test_tmp["target_class"].astype("int32")

In [233]:
df_test_tmp

Unnamed: 0,product_id,line,target_class
0,TEST_000,T100306,1
1,TEST_001,T100304,1
2,TEST_002,T100304,1
3,TEST_003,T010305,1
4,TEST_004,T010306,1
...,...,...,...
305,TEST_305,T100306,1
306,TEST_306,T100304,1
307,TEST_307,T100306,1
308,TEST_308,T100306,1


In [234]:
submission = pd.read_csv("../datasets/sample_submission.csv")
submission["Y_Class"] = df_test_tmp["target_class"].values

In [235]:
submission

Unnamed: 0,PRODUCT_ID,Y_Class
0,TEST_000,1
1,TEST_001,1
2,TEST_002,1
3,TEST_003,1
4,TEST_004,1
...,...,...
305,TEST_305,1
306,TEST_306,1
307,TEST_307,1
308,TEST_308,1


In [237]:
submission.to_csv(architecture_path + "submission_" + architecture_name + ".csv", index=False)