## Setup

In [114]:
GLOBAL_SEED = 42

import os
os.environ['PYTHONHASHSEED'] = str(GLOBAL_SEED)
import sys

import gc
from tqdm import tqdm
import datetime
import pickle
import random as rnd
from glob import glob
import pandas as pd
import numpy as np
from numpy import random as np_rnd
import warnings
from math import ceil

import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import rcParams
from itertools import combinations
from collections import Counter

import lightgbm as lgb
import xgboost as xgb
import catboost as cat

import optuna
from optuna import Trial, create_study
from optuna.samplers import TPESampler

from sklearn import linear_model as lm
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold

from sklearn.impute import KNNImputer
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from itertools import product

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from scipy.stats import f_oneway
from scipy.stats import pearsonr

# display setting
warnings.filterwarnings(action='ignore')
rcParams['axes.unicode_minus'] = False

In [115]:
def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    # python random
    rnd.seed(seed)
    # numpy random
    np_rnd.seed(seed)
    # tf random
    try:
        tf_rnd.set_seed(seed)
    except:
        pass
    # RAPIDS random
    try:
        cupy.random.seed(seed)
    except:
        pass
    # pytorch random
    try:
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
    except:
        pass

def pickleIO(obj, src, op="w"):
    if op=="w":
        with open(src, op + "b") as f:
            pickle.dump(obj, f)
    elif op=="r":
        with open(src, op + "b") as f:
            tmp = pickle.load(f)
        return tmp
    else:
        print("unknown operation")
        return obj

def findIdx(data_x, col_names):
    return [int(i) for i, j in enumerate(data_x) if j in col_names]

def diff(first, second):
    second = set(second)
    return [item for item in first if item not in second]

def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print('Error: Creating directory. ' + directory)
        
def week_of_month(dt):
    """ 
        Returns the week of the month for the specified date.
    """
    first_day = dt.replace(day=1)
    dom = dt.day
    adjusted_dom = dom + (1 + first_day.weekday()) % 7
    return int(ceil(adjusted_dom/7.0))

def quality_to_class(x, threshold):
    y_pred_class = []
    for i in x:
        tmp_class = len(threshold)
        for k, v in threshold.items():
            if i < v:
                tmp_class = k
                break
        y_pred_class.append(tmp_class)
    return y_pred_class

In [116]:
class CFG:
    debug = False
    TF = True
    product_mapper = {
        "A": ["T010305", "T010306", "T050304", "T050307"],
        "O": ["T100304", "T100306"],
        "T": ["T100304", "T100306"],
    }
    line_mapper = {
        "T010305": "A", "T010306": "A", "T050304": "A", "T050307": "A",
        "T100304": "O_T", "T100306": "O_T",
    }
    line_groups = [
        ["T010305", "T010306"],
        ["T050304", "T050307"],
        ["T100304", "T100306"],
    ]
    classes = [0, 1, 2]
    time_features = ["month", "day", "weekday", "week_of_month", "hour", "office_hour", "sec_in_day", "sin_in_day", "cos_in_day"]

In [117]:
quality_threshold = pickleIO(None, "../datasets/dataset_valid/quality_threshold.pkl", "r")
quality_threshold['T010305_T010306_T050304_T050307'] = {0: (0.525046 + 0.525086) / 2, 1: (0.534843 + 0.535279) / 2}

In [118]:
quality_threshold = {
    "T010305_T010306": {0: 0.52507, 1: 0.53490},
    "T050304_T050307": {0: 0.52507, 1: 0.53490},
    "T100304_T100306": {0: 0.52507, 1: 0.53490},
}

In [119]:
quality_threshold

{'T010305_T010306': {0: 0.52507, 1: 0.5349},
 'T050304_T050307': {0: 0.52507, 1: 0.5349},
 'T100304_T100306': {0: 0.52507, 1: 0.5349}}

In [120]:
createFolder("./architectures/ensemble/")

## 1. Ensemble predictions with SOTA

In [121]:
architecture_root_path = "./architectures/"
architecture_name = "daheeRef_targetQual_bylines_TF_cat_gbm_metaLearning_try2"
architecture_path = architecture_root_path + architecture_name + "/"

In [122]:
# Loading my architecture
mynb = pd.read_csv(architecture_path + "test_rawoutput.csv")

In [123]:
mynb

Unnamed: 0,reg_qual,cls_cls0,cls_cls1,cls_cls2,reg_cls,meta_learning_cls0,meta_learning_cls1,meta_learning_cls2,product_code,product_id,tmp,label_reg_qual,label_reg_cls,label_cls_cls,label_meta_learning,majority_vote
0,0.530398,0.024127,0.948472,0.027402,0.992397,0.003376,0.993208,0.003416,T_31,TEST_000,T_31_O_31,1,1,1,1,1
1,0.532199,0.073577,0.596579,0.329845,1.236157,0.015790,0.952169,0.032040,T_31,TEST_001,T_31_O_31,1,1,2,2,1
2,0.531471,0.026486,0.905740,0.067774,1.100129,0.003844,0.991825,0.004331,T_31,TEST_002,T_31_O_31,1,1,1,1,1
3,0.524192,0.099548,0.882260,0.018192,0.575829,0.011901,0.979461,0.008638,A_31,TEST_003,A_31,0,0,1,1,0
4,0.531549,0.063362,0.826198,0.110439,1.058468,0.012233,0.973575,0.014191,A_31,TEST_004,A_31,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,0.528816,0.197093,0.643421,0.159485,1.071927,0.015831,0.969022,0.015147,T_31,TEST_305,T_31_O_31,1,1,0,0,1
306,0.527541,0.139535,0.727657,0.132809,1.085151,0.009392,0.980988,0.009620,T_31,TEST_306,T_31_O_31,1,1,1,1,1
307,0.529334,0.104771,0.736675,0.158555,1.269602,0.008200,0.981703,0.010097,T_31,TEST_307,T_31_O_31,1,2,1,1,1
308,0.528922,0.110445,0.691754,0.197801,1.207066,0.010218,0.976420,0.013362,T_31,TEST_308,T_31_O_31,1,1,1,1,1


In [124]:
public_nbs = pd.read_csv('./모델4개2.csv', encoding = 'cp949')
public_nbs.columns = public_nbs.columns.str.lower()

# get lb 0.748
public_nbs = public_nbs.rename(columns={"0.748 결과": "label_lb_0.748"})

# get lb 0.758
public_nbs["label_lb_0.758"] = public_nbs["label_lb_0.748"].values
public_nbs['label_lb_0.758'][(public_nbs['label_lb_0.758'] != 0) & (public_nbs['class0'] > 0.45) & (public_nbs['class를 회귀로(모델3개)'] <= 0.75) & (public_nbs['class를 회귀로(모델1개)'] <= 0.75)] = 0

# additional label
public_nbs["label_opt"] = public_nbs["label_lb_0.748"].values
public_nbs['label_opt'][(public_nbs["label_opt"] != 2) & (public_nbs['class0'] > 0.45) & (public_nbs['class를 회귀로(모델3개)'] <= 0.75) & (public_nbs['class를 회귀로(모델1개)'] <= 0.75)] = 0
public_nbs['label_opt'][(public_nbs["label_opt"] != 0) & (public_nbs['class2'] > 0.45) & (public_nbs['class를 회귀로(모델3개)'] >= 1.25) & (public_nbs['class를 회귀로(모델1개)'] >= 1.25)] = 2

In [125]:
print(public_nbs["label_lb_0.748"].value_counts(normalize=True))
print(public_nbs["label_lb_0.758"].value_counts(normalize=True))
print(public_nbs["label_opt"].value_counts(normalize=True))

1    0.887097
0    0.096774
2    0.016129
Name: label_lb_0.748, dtype: float64
1    0.845161
0    0.138710
2    0.016129
Name: label_lb_0.758, dtype: float64
1    0.812903
0    0.138710
2    0.048387
Name: label_opt, dtype: float64


### Get ensemble prediction

In [126]:
casting_voter = "label_lb_0.758"

In [127]:
ensemble = mynb.filter(regex="product_id|label_*").merge(public_nbs.filter(regex="product_id|label_*"), on="product_id")
ensemble["majority_voting"] = ensemble.filter(regex="label_*").apply(lambda x: Counter(x).most_common(3), axis=1)
ensemble["label"] = ensemble["majority_voting"].apply(lambda x: x[0][0])

In [128]:
same_rank_list = []
for i in ensemble["majority_voting"]:
    tmp_cls, tmp_score = list(zip(*i))
    if (max(tmp_score) == pd.Series(tmp_score)).sum() > 1:
        same_rank_list.append(True)
    else:
        same_rank_list.append(False)

# Casting Voter is lb_0.758
ensemble.loc[same_rank_list, "label"] = ensemble.loc[same_rank_list, casting_voter].values

In [129]:
print("Number of same rank1 :", sum(same_rank_list))
ensemble

Number of same rank1 : 0


Unnamed: 0,product_id,label_reg_qual,label_reg_cls,label_cls_cls,label_meta_learning,label_lb_0.748,label_lb_0.758,label_opt,majority_voting,label
0,TEST_000,1,1,1,1,1,1,1,"[(1, 7)]",1
1,TEST_001,1,1,2,2,1,1,2,"[(1, 4), (2, 3)]",1
2,TEST_002,1,1,1,1,1,1,1,"[(1, 7)]",1
3,TEST_003,0,0,1,1,0,0,0,"[(0, 5), (1, 2)]",0
4,TEST_004,1,1,1,1,1,1,1,"[(1, 7)]",1
...,...,...,...,...,...,...,...,...,...,...
305,TEST_305,1,1,0,0,1,1,1,"[(1, 5), (0, 2)]",1
306,TEST_306,1,1,1,1,1,1,1,"[(1, 7)]",1
307,TEST_307,1,2,1,1,1,1,1,"[(1, 6), (2, 1)]",1
308,TEST_308,1,1,1,1,1,1,1,"[(1, 7)]",1


In [130]:
print(ensemble["label"].value_counts())
print(ensemble["label"].value_counts(normalize=True))

1    259
0     43
2      8
Name: label, dtype: int64
1    0.835484
0    0.138710
2    0.025806
Name: label, dtype: float64


In [131]:
ensemble.to_csv("./architectures/ensemble/" + "ensemble1_rawoutput.csv")

## Submission

In [132]:
submission = pd.read_csv("../datasets/sample_submission.csv")
submission["Y_Class"] = ensemble["label"].values

In [133]:
submission

Unnamed: 0,PRODUCT_ID,Y_Class
0,TEST_000,1
1,TEST_001,1
2,TEST_002,1
3,TEST_003,0
4,TEST_004,1
...,...,...
305,TEST_305,1
306,TEST_306,1
307,TEST_307,1
308,TEST_308,1


In [134]:
submission.to_csv("./architectures/ensemble/" + "ensemble1_submission.csv", index=False)

## 2. Ensemble Prediction with Threshold Optimization

In [135]:
df_tmp = public_nbs.copy()
df_tmp = df_tmp.rename(columns={"class0": "pred_public_cls_cls0", "class1": "pred_public_cls_cls1", "class2": "pred_public_cls_cls2"})
df_tmp = df_tmp.rename(columns={"class를 회귀로(모델3개)": "pred_public_reg_cls_multimodels"})
df_tmp = df_tmp.rename(columns={"class를 회귀로(모델1개)": "pred_public_reg_cls_singlemodel"})
df_tmp

Unnamed: 0,product_id,label_lb_0.748,pred_public_cls_cls0,pred_public_cls_cls1,pred_public_cls_cls2,pred_public_reg_cls_multimodels,pred_public_reg_cls_singlemodel,label_lb_0.758,label_opt
0,TEST_000,1,0.050405,0.913329,0.036266,0.912529,0.896953,1,1
1,TEST_001,1,0.109743,0.259731,0.630526,1.340539,1.392440,1,2
2,TEST_002,1,0.098200,0.681939,0.219861,1.137469,1.228641,1,1
3,TEST_003,0,0.089931,0.891287,0.018782,0.653888,0.686387,0,0
4,TEST_004,1,0.075801,0.781753,0.142446,1.118894,1.016366,1,1
...,...,...,...,...,...,...,...,...,...
305,TEST_305,1,0.149982,0.676850,0.173169,1.020343,0.923551,1,1
306,TEST_306,1,0.226041,0.588804,0.185155,1.033545,0.938690,1,1
307,TEST_307,1,0.083189,0.735290,0.181522,1.259804,1.199065,1,1
308,TEST_308,1,0.101191,0.641677,0.257132,1.115454,1.079444,1,1


In [136]:
output = {"pred_public_cls_cls": None, "pred_public_reg_cls_multimodels": None, "pred_public_reg_cls_singlemodel": None}
public_pred_threshold = {"public_cls_cls": [0.4, 1], "public_reg_cls_multimodels": [0.65, 1.35], "public_reg_cls_singlemodel": [0.55, 1.45]}

In [137]:
# Classification
print("=== Classification ===")
tmp_labeld = []
for idx, row in df_tmp.filter(regex="pred_public_cls_cls*").iterrows():
    if (row.values[1] >= public_pred_threshold["public_cls_cls"][0]) & (row.values[1] <= public_pred_threshold["public_cls_cls"][1]):
        tmp_labeld.append(1)
    else:
        tmp_labeld.append(0 if np.argmax([row.values[0], row.values[2]]) == 0 else 2)
df_tmp["label_public_cls_cls"] = tmp_labeld
print(df_tmp["label_public_cls_cls"].value_counts())
print(df_tmp["label_public_cls_cls"].value_counts(normalize=True), "\n\n")

# Regression 3 models
print("=== Regression 3 models ===")
tmp_labeld = []
for row in df_tmp["pred_public_reg_cls_multimodels"].values:
    if row < public_pred_threshold["public_reg_cls_multimodels"][0]:
        tmp_labeld.append(0)
    elif row > public_pred_threshold["public_reg_cls_multimodels"][1]:
        tmp_labeld.append(2)
    else:
        tmp_labeld.append(1)
df_tmp["label_public_reg_cls_multimodels"] = tmp_labeld
print(df_tmp["label_public_reg_cls_multimodels"].value_counts())
print(df_tmp["label_public_reg_cls_multimodels"].value_counts(normalize=True), "\n\n")

# Regression 1 model
print("=== Regression 1 model ===")
tmp_labeld = []
for row in df_tmp["pred_public_reg_cls_singlemodel"].values:
    if row < public_pred_threshold["public_reg_cls_singlemodel"][0]:
        tmp_labeld.append(0)
    elif row > public_pred_threshold["public_reg_cls_singlemodel"][1]:
        tmp_labeld.append(2)
    else:
        tmp_labeld.append(1)
df_tmp["label_public_reg_cls_singlemodel"] = tmp_labeld
print(df_tmp["label_public_reg_cls_singlemodel"].value_counts())
print(df_tmp["label_public_reg_cls_singlemodel"].value_counts(normalize=True), "\n\n")

=== Classification ===
1    257
0     38
2     15
Name: label_public_cls_cls, dtype: int64
1    0.829032
0    0.122581
2    0.048387
Name: label_public_cls_cls, dtype: float64 


=== Regression 3 models ===
1    255
0     42
2     13
Name: label_public_reg_cls_multimodels, dtype: int64
1    0.822581
0    0.135484
2    0.041935
Name: label_public_reg_cls_multimodels, dtype: float64 


=== Regression 1 model ===
1    270
0     23
2     17
Name: label_public_reg_cls_singlemodel, dtype: int64
1    0.870968
0    0.074194
2    0.054839
Name: label_public_reg_cls_singlemodel, dtype: float64 




### Get ensemble prediction

In [138]:
casting_voter = "label_lb_0.758"

In [139]:
ensemble = mynb.filter(regex="product_id|label_*").merge(df_tmp.filter(regex="product_id|label_*"), on="product_id")
ensemble["majority_voting"] = ensemble.filter(regex="label_*").apply(lambda x: Counter(x).most_common(3), axis=1)
ensemble["label"] = ensemble["majority_voting"].apply(lambda x: x[0][0])

In [140]:
same_rank_list = []
for i in ensemble["majority_voting"]:
    tmp_cls, tmp_score = list(zip(*i))
    if (max(tmp_score) == pd.Series(tmp_score)).sum() > 1:
        same_rank_list.append(True)
    else:
        same_rank_list.append(False)

# Casting Voter is lb_0.758
ensemble.loc[same_rank_list, "label"] = ensemble.loc[same_rank_list, casting_voter].values

In [141]:
ensemble

Unnamed: 0,product_id,label_reg_qual,label_reg_cls,label_cls_cls,label_meta_learning,label_lb_0.748,label_lb_0.758,label_opt,label_public_cls_cls,label_public_reg_cls_multimodels,label_public_reg_cls_singlemodel,majority_voting,label
0,TEST_000,1,1,1,1,1,1,1,1,1,1,"[(1, 10)]",1
1,TEST_001,1,1,2,2,1,1,2,2,1,1,"[(1, 6), (2, 4)]",1
2,TEST_002,1,1,1,1,1,1,1,1,1,1,"[(1, 10)]",1
3,TEST_003,0,0,1,1,0,0,0,1,1,1,"[(0, 5), (1, 5)]",0
4,TEST_004,1,1,1,1,1,1,1,1,1,1,"[(1, 10)]",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,TEST_305,1,1,0,0,1,1,1,1,1,1,"[(1, 8), (0, 2)]",1
306,TEST_306,1,1,1,1,1,1,1,1,1,1,"[(1, 10)]",1
307,TEST_307,1,2,1,1,1,1,1,1,1,1,"[(1, 9), (2, 1)]",1
308,TEST_308,1,1,1,1,1,1,1,1,1,1,"[(1, 10)]",1


In [142]:
print("Number of same rank1 :", sum(same_rank_list))
ensemble

Number of same rank1 : 12


Unnamed: 0,product_id,label_reg_qual,label_reg_cls,label_cls_cls,label_meta_learning,label_lb_0.748,label_lb_0.758,label_opt,label_public_cls_cls,label_public_reg_cls_multimodels,label_public_reg_cls_singlemodel,majority_voting,label
0,TEST_000,1,1,1,1,1,1,1,1,1,1,"[(1, 10)]",1
1,TEST_001,1,1,2,2,1,1,2,2,1,1,"[(1, 6), (2, 4)]",1
2,TEST_002,1,1,1,1,1,1,1,1,1,1,"[(1, 10)]",1
3,TEST_003,0,0,1,1,0,0,0,1,1,1,"[(0, 5), (1, 5)]",0
4,TEST_004,1,1,1,1,1,1,1,1,1,1,"[(1, 10)]",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,TEST_305,1,1,0,0,1,1,1,1,1,1,"[(1, 8), (0, 2)]",1
306,TEST_306,1,1,1,1,1,1,1,1,1,1,"[(1, 10)]",1
307,TEST_307,1,2,1,1,1,1,1,1,1,1,"[(1, 9), (2, 1)]",1
308,TEST_308,1,1,1,1,1,1,1,1,1,1,"[(1, 10)]",1


In [143]:
print(ensemble["label"].value_counts())
print(ensemble["label"].value_counts(normalize=True))

1    262
0     42
2      6
Name: label, dtype: int64
1    0.845161
0    0.135484
2    0.019355
Name: label, dtype: float64


In [144]:
ensemble.to_csv("./architectures/ensemble/" + "ensemble2_rawoutput.csv")

## Submission

In [145]:
submission = pd.read_csv("../datasets/sample_submission.csv")
submission["Y_Class"] = ensemble["label"].values

In [146]:
submission

Unnamed: 0,PRODUCT_ID,Y_Class
0,TEST_000,1
1,TEST_001,1
2,TEST_002,1
3,TEST_003,0
4,TEST_004,1
...,...,...
305,TEST_305,1
306,TEST_306,1
307,TEST_307,1
308,TEST_308,1


In [147]:
submission.to_csv("./architectures/ensemble/" + "ensemble2_submission.csv", index=False)

## 3. Ensemble Prediction with Threshold Optimization (V2)

### My architectures

In [148]:
architecture_root_path = "./architectures/"
architecture_name = [
    "daheeRef_targetQual_bylines_TF_cat_gbm_metaLearning_try2",
    "daheeRef_bylines_dnn_try1",
    "daheeRef_bylines_dnn_jeju_try1",
]
# architecture_path = architecture_root_path + architecture_name + "/"

In [149]:
mynb = None
# Loading my architecture
for i in architecture_name:
    tmp = pd.read_csv(architecture_path + "test_rawoutput.csv")
    tmp.columns = [j + "_" + i if j != "product_id" else j for j in tmp.columns ]
    if mynb is None:
        mynb = tmp.filter(regex="product_id|label_*")
    else:
        mynb = mynb.merge(tmp.filter(regex="product_id|label_*"), on="product_id")

In [150]:
mynb

Unnamed: 0,product_id,label_reg_qual_daheeRef_targetQual_bylines_TF_cat_gbm_metaLearning_try2,label_reg_cls_daheeRef_targetQual_bylines_TF_cat_gbm_metaLearning_try2,label_cls_cls_daheeRef_targetQual_bylines_TF_cat_gbm_metaLearning_try2,label_meta_learning_daheeRef_targetQual_bylines_TF_cat_gbm_metaLearning_try2,label_reg_qual_daheeRef_bylines_dnn_try1,label_reg_cls_daheeRef_bylines_dnn_try1,label_cls_cls_daheeRef_bylines_dnn_try1,label_meta_learning_daheeRef_bylines_dnn_try1,label_reg_qual_daheeRef_bylines_dnn_jeju_try1,label_reg_cls_daheeRef_bylines_dnn_jeju_try1,label_cls_cls_daheeRef_bylines_dnn_jeju_try1,label_meta_learning_daheeRef_bylines_dnn_jeju_try1
0,TEST_000,1,1,1,1,1,1,1,1,1,1,1,1
1,TEST_001,1,1,2,2,1,1,2,2,1,1,2,2
2,TEST_002,1,1,1,1,1,1,1,1,1,1,1,1
3,TEST_003,0,0,1,1,0,0,1,1,0,0,1,1
4,TEST_004,1,1,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,TEST_305,1,1,0,0,1,1,0,0,1,1,0,0
306,TEST_306,1,1,1,1,1,1,1,1,1,1,1,1
307,TEST_307,1,2,1,1,1,2,1,1,1,2,1,1
308,TEST_308,1,1,1,1,1,1,1,1,1,1,1,1


### Team member architectures

In [151]:
public_nbs = pd.read_csv('./모델4개2.csv', encoding = 'cp949')
public_nbs.columns = public_nbs.columns.str.lower()

# get lb 0.748
public_nbs = public_nbs.rename(columns={"0.748 결과": "label_lb_0.748"})

# get lb 0.758
public_nbs["label_lb_0.758"] = public_nbs["label_lb_0.748"].values
public_nbs['label_lb_0.758'][(public_nbs['label_lb_0.758'] != 0) & (public_nbs['class0'] > 0.45) & (public_nbs['class를 회귀로(모델3개)'] <= 0.75) & (public_nbs['class를 회귀로(모델1개)'] <= 0.75)] = 0

# additional label
public_nbs["label_opt"] = public_nbs["label_lb_0.748"].values
public_nbs['label_opt'][(public_nbs["label_opt"] != 2) & (public_nbs['class0'] > 0.45) & (public_nbs['class를 회귀로(모델3개)'] <= 0.75) & (public_nbs['class를 회귀로(모델1개)'] <= 0.75)] = 0
public_nbs['label_opt'][(public_nbs["label_opt"] != 0) & (public_nbs['class2'] > 0.45) & (public_nbs['class를 회귀로(모델3개)'] >= 1.25) & (public_nbs['class를 회귀로(모델1개)'] >= 1.25)] = 2

In [152]:
df_tmp = public_nbs.copy()
df_tmp = df_tmp.rename(columns={"class0": "pred_public_cls_cls0", "class1": "pred_public_cls_cls1", "class2": "pred_public_cls_cls2"})
df_tmp = df_tmp.rename(columns={"class를 회귀로(모델3개)": "pred_public_reg_cls_multimodels"})
df_tmp = df_tmp.rename(columns={"class를 회귀로(모델1개)": "pred_public_reg_cls_singlemodel"})
df_tmp

Unnamed: 0,product_id,label_lb_0.748,pred_public_cls_cls0,pred_public_cls_cls1,pred_public_cls_cls2,pred_public_reg_cls_multimodels,pred_public_reg_cls_singlemodel,label_lb_0.758,label_opt
0,TEST_000,1,0.050405,0.913329,0.036266,0.912529,0.896953,1,1
1,TEST_001,1,0.109743,0.259731,0.630526,1.340539,1.392440,1,2
2,TEST_002,1,0.098200,0.681939,0.219861,1.137469,1.228641,1,1
3,TEST_003,0,0.089931,0.891287,0.018782,0.653888,0.686387,0,0
4,TEST_004,1,0.075801,0.781753,0.142446,1.118894,1.016366,1,1
...,...,...,...,...,...,...,...,...,...
305,TEST_305,1,0.149982,0.676850,0.173169,1.020343,0.923551,1,1
306,TEST_306,1,0.226041,0.588804,0.185155,1.033545,0.938690,1,1
307,TEST_307,1,0.083189,0.735290,0.181522,1.259804,1.199065,1,1
308,TEST_308,1,0.101191,0.641677,0.257132,1.115454,1.079444,1,1


In [153]:
output = {"pred_public_cls_cls": None, "pred_public_reg_cls_multimodels": None, "pred_public_reg_cls_singlemodel": None}
public_pred_threshold = {"public_cls_cls": [0.4, 1], "public_reg_cls_multimodels": [0.65, 1.35], "public_reg_cls_singlemodel": [0.55, 1.45]}

In [154]:
# Classification
print("=== Classification ===")
tmp_labeld = []
for idx, row in df_tmp.filter(regex="pred_public_cls_cls*").iterrows():
    if (row.values[1] >= public_pred_threshold["public_cls_cls"][0]) & (row.values[1] <= public_pred_threshold["public_cls_cls"][1]):
        tmp_labeld.append(1)
    else:
        tmp_labeld.append(0 if np.argmax([row.values[0], row.values[2]]) == 0 else 2)
df_tmp["label_public_cls_cls"] = tmp_labeld
print(df_tmp["label_public_cls_cls"].value_counts())
print(df_tmp["label_public_cls_cls"].value_counts(normalize=True), "\n\n")

# Regression 3 models
print("=== Regression 3 models ===")
tmp_labeld = []
for row in df_tmp["pred_public_reg_cls_multimodels"].values:
    if row < public_pred_threshold["public_reg_cls_multimodels"][0]:
        tmp_labeld.append(0)
    elif row > public_pred_threshold["public_reg_cls_multimodels"][1]:
        tmp_labeld.append(2)
    else:
        tmp_labeld.append(1)
df_tmp["label_public_reg_cls_multimodels"] = tmp_labeld
print(df_tmp["label_public_reg_cls_multimodels"].value_counts())
print(df_tmp["label_public_reg_cls_multimodels"].value_counts(normalize=True), "\n\n")

# Regression 1 model
print("=== Regression 1 model ===")
tmp_labeld = []
for row in df_tmp["pred_public_reg_cls_singlemodel"].values:
    if row < public_pred_threshold["public_reg_cls_singlemodel"][0]:
        tmp_labeld.append(0)
    elif row > public_pred_threshold["public_reg_cls_singlemodel"][1]:
        tmp_labeld.append(2)
    else:
        tmp_labeld.append(1)
df_tmp["label_public_reg_cls_singlemodel"] = tmp_labeld
print(df_tmp["label_public_reg_cls_singlemodel"].value_counts())
print(df_tmp["label_public_reg_cls_singlemodel"].value_counts(normalize=True), "\n\n")

=== Classification ===
1    257
0     38
2     15
Name: label_public_cls_cls, dtype: int64
1    0.829032
0    0.122581
2    0.048387
Name: label_public_cls_cls, dtype: float64 


=== Regression 3 models ===
1    255
0     42
2     13
Name: label_public_reg_cls_multimodels, dtype: int64
1    0.822581
0    0.135484
2    0.041935
Name: label_public_reg_cls_multimodels, dtype: float64 


=== Regression 1 model ===
1    270
0     23
2     17
Name: label_public_reg_cls_singlemodel, dtype: int64
1    0.870968
0    0.074194
2    0.054839
Name: label_public_reg_cls_singlemodel, dtype: float64 




### Get ensemble prediction

In [155]:
casting_voter = "label_lb_0.758"

In [156]:
ensemble = mynb.filter(regex="product_id|label_*").merge(df_tmp.filter(regex="product_id|label_*"), on="product_id")
ensemble["majority_voting"] = ensemble.filter(regex="label_*").apply(lambda x: Counter(x).most_common(3), axis=1)
ensemble["label"] = ensemble["majority_voting"].apply(lambda x: x[0][0])

In [157]:
same_rank_list = []
for i in ensemble["majority_voting"]:
    tmp_cls, tmp_score = list(zip(*i))
    if (max(tmp_score) == pd.Series(tmp_score)).sum() > 1:
        same_rank_list.append(True)
    else:
        same_rank_list.append(False)

# Casting Voter is lb_0.758
ensemble.loc[same_rank_list, "label"] = ensemble.loc[same_rank_list, casting_voter].values

In [158]:
ensemble

Unnamed: 0,product_id,label_reg_qual_daheeRef_targetQual_bylines_TF_cat_gbm_metaLearning_try2,label_reg_cls_daheeRef_targetQual_bylines_TF_cat_gbm_metaLearning_try2,label_cls_cls_daheeRef_targetQual_bylines_TF_cat_gbm_metaLearning_try2,label_meta_learning_daheeRef_targetQual_bylines_TF_cat_gbm_metaLearning_try2,label_reg_qual_daheeRef_bylines_dnn_try1,label_reg_cls_daheeRef_bylines_dnn_try1,label_cls_cls_daheeRef_bylines_dnn_try1,label_meta_learning_daheeRef_bylines_dnn_try1,label_reg_qual_daheeRef_bylines_dnn_jeju_try1,...,label_cls_cls_daheeRef_bylines_dnn_jeju_try1,label_meta_learning_daheeRef_bylines_dnn_jeju_try1,label_lb_0.748,label_lb_0.758,label_opt,label_public_cls_cls,label_public_reg_cls_multimodels,label_public_reg_cls_singlemodel,majority_voting,label
0,TEST_000,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,"[(1, 18)]",1
1,TEST_001,1,1,2,2,1,1,2,2,1,...,2,2,1,1,2,2,1,1,"[(1, 10), (2, 8)]",1
2,TEST_002,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,"[(1, 18)]",1
3,TEST_003,0,0,1,1,0,0,1,1,0,...,1,1,0,0,0,1,1,1,"[(0, 9), (1, 9)]",0
4,TEST_004,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,"[(1, 18)]",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,TEST_305,1,1,0,0,1,1,0,0,1,...,0,0,1,1,1,1,1,1,"[(1, 12), (0, 6)]",1
306,TEST_306,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,"[(1, 18)]",1
307,TEST_307,1,2,1,1,1,2,1,1,1,...,1,1,1,1,1,1,1,1,"[(1, 15), (2, 3)]",1
308,TEST_308,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,"[(1, 18)]",1


In [159]:
print("Number of same rank1 :", sum(same_rank_list))
ensemble

Number of same rank1 : 11


Unnamed: 0,product_id,label_reg_qual_daheeRef_targetQual_bylines_TF_cat_gbm_metaLearning_try2,label_reg_cls_daheeRef_targetQual_bylines_TF_cat_gbm_metaLearning_try2,label_cls_cls_daheeRef_targetQual_bylines_TF_cat_gbm_metaLearning_try2,label_meta_learning_daheeRef_targetQual_bylines_TF_cat_gbm_metaLearning_try2,label_reg_qual_daheeRef_bylines_dnn_try1,label_reg_cls_daheeRef_bylines_dnn_try1,label_cls_cls_daheeRef_bylines_dnn_try1,label_meta_learning_daheeRef_bylines_dnn_try1,label_reg_qual_daheeRef_bylines_dnn_jeju_try1,...,label_cls_cls_daheeRef_bylines_dnn_jeju_try1,label_meta_learning_daheeRef_bylines_dnn_jeju_try1,label_lb_0.748,label_lb_0.758,label_opt,label_public_cls_cls,label_public_reg_cls_multimodels,label_public_reg_cls_singlemodel,majority_voting,label
0,TEST_000,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,"[(1, 18)]",1
1,TEST_001,1,1,2,2,1,1,2,2,1,...,2,2,1,1,2,2,1,1,"[(1, 10), (2, 8)]",1
2,TEST_002,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,"[(1, 18)]",1
3,TEST_003,0,0,1,1,0,0,1,1,0,...,1,1,0,0,0,1,1,1,"[(0, 9), (1, 9)]",0
4,TEST_004,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,"[(1, 18)]",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,TEST_305,1,1,0,0,1,1,0,0,1,...,0,0,1,1,1,1,1,1,"[(1, 12), (0, 6)]",1
306,TEST_306,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,"[(1, 18)]",1
307,TEST_307,1,2,1,1,1,2,1,1,1,...,1,1,1,1,1,1,1,1,"[(1, 15), (2, 3)]",1
308,TEST_308,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,"[(1, 18)]",1


In [160]:
print(ensemble["label"].value_counts())
print(ensemble["label"].value_counts(normalize=True))

1    264
0     40
2      6
Name: label, dtype: int64
1    0.851613
0    0.129032
2    0.019355
Name: label, dtype: float64


In [161]:
ensemble.to_csv("./architectures/ensemble/" + "ensemble3_rawoutput.csv")

## Submission

In [162]:
submission = pd.read_csv("../datasets/sample_submission.csv")
submission["Y_Class"] = ensemble["label"].values

In [163]:
submission

Unnamed: 0,PRODUCT_ID,Y_Class
0,TEST_000,1
1,TEST_001,1
2,TEST_002,1
3,TEST_003,0
4,TEST_004,1
...,...,...
305,TEST_305,1
306,TEST_306,1
307,TEST_307,1
308,TEST_308,1


In [164]:
submission.to_csv("./architectures/ensemble/" + "ensemble3_submission.csv", index=False)