## Introduction

* This is rule-based algorithm for O_31 product
* This idea is from that target is ordinal rather than categorical
* Inference is carried out with voting the class by feature ranking which is calculated by split point on each features
* This will be good ensemble source with ML/DL model

## Setup

In [31]:
GLOBAL_SEED = 42

import os
os.environ['PYTHONHASHSEED'] = str(GLOBAL_SEED)
import sys

import gc
from tqdm import tqdm
import datetime
import pickle
import random as rnd
from glob import glob
import pandas as pd
import numpy as np
from numpy import random as np_rnd
import warnings
from math import ceil

import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import rcParams
from itertools import combinations
from collections import Counter

from scipy.stats import f_oneway
from scipy.stats import pearsonr

# display setting
warnings.filterwarnings(action='ignore')
rcParams['axes.unicode_minus'] = False

In [32]:
def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    # python random
    rnd.seed(seed)
    # numpy random
    np_rnd.seed(seed)
    # tf random
    try:
        tf_rnd.set_seed(seed)
    except:
        pass
    # RAPIDS random
    try:
        cupy.random.seed(seed)
    except:
        pass
    # pytorch random
    try:
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
    except:
        pass

def create_get_ts(ts):
    return int((ts.replace(tzinfo=CFG.tz) - CFG.ts_zero).total_seconds())

def pickleIO(obj, src, op="w"):
    if op=="w":
        with open(src, op + "b") as f:
            pickle.dump(obj, f)
    elif op=="r":
        with open(src, op + "b") as f:
            tmp = pickle.load(f)
        return tmp
    else:
        print("unknown operation")
        return obj

def week_of_month(dt):
    """ 
        Returns the week of the month for the specified date.
    """
    first_day = dt.replace(day=1)
    dom = dt.day
    adjusted_dom = dom + (1 + first_day.weekday()) % 7
    return int(ceil(adjusted_dom/7.0))

In [33]:
class CFG:
    debug = False
    product_mapper = {
        "A": ["T010305", "T010306", "T050304", "T050307"],
        "O": ["T100304", "T100306"],
        "T": ["T100304", "T100306"],
    }
    line_mapper = {
        "T010305": "A", "T010306": "A", "T050304": "A", "T050307": "A",
        "T100304": "O_T", "T100306": "O_T",
    }

## Loading Data

In [34]:
# information Provided by Dacon

# PRODUCT_ID : 제품의 고유 ID
# Y_Class : 제품 품질 상태(Target) 
# 0 : 적정 기준 미달 (부적합)
# 1 : 적합
# 2 : 적정 기준 초과 (부적합)
# Y_Quality : 제품 품질 관련 정량적 수치
# TIMESTAMP : 제품이 공정에 들어간 시각
# LINE : 제품이 들어간 공정 LINE 종류 ('T050304', 'T050307', 'T100304', 'T100306', 'T010306', 'T010305' 존재)
# PRODUCT_CODE : 제품의 CODE 번호 ('A_31', 'T_31', 'O_31' 존재)
# X_1 ~ X_2875 : 공정 과정에서 추출되어 비식별화된 변수

In [35]:
df_full = pd.read_csv("C:/Users/flash/PycharmProjects/pythonProject/projects/dacon_lgaimers2/datasets/train.csv")
df_full.columns = df_full.columns.str.lower()
df_full["timestamp"] = pd.to_datetime(df_full["timestamp"])

In [36]:
# # time feature engineernig
# df_full["month"] = df_full["timestamp"].dt.month
# df_full["day"] = df_full["timestamp"].dt.day
# df_full["weekday"] = df_full["timestamp"].dt.weekday
# df_full["week_of_month"] = df_full["timestamp"].apply(week_of_month)
# df_full["hour"] = df_full["timestamp"].dt.hour
# df_full["office_hour"] = df_full["hour"].apply(lambda x: 1 if ((x >= 9) & (x < 18)) else 0)
# df_full["sec_in_day"] = (df_full["timestamp"] - df_full["timestamp"].dt.normalize()).dt.total_seconds() / 3600
# df_full["sin_in_day"] = np.sin(2 * np.pi * df_full["sec_in_day"].values)
# df_full["cos_in_day"] = np.cos(2 * np.pi * df_full["sec_in_day"].values)

In [37]:
df_full

Unnamed: 0,product_id,y_class,y_quality,timestamp,line,product_code,x_1,x_2,x_3,x_4,...,x_2866,x_2867,x_2868,x_2869,x_2870,x_2871,x_2872,x_2873,x_2874,x_2875
0,TRAIN_000,1,0.533433,2022-06-13 05:14:00,T050304,A_31,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
1,TRAIN_001,2,0.541819,2022-06-13 05:22:00,T050307,A_31,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2,TRAIN_002,1,0.531267,2022-06-13 05:30:00,T050304,A_31,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,
3,TRAIN_003,2,0.537325,2022-06-13 05:39:00,T050307,A_31,,,,,...,37.74,39.17,52.17,30.58,71.78,,,,,
4,TRAIN_004,1,0.531590,2022-06-13 05:47:00,T050304,A_31,,,,,...,38.70,41.89,46.93,33.09,76.97,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,TRAIN_593,1,0.526546,2022-09-08 14:30:00,T100306,T_31,2.0,95.0,0.0,45.0,...,,,,,,,,,,
594,TRAIN_594,0,0.524022,2022-09-08 22:38:00,T050304,A_31,,,,,...,49.47,53.07,50.89,55.10,66.49,1.0,,,,
595,TRAIN_595,0,0.521289,2022-09-08 22:47:00,T050304,A_31,,,,,...,,,,,,1.0,,,,
596,TRAIN_596,1,0.531375,2022-09-08 14:38:00,T100304,O_31,40.0,94.0,0.0,45.0,...,,,,,,,,,,


## Calculate Feature Rank

In [38]:
df = df_full[df_full["product_code"] == "O_31"].dropna(axis=1)
df_var = df.var()
df = df[list(df.columns[:6]) + list(df_var[df_var != 0].iloc[2:].index)]

In [39]:
df

Unnamed: 0,product_id,y_class,y_quality,timestamp,line,product_code,x_1,x_2,x_5,x_7,...,x_924,x_925,x_926,x_927,x_928,x_929,x_930,x_931,x_932,x_933
569,TRAIN_569,1,0.530533,2022-09-03 18:32:00,T100304,O_31,4.0,98.0,11.0,45.0,...,181.7,160.0,155.541935,149.8,327.0,312.018182,298.0,13.6,13.458065,13.3
570,TRAIN_570,2,0.534951,2022-09-03 18:40:00,T100306,O_31,6.0,90.0,10.0,51.0,...,176.0,161.9,157.464516,154.5,327.0,312.454545,298.0,13.7,13.454839,13.3
571,TRAIN_571,1,0.525916,2022-09-03 18:48:00,T100304,O_31,4.0,100.0,11.0,45.0,...,175.4,165.7,159.864516,148.7,326.0,311.763636,298.0,13.7,13.425806,13.2
572,TRAIN_572,2,0.535205,2022-09-03 18:56:00,T100306,O_31,6.0,89.0,10.0,51.0,...,166.1,166.0,161.046667,151.3,326.0,312.537037,298.0,13.7,13.44,13.2
596,TRAIN_596,1,0.531375,2022-09-08 14:38:00,T100304,O_31,40.0,94.0,11.0,45.0,...,189.2,179.5,173.190323,168.6,321.0,303.036364,289.0,13.6,13.406667,13.2
597,TRAIN_597,1,0.533702,2022-09-08 14:46:00,T100306,O_31,21.0,87.0,10.0,61.0,...,188.2,172.9,167.087097,161.1,326.0,304.818182,289.0,13.6,13.38,13.2


In [40]:
df_c1 = df.filter(regex="x_*")[df["y_class"] == 1]
df_c2 = df.filter(regex="x_*")[df["y_class"] == 2]

In [41]:
df_c1

Unnamed: 0,x_1,x_2,x_5,x_7,x_9,x_11,x_12,x_13,x_16,x_17,...,x_924,x_925,x_926,x_927,x_928,x_929,x_930,x_931,x_932,x_933
569,4.0,98.0,11.0,45.0,31.0,500.0,505.3,493.2,247.6,248.0,...,181.7,160.0,155.541935,149.8,327.0,312.018182,298.0,13.6,13.458065,13.3
571,4.0,100.0,11.0,45.0,31.0,500.4,504.8,492.4,247.5,248.1,...,175.4,165.7,159.864516,148.7,326.0,311.763636,298.0,13.7,13.425806,13.2
596,40.0,94.0,11.0,45.0,31.0,505.8,510.9,497.1,247.5,248.1,...,189.2,179.5,173.190323,168.6,321.0,303.036364,289.0,13.6,13.406667,13.2
597,21.0,87.0,10.0,61.0,52.0,470.6,474.9,462.9,247.5,248.0,...,188.2,172.9,167.087097,161.1,326.0,304.818182,289.0,13.6,13.38,13.2


In [42]:
df_c2

Unnamed: 0,x_1,x_2,x_5,x_7,x_9,x_11,x_12,x_13,x_16,x_17,...,x_924,x_925,x_926,x_927,x_928,x_929,x_930,x_931,x_932,x_933
570,6.0,90.0,10.0,51.0,52.0,466.1,471.4,457.0,247.6,248.1,...,176.0,161.9,157.464516,154.5,327.0,312.454545,298.0,13.7,13.454839,13.3
572,6.0,89.0,10.0,51.0,52.0,465.9,471.4,457.3,247.5,248.0,...,166.1,166.0,161.046667,151.3,326.0,312.537037,298.0,13.7,13.44,13.2


In [43]:
tmp = {}

for i in df_c1.columns:
    tmp[i] = 0
    for j in df_c1[i]:
        for k in df_c2[i]:
            if k > j:
                tmp[i] += 1
            elif k < j:
                tmp[i] += -1
            else:
                tmp[i] += 0

In [44]:
# Get feature showing the direction definitive
df_cor = pd.Series(tmp)[(pd.Series(tmp) == 8) | (pd.Series(tmp) == -8)]

In [45]:
df_cor

x_11    -8
x_12    -8
x_13    -8
x_20     8
x_21     8
        ..
x_916   -8
x_919   -8
x_920   -8
x_923   -8
x_929    8
Length: 71, dtype: int64

In [46]:
# Multiply negtive to feature that has higher rank to '1' class
df[df_cor[df_cor == -8].index] = (-1) * df[df_cor[df_cor == -8].index].values

In [47]:
df[df_cor.index]

Unnamed: 0,x_11,x_12,x_13,x_20,x_21,x_22,x_38,x_98,x_101,x_102,...,x_832,x_852,x_861,x_899,x_901,x_916,x_919,x_920,x_923,x_929
569,-500.0,-505.3,-493.2,495.0,503.0,491.0,-0.079,-0.0013,3.9e-05,4.8e-05,...,-175.677419,27.46,-1.693548,6.836735,11.0,-188.0,-189.7,-182.254839,-186.687097,312.018182
570,-466.1,-471.4,-457.0,530.0,541.0,525.0,-0.076,-0.00081,7.1e-05,8.3e-05,...,-167.612903,27.48,-1.625806,7.244898,12.0,-184.0,-178.3,-173.067742,-181.483871,312.454545
571,-500.4,-504.8,-492.4,494.0,503.0,490.0,-0.079,-0.0013,4.2e-05,5e-05,...,-174.967742,27.46,-1.687097,7.22449,10.0,-187.0,-187.9,-174.351613,-183.548387,311.763636
572,-465.9,-471.4,-457.3,530.0,541.0,525.0,-0.075,-0.00082,7.4e-05,8.8e-05,...,-170.333333,27.49,-1.636667,7.387755,12.0,-186.1,-184.6,-174.06,-177.933333,312.537037
596,-505.8,-510.9,-497.1,489.0,498.0,485.0,-0.078,-0.0015,5.7e-05,7e-05,...,-178.133333,27.14,-1.733333,6.72,9.0,-197.2,-190.5,-181.432258,-196.76129,303.036364
597,-470.6,-474.9,-462.9,525.0,534.0,520.0,-0.077,-0.00084,6.3e-05,7.8e-05,...,-171.233333,27.09,-1.663333,7.163265,11.0,-196.0,-194.7,-178.725807,-193.683871,304.818182


In [48]:
df_feature_rank = df[["y_class"] + list(df_cor.index)].groupby("y_class").describe()

In [49]:
df_feature_rank

Unnamed: 0_level_0,x_11,x_11,x_11,x_11,x_11,x_11,x_11,x_11,x_12,x_12,...,x_923,x_923,x_929,x_929,x_929,x_929,x_929,x_929,x_929,x_929
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
y_class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,4.0,-494.2,15.954101,-505.8,-501.75,-500.2,-492.65,-470.6,4.0,-498.975,...,-185.902419,-183.548387,4.0,307.909091,4.656155,303.036364,304.372727,308.290909,311.827273,312.018182
2,2.0,-466.0,0.141421,-466.1,-466.05,-466.0,-465.95,-465.9,2.0,-471.4,...,-178.820968,-177.933333,2.0,312.495791,0.05833,312.454545,312.475168,312.495791,312.516414,312.537037


In [50]:
split_point = {i : dict.fromkeys(df_cor.index) for i in ["0_1", "1_2"]}
for i in df_feature_rank.columns.get_level_values(0).unique():
    split_point["0_1"][i] = df_feature_rank.loc[1, i]["min"]
    split_point["1_2"][i] = (df_feature_rank.loc[1, i]["max"] + df_feature_rank.loc[2, i]["min"]) / 2
#     print(i)

In [51]:
split_point

{'0_1': {'x_11': -505.8,
  'x_12': -510.9,
  'x_13': -497.1,
  'x_20': 489.0,
  'x_21': 498.0,
  'x_22': 485.0,
  'x_38': -0.079,
  'x_98': -0.0015,
  'x_101': 3.9e-05,
  'x_102': 4.8e-05,
  'x_103': 2.9e-05,
  'x_111': 1e-06,
  'x_119': 6.3e-06,
  'x_120': 33.9,
  'x_123': -56698.0,
  'x_385': 2.3233333330000003,
  'x_416': -171.9,
  'x_439': -400.0,
  'x_458': -2530.28,
  'x_482': -197.2,
  'x_483': -194.7,
  'x_495': 4.7127272730000005,
  'x_499': -182.2548387,
  'x_500': -196.7612903,
  'x_502': 303.0363636,
  'x_507': 389.0,
  'x_523': -1.733333333,
  'x_555': 8.6,
  'x_572': 19.0,
  'x_594': -0.4,
  'x_596': -0.3,
  'x_616': 2.3,
  'x_646': -182.2548387,
  'x_647': -196.7612903,
  'x_649': 303.0363636,
  'x_658': 0.0,
  'x_697': -2530.28,
  'x_698': 97.755,
  'x_701': -8.92,
  'x_702': -8.903333332999999,
  'x_703': -8.54,
  'x_704': -9.123333333,
  'x_705': -8.673333332999999,
  'x_706': -8.383333333,
  'x_707': -8.55,
  'x_708': -8.436666667,
  'x_717': -14.1,
  'x_746': 389.0,

## Inference

In [52]:
df_test = pd.read_csv("C:/Users/flash/PycharmProjects/pythonProject/projects/dacon_lgaimers2/datasets/test.csv")
df_test.columns = df_test.columns.str.lower()
df_test["timestamp"] = pd.to_datetime(df_test["timestamp"])
df_test[df_cor[df_cor == -8].index] = (-1) * df_test[df_cor[df_cor == -8].index].values

infer_list = []

for idx, value in df_test[df_test["product_code"] == "O_31"].iterrows():
    tmp = Counter()
    for i in df_cor.index:
        if value[i] < split_point["0_1"][i]:
            tmp[0] += 1 / len(df_cor.index)
        elif value[i] < split_point["1_2"][i]:
            tmp[1] += 1 / len(df_cor.index)
        else:
            tmp[2] += 1 / len(df_cor.index)
    infer_list.append(tmp.most_common(3))

In [54]:
infer_list

[[(1, 0.549295774647887), (0, 0.25352112676056343), (2, 0.19718309859154934)],
 [(1, 0.5633802816901405),
  (0, 0.38028169014084495),
  (2, 0.056338028169014086)],
 [(1, 0.42253521126760546),
  (0, 0.42253521126760546),
  (2, 0.15492957746478875)],
 [(1, 0.49295774647887297),
  (0, 0.36619718309859145),
  (2, 0.14084507042253522)]]

In [58]:
output = [i[0][0] for i in infer_list]
output

[1, 1, 1, 1]