# 라이브러리 설치

# Import

In [139]:
import os
import random
import pandas as pd
import numpy as np
import sklearn
import catboost

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from datetime import datetime

In [140]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

# Data Load

In [141]:
train_df = pd.read_csv('../../data/dacon_open/train.csv')
test_df = pd.read_csv('../../data/dacon_open/test.csv')
submission_df = pd.read_csv("../../data/dacon_open/sample_submission.csv")

In [142]:
train_df

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,TRAIN_000,1,0.533433,2022-06-13 5:14,T050304,A_31,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
1,TRAIN_001,2,0.541819,2022-06-13 5:22,T050307,A_31,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2,TRAIN_002,1,0.531267,2022-06-13 5:30,T050304,A_31,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,
3,TRAIN_003,2,0.537325,2022-06-13 5:39,T050307,A_31,,,,,...,37.74,39.17,52.17,30.58,71.78,,,,,
4,TRAIN_004,1,0.531590,2022-06-13 5:47,T050304,A_31,,,,,...,38.70,41.89,46.93,33.09,76.97,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,TRAIN_593,1,0.526546,2022-09-08 14:30,T100306,T_31,2.0,95.0,0.0,45.0,...,,,,,,,,,,
594,TRAIN_594,0,0.524022,2022-09-08 22:38,T050304,A_31,,,,,...,49.47,53.07,50.89,55.10,66.49,1.0,,,,
595,TRAIN_595,0,0.521289,2022-09-08 22:47,T050304,A_31,,,,,...,,,,,,1.0,,,,
596,TRAIN_596,1,0.531375,2022-09-08 14:38,T100304,O_31,40.0,94.0,0.0,45.0,...,,,,,,,,,,


# Data Pre-processing

In [143]:
#TIMESTAMP 피쳐를 datetime으로 나누기
tr_month =[]
tr_day = []
tr_hour =[]
tr_minute = []
for i in range(train_df.shape[0]):
  tr_month.append(datetime.strptime(train_df.iloc[i,:]['TIMESTAMP'],'%Y-%m-%d %H:%M').month)
  tr_day.append(datetime.strptime(train_df.iloc[i,:]['TIMESTAMP'],'%Y-%m-%d %H:%M').day)
  tr_hour.append(datetime.strptime(train_df.iloc[i,:]['TIMESTAMP'],'%Y-%m-%d %H:%M').hour)
  tr_minute.append(datetime.strptime(train_df.iloc[i,:]['TIMESTAMP'],'%Y-%m-%d %H:%M').minute)

train_df['MONTH'] = tr_month
train_df['DAY'] = tr_day
train_df['HOUR'] = tr_hour
train_df['MINUTE'] = tr_minute

te_month =[]
te_day = []
te_hour =[]
te_minute = []
for i in range(test_df.shape[0]):
  te_month.append(datetime.strptime(test_df.iloc[i,:]['TIMESTAMP'],'%Y-%m-%d %H:%M').month)
  te_day.append(datetime.strptime(test_df.iloc[i,:]['TIMESTAMP'],'%Y-%m-%d %H:%M').day)
  te_hour.append(datetime.strptime(test_df.iloc[i,:]['TIMESTAMP'],'%Y-%m-%d %H:%M').hour)
  te_minute.append(datetime.strptime(test_df.iloc[i,:]['TIMESTAMP'],'%Y-%m-%d %H:%M').minute)

test_df['MONTH'] = te_month
test_df['DAY'] = te_day
test_df['HOUR'] = te_hour
test_df['MINUTE'] = te_minute

In [144]:
train_df

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,...,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875,MONTH,DAY,HOUR,MINUTE
0,TRAIN_000,1,0.533433,2022-06-13 5:14,T050304,A_31,,,,,...,77.77,,,,,,6,13,5,14
1,TRAIN_001,2,0.541819,2022-06-13 5:22,T050307,A_31,,,,,...,72.55,,,,,,6,13,5,22
2,TRAIN_002,1,0.531267,2022-06-13 5:30,T050304,A_31,,,,,...,78.35,,,,,,6,13,5,30
3,TRAIN_003,2,0.537325,2022-06-13 5:39,T050307,A_31,,,,,...,71.78,,,,,,6,13,5,39
4,TRAIN_004,1,0.531590,2022-06-13 5:47,T050304,A_31,,,,,...,76.97,,,,,,6,13,5,47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,TRAIN_593,1,0.526546,2022-09-08 14:30,T100306,T_31,2.0,95.0,0.0,45.0,...,,,,,,,9,8,14,30
594,TRAIN_594,0,0.524022,2022-09-08 22:38,T050304,A_31,,,,,...,66.49,1.0,,,,,9,8,22,38
595,TRAIN_595,0,0.521289,2022-09-08 22:47,T050304,A_31,,,,,...,,1.0,,,,,9,8,22,47
596,TRAIN_596,1,0.531375,2022-09-08 14:38,T100304,O_31,40.0,94.0,0.0,45.0,...,,,,,,,9,8,14,38


In [145]:
# TIMESTAMP 피쳐 제거
train_df=train_df.drop(columns=['TIMESTAMP'])
test_df=test_df.drop(columns=['TIMESTAMP'])

In [146]:
train_df

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,...,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875,MONTH,DAY,HOUR,MINUTE
0,TRAIN_000,1,0.533433,T050304,A_31,,,,,,...,77.77,,,,,,6,13,5,14
1,TRAIN_001,2,0.541819,T050307,A_31,,,,,,...,72.55,,,,,,6,13,5,22
2,TRAIN_002,1,0.531267,T050304,A_31,,,,,,...,78.35,,,,,,6,13,5,30
3,TRAIN_003,2,0.537325,T050307,A_31,,,,,,...,71.78,,,,,,6,13,5,39
4,TRAIN_004,1,0.531590,T050304,A_31,,,,,,...,76.97,,,,,,6,13,5,47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,TRAIN_593,1,0.526546,T100306,T_31,2.0,95.0,0.0,45.0,10.0,...,,,,,,,9,8,14,30
594,TRAIN_594,0,0.524022,T050304,A_31,,,,,,...,66.49,1.0,,,,,9,8,22,38
595,TRAIN_595,0,0.521289,T050304,A_31,,,,,,...,,1.0,,,,,9,8,22,47
596,TRAIN_596,1,0.531375,T100304,O_31,40.0,94.0,0.0,45.0,11.0,...,,,,,,,9,8,14,38


In [147]:
#전부 NAN값인 피쳐 제거
a=[]
A = pd.DataFrame(train_df.isnull().sum(),columns=['sum'])
a = list(A[A['sum']==train_df.shape[0]].index)
train_df.drop(columns=a,inplace=True)
test_df.drop(columns=a, inplace=True)

In [148]:
train_df

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,MONTH,DAY,HOUR,MINUTE
0,TRAIN_000,1,0.533433,T050304,A_31,,,,,,...,39.34,40.89,32.56,34.09,77.77,,6,13,5,14
1,TRAIN_001,2,0.541819,T050307,A_31,,,,,,...,38.89,42.82,43.92,35.34,72.55,,6,13,5,22
2,TRAIN_002,1,0.531267,T050304,A_31,,,,,,...,39.19,36.65,42.47,36.53,78.35,,6,13,5,30
3,TRAIN_003,2,0.537325,T050307,A_31,,,,,,...,37.74,39.17,52.17,30.58,71.78,,6,13,5,39
4,TRAIN_004,1,0.531590,T050304,A_31,,,,,,...,38.70,41.89,46.93,33.09,76.97,,6,13,5,47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,TRAIN_593,1,0.526546,T100306,T_31,2.0,95.0,0.0,45.0,10.0,...,,,,,,,9,8,14,30
594,TRAIN_594,0,0.524022,T050304,A_31,,,,,,...,49.47,53.07,50.89,55.10,66.49,1.0,9,8,22,38
595,TRAIN_595,0,0.521289,T050304,A_31,,,,,,...,,,,,,1.0,9,8,22,47
596,TRAIN_596,1,0.531375,T100304,O_31,40.0,94.0,0.0,45.0,11.0,...,,,,,,,9,8,14,38


In [149]:
#분산이 1인 피쳐 제거
a=[]
for c in train_df.columns:
  if  train_df[c].nunique()==1:
    a.append(c)

len(a)
train_df.drop(columns=a, inplace=True)
test_df.drop(columns=a, inplace=True)

In [150]:
train_df

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,LINE,PRODUCT_CODE,X_1,X_2,X_5,X_7,X_8,...,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870,MONTH,DAY,HOUR,MINUTE
0,TRAIN_000,1,0.533433,T050304,A_31,,,,,,...,353.0,39.34,40.89,32.56,34.09,77.77,6,13,5,14
1,TRAIN_001,2,0.541819,T050307,A_31,,,,,,...,353.0,38.89,42.82,43.92,35.34,72.55,6,13,5,22
2,TRAIN_002,1,0.531267,T050304,A_31,,,,,,...,353.0,39.19,36.65,42.47,36.53,78.35,6,13,5,30
3,TRAIN_003,2,0.537325,T050307,A_31,,,,,,...,353.0,37.74,39.17,52.17,30.58,71.78,6,13,5,39
4,TRAIN_004,1,0.531590,T050304,A_31,,,,,,...,352.0,38.70,41.89,46.93,33.09,76.97,6,13,5,47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,TRAIN_593,1,0.526546,T100306,T_31,2.0,95.0,10.0,50.0,10.0,...,,,,,,,9,8,14,30
594,TRAIN_594,0,0.524022,T050304,A_31,,,,,,...,353.0,49.47,53.07,50.89,55.10,66.49,9,8,22,38
595,TRAIN_595,0,0.521289,T050304,A_31,,,,,,...,352.0,,,,,,9,8,22,47
596,TRAIN_596,1,0.531375,T100304,O_31,40.0,94.0,11.0,45.0,10.0,...,,,,,,,9,8,14,38


In [151]:
non_null_counts_3

PRODUCT_ID      78
Y_Class         78
Y_Quality       78
LINE            78
PRODUCT_CODE    78
                ..
X_2870          67
MONTH           78
DAY             78
HOUR            78
MINUTE          78
Length: 2425, dtype: int64

In [152]:
# 3,4번 라인 피쳐 분석
groups_lines=train_df.groupby('LINE')
line=0
for name, group in groups_lines:
    line+=1
    globals()['group_line{}_df'.format(line)] = groups_lines.get_group(name)
non_null_counts_3 = group_line3_df.notnull().sum()
non_null_cols_3 = non_null_counts_3[non_null_counts_3 > 0].index.tolist()


grouped3 = group_line3_df.groupby(group_line3_df.isnull().apply(tuple, axis=1))
cnt=0
for name, group in grouped3:
    cnt+=1
    globals()['line3_{}_df'.format(cnt)] = grouped3.get_group(name)
for i in range(1,cnt+1):
  globals()['non_null_cols3_{}'.format(i)] = globals()['line3_{}_df'.format(i)].columns[globals()['line3_{}_df'.format(i)].notnull().all()].tolist()
  del globals()['non_null_cols3_{}'.format(i)][globals()['non_null_cols3_{}'.format(i)].index('PRODUCT_ID')]


grouped4 = group_line4_df.groupby(group_line4_df.isnull().apply(tuple, axis=1))
cnt=0
for name, group in grouped4:
    cnt+=1
    globals()['line4_{}_df'.format(cnt)] = grouped4.get_group(name)
for i in range(1,cnt+1):
  globals()['non_null_cols4_{}'.format(i)] = globals()['line4_{}_df'.format(i)].columns[globals()['line4_{}_df'.format(i)].notnull().all()].tolist()
  del globals()['non_null_cols4_{}'.format(i)][globals()['non_null_cols4_{}'.format(i)].index('PRODUCT_ID')]


# 3번라인과 4번라인 피쳐들의 합집합구하기
col_34=[]
for i in non_null_cols3_3:
  if i not in col_34:
    col_34.append(i)
for j in non_null_cols4_1:
  if j not in col_34:
    col_34.append(j)

# 3,4번 라인 튜닝용 데이터 셋 저장 
col_34_train = col_34
train_x = train_df[col_34_train]

col_34_test = col_34
del col_34_test[col_34_test.index('Y_Quality')]
del col_34_test[col_34_test.index('Y_Class')]

test_x = test_df[col_34_test]

train_x34 = train_x[train_x['LINE'].isin(['T050304','T050307'])]
test_x34 = test_x[test_x['LINE'].isin(['T050304','T050307'])]

In [153]:
train_df[col_34_train]

Unnamed: 0,LINE,PRODUCT_CODE,X_128,X_129,X_132,X_133,X_134,X_136,X_137,X_138,...,X_2541,X_2542,X_2545,X_2548,X_2551,X_2721,X_2722,X_2723,X_2773,X_2774
0,T050304,A_31,7813.0,7813.0,0.19,0.20,0.19,228.0,228.0,225.0,...,,,,,,,,,,
1,T050307,A_31,,,0.20,0.21,0.20,413.0,414.0,414.0,...,0.199812,0.197,1.1,1.0,1.144928,2.0,1.995500,1.97,0.0,0.0
2,T050304,A_31,7815.0,7815.0,0.19,0.20,0.19,228.0,228.0,225.0,...,,,,,,,,,,
3,T050307,A_31,,,0.20,0.21,0.20,414.0,414.0,414.0,...,0.199941,0.197,1.1,1.0,1.142647,2.0,1.993333,1.97,0.0,0.0
4,T050304,A_31,7817.0,7817.0,0.19,0.20,0.18,228.0,228.0,225.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,T100306,T_31,,,,,,,,,...,,,,,,,,,,
594,T050304,A_31,14810.0,14810.0,0.19,0.20,0.19,304.0,304.0,304.0,...,,,,,,,,,,
595,T050304,A_31,14813.0,14813.0,0.19,0.20,0.19,304.0,304.0,304.0,...,,,,,,,,,,
596,T100304,O_31,,,,,,,,,...,,,,,,,,,,


In [184]:
train_x34

Unnamed: 0,X_128,X_129,X_132,X_133,X_134,X_136,X_137,X_138,X_139,X_140,...,X_2541,X_2542,X_2545,X_2548,X_2551,X_2721,X_2722,X_2723,X_2773,X_2774
0,7813.0,7813.0,0.19,0.20,0.19,228.0,228.0,225.0,582.9,587.1,...,0.000000,0.000,0.0,0.0,0.000000,0.0,0.000000,0.00,0.0,0.0
1,0.0,0.0,0.20,0.21,0.20,413.0,414.0,414.0,589.3,595.8,...,0.199812,0.197,1.1,1.0,1.144928,2.0,1.995500,1.97,0.0,0.0
2,7815.0,7815.0,0.19,0.20,0.19,228.0,228.0,225.0,583.8,587.6,...,0.000000,0.000,0.0,0.0,0.000000,0.0,0.000000,0.00,0.0,0.0
3,0.0,0.0,0.20,0.21,0.20,414.0,414.0,414.0,589.8,596.1,...,0.199941,0.197,1.1,1.0,1.142647,2.0,1.993333,1.97,0.0,0.0
4,7817.0,7817.0,0.19,0.20,0.18,228.0,228.0,225.0,583.2,587.3,...,0.000000,0.000,0.0,0.0,0.000000,0.0,0.000000,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581,11864.0,11864.0,0.19,0.20,0.19,230.0,230.0,230.0,584.0,588.7,...,0.000000,0.000,0.0,0.0,0.000000,0.0,0.000000,0.00,0.0,0.0
582,11898.0,11898.0,0.18,0.20,0.18,230.0,230.0,230.0,583.3,588.4,...,0.000000,0.000,0.0,0.0,0.000000,0.0,0.000000,0.00,0.0,0.0
583,11920.0,11920.0,0.19,0.20,0.19,230.0,230.0,230.0,583.5,588.3,...,0.000000,0.000,0.0,0.0,0.000000,0.0,0.000000,0.00,0.0,0.0
594,14810.0,14810.0,0.19,0.20,0.19,304.0,304.0,304.0,588.9,594.6,...,0.000000,0.000,0.0,0.0,0.000000,0.0,0.000000,0.00,0.0,0.0


In [182]:
test_x34

Unnamed: 0,X_128,X_129,X_132,X_133,X_134,X_136,X_137,X_138,X_139,X_140,...,X_2541,X_2542,X_2545,X_2548,X_2551,X_2721,X_2722,X_2723,X_2773,X_2774
7,18031.0,18031.0,0.19,0.2,0.19,354.0,354.0,354.0,591.5,597.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,18064.0,18064.0,0.19,0.2,0.19,355.0,354.0,355.0,591.5,597.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.21,0.22,0.21,230.0,230.0,231.0,580.6,584.1,...,0.0,0.0,0.0,0.0,0.0,2.1,2.098846,2.07,0.0,0.0
14,295.0,295.0,0.19,0.21,0.19,60.0,60.0,60.0,558.4,561.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35,8632.0,8632.0,0.2,0.21,0.2,249.0,249.0,249.0,583.1,587.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36,0.0,0.0,0.21,0.22,0.21,99.0,99.0,99.0,581.5,585.0,...,0.0,0.0,0.0,0.0,0.0,2.1,2.091923,2.07,0.0,0.0
41,10712.0,10712.0,0.2,0.21,0.2,293.0,293.0,293.0,580.9,585.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42,10729.0,10729.0,0.2,0.21,0.19,293.0,293.0,293.0,581.0,585.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52,12261.0,12261.0,0.19,0.21,0.19,330.0,330.0,330.0,582.9,587.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53,12283.0,12283.0,0.2,0.21,0.2,330.0,330.0,330.0,583.4,588.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [154]:
train_x 

Unnamed: 0,Y_Class,Y_Quality,LINE,PRODUCT_CODE,X_128,X_129,X_132,X_133,X_134,X_136,...,X_2541,X_2542,X_2545,X_2548,X_2551,X_2721,X_2722,X_2723,X_2773,X_2774
0,1,0.533433,T050304,A_31,7813.0,7813.0,0.19,0.20,0.19,228.0,...,,,,,,,,,,
1,2,0.541819,T050307,A_31,,,0.20,0.21,0.20,413.0,...,0.199812,0.197,1.1,1.0,1.144928,2.0,1.995500,1.97,0.0,0.0
2,1,0.531267,T050304,A_31,7815.0,7815.0,0.19,0.20,0.19,228.0,...,,,,,,,,,,
3,2,0.537325,T050307,A_31,,,0.20,0.21,0.20,414.0,...,0.199941,0.197,1.1,1.0,1.142647,2.0,1.993333,1.97,0.0,0.0
4,1,0.531590,T050304,A_31,7817.0,7817.0,0.19,0.20,0.18,228.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,1,0.526546,T100306,T_31,,,,,,,...,,,,,,,,,,
594,0,0.524022,T050304,A_31,14810.0,14810.0,0.19,0.20,0.19,304.0,...,,,,,,,,,,
595,0,0.521289,T050304,A_31,14813.0,14813.0,0.19,0.20,0.19,304.0,...,,,,,,,,,,
596,1,0.531375,T100304,O_31,,,,,,,...,,,,,,,,,,


In [155]:
test_x

Unnamed: 0,LINE,PRODUCT_CODE,X_128,X_129,X_132,X_133,X_134,X_136,X_137,X_138,...,X_2541,X_2542,X_2545,X_2548,X_2551,X_2721,X_2722,X_2723,X_2773,X_2774
0,T100306,T_31,,,,,,,,,...,,,,,,,,,,
1,T100304,T_31,,,,,,,,,...,,,,,,,,,,
2,T100304,T_31,,,,,,,,,...,,,,,,,,,,
3,T010305,A_31,,,,,,,,,...,,,,,,,,,,
4,T010306,A_31,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,T100306,T_31,,,,,,,,,...,,,,,,,,,,
306,T100304,T_31,,,,,,,,,...,,,,,,,,,,
307,T100306,T_31,,,,,,,,,...,,,,,,,,,,
308,T100306,T_31,,,,,,,,,...,,,,,,,,,,


In [156]:
# 전체 학습용 데이터 결측값 전처리
train_x = train_df.drop(columns=['PRODUCT_ID'])
test_x = test_df.drop(columns=['PRODUCT_ID'])
train_x = train_x.fillna(0)
test_x = test_x.fillna(0)

In [157]:
train_x

Unnamed: 0,Y_Class,Y_Quality,LINE,PRODUCT_CODE,X_1,X_2,X_5,X_7,X_8,X_9,...,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870,MONTH,DAY,HOUR,MINUTE
0,1,0.533433,T050304,A_31,0.0,0.0,0.0,0.0,0.0,0.0,...,353.0,39.34,40.89,32.56,34.09,77.77,6,13,5,14
1,2,0.541819,T050307,A_31,0.0,0.0,0.0,0.0,0.0,0.0,...,353.0,38.89,42.82,43.92,35.34,72.55,6,13,5,22
2,1,0.531267,T050304,A_31,0.0,0.0,0.0,0.0,0.0,0.0,...,353.0,39.19,36.65,42.47,36.53,78.35,6,13,5,30
3,2,0.537325,T050307,A_31,0.0,0.0,0.0,0.0,0.0,0.0,...,353.0,37.74,39.17,52.17,30.58,71.78,6,13,5,39
4,1,0.531590,T050304,A_31,0.0,0.0,0.0,0.0,0.0,0.0,...,352.0,38.70,41.89,46.93,33.09,76.97,6,13,5,47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,1,0.526546,T100306,T_31,2.0,95.0,10.0,50.0,10.0,52.0,...,0.0,0.00,0.00,0.00,0.00,0.00,9,8,14,30
594,0,0.524022,T050304,A_31,0.0,0.0,0.0,0.0,0.0,0.0,...,353.0,49.47,53.07,50.89,55.10,66.49,9,8,22,38
595,0,0.521289,T050304,A_31,0.0,0.0,0.0,0.0,0.0,0.0,...,352.0,0.00,0.00,0.00,0.00,0.00,9,8,22,47
596,1,0.531375,T100304,O_31,40.0,94.0,11.0,45.0,10.0,31.0,...,0.0,0.00,0.00,0.00,0.00,0.00,9,8,14,38


In [158]:
test_x

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_5,X_7,X_8,X_9,X_11,X_12,...,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870,MONTH,DAY,HOUR,MINUTE
0,T100306,T_31,2.0,94.0,10.0,51.0,10.0,52.0,469.6,474.4,...,0.0,0.0,0.0,0.0,0.0,0.0,9,9,2,1
1,T100304,T_31,2.0,93.0,11.0,45.0,10.0,31.0,506.6,511.1,...,0.0,0.0,0.0,0.0,0.0,0.0,9,9,2,9
2,T100304,T_31,2.0,95.0,11.0,45.0,10.0,31.0,506.6,511.4,...,0.0,0.0,0.0,0.0,0.0,0.0,9,9,8,42
3,T010305,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,423.0,0.0,0.0,0.0,0.0,0.0,9,9,10,56
4,T010306,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,423.0,0.0,0.0,0.0,0.0,0.0,9,9,11,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,T100306,T_31,2.0,91.0,10.0,51.0,10.0,52.0,502.5,511.9,...,0.0,0.0,0.0,0.0,0.0,0.0,11,5,11,18
306,T100304,T_31,2.0,96.0,11.0,45.0,10.0,31.0,513.7,518.9,...,0.0,0.0,0.0,0.0,0.0,0.0,11,5,16,39
307,T100306,T_31,2.0,91.0,10.0,50.0,10.0,52.0,502.8,511.6,...,0.0,0.0,0.0,0.0,0.0,0.0,11,5,16,47
308,T100306,T_31,2.0,95.0,10.0,51.0,10.0,52.0,503.2,512.6,...,0.0,0.0,0.0,0.0,0.0,0.0,11,5,20,53


In [159]:
qual_col = ['LINE', 'PRODUCT_CODE']
for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 

train_y = train_x['Y_Quality']
train_yy = train_x['Y_Class']
train_x.drop(columns=['Y_Quality','Y_Class'],inplace=True)

In [160]:
for i,c in enumerate(train_x34.columns[~train_x34.columns.isin(['Y_Quality','Y_Class','LINE','PRODUCT_CODE'])]):
  train_x34[c].fillna(0,inplace=True)
  test_x34[c].fillna(0,inplace=True)

yy_train34 = train_x34['Y_Class']
train_x34 = train_x34.drop(columns=['Y_Class', 'Y_Quality','LINE','PRODUCT_CODE'])
test_x34 = test_x34.drop(columns=['LINE','PRODUCT_CODE'])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_x34[c].fillna(0,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_x34[c].fillna(0,inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform th

In [161]:
train_x34

Unnamed: 0,X_128,X_129,X_132,X_133,X_134,X_136,X_137,X_138,X_139,X_140,...,X_2541,X_2542,X_2545,X_2548,X_2551,X_2721,X_2722,X_2723,X_2773,X_2774
0,7813.0,7813.0,0.19,0.20,0.19,228.0,228.0,225.0,582.9,587.1,...,0.000000,0.000,0.0,0.0,0.000000,0.0,0.000000,0.00,0.0,0.0
1,0.0,0.0,0.20,0.21,0.20,413.0,414.0,414.0,589.3,595.8,...,0.199812,0.197,1.1,1.0,1.144928,2.0,1.995500,1.97,0.0,0.0
2,7815.0,7815.0,0.19,0.20,0.19,228.0,228.0,225.0,583.8,587.6,...,0.000000,0.000,0.0,0.0,0.000000,0.0,0.000000,0.00,0.0,0.0
3,0.0,0.0,0.20,0.21,0.20,414.0,414.0,414.0,589.8,596.1,...,0.199941,0.197,1.1,1.0,1.142647,2.0,1.993333,1.97,0.0,0.0
4,7817.0,7817.0,0.19,0.20,0.18,228.0,228.0,225.0,583.2,587.3,...,0.000000,0.000,0.0,0.0,0.000000,0.0,0.000000,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581,11864.0,11864.0,0.19,0.20,0.19,230.0,230.0,230.0,584.0,588.7,...,0.000000,0.000,0.0,0.0,0.000000,0.0,0.000000,0.00,0.0,0.0
582,11898.0,11898.0,0.18,0.20,0.18,230.0,230.0,230.0,583.3,588.4,...,0.000000,0.000,0.0,0.0,0.000000,0.0,0.000000,0.00,0.0,0.0
583,11920.0,11920.0,0.19,0.20,0.19,230.0,230.0,230.0,583.5,588.3,...,0.000000,0.000,0.0,0.0,0.000000,0.0,0.000000,0.00,0.0,0.0
594,14810.0,14810.0,0.19,0.20,0.19,304.0,304.0,304.0,588.9,594.6,...,0.000000,0.000,0.0,0.0,0.000000,0.0,0.000000,0.00,0.0,0.0


In [181]:
test_x34

Unnamed: 0,X_128,X_129,X_132,X_133,X_134,X_136,X_137,X_138,X_139,X_140,...,X_2541,X_2542,X_2545,X_2548,X_2551,X_2721,X_2722,X_2723,X_2773,X_2774
7,18031.0,18031.0,0.19,0.2,0.19,354.0,354.0,354.0,591.5,597.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,18064.0,18064.0,0.19,0.2,0.19,355.0,354.0,355.0,591.5,597.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.21,0.22,0.21,230.0,230.0,231.0,580.6,584.1,...,0.0,0.0,0.0,0.0,0.0,2.1,2.098846,2.07,0.0,0.0
14,295.0,295.0,0.19,0.21,0.19,60.0,60.0,60.0,558.4,561.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35,8632.0,8632.0,0.2,0.21,0.2,249.0,249.0,249.0,583.1,587.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36,0.0,0.0,0.21,0.22,0.21,99.0,99.0,99.0,581.5,585.0,...,0.0,0.0,0.0,0.0,0.0,2.1,2.091923,2.07,0.0,0.0
41,10712.0,10712.0,0.2,0.21,0.2,293.0,293.0,293.0,580.9,585.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42,10729.0,10729.0,0.2,0.21,0.19,293.0,293.0,293.0,581.0,585.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52,12261.0,12261.0,0.19,0.21,0.19,330.0,330.0,330.0,582.9,587.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53,12283.0,12283.0,0.2,0.21,0.2,330.0,330.0,330.0,583.4,588.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Model Fit & Inference

In [162]:
# regressor
reg = catboost.CatBoostRegressor(learning_rate=0.05, iterations=500, verbose=0)
reg.fit(train_x, train_y, early_stopping_rounds=100, cat_features=['PRODUCT_CODE', 'LINE'])

pre_preds = reg.predict(test_x)

a = train_df[['Y_Class','Y_Quality']].groupby('Y_Class').agg(['mean', 'min', 'max', 'count'])

preds=[]
for p in pre_preds:
  if p<=a[('Y_Quality','max')][0]:
    preds.append(0)
  elif p<=a[('Y_Quality','min')][2]:
    preds.append(1)
  else:
    preds.append(2)

RR = test_x

In [188]:
RR

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_5,X_7,X_8,X_9,X_11,X_12,...,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870,MONTH,DAY,HOUR,MINUTE
0,5,2,2.0,94.0,10.0,51.0,10.0,52.0,469.6,474.4,...,0.0,0.0,0.0,0.0,0.0,0.0,9,9,2,1
1,4,2,2.0,93.0,11.0,45.0,10.0,31.0,506.6,511.1,...,0.0,0.0,0.0,0.0,0.0,0.0,9,9,2,9
2,4,2,2.0,95.0,11.0,45.0,10.0,31.0,506.6,511.4,...,0.0,0.0,0.0,0.0,0.0,0.0,9,9,8,42
3,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,423.0,0.0,0.0,0.0,0.0,0.0,9,9,10,56
4,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,423.0,0.0,0.0,0.0,0.0,0.0,9,9,11,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,5,2,2.0,91.0,10.0,51.0,10.0,52.0,502.5,511.9,...,0.0,0.0,0.0,0.0,0.0,0.0,11,5,11,18
306,4,2,2.0,96.0,11.0,45.0,10.0,31.0,513.7,518.9,...,0.0,0.0,0.0,0.0,0.0,0.0,11,5,16,39
307,5,2,2.0,91.0,10.0,50.0,10.0,52.0,502.8,511.6,...,0.0,0.0,0.0,0.0,0.0,0.0,11,5,16,47
308,5,2,2.0,95.0,10.0,51.0,10.0,52.0,503.2,512.6,...,0.0,0.0,0.0,0.0,0.0,0.0,11,5,20,53


In [164]:
# 3,4 line classifier
clf = catboost.CatBoostClassifier(random_seed=37,iterations=500, verbose=0)
clf.fit(train_x34, yy_train34)
y_preds = clf.predict(test_x34)

a = RR[(RR['LINE']==2)|(RR['LINE']==3)].index
temp = preds
for i in range(len(a)):
  temp[a[i]]=y_preds[i][0]

In [191]:
len(y_preds)

39

# Submit

In [165]:
submit = submission_df
submit['Y_Class'] = temp
submit
# submit.to_csv('./sample_submission.csv', index=False)

Unnamed: 0,PRODUCT_ID,Y_Class
0,TEST_000,1
1,TEST_001,1
2,TEST_002,1
3,TEST_003,0
4,TEST_004,1
...,...,...
305,TEST_305,1
306,TEST_306,0
307,TEST_307,1
308,TEST_308,1
