## import & version

In [11]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')
import catboost
from catboost import *
import sklearn
from sklearn.preprocessing import LabelEncoder
import sys

In [17]:
print(' pandas version   : ',pd.__version__,'\n',
       'numpy version    : ',np.__version__,'\n',
       'catboost version : ',catboost.__version__,'\n',
       'sklearn version  : ',sklearn.__version__,'\n',
       'python version   : ',sys.version)

 pandas version   :  1.5.2 
 numpy version    :  1.23.5 
 catboost version :  1.0.6 
 sklearn version  :  1.2.0 
 python version   :  3.9.16 (main, Jan 11 2023, 10:02:19) 
[Clang 14.0.6 ]


## Feature Engineering
#### 함수 정의

In [40]:
def del_columns(train, test):
    
    '''
    모든 값이 결측값이거나,
    유니크값이 1개인 경우 제거
    '''
    
    col_list = train.columns
    nan_list = []
    nan_cnt = []
    nan_col = []
    full_list = []
    for col in col_list:
        if train[col].isnull().sum() == 0 :
            full_list.append(col)
            continue
        nan_list.append([col, train[col].isnull().sum()])
        nan_cnt.append(train[col].isnull().sum())
        nan_col.append(col)

    del_col = []
    for col in nan_list :
        if col[1] == len(train) :
            del_col.append(col[0])
    train = train.drop(columns=del_col)
    test = test.drop(columns=del_col)

    del_col = []
    col_list = train.describe().columns
    for col in col_list :
        if col == 'Y_Class':
            continue
        if col == 'Y_Quality':
            continue
        if col == 'LINE':
            continue
        if col == 'PRODUCT_CODE':
            continue
        if train[col].nunique()==1 :
            del_col.append(col)
    train = train.drop(columns=del_col)
    test = test.drop(columns=del_col)
    
    return train,test
   
def make_train_test_dataset(train,test):
    
    '''
    트레인데이터, 학습데이터 셋 만들기
    '''
    
    train_x = train.drop(columns=['PRODUCT_ID','TIMESTAMP','PRODUCT_CODE','Y_Class','Y_Quality'])
    test_x = test.drop(columns=['PRODUCT_ID','TIMESTAMP','PRODUCT_CODE'])
    train_y = train['Y_Quality']
    train_w = train[['Y_Class']]
    return train_x, test_x, train_y, train_w

def fillna(train,test,value):
    
    '''
    입력받은 value로 결측치 대체
    '''
    
    train = train.fillna(value)
    test = test.fillna(value)
    return train,test

def labelencoder(train,test,col_list):
    
    '''
    입력받은 피처에 대해 라벨인코딩 진행
    '''
    
    qual_col = col_list
    for i in qual_col:
        le = LabelEncoder()
        le = le.fit(train[i])
        train[i] = le.transform(train[i])

        for label in np.unique(test[i]): 
            if label not in le.classes_: 
                le.classes_ = np.append(le.classes_, label)
        test[i] = le.transform(test[i]) 
    return train,test

def pred_target(arr):
    for i in range(len(arr)):
        if arr[i] < -0.64421190232267 :
            arr[i] = 0
        elif arr[i] <= -0.6256814053066195 :
            arr[i] = 1
        else : 
            arr[i] = 2
    return arr

def make_dataset (train_path, test_path):

    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)

    # y quality scaling
    train['Y_Quality'] = train['Y_Quality'].map(lambda x : np.log(x))

    train, test = del_columns(train,test)

    trainA_31 = train[train['PRODUCT_CODE']=='A_31']
    train_T_31 = train[train['PRODUCT_CODE']=='T_31']
    train_O_31 = train[train['PRODUCT_CODE']=='O_31']

    testA_31 = test[test['PRODUCT_CODE']=='A_31']
    test_T_31 = test[test['PRODUCT_CODE']=='T_31']
    test_O_31 = test[test['PRODUCT_CODE']=='O_31']

    trainA_31, testA_31 = del_columns(trainA_31,testA_31)
    train_T_31, test_T_31 = del_columns(train_T_31,test_T_31)
    train_O_31, test_O_31 = del_columns(train_O_31,test_O_31)

    trainA_31_x,testA_31_x, trainA_31_y, trainA_31_w = make_train_test_dataset(trainA_31,testA_31)
    train_T_31_x,test_T_31_x, train_T_31_y, train_T_31_w = make_train_test_dataset(train_T_31,test_T_31)
    train_O_31_x,test_O_31_x, train_O_31_y, train_O_31_w = make_train_test_dataset(train_O_31,test_O_31)

    trainA_31_x,testA_31_x = labelencoder(trainA_31_x,testA_31_x,['LINE'])
    train_T_31_x,test_T_31_x = labelencoder(train_T_31_x,test_T_31_x,['LINE'])
    train_O_31_x,test_O_31_x = labelencoder(train_O_31_x,test_O_31_x,['LINE'])

    trainA_31_x,testA_31_x = fillna(trainA_31_x,testA_31_x,-1)
    train_T_31_x,test_T_31_x = fillna(train_T_31_x,test_T_31_x,-1)
    train_O_31_x,test_O_31_x = fillna(train_O_31_x,test_O_31_x,-1)

    print(" train_a_shape : ",trainA_31_x.shape,testA_31_x.shape,'\n',
          "train_t_shape : ",train_T_31_x.shape,test_T_31_x.shape,'\n',
          "train_o_shape : ",train_O_31_x.shape,test_O_31_x.shape)
    
    return trainA_31_x,testA_31_x, trainA_31_y, testA_31,train_T_31_x,test_T_31_x, train_T_31_y, test_T_31,train_O_31_x,test_O_31_x, train_O_31_y,test_O_31

#### 실행

In [19]:
train_input = 'train.csv'
test_input = 'test.csv'

In [41]:
train_A_31_x,test_A_31_x, train_A_31_y,test_A_31, train_T_31_x,test_T_31_x, train_T_31_y,test_T_31, train_O_31_x,test_O_31_x, train_O_31_y,test_O_31 = make_dataset(train_input,test_input)

 train_a_shape :  (249, 1866) (67, 1866) 
 train_t_shape :  (343, 550) (239, 550) 
 train_o_shape :  (6, 499) (4, 499)


## 학습 및 예측

In [24]:
model = CatBoostRegressor(random_state=1234,verbose=500,iterations=1500,learning_rate=0.033)

In [31]:
model.fit(train_A_31_x, train_A_31_y)
pred_a = model.predict(test_A_31_x)

0:	learn: 0.0183320	total: 19.7ms	remaining: 29.6s
500:	learn: 0.0012455	total: 7.35s	remaining: 14.7s
1000:	learn: 0.0001219	total: 14.7s	remaining: 7.33s
1499:	learn: 0.0000129	total: 22.1s	remaining: 0us


In [32]:
pred_a = pred_target(pred_a)

In [33]:
model.fit(train_T_31_x, train_T_31_y)
pred_t = model.predict(test_T_31_x)

0:	learn: 0.0089112	total: 8.27ms	remaining: 12.4s
500:	learn: 0.0009494	total: 2.9s	remaining: 5.78s
1000:	learn: 0.0001266	total: 5.71s	remaining: 2.84s
1499:	learn: 0.0000173	total: 8.58s	remaining: 0us


In [34]:
pred_t = pred_target(pred_t)

In [35]:
model.fit(train_O_31_x, train_O_31_y)
pred_o = model.predict(test_O_31_x)

0:	learn: 0.0059697	total: 1.91ms	remaining: 2.86s
500:	learn: 0.0000906	total: 656ms	remaining: 1.31s
1000:	learn: 0.0000014	total: 1.26s	remaining: 626ms
1499:	learn: 0.0000000	total: 1.85s	remaining: 0us


In [36]:
pred_o = pred_target(pred_o)

In [47]:
test_A_31['Y_Class'] = pred_a
test_T_31['Y_Class'] = pred_t
test_O_31['Y_Class'] = pred_o

submita = pd.read_csv('sample_submission.csv')
submitt = pd.read_csv('sample_submission.csv')
submito = pd.read_csv('sample_submission.csv')

submita = pd.merge(submita[['PRODUCT_ID']],test_A_31[['PRODUCT_ID','Y_Class']],on='PRODUCT_ID')
submitt = pd.merge(submitt[['PRODUCT_ID']],test_T_31[['PRODUCT_ID','Y_Class']],on='PRODUCT_ID')
submito = pd.merge(submito[['PRODUCT_ID']],test_O_31[['PRODUCT_ID','Y_Class']],on='PRODUCT_ID')

submit_final = pd.concat([submita,submitt,submito]).sort_values(by='PRODUCT_ID')
submit_final['Y_Class'] = submit_final['Y_Class'].astype('int')
submit_final.to_csv('final_test.csv',index=False)