In [23]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import warnings
import random
import os
plt.style.use('ggplot')
warnings.filterwarnings(action='ignore')

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

In [3]:
def cyclical_feature(df): 
    df['sin_time'] = np.sin(2*np.pi*df.HOUR/24)
    df['cos_time'] = np.cos(2*np.pi*df.HOUR/24)

In [5]:
def timestamp_split(df):
    df['YEAR'] = df['TIMESTAMP'].str.split('-').str[0].astype('int')
    df['MONTH'] = df['TIMESTAMP'].str.split('-').str[1].astype('int')
    df['DAY'] = df['TIMESTAMP'].str.split('-').str[2]
    df['DAY'] = df['DAY'].str.split().str[0].astype('int')
    df['TIME'] = df['TIMESTAMP'].str.split().str[1]
    df['HOUR'] = df['TIME'].str.split(':').str[0].astype('int')
    df['TIME_MIN'] = df['TIME'].str.split(':').str[1].astype('int')+df['HOUR']*60

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [6]:
timestamp_split(test)
timestamp_split(train)

cyclical_feature(train)
cyclical_feature(test)

In [8]:
col_list = train.columns
nan_list = []
nan_cnt = []
nan_col = []
full_list = []
for col in col_list:
    if train[col].isnull().sum() == 0 :
        full_list.append(col)
        continue
    nan_list.append([col, train[col].isnull().sum()])
    nan_cnt.append(train[col].isnull().sum())
    nan_col.append(col)
    
'''모든값이 결측값이면 제거'''

del_col = []
for col in nan_list :
    if col[1] == 598 :
        del_col.append(col[0])
train = train.drop(columns=del_col)
test = test.drop(columns=del_col)
train.head(3)

In [10]:
trainA_31 = train[train['PRODUCT_CODE']=='A_31']
train_T_O = train[train['PRODUCT_CODE']!='A_31']

testA_31 = test[test['PRODUCT_CODE']=='A_31']
test_T_O = test[test['PRODUCT_CODE']!='A_31']

In [11]:
col_list = train.columns
nan_listA_31 = []
nan_cntA_31 = []
nan_colA_31 = []
full_listA_31 = []
for col in col_list:
    if trainA_31[col].isnull().sum() == 0 :
        full_listA_31.append(col)
        continue
    nan_listA_31.append([col, trainA_31[col].isnull().sum()])
    nan_cntA_31.append(trainA_31[col].isnull().sum())
    nan_colA_31.append(col)

In [12]:
'''모든값이 결측값이면 제거'''
del_col = []
for col in nan_listA_31 :
    if col[1] == len(trainA_31) :
        del_col.append(col[0])
trainA_31 = trainA_31.drop(columns=del_col)
testA_31 = testA_31.drop(columns=del_col)
trainA_31.head(3)

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,LINE,PRODUCT_CODE,X_128,X_129,X_130,X_131,...,X_2870,X_2871,YEAR,MONTH,DAY,TIME,HOUR,TIME_MIN,sin_time,cos_time
0,TRAIN_000,1,0.533433,2022-06-13 5:14,T050304,A_31,7813.0,7813.0,,,...,77.77,,2022,6,13,5:14,5,314,0.965926,0.258819
1,TRAIN_001,2,0.541819,2022-06-13 5:22,T050307,A_31,,,19854.0,19854.0,...,72.55,,2022,6,13,5:22,5,322,0.965926,0.258819
2,TRAIN_002,1,0.531267,2022-06-13 5:30,T050304,A_31,7815.0,7815.0,,,...,78.35,,2022,6,13,5:30,5,330,0.965926,0.258819


In [13]:
col_list = train.columns
nan_listT_O = []
nan_cntT_O = []
nan_colT_O = []
full_listT_O = []
for col in col_list:
    if train_T_O[col].isnull().sum() == 0 :
        full_listT_O.append(col)
        continue
    nan_listT_O.append([col, train_T_O[col].isnull().sum()])
    nan_cntT_O.append(train_T_O[col].isnull().sum())
    nan_colT_O.append(col)

In [14]:
'''모든값이 결측값이면 제거'''
del_col = []
for col in nan_listT_O :
    if col[1] == len(train_T_O) :
        del_col.append(col[0])
train_T_O = train_T_O.drop(columns=del_col)
test_T_O = test_T_O.drop(columns=del_col)
train_T_O.head(3)

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,...,X_932,X_933,YEAR,MONTH,DAY,TIME,HOUR,TIME_MIN,sin_time,cos_time
22,TRAIN_022,0,0.517719,2022-06-14 8:53,T100304,T_31,2.0,102.0,0.0,45.0,...,13.443333,13.2,2022,6,14,8:53,8,533,0.866025,-0.5
23,TRAIN_023,0,0.51909,2022-06-14 9:01,T100304,T_31,2.0,102.0,0.0,45.0,...,,,2022,6,14,9:01,9,541,0.707107,-0.707107
25,TRAIN_025,1,0.529362,2022-06-19 9:11,T100304,T_31,2.0,97.0,0.0,45.0,...,13.454839,13.2,2022,6,19,9:11,9,551,0.707107,-0.707107


In [15]:
trainA_31_x = trainA_31.drop(columns=['PRODUCT_ID','TIMESTAMP','PRODUCT_CODE','DAY','Y_Class','Y_Quality','YEAR','MONTH','TIME'])
testA_31_x = testA_31.drop(columns=['PRODUCT_ID','TIMESTAMP','PRODUCT_CODE','YEAR','DAY','MONTH','TIME'])
train_T_O_x = train_T_O.drop(columns=['PRODUCT_ID','TIMESTAMP','Y_Class','Y_Quality','DAY','PRODUCT_CODE','YEAR','MONTH','TIME'])
test_T_O_x = test_T_O.drop(columns=['PRODUCT_ID','TIMESTAMP','PRODUCT_CODE','YEAR','DAY','MONTH','TIME'])

# classification
trainA_31_y_c = trainA_31['Y_Class']
train_T_O_y_c = train_T_O['Y_Class']

# regression
trainA_31_y_r = trainA_31['Y_Quality']
train_T_O_y_r = train_T_O['Y_Quality']

In [16]:
print(trainA_31_x.shape,'\n',testA_31_x.shape,'\n',train_T_O_x.shape,'\n',test_T_O_x.shape)

(249, 2121) 
 (67, 2121) 
 (349, 682) 
 (243, 682)


In [17]:
trainA_31_x=trainA_31_x.fillna(-1)
testA_31_x=testA_31_x.fillna(-1)
train_T_O_x=train_T_O_x.fillna(-1)
test_T_O_x=test_T_O_x.fillna(-1)

In [24]:
# qualitative to quantitative
qual_col = ['LINE']
for i in qual_col:
    le = LabelEncoder()
    le = le.fit(trainA_31_x[i])
    trainA_31_x[i] = le.transform(trainA_31_x[i])
    
    for label in np.unique(testA_31_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    testA_31_x[i] = le.transform(testA_31_x[i]) 
print('Done.')

Done.


In [25]:
# qualitative to quantitative
qual_col = ['LINE']
for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_T_O_x[i])
    train_T_O_x[i] = le.transform(train_T_O_x[i])
    
    for label in np.unique(test_T_O_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_T_O_x[i] = le.transform(test_T_O_x[i]) 
print('Done.')

Done.


In [19]:
from catboost import *

In [26]:
model = CatBoostRegressor(random_state=110,verbose=500,iterations=500)
model.fit(trainA_31_x, trainA_31_y_r)
pred_a = model.predict(testA_31_x)

Learning rate set to 0.051607
0:	learn: 0.0097677	total: 290ms	remaining: 2m 24s
499:	learn: 0.0001742	total: 37.3s	remaining: 0us


In [27]:
model.fit(train_T_O_x, train_T_O_y_r)
pred_t = model.predict(test_T_O_x)

Learning rate set to 0.054803
0:	learn: 0.0046901	total: 24.1ms	remaining: 12s
499:	learn: 0.0001228	total: 16.9s	remaining: 0us


In [28]:
testA_31['Y_quanlity'] = pred_a
test_T_O['Y_quanlity'] = pred_t

In [29]:
testA_31['Y_Class'] = 1
test_T_O['Y_Class'] = 1

In [30]:
testA_31.loc[(testA_31['Y_quanlity']<0.52507), 'Y_Class'] = 0
testA_31.loc[(testA_31['Y_quanlity']>0.5349), 'Y_Class'] = 2

test_T_O.loc[(test_T_O['Y_quanlity']<0.52507), 'Y_Class'] = 0
test_T_O.loc[(test_T_O['Y_quanlity']>0.5349), 'Y_Class'] = 2

In [31]:
submita = pd.read_csv('sample_submission.csv')
submitt = pd.read_csv('sample_submission.csv')

In [32]:
submita = pd.merge(submita[['PRODUCT_ID']],testA_31[['PRODUCT_ID','Y_Class']],on='PRODUCT_ID')
submitt = pd.merge(submitt[['PRODUCT_ID']],test_T_O[['PRODUCT_ID','Y_Class']],on='PRODUCT_ID')

pd.concat([submita,submitt]).sort_values(by='PRODUCT_ID').to_csv('sub0203_cat_reg.csv',index=False)

In [36]:
testA_31['Y_Class'].values

array([0, 1, 1, 1, 1, 1, 1, 1, 2, 1, 0, 0, 1, 1, 0, 2, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0], dtype=int64)

In [35]:
test_T_O['Y_Class'].values

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1], dtype=int64)