In [2]:
import pandas as pd
import random
import os
import numpy as np
import matplotlib.pyplot as plt
import warnings; warnings.filterwarnings(action='ignore')
import pickle

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [3]:
def seed_everything(seed):
    random.seed(seed) #파이썬 자체 모듈 random 모듈의 시드 고정
    os.environ['PYTHONHASHSEED'] = str(seed) 
    np.random.seed(seed) #넘파이를 사용할 경우
#     torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)
#     torch.backends.cudnn.deterministic = True
    
seed_everything(37) # Seed 고정

# Data Load

In [4]:
#데이터프레임 불러오기
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
submission = pd.read_csv('./sample_submission.csv')

In [5]:
train_df.head()

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,TRAIN_000,1,0.533433,2022-06-13 5:14,T050304,A_31,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
1,TRAIN_001,2,0.541819,2022-06-13 5:22,T050307,A_31,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2,TRAIN_002,1,0.531267,2022-06-13 5:30,T050304,A_31,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,
3,TRAIN_003,2,0.537325,2022-06-13 5:39,T050307,A_31,,,,,...,37.74,39.17,52.17,30.58,71.78,,,,,
4,TRAIN_004,1,0.53159,2022-06-13 5:47,T050304,A_31,,,,,...,38.7,41.89,46.93,33.09,76.97,,,,,


In [6]:
test_df.head()

Unnamed: 0,PRODUCT_ID,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,TEST_000,2022-09-09 2:01,T100306,T_31,2.0,94.0,0.0,45.0,10.0,0.0,...,,,,,,,,,,
1,TEST_001,2022-09-09 2:09,T100304,T_31,2.0,93.0,0.0,45.0,11.0,0.0,...,,,,,,,,,,
2,TEST_002,2022-09-09 8:42,T100304,T_31,2.0,95.0,0.0,45.0,11.0,0.0,...,,,,,,,,,,
3,TEST_003,2022-09-09 10:56,T010305,A_31,,,,,,,...,,,,,,,,,,
4,TEST_004,2022-09-09 11:04,T010306,A_31,,,,,,,...,,,,,,,,,,


In [7]:
submission.head()

Unnamed: 0,PRODUCT_ID,Y_Class
0,TEST_000,0
1,TEST_001,0
2,TEST_002,0
3,TEST_003,0
4,TEST_004,0


위 세 가지 데이터 프레임을 보고 우리가 무엇을 예측해야하는지 알 수 있다.

# Data Preprocessing

데이터 전처리(Data preprocessing)의 목적은 

주어진원본 데이터를 신경망에 적용하기 쉽도록 만드는 것이다.

벡터화(vectorization), 정규화(normalization), 

특성 추출(Feature Engineering)등이 포함된다.

### Feature Engineering

데이터에는 수치형과 범주형 데이터가 있는데, 둘 중 수치치형만 사용하고 싶은 경우, 다른 범주형 데이터를 drop함수를 이용하여 다 삭제한다.(데이터 처리)

train_df에 적용시킨 사항은 test_df에도 똑같이 적용시켜준다.

In [16]:
## train data preprocessing(timestamp)
timestp_lst = []
for i in train_df.TIMESTAMP:
    print(i[-5:].split(':')[0])
    timestp_lst.append(i[-5:].split(':')[0])
# print('end')
# timestp_lst2 = []
# for i in timestp_lst:
#     i = i.split(' ')[1]
#     timestp_lst2.append(i)
#     print(i)
    
timestp = pd.Series(data=timestp_lst, name = 'timestamp') #여기서 timestamp는 column name이 된다.
timestp

 5
 5
 5
 5
 5
 5
 6
 6
 6
 6
 6
 6
 6
 7
 7
 7
 7
 7
 7
 7
 7
 5
 8
 9
23
 9
 9
20
20
23
23
21
 7
 7
17
17
 3
 3
 4
 4
12
12
 3
 6
 6
10
10
15
15
 1
 1
 4
 4
 6
 6
 8
 9
21
21
22
 6
 6
17
17
18
18
21
 3
 4
 6
 6
 7
 7
 8
 8
 8
 8
16
17
 0
 1
 1
 1
 7
 8
19
20
20
12
12
19
19
22
22
10
10
18
18
 3
 3
22
14
14
14
14
23
23
15
15
19
19
20
 2
13
13
15
15
17
17
22
22
10
10
13
14
 0
 0
11
11
17
19
 0
 1
 6
 6
 1
 1
 7
 7
 8
 9
11
11
 1
 2
 8
 8
12
13
14
14
 4
 4
13
13
23
23
 7
 7
 8
 8
19
19
 1
 1
10
 9
 9
18
18
 3
 3
14
14
 4
 4
19
19
21
21
 5
 5
17
18
18
23
23
 6
 6
14
14
16
16
22
22
 7
10
11
23
23
 3
 4
 4
 5
 5
 9
 9
15
15
 1
 1
 1
 2
 2
 8
 8
12
12
13
13
13
16
16
16
16
 2
 2
23
23
 2
 2
10
10
11
11
16
16
17
17
 0
 0
 2
 2
 2
 2
 8
 8
13
13
 6
 6
 2
 2
 6
 6
12
12
14
14
19
20
 6
 8
 8
11
11
19
19
 0
 0
 4
 5
 5
 6
10
10
12
12
14
14
19
19
19
20
 2
 2
 5
 5
 6
 6
 9
10
10
10
15
15
 0
 0
 2
 2
 3
 4
 4
 4
 4
 6
 7
 6
 9
 9
 9
 9
 9
13
14
14
15
17
18
18
18
19
22
 2
 3
 3
11
11
21
21
 9
 9
23
2

0       5
1       5
2       5
3       5
4       5
       ..
593    14
594    22
595    22
596    14
597    14
Name: timestamp, Length: 598, dtype: object

In [17]:
train_df = pd.concat([train_df, timestp], axis = 1) #concat함수 안에는 series 이름을 넣는다.
train_df

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,...,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875,timestamp
0,TRAIN_000,1,0.533433,2022-06-13 5:14,T050304,A_31,,,,,...,40.89,32.56,34.09,77.77,,,,,,5
1,TRAIN_001,2,0.541819,2022-06-13 5:22,T050307,A_31,,,,,...,42.82,43.92,35.34,72.55,,,,,,5
2,TRAIN_002,1,0.531267,2022-06-13 5:30,T050304,A_31,,,,,...,36.65,42.47,36.53,78.35,,,,,,5
3,TRAIN_003,2,0.537325,2022-06-13 5:39,T050307,A_31,,,,,...,39.17,52.17,30.58,71.78,,,,,,5
4,TRAIN_004,1,0.531590,2022-06-13 5:47,T050304,A_31,,,,,...,41.89,46.93,33.09,76.97,,,,,,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,TRAIN_593,1,0.526546,2022-09-08 14:30,T100306,T_31,2.0,95.0,0.0,45.0,...,,,,,,,,,,14
594,TRAIN_594,0,0.524022,2022-09-08 22:38,T050304,A_31,,,,,...,53.07,50.89,55.10,66.49,1.0,,,,,22
595,TRAIN_595,0,0.521289,2022-09-08 22:47,T050304,A_31,,,,,...,,,,,1.0,,,,,22
596,TRAIN_596,1,0.531375,2022-09-08 14:38,T100304,O_31,40.0,94.0,0.0,45.0,...,,,,,,,,,,14


In [18]:
#test data preprocessing(timestamp)
t_lst = []
for i in test_df.TIMESTAMP:
    print(i[-5:].split(':')[0])
    t_lst.append(i[-5:].split(':')[0])
    
timestp2 = pd.Series(data=t_lst, name = 'timestamp')
timestp2

 2
 2
 8
10
11
19
19
12
12
14
14
13
13
 6
 6
 5
 5
23
23
 1
 1
 4
 4
 8
 8
14
19
19
22
22
 7
 7
11
11
14
 3
 4
 8
 8
19
19
23
23
14
14
20
20
13
13
13
13
13
17
17
18
20
21
 3
 4
 4
 4
21
21
 0
 0
 3
 8
19
19
 9
 9
22
22
 5
 6
 9
 9
17
17
20
 2
 3
 8
 8
10
10
14
14
17
17
18
21
22
 2
 3
 6
 6
22
22
 2
 2
11
11
20
20
23
23
 4
 4
 6
10
10
16
17
15
15
17
17
21
21
 5
 5
11
11
15
15
 3
 3
 8
 9
11
11
13
 0
 0
 5
 5
 5
14
16
16
20
20
21
21
 2
 2
11
11
14
14
15
16
18
19
19
20
20
 1
 1
 3
 3
 5
 6
11
11
16
16
21
21
 1
 1
 6
 6
 8
 8
12
12
21
21
 2
 2
 8
 8
10
10
15
16
16
16
16
16
16
22
22
 2
 2
 6
 6
12
12
16
16
 0
 0
 4
 4
11
11
18
18
23
23
 5
 5
14
14
17
17
17
20
20
 2
 2
 6
 6
10
10
14
14
20
21
12
12
18
18
 1
 1
20
20
 2
 2
 6
 6
12
12
18
18
 7
 7
10
10
10
 8
13
13
14
14
14
14
15
17
17
16
21
21
 2
 2
 7
 7
11
11
18
18
22
22
 2
 2
 9
 9
16
17
 3
 3
 9
10
11
20
20
20
23
23
 0
 0
 5
 5
 7
 7
11
11
15
15
20
20
11
11
16
16
20
21


0       2
1       2
2       8
3      10
4      11
       ..
305    11
306    16
307    16
308    20
309    21
Name: timestamp, Length: 310, dtype: object

In [19]:
test_df = pd.concat([test_df, timestp2], axis = 1)
test_df

Unnamed: 0,PRODUCT_ID,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,...,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875,timestamp
0,TEST_000,2022-09-09 2:01,T100306,T_31,2.0,94.0,0.0,45.0,10.0,0.0,...,,,,,,,,,,2
1,TEST_001,2022-09-09 2:09,T100304,T_31,2.0,93.0,0.0,45.0,11.0,0.0,...,,,,,,,,,,2
2,TEST_002,2022-09-09 8:42,T100304,T_31,2.0,95.0,0.0,45.0,11.0,0.0,...,,,,,,,,,,8
3,TEST_003,2022-09-09 10:56,T010305,A_31,,,,,,,...,,,,,,,,,,10
4,TEST_004,2022-09-09 11:04,T010306,A_31,,,,,,,...,,,,,,,,,,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,TEST_305,2022-11-05 11:18,T100306,T_31,2.0,91.0,0.0,45.0,10.0,0.0,...,,,,,,,,,,11
306,TEST_306,2022-11-05 16:39,T100304,T_31,2.0,96.0,0.0,45.0,11.0,0.0,...,,,,,,,,,,16
307,TEST_307,2022-11-05 16:47,T100306,T_31,2.0,91.0,0.0,45.0,10.0,0.0,...,,,,,,,,,,16
308,TEST_308,2022-11-05 20:53,T100306,T_31,2.0,95.0,0.0,45.0,10.0,0.0,...,,,,,,,,,,20


### Feature Selection

모델링을 하기 앞서, 독립변수 X와 종속변수 y를 설정해야한다.

**학습에 사용할 변수 X와 예측할 변수 y를 분리**

In [20]:
train_y = train_df['Y_Class']
train_y

0      1
1      2
2      1
3      2
4      1
      ..
593    1
594    0
595    0
596    1
597    1
Name: Y_Class, Length: 598, dtype: int64

학습에 쓰이지 않을 column들을 제거한다.

In [21]:
train_x = train_df.drop(columns = ['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
#모델 학습이 끝나고 예측에 쓰일 test데이터
test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

In [22]:
test_x

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875,timestamp
0,T100306,T_31,2.0,94.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,2
1,T100304,T_31,2.0,93.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,2
2,T100304,T_31,2.0,95.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,8
3,T010305,A_31,,,,,,,,,...,,,,,,,,,,10
4,T010306,A_31,,,,,,,,,...,,,,,,,,,,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,T100306,T_31,2.0,91.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,11
306,T100304,T_31,2.0,96.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,16
307,T100306,T_31,2.0,91.0,0.0,45.0,10.0,0.0,50.0,10.0,...,,,,,,,,,,16
308,T100306,T_31,2.0,95.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,20


데이터 전처리. 결측값을 0으로 채운다.

In [23]:
train_x = train_x.fillna(0)
test_x = test_x.fillna(0)

범주형 데이터를 수치 데이터로 전환하기 위해 LabelEncoder 활용

In [24]:
# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i]) #원래 column 값을 기준으로 fit.
    train_x[i] = le.transform(train_x[i]) #수치화, 수치로 변형
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 
print('Done.')

Done.


### Modeling

1.모델 선택 - sklearn라이브러리 활용 - RandomForest 

2.모델 학습 - train_df를 활용하여 1번에서 정의한 모델로 학습

3.예측 - 학습된 모델을 바탕으로 test 데이터를 예측

4.정답 파일 생성 - 정답 파일 생성 및 제출 필요(경진대회를 위해 필요한 과정.)

In [25]:
#모델 선언(모델 선택) + 모델 학습
#random_state=37 로 시드 고정.
RF = RandomForestClassifier(random_state=37, ma).fit(train_x, train_y)
print('Done.')

Done.


In [26]:
#test 데이터로 예측
preds = RF.predict(test_x)
print('Done.')

Done.


In [27]:
#제출 파일에 예측 column 넣기
submission['Y_Class'] = preds
submission.to_csv('./baseline_submission2.csv', index=False)