## Import

In [1]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

## Data Load

In [4]:
train_df = pd.read_csv('/content/drive/MyDrive/LG/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/LG/test.csv')

In [100]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
train_y = train_df['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

## Data Pre-processing

In [101]:
train_x.head()

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,T050304,A_31,,,,,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
1,T050307,A_31,,,,,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2,T050304,A_31,,,,,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,
3,T050307,A_31,,,,,,,,,...,37.74,39.17,52.17,30.58,71.78,,,,,
4,T050304,A_31,,,,,,,,,...,38.7,41.89,46.93,33.09,76.97,,,,,


In [109]:
test_x.head()

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,T100306,T_31,2.0,94.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,
1,T100304,T_31,2.0,93.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
2,T100304,T_31,2.0,95.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
3,T010305,A_31,,,,,,,,,...,,,,,,,,,,
4,T010306,A_31,,,,,,,,,...,,,,,,,,,,


In [78]:
# train_x = train_x.fillna(0)
# test_x = test_x.fillna(0)

In [107]:
train_x.dropna(how='all', axis=1)

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871
0,T050304,A_31,,,,,,,,,...,189.0,383.0,368.296296,353.0,39.34,40.89,32.56,34.09,77.77,
1,T050307,A_31,,,,,,,,,...,185.6,383.0,367.735849,353.0,38.89,42.82,43.92,35.34,72.55,
2,T050304,A_31,,,,,,,,,...,165.5,383.0,367.320755,353.0,39.19,36.65,42.47,36.53,78.35,
3,T050307,A_31,,,,,,,,,...,165.8,384.0,369.188679,353.0,37.74,39.17,52.17,30.58,71.78,
4,T050304,A_31,,,,,,,,,...,182.6,383.0,367.351852,352.0,38.70,41.89,46.93,33.09,76.97,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,T100306,T_31,2.0,95.0,0.0,45.0,10.0,0.0,50.0,10.0,...,,,,,,,,,,
594,T050304,A_31,,,,,,,,,...,168.7,384.0,369.811321,353.0,49.47,53.07,50.89,55.10,66.49,1.0
595,T050304,A_31,,,,,,,,,...,156.6,383.0,367.018868,352.0,,,,,,1.0
596,T100304,O_31,40.0,94.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,


In [108]:
test_x.dropna(how='all',axis=1)

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,T100306,T_31,2.0,94.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,
1,T100304,T_31,2.0,93.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
2,T100304,T_31,2.0,95.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
3,T010305,A_31,,,,,,,,,...,,,,,,,,,,
4,T010306,A_31,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,T100306,T_31,2.0,91.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,
306,T100304,T_31,2.0,96.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
307,T100306,T_31,2.0,91.0,0.0,45.0,10.0,0.0,50.0,10.0,...,,,,,,,,,,
308,T100306,T_31,2.0,95.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,


In [113]:
train_x.groupby(['LINE','PRODUCT_CODE']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
LINE,PRODUCT_CODE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
T010305,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T010306,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T050304,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3454.34,3633.42,3260.57,3506.28,4467.96,67.0,0.0,0.0,0.0,0.0
T050307,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1626.39,1727.35,1700.05,1659.7,2197.01,32.0,0.0,0.0,0.0,0.0
T100304,O_31,48.0,292.0,0.0,135.0,33.0,0.0,135.0,30.0,93.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T100304,T_31,428.0,16857.0,0.0,7740.0,1854.0,0.0,7740.0,1720.0,5332.0,344.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T100306,O_31,33.0,266.0,0.0,135.0,30.0,0.0,163.0,30.0,156.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T100306,T_31,332.0,15783.0,0.0,7695.0,1710.0,0.0,8994.0,1727.0,8892.0,342.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [153]:
test_x.groupby(['LINE','PRODUCT_CODE']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
LINE,PRODUCT_CODE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
T010305,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T010306,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T050304,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,583.62,585.51,546.4,585.97,698.1,13.0,0.0,0.0,0.0,0.0
T050307,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1186.64,1216.99,1153.41,1256.91,1471.31,25.0,2231.9,2267.0,2189.7,2059.6
T100304,O_31,291.0,295.0,0.0,135.0,31.0,0.0,135.0,30.0,93.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T100304,T_31,266.0,10533.0,0.0,4860.0,1145.0,0.0,4860.0,1080.0,3348.0,216.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T100306,O_31,146.0,94.0,0.0,45.0,10.0,0.0,67.0,11.0,52.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T100306,T_31,351.0,12029.0,0.0,5895.0,1310.0,0.0,6720.0,1312.0,6812.0,262.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [142]:
a = train_x[train_x['LINE']=='T010305']
a_deleted = a.dropna(how='all', axis=1)
print(a_deleted.info())
print(a_deleted.shape)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59 entries, 28 to 584
Columns: 888 entries, LINE to X_2865
dtypes: float64(886), object(2)
memory usage: 409.8+ KB
None
(59, 888)


In [165]:
a_test = test_x[test_x['LINE']=='T010305']
a_test_deleted = a_test.dropna(how='all', axis=1)
print(a_test_deleted.info())
print(a_test_deleted.shape)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14 entries, 3 to 142
Columns: 878 entries, LINE to X_2865
dtypes: float64(876), object(2)
memory usage: 96.1+ KB
None
(14, 878)


In [166]:
list(a_deleted.columns) - list(a_test_deleted.columns

ValueError: ignored

In [143]:
b = train_x[train_x['LINE']=='T010306']
b_deleted = b.dropna(how='all', axis=1)
print(b_deleted.info())
print(b_deleted.shape)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 70 entries, 27 to 585
Columns: 888 entries, LINE to X_2865
dtypes: float64(886), object(2)
memory usage: 486.2+ KB
None
(70, 888)


In [145]:
c = train_x[train_x['LINE']=='T050304']
c_deleted = c.dropna(how='all', axis=1)
print(c_deleted.info())
print(c_deleted.shape)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78 entries, 0 to 595
Columns: 1970 entries, LINE to X_2871
dtypes: float64(1968), object(2)
memory usage: 1.2+ MB
None
(78, 1970)


In [146]:
d = train_x[train_x['LINE']=='T050307']
d_deleted = d.dropna(how='all', axis=1)
print(d_deleted.info())
print(d_deleted.shape)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42 entries, 1 to 555
Columns: 1977 entries, LINE to X_2871
dtypes: float64(1975), object(2)
memory usage: 649.0+ KB
None
(42, 1977)


In [148]:
# 해당하는 조건에 대응되는 Y_Class가 하나뿐임. 따라서 PRODUCT_CODE를 기준으로 한번 더 나누지 않고 LINE 선에서까지만 나누고 학습시키는게 좋을듯
e = train_x[(train_x['LINE']=='T100304')&(train_x['PRODUCT_CODE']=='O_31')]
e_deleted = e.dropna(how='all', axis=1)
print(e_deleted.info())
print(e_deleted.shape)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3 entries, 569 to 596
Columns: 663 entries, LINE to X_933
dtypes: float64(661), object(2)
memory usage: 15.6+ KB
None
(3, 663)


In [149]:
e2 = train_x[(train_x['LINE']=='T100304')&(train_x['PRODUCT_CODE']=='T_31')]
e2_deleted = e2.dropna(how='all', axis=1)
print(e2_deleted.info())
print(e2_deleted.shape)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 172 entries, 22 to 592
Columns: 673 entries, LINE to X_933
dtypes: float64(671), object(2)
memory usage: 905.7+ KB
None
(172, 673)


In [150]:
f = train_x[(train_x['LINE']=='T100306')&(train_x['PRODUCT_CODE']=='O_31')]
f_deleted = f.dropna(how='all', axis=1)
print(f_deleted.info())
print(f_deleted.shape)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3 entries, 570 to 597
Columns: 663 entries, LINE to X_933
dtypes: float64(661), object(2)
memory usage: 15.6+ KB
None
(3, 663)


In [151]:
f2 = train_x[(train_x['LINE']=='T100306')&(train_x['PRODUCT_CODE']=='T_31')]
f2_deleted = f2.dropna(how='all', axis=1)
print(f2_deleted.info())
print(f2_deleted.shape)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 171 entries, 26 to 593
Columns: 673 entries, LINE to X_933
dtypes: float64(671), object(2)
memory usage: 900.4+ KB
None
(171, 673)


In [154]:
#train 데이터셋과 test 데이터셋이 동일한 LINE과 PRODUCT_CODE의 조합으로 구성됨
# 해당하는 데이터셋으로 나누어서 생각해보기

In [163]:
np.unique(train_df[(train_df['LINE']=='T100306')&(train_df['PRODUCT_CODE']=='T_31')].Y_Class)

array([0, 1, 2])

In [164]:
g = train_x[train_x['PRODUCT_CODE']=='A_31']
g_deleted = g.dropna(how='all', axis=1)
print(g_deleted.info())
print(g_deleted.shape)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 249 entries, 0 to 595
Columns: 2118 entries, LINE to X_2871
dtypes: float64(2116), object(2)
memory usage: 4.0+ MB
None
(249, 2118)


# get dummies

In [None]:
#get_dummies만 추가해보기 -> 0.49로 0.54에서 0.05감소 : 아무래도 minmaxscaling이 필요한듯?

In [61]:
train_x_num = train_x.filter(regex='X')
test_x_num = test_x.filter(regex='X')
cat_encoding_train = pd.get_dummies(train_x[['LINE','PRODUCT_CODE']])
cat_encoding_test = pd.get_dummies(test_x[['LINE','PRODUCT_CODE']])
train_x = train_x_num.join(cat_encoding_train)
test_x = test_x_num.join(cat_encoding_test)

# minmaxscaling

In [None]:
#minmaxscaling만 해보기 -> 0.49에서 0.51로 0.02 증가 : get_dummies까지 해보자

In [65]:
train_x_num = train_x.filter(regex='X')
test_x_num = test_x.filter(regex='X')
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(train_x_num)
train_x_num_scaled = scaler.transform(train_x_num)
test_x_num_scaled = scaler.transform(test_x_num)
train_x_num_scaled = pd.DataFrame(train_x_num_scaled)
test_x_num_scaled = pd.DataFrame(test_x_num_scaled)
train_x = train_x_num_scaled.join(train_x[['LINE','PRODUCT_CODE']])
test_x = test_x_num_scaled.join(test_x[['LINE','PRODUCT_CODE']])
train_x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2867,2868,2869,2870,2871,2872,2873,2874,LINE,PRODUCT_CODE
0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.538984,0.568830,0.975172,0.0,0.0,0.0,0.0,0.0,T050304,A_31
1,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.727032,0.589688,0.909718,0.0,0.0,0.0,0.0,0.0,T050307,A_31
2,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.703029,0.609544,0.982445,0.0,0.0,0.0,0.0,0.0,T050304,A_31
3,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.863599,0.510262,0.900063,0.0,0.0,0.0,0.0,0.0,T050307,A_31
4,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.776858,0.552144,0.965141,0.0,0.0,0.0,0.0,0.0,T050304,A_31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,0.019417,0.931373,0.0,1.0,0.909091,0.0,0.806452,0.909091,1.000000,1.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,T100306,T_31
594,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.842410,0.919406,0.833730,1.0,0.0,0.0,0.0,0.0,T050304,A_31
595,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,1.0,0.0,0.0,0.0,0.0,T050304,A_31
596,0.388350,0.921569,0.0,1.0,1.000000,0.0,0.725806,0.909091,0.596154,1.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,T100304,O_31


In [49]:
# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 
print('Done.')

Done.


In [57]:
train_x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2867,2868,2869,2870,2871,2872,2873,2874,LINE,PRODUCT_CODE
0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.538984,0.568830,0.975172,0.0,0.0,0.0,0.0,0.0,2,0
1,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.727032,0.589688,0.909718,0.0,0.0,0.0,0.0,0.0,3,0
2,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.703029,0.609544,0.982445,0.0,0.0,0.0,0.0,0.0,2,0
3,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.863599,0.510262,0.900063,0.0,0.0,0.0,0.0,0.0,3,0
4,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.776858,0.552144,0.965141,0.0,0.0,0.0,0.0,0.0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,0.019417,0.931373,0.0,1.0,0.909091,0.0,0.806452,0.909091,1.000000,1.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,5,2
594,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.842410,0.919406,0.833730,1.0,0.0,0.0,0.0,0.0,2,0
595,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,1.0,0.0,0.0,0.0,0.0,2,0
596,0.388350,0.921569,0.0,1.0,1.000000,0.0,0.725806,0.909091,0.596154,1.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,4,1


# pd_getdummies + minmaxscaling

In [None]:
#성능 : 0.477682 로 오히려 pd_Get_dummies 하기 전보다 떨어짐. 범주형 변수를 처리하는 방식이 잘못된건가?

In [66]:
cat_encoding_train = pd.get_dummies(train_x[['LINE','PRODUCT_CODE']])
cat_encoding_test = pd.get_dummies(test_x[['LINE','PRODUCT_CODE']])
train_x = train_x.drop(columns=['LINE','PRODUCT_CODE'])
test_x = test_x.drop(columns=['LINE','PRODUCT_CODE'])
train_x = train_x.join(cat_encoding_train)
test_x = test_x.join(cat_encoding_test)

In [67]:
train_x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2874,LINE_T010305,LINE_T010306,LINE_T050304,LINE_T050307,LINE_T100304,LINE_T100306,PRODUCT_CODE_A_31,PRODUCT_CODE_O_31,PRODUCT_CODE_T_31
0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0,0,1,0,0,0,1,0,0
1,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0,0,0,1,0,0,1,0,0
2,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0,0,1,0,0,0,1,0,0
3,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0,0,0,1,0,0,1,0,0
4,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,0.019417,0.931373,0.0,1.0,0.909091,0.0,0.806452,0.909091,1.000000,1.0,...,0.0,0,0,0,0,0,1,0,0,1
594,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0,0,1,0,0,0,1,0,0
595,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0,0,1,0,0,0,1,0,0
596,0.388350,0.921569,0.0,1.0,1.000000,0.0,0.725806,0.909091,0.596154,1.0,...,0.0,0,0,0,0,1,0,0,1,0


## Classification Model Fit

In [68]:
RF = RandomForestClassifier(random_state=37).fit(train_x, train_y)
print('Done.')



Done.


## Inference

In [69]:
preds = RF.predict(test_x)
print('Done.')

Done.




## Submit

In [70]:
submit = pd.read_csv('/content/drive/MyDrive/LG/sample_submission.csv')

In [71]:
submit

Unnamed: 0,PRODUCT_ID,Y_Class
0,TEST_000,0
1,TEST_001,0
2,TEST_002,0
3,TEST_003,0
4,TEST_004,0
...,...,...
305,TEST_305,0
306,TEST_306,0
307,TEST_307,0
308,TEST_308,0


In [72]:
submit['Y_Class'] = preds

In [74]:
submit.to_csv('./0210_3.csv', index=False)

In [16]:
#minmaxscaling + labelencoder를 사용하는 것이 RandomForestClassifier모델에서는 가장 나은 모델인듯. 
#baseline코드보다 minmaxscaling을 추가했을 때 성능이 소폭 떨어진 것은 무슨 이유?? : 이상치가 있을 때 minmaxscaling을 실행하면 오히려 성능이 저하된다고 하네. 그러면 