## Import

In [None]:
import pandas as pd
import random
import os
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

## Data Load

In [None]:
from google.colab import drive
drive.mount('/content/drive')
trainname='/content/drive/MyDrive/train.csv'
testname='/content/drive/MyDrive/test.csv'
samplename='/content/drive/MyDrive/sample_submission.csv'
submitname='/content/drive/MyDrive/submit.csv'

train_df = pd.read_csv(trainname)
test_df = pd.read_csv(testname)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
train_y = train_df['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

## Data Pre-processing

In [None]:
# plt.rcParams['axes.unicode_minus'] = False  # matplotlib 마이너스기호 표시

# train_x.boxplot()




In [None]:
train_x.head()
#train_x.info()

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,T050304,A_31,,,,,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
1,T050307,A_31,,,,,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2,T050304,A_31,,,,,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,
3,T050307,A_31,,,,,,,,,...,37.74,39.17,52.17,30.58,71.78,,,,,
4,T050304,A_31,,,,,,,,,...,38.7,41.89,46.93,33.09,76.97,,,,,


In [None]:
test_x.head()
#test_x.info()

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,T100306,T_31,2.0,94.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,
1,T100304,T_31,2.0,93.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
2,T100304,T_31,2.0,95.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
3,T010305,A_31,,,,,,,,,...,,,,,,,,,,
4,T010306,A_31,,,,,,,,,...,,,,,,,,,,


In [None]:
# train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
# test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

#이거 column 여러개 설정하기
def remove_out(dataframe, remove_col):
    dff = dataframe
    for k in remove_col:
        level_1q = dff[k].quantile(0.25)
        level_3q = dff[k].quantile(0.75)
        IQR = level_3q - level_1q
        rev_range = 3  # 제거 범위 조절 변수
        dff = dff[(dff[k] <= level_3q + (rev_range * IQR)) & (dff[k] >= level_1q - (rev_range * IQR))]
        dff = dff.reset_index(drop=True)
    return dff

In [None]:
#결측치 값을 0으로 채운다.

# train_x = train_x.fillna(0)
# test_x = test_x.fillna(0)
#===========================

#결측치 값을 평균으로 채운다.

# train_x = train_x.fillna(train_x.mean())
# test_x = test_x.fillna(test_x.mean())
# train_x = train_x.fillna(0)
# test_x = test_x.fillna(0)
#==============================================

#결측치 값을 nan값이 아닌 값들의 평균으로 채운다.

train_x = train_x.fillna(train_x.notnull().mean())
test_x = test_x.fillna(test_x.notnull().mean())
#===============================================

#결측치 값을 nan값이 아닌 값들의 중간값으로 채운다.

# train_x = train_x.fillna(train_x.notnull().median())
# test_x = test_x.fillna(test_x.notnull().median())

#===============================================
#결측치 값을 평균으로 채우고 이상치 제거.



In [None]:
train_x.head()

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,T050304,A_31,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,39.34,40.89,32.56,34.09,77.77,0.0,0.0,0.0,0.0,0.0
1,T050307,A_31,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,38.89,42.82,43.92,35.34,72.55,0.0,0.0,0.0,0.0,0.0
2,T050304,A_31,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,39.19,36.65,42.47,36.53,78.35,0.0,0.0,0.0,0.0,0.0
3,T050307,A_31,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,37.74,39.17,52.17,30.58,71.78,0.0,0.0,0.0,0.0,0.0
4,T050304,A_31,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,38.7,41.89,46.93,33.09,76.97,0.0,0.0,0.0,0.0,0.0


In [None]:
test_x.head()

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,T100306,T_31,2.0,94.0,0.0,45.0,10.0,0.0,51.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,T100304,T_31,2.0,93.0,0.0,45.0,11.0,0.0,45.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,T100304,T_31,2.0,95.0,0.0,45.0,11.0,0.0,45.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,T010305,A_31,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,T010306,A_31,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 
print('Done.')

Done.


## Classification Model Fit

In [None]:
RF = RandomForestClassifier(random_state=37).fit(train_x, train_y)
print('Done.')

Done.


## Inference

In [None]:
preds = RF.predict(test_x)
print('Done.')

Done.


## Submit

In [None]:
submit = pd.read_csv(samplename)

In [None]:
submit['Y_Class'] = preds
#submit.head()

In [None]:
submit.to_csv(submitname, index=False)