## Import

In [462]:
import pandas as pd
import random
import os
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [463]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

## Data Load

In [464]:
from google.colab import drive
drive.mount('/content/drive')
trainname='/content/drive/MyDrive/train.csv'
testname='/content/drive/MyDrive/test.csv'
samplename='/content/drive/MyDrive/sample_submission.csv'
submitname='/content/drive/MyDrive/submit.csv'

train_df = pd.read_csv(trainname)
test_df = pd.read_csv(testname)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [465]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
train_y = train_df['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

## Data Pre-processing

In [466]:
train_T_00304=train_x[(train_x['LINE']=='T100304')&(train_x['PRODUCT_CODE']=='T_31')]
train_T_00306=train_x[(train_x['LINE']=='T100306')&(train_x['PRODUCT_CODE']=='T_31')]
train_A_50304=train_x[(train_x['LINE']=='T050304')&(train_x['PRODUCT_CODE']=='A_31')]
train_A_10306=train_x[(train_x['LINE']=='T010306')&(train_x['PRODUCT_CODE']=='A_31')]
train_A_10305=train_x[(train_x['LINE']=='T010305')&(train_x['PRODUCT_CODE']=='A_31')]
train_A_50307=train_x[(train_x['LINE']=='T050307')&(train_x['PRODUCT_CODE']=='A_31')]
train_O_00304=train_x[(train_x['LINE']=='T100304')&(train_x['PRODUCT_CODE']=='O_31')]
train_O_00306=train_x[(train_x['LINE']=='T100306')&(train_x['PRODUCT_CODE']=='O_31')]

test_T_00304=test_x[(test_x['LINE']=='T100304')&(test_x['PRODUCT_CODE']=='T_31')]
test_T_00306=test_x[(test_x['LINE']=='T100306')&(test_x['PRODUCT_CODE']=='T_31')]
test_A_50304=test_x[(test_x['LINE']=='T050304')&(test_x['PRODUCT_CODE']=='A_31')]
test_A_10306=test_x[(test_x['LINE']=='T010306')&(test_x['PRODUCT_CODE']=='A_31')]
test_A_10305=test_x[(test_x['LINE']=='T010305')&(test_x['PRODUCT_CODE']=='A_31')]
test_A_50307=test_x[(test_x['LINE']=='T050307')&(test_x['PRODUCT_CODE']=='A_31')]
test_O_00304=test_x[(test_x['LINE']=='T100304')&(test_x['PRODUCT_CODE']=='O_31')]
test_O_00306=test_x[(test_x['LINE']=='T100306')&(test_x['PRODUCT_CODE']=='O_31')]

In [467]:
# train_T_00304.head()



In [468]:
train_x.head()
#train_x.info()

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,T050304,A_31,,,,,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
1,T050307,A_31,,,,,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2,T050304,A_31,,,,,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,
3,T050307,A_31,,,,,,,,,...,37.74,39.17,52.17,30.58,71.78,,,,,
4,T050304,A_31,,,,,,,,,...,38.7,41.89,46.93,33.09,76.97,,,,,


In [469]:
test_x.head()
#test_x.info()

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,T100306,T_31,2.0,94.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,
1,T100304,T_31,2.0,93.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
2,T100304,T_31,2.0,95.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
3,T010305,A_31,,,,,,,,,...,,,,,,,,,,
4,T010306,A_31,,,,,,,,,...,,,,,,,,,,


In [470]:
# train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
# test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

#이거 column 여러개 설정하기
def remove_out(dataframe, remove_col):
    dff = dataframe
    for k in remove_col:
        level_1q = dff[k].quantile(0.25)
        level_3q = dff[k].quantile(0.75)
        IQR = level_3q - level_1q
        rev_range = 3  # 제거 범위 조절 변수
        dff = dff[(dff[k] <= level_3q + (rev_range * IQR)) & (dff[k] >= level_1q - (rev_range * IQR))]
        dff = dff.reset_index(drop=True)
    return dff

In [471]:
#결측치 값을 0으로 채운다.

# train_x = train_x.fillna(0)
# test_x = test_x.fillna(0)
#===========================

#결측치 값을 평균으로 채운다.

# train_x = train_x.fillna(train_x.mean())
# test_x = test_x.fillna(test_x.mean())
# train_x = train_x.fillna(0)
# test_x = test_x.fillna(0)
#==============================================

#결측치 값을 nan값이 아닌 값들의 평균으로 채운다.

train_x = train_x.fillna(train_x.notnull().mean())
test_x = test_x.fillna(test_x.notnull().mean())
train_x=train_x.replace(0,train_x.notnull().mean())
test_x=test_x.replace(0,test_x.notnull().mean())

#===============================================

#결측치 값을 nan값이 아닌 값들의 중간값으로 채운다.

# train_x = train_x.fillna(train_x.notnull().median())
# test_x = test_x.fillna(test_x.notnull().median())

#===============================================
#결측치 값을 평균으로 채우고 이상치 제거.

#===============================================
#결측치를 미리 채우고 데이터를 분류하면 전체의 평균이 되어 정확하지 않을수도있음

# train_T_00304=train_T_00304.fillna(train_T_00304.notnull().mean())
# train_A_50304=train_A_50304.fillna(train_A_50304.notnull().mean())
# train_A_10306=train_A_10306.fillna(train_A_10306.notnull().mean())
# train_T_00306=train_T_00306.fillna(train_T_00306.notnull().mean())
# train_A_10305=train_A_10305.fillna(train_A_10305.notnull().mean())
# train_A_50307=train_A_50307.fillna(train_A_50307.notnull().mean())
# train_O_00304=train_O_00304.fillna(train_O_00304.notnull().mean())
# train_O_00306=train_O_00306.fillna(train_O_00306.notnull().mean())

# train_T_00304=train_T_00304.loc[:, (train_T_00304 != 0).any(axis=0)]
# train_A_50304=train_A_50304.loc[:, (train_A_50304 != 0).any(axis=0)]
# train_A_10306=train_A_10306.loc[:, (train_A_10306 != 0).any(axis=0)]
# train_T_00306=train_T_00306.loc[:, (train_T_00306 != 0).any(axis=0)]
# train_A_10305=train_A_10305.loc[:, (train_A_10305 != 0).any(axis=0)]
# train_A_50307=train_T_00304.loc[:, (train_A_50307 != 0).any(axis=0)]
# train_O_00304=train_T_00304.loc[:, (train_O_00304 != 0).any(axis=0)]
# train_O_00306=train_O_00306.loc[:, (train_O_00306 != 0).any(axis=0)]
 
# test_T_00304=test_T_00304.fillna(test_T_00304.notnull().mean())
# test_A_50304=test_A_50304.fillna(test_A_50304.notnull().mean())
# test_A_10306=test_A_10306.fillna(test_A_10306.notnull().mean())
# test_T_00306=test_T_00306.fillna(test_T_00306.notnull().mean())
# test_A_10305=test_A_10305.fillna(test_A_10305.notnull().mean())
# test_A_50307=test_A_50307.fillna(test_A_50307.notnull().mean())
# test_O_00304=test_O_00304.fillna(test_O_00304.notnull().mean())
# test_O_00306=test_O_00306.fillna(test_O_00306.notnull().mean())

# test_T_00304=test_T_00304.loc[:, (test_T_00304 != 0).any(axis=0)]
# test_A_50304=test_A_50304.loc[:, (test_A_50304 != 0).any(axis=0)]
# test_A_10306=test_A_10306.loc[:, (test_A_10306 != 0).any(axis=0)]
# test_T_00306=test_T_00306.loc[:, (test_T_00306 != 0).any(axis=0)]
# test_A_10305=test_A_10305.loc[:, (test_A_10305 != 0).any(axis=0)]
# test_A_50307=test_A_50307.loc[:, (test_A_50307 != 0).any(axis=0)]
# test_O_00304=test_O_00304.loc[:, (test_O_00304 != 0).any(axis=0)]
# test_O_00306=test_O_00306.loc[:, (test_O_00306 != 0).any(axis=0)]

In [472]:
plt.rcParams['axes.unicode_minus'] = False  # matplotlib 마이너스기호 표시

# train_T_00304.boxplot()
# train_A_50304.boxplot()
# train_A_10306.boxplot()
# train_T_00306.boxplot()
# train_A_10305.boxplot()
# train_A_50307.boxplot()
# train_O_00304.boxplot()
# train_O_00306.boxplot()

# test_T_00304.boxplot()
# test_A_50304.boxplot()
# test_A_10306.boxplot()
# test_T_00306.boxplot()
# test_A_10305.boxplot()
# test_A_50307.boxplot()
# test_O_00304.boxplot()
# test_O_00306.boxplot()
#remove_out(train_T_00304,train_T_00304.loc[2:])

#train_A_10306.boxplot()

In [473]:
# def is_kor_outlier(df):
#   q3 = df.quantile(0.75) # df['국어'].quantile(0.75) 처럼 특정 열만 적용 가능
#   q1 = df.quantile(0.25)
#   iqr = q3 - q1
#   kor_score = df['국어']
#   if kor_score > q3['국어'] + 1.5 * iqr['국어'] or kor_score < q1['국어'] - 1.5 * iqr['국어']:
#     return True
#   else:
#     return False
#   df['isOutlier'] = df.apply(is_kor_outlier, axis = 1) # axis = 1 지정 필수
#   df = df.loc[df['isOutlier'] == False]
#   del df_trim['isOutlier']


In [474]:
# train_x =pd.concat([train_T_00304,train_T_00306,train_A_50304,train_A_10306,train_A_10305,train_A_50307,train_O_00304,train_O_00306],axis=1)
# test_x =pd.concat([test_T_00304,test_T_00306,test_A_50304,test_A_10306,test_A_10305,test_A_50307,test_O_00304,test_O_00306],axis=1)

# train_x = train_x.fillna(train_x.notnull().mean())
# test_x = test_x.fillna(test_x.notnull().mean())


In [475]:
train_x.head()

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,T050304,A_31,0.583612,0.583612,0.583612,0.583612,0.583612,0.583612,0.583612,0.583612,...,39.34,40.89,32.56,34.09,77.77,0.165552,1.0,1.0,1.0,1.0
1,T050307,A_31,0.583612,0.583612,0.583612,0.583612,0.583612,0.583612,0.583612,0.583612,...,38.89,42.82,43.92,35.34,72.55,0.165552,1.0,1.0,1.0,1.0
2,T050304,A_31,0.583612,0.583612,0.583612,0.583612,0.583612,0.583612,0.583612,0.583612,...,39.19,36.65,42.47,36.53,78.35,0.165552,1.0,1.0,1.0,1.0
3,T050307,A_31,0.583612,0.583612,0.583612,0.583612,0.583612,0.583612,0.583612,0.583612,...,37.74,39.17,52.17,30.58,71.78,0.165552,1.0,1.0,1.0,1.0
4,T050304,A_31,0.583612,0.583612,0.583612,0.583612,0.583612,0.583612,0.583612,0.583612,...,38.7,41.89,46.93,33.09,76.97,0.165552,1.0,1.0,1.0,1.0


In [476]:
test_x.head()

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,T100306,T_31,2.0,94.0,1.0,45.0,10.0,1.0,51.0,10.0,...,0.109677,0.109677,0.109677,0.109677,0.109677,0.125806,0.029032,0.029032,0.029032,0.029032
1,T100304,T_31,2.0,93.0,1.0,45.0,11.0,1.0,45.0,10.0,...,0.109677,0.109677,0.109677,0.109677,0.109677,0.125806,0.029032,0.029032,0.029032,0.029032
2,T100304,T_31,2.0,95.0,1.0,45.0,11.0,1.0,45.0,10.0,...,0.109677,0.109677,0.109677,0.109677,0.109677,0.125806,0.029032,0.029032,0.029032,0.029032
3,T010305,A_31,0.783871,0.783871,0.783871,0.783871,0.783871,0.783871,0.783871,0.783871,...,0.109677,0.109677,0.109677,0.109677,0.109677,0.125806,0.029032,0.029032,0.029032,0.029032
4,T010306,A_31,0.783871,0.783871,0.783871,0.783871,0.783871,0.783871,0.783871,0.783871,...,0.109677,0.109677,0.109677,0.109677,0.109677,0.125806,0.029032,0.029032,0.029032,0.029032


In [477]:
# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 
print('Done.')

Done.


## Classification Model Fit

In [478]:
RF = RandomForestClassifier(random_state=37).fit(train_x, train_y)
print('Done.')

Done.


## Inference

In [479]:
preds = RF.predict(test_x)
print('Done.')

Done.


## Submit

In [480]:
submit = pd.read_csv(samplename)

In [481]:
submit['Y_Class'] = preds
#submit.head()

In [482]:
submit.to_csv(submitname, index=False)