## Import

In [1]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

## Data Load

In [3]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [4]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
train_y = train_df['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

## Data Pre-processing

In [5]:
# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 
print('Done.')

Done.


In [6]:
#line과 product code로 나눔
pr_u=train_x['PRODUCT_CODE'].unique()
li_u=train_x['LINE'].unique()
print(pr_u, li_u)
print(test_x)

[0 2 1] [2 3 4 5 1 0]
     LINE  PRODUCT_CODE  X_1   X_2  X_3   X_4   X_5  X_6   X_7   X_8  ...  \
0       5             2  2.0  94.0  0.0  45.0  10.0  0.0  51.0  10.0  ...   
1       4             2  2.0  93.0  0.0  45.0  11.0  0.0  45.0  10.0  ...   
2       4             2  2.0  95.0  0.0  45.0  11.0  0.0  45.0  10.0  ...   
3       0             0  NaN   NaN  NaN   NaN   NaN  NaN   NaN   NaN  ...   
4       1             0  NaN   NaN  NaN   NaN   NaN  NaN   NaN   NaN  ...   
..    ...           ...  ...   ...  ...   ...   ...  ...   ...   ...  ...   
305     5             2  2.0  91.0  0.0  45.0  10.0  0.0  51.0  10.0  ...   
306     4             2  2.0  96.0  0.0  45.0  11.0  0.0  45.0  10.0  ...   
307     5             2  2.0  91.0  0.0  45.0  10.0  0.0  50.0  10.0  ...   
308     5             2  2.0  95.0  0.0  45.0  10.0  0.0  51.0  10.0  ...   
309     5             2  2.0  87.0  0.0  45.0  10.0  0.0  51.0  10.0  ...   

     X_2866  X_2867  X_2868  X_2869  X_2870  X_2871  

In [7]:
#598개 모두 NAN인 경우 0으로 채움
na_check=train_x.isna().sum()
x2=na_check[na_check==598].index
for i in x2:
    train_x[i]=train_x[i].fillna(0)
xx=na_check[na_check > 0].index
xx

Index(['X_1', 'X_2', 'X_3', 'X_4', 'X_5', 'X_6', 'X_7', 'X_8', 'X_9', 'X_10',
       ...
       'X_2866', 'X_2867', 'X_2868', 'X_2869', 'X_2870', 'X_2871', 'X_2872',
       'X_2873', 'X_2874', 'X_2875'],
      dtype='object', length=2875)

In [8]:
#598개 모두 NAN인 경우 0으로 채움
na_check2=test_x.isna().sum()
x22=na_check2[na_check2==len(test_x)].index
for i in x22:
    test_x[i]=test_x[i].fillna(0)
xx2=na_check2[na_check2 > 0].index
xx2

Index(['X_1', 'X_2', 'X_3', 'X_4', 'X_5', 'X_6', 'X_7', 'X_8', 'X_9', 'X_10',
       ...
       'X_2866', 'X_2867', 'X_2868', 'X_2869', 'X_2870', 'X_2871', 'X_2872',
       'X_2873', 'X_2874', 'X_2875'],
      dtype='object', length=2875)

In [9]:
print(train_x)

     LINE  PRODUCT_CODE   X_1   X_2  X_3   X_4   X_5  X_6   X_7   X_8  ...  \
0       2             0   NaN   NaN  NaN   NaN   NaN  NaN   NaN   NaN  ...   
1       3             0   NaN   NaN  NaN   NaN   NaN  NaN   NaN   NaN  ...   
2       2             0   NaN   NaN  NaN   NaN   NaN  NaN   NaN   NaN  ...   
3       3             0   NaN   NaN  NaN   NaN   NaN  NaN   NaN   NaN  ...   
4       2             0   NaN   NaN  NaN   NaN   NaN  NaN   NaN   NaN  ...   
..    ...           ...   ...   ...  ...   ...   ...  ...   ...   ...  ...   
593     5             2   2.0  95.0  0.0  45.0  10.0  0.0  50.0  10.0  ...   
594     2             0   NaN   NaN  NaN   NaN   NaN  NaN   NaN   NaN  ...   
595     2             0   NaN   NaN  NaN   NaN   NaN  NaN   NaN   NaN  ...   
596     4             1  40.0  94.0  0.0  45.0  11.0  0.0  45.0  10.0  ...   
597     5             1  21.0  87.0  0.0  45.0  10.0  0.0  61.0  10.0  ...   

     X_2866  X_2867  X_2868  X_2869  X_2870  X_2871  X_2872  X_

In [10]:
print(test_x)
print(len(test_x))

     LINE  PRODUCT_CODE  X_1   X_2  X_3   X_4   X_5  X_6   X_7   X_8  ...  \
0       5             2  2.0  94.0  0.0  45.0  10.0  0.0  51.0  10.0  ...   
1       4             2  2.0  93.0  0.0  45.0  11.0  0.0  45.0  10.0  ...   
2       4             2  2.0  95.0  0.0  45.0  11.0  0.0  45.0  10.0  ...   
3       0             0  NaN   NaN  NaN   NaN   NaN  NaN   NaN   NaN  ...   
4       1             0  NaN   NaN  NaN   NaN   NaN  NaN   NaN   NaN  ...   
..    ...           ...  ...   ...  ...   ...   ...  ...   ...   ...  ...   
305     5             2  2.0  91.0  0.0  45.0  10.0  0.0  51.0  10.0  ...   
306     4             2  2.0  96.0  0.0  45.0  11.0  0.0  45.0  10.0  ...   
307     5             2  2.0  91.0  0.0  45.0  10.0  0.0  50.0  10.0  ...   
308     5             2  2.0  95.0  0.0  45.0  10.0  0.0  51.0  10.0  ...   
309     5             2  2.0  87.0  0.0  45.0  10.0  0.0  51.0  10.0  ...   

     X_2866  X_2867  X_2868  X_2869  X_2870  X_2871  X_2872  X_2873  X_2874

In [11]:
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer(strategy="mean")
#imputer 생성
train_x = pd.DataFrame(my_imputer.fit_transform(train_x))
test_x = pd.DataFrame(my_imputer.fit_transform(test_x))

print(test_x)

     0     1         2         3     4     5          6     7          8     \
0     5.0   2.0  2.000000  94.00000   0.0  45.0  10.000000   0.0  51.000000   
1     4.0   2.0  2.000000  93.00000   0.0  45.0  11.000000   0.0  45.000000   
2     4.0   2.0  2.000000  95.00000   0.0  45.0  11.000000   0.0  45.000000   
3     0.0   0.0  4.337449  94.44856   0.0  45.0  10.271605   0.0  48.485597   
4     1.0   0.0  4.337449  94.44856   0.0  45.0  10.271605   0.0  48.485597   
..    ...   ...       ...       ...   ...   ...        ...   ...        ...   
305   5.0   2.0  2.000000  91.00000   0.0  45.0  10.000000   0.0  51.000000   
306   4.0   2.0  2.000000  96.00000   0.0  45.0  11.000000   0.0  45.000000   
307   5.0   2.0  2.000000  91.00000   0.0  45.0  10.000000   0.0  50.000000   
308   5.0   2.0  2.000000  95.00000   0.0  45.0  10.000000   0.0  51.000000   
309   5.0   2.0  2.000000  87.00000   0.0  45.0  10.000000   0.0  51.000000   

          9     ...       2867       2868       286

## Classification Model Fit

In [12]:
RF = RandomForestClassifier(random_state=37).fit(train_x, train_y)
print('Done.')

Done.


## Inference

In [13]:
preds = RF.predict(test_x)
print('Done.')

Done.


## Submit

In [14]:
submit = pd.read_csv('./sample_submission.csv')

In [15]:
submit['Y_Class'] = preds

In [16]:
submit.to_csv('./2_16.csv', index=False)