## Import

In [104]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [105]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) 

## Data Load

In [3]:
from google.colab import files
uploaded = files.upload()

Saving train.csv to train.csv


In [4]:
from google.colab import files
uploaded = files.upload()

Saving test.csv to test.csv


In [5]:
from google.colab import files
uploaded = files.upload()

Saving sample_submission.csv to sample_submission.csv


In [106]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [107]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
train_y = train_df['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

## Data Pre-processing

In [None]:
#train_x = train_x.fillna(0)
#test_x = test_x.fillna(0)

In [108]:
# qualitative to quantitative
# one-hot encoding

qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 
print('Done.')

Done.


In [109]:
#전부 결측치인 column 먼저 0으로 채우기

import math

for column in train_x:
  if math.isnan(train_x[column].mean()):
    train_x[column] = 0

for column in test_x:
  if math.isnan(test_x[column].mean()):
    test_x[column] = 0

#### 결측치 채우는 방법 3가지

In [110]:
#결측치 평균

import pandas as pd
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer(strategy="mean")
#imputer 생성
train_x = pd.DataFrame(my_imputer.fit_transform(train_x), columns = train_x.columns)
test_x = pd.DataFrame(my_imputer.fit_transform(test_x), columns = test_x.columns)

In [None]:
#결측치 중간값

import pandas as pd
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer(strategy="median")
#imputer 생성
train_x = pd.DataFrame(my_imputer.fit_transform(train_x))
test_x = pd.DataFrame(my_imputer.fit_transform(test_x))

In [None]:
#결측치 최빈값

import pandas as pd
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer(strategy="most_frequent")
#imputer 생성
train_x = pd.DataFrame(my_imputer.fit_transform(train_x))
test_x = pd.DataFrame(my_imputer.fit_transform(test_x))

In [49]:
train_x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2867,2868,2869,2870,2871,2872,2873,2874,2875,2876
0,2.0,0.0,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,39.3400,40.8900,32.5600,34.0900,77.7700,1.0,0.0,0.0,0.0,0.0
1,3.0,0.0,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,38.8900,42.8200,43.9200,35.3400,72.5500,1.0,0.0,0.0,0.0,0.0
2,2.0,0.0,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,39.1900,36.6500,42.4700,36.5300,78.3500,1.0,0.0,0.0,0.0,0.0
3,3.0,0.0,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,37.7400,39.1700,52.1700,30.5800,71.7800,1.0,0.0,0.0,0.0,0.0
4,2.0,0.0,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,38.7000,41.8900,46.9300,33.0900,76.9700,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,5.0,2.0,2.000000,95.000000,0.0,45.0,10.00000,0.0,50.000000,10.000000,...,50.8073,53.6077,49.6062,51.6598,66.6497,1.0,0.0,0.0,0.0,0.0
594,2.0,0.0,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,49.4700,53.0700,50.8900,55.1000,66.4900,1.0,0.0,0.0,0.0,0.0
595,2.0,0.0,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,50.8073,53.6077,49.6062,51.6598,66.6497,1.0,0.0,0.0,0.0,0.0
596,4.0,1.0,40.000000,94.000000,0.0,45.0,11.00000,0.0,45.000000,10.000000,...,50.8073,53.6077,49.6062,51.6598,66.6497,1.0,0.0,0.0,0.0,0.0


In [48]:
train_x.columns

RangeIndex(start=0, stop=2877, step=1)

In [61]:
train_x['PRODUCT_CODE'] == 0

0       True
1       True
2       True
3       True
4       True
       ...  
593    False
594     True
595     True
596    False
597    False
Name: PRODUCT_CODE, Length: 598, dtype: bool

#### 이상치 제거를 위해 나누기

In [62]:
#train_a = train_x[train_x['PRODUCT_CODE'] == 0]
#train_o = train_x[train_x['PRODUCT_CODE'] == 1]
#train_t = train_x[train_x['PRODUCT_CODE'] == 2]

In [73]:
#train_a = train_x[(train_x['PRODUCT_CODE'] == 0) & (train_x['LINE'] == 0.0)]
#train_a = train_x[(train_x['LINE'] == 0.0)]

In [111]:
line = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
prod_code = [0, 1, 2]

In [79]:
type(df_temp['X_1'])

pandas.core.series.Series

In [82]:
np.percentile(train_x['X_1'],75)

2.4097421203438394

In [112]:
train_new_x = pd.DataFrame()

for i in line:
  for k in prod_code:
    print(k, i)
    df_temp = train_x[(train_x['PRODUCT_CODE'] == k) & (train_x['LINE'] == i)]
    
    if len(df_temp) != 0:
      for column in df_temp:
        iqr_value = np.percentile(df_temp[column],75) - np.percentile(df_temp[column],25) 
        upper_bound = iqr_value * 1.5 + np.percentile(df_temp[column],75)
        lower_bound = np.percentile(df_temp[column],25) - iqr_value * 1.5   
        df_temp = df_temp[(df_temp[column] <= upper_bound) & (df_temp[column] >= lower_bound)]
    
    train_new_x = train_new_x.append(df_temp)
      


0 0.0
1 0.0
2 0.0
0 1.0
1 1.0
2 1.0
0 2.0
1 2.0
2 2.0
0 3.0
1 3.0
2 3.0
0 4.0
1 4.0
2 4.0
0 5.0
1 5.0
2 5.0


In [88]:
train_new_x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23 entries, 220 to 492
Columns: 2877 entries, LINE to X_2875
dtypes: float64(2877)
memory usage: 517.1 KB


In [113]:
train_new_x

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
220,0.0,0.0,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,50.8073,53.6077,49.6062,51.6598,66.6497,1.0,0.0,0.0,0.0,0.0
480,0.0,0.0,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,50.8073,53.6077,49.6062,51.6598,66.6497,1.0,0.0,0.0,0.0,0.0
530,0.0,0.0,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,50.8073,53.6077,49.6062,51.6598,66.6497,1.0,0.0,0.0,0.0,0.0
146,1.0,0.0,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,50.8073,53.6077,49.6062,51.6598,66.6497,1.0,0.0,0.0,0.0,0.0
424,1.0,0.0,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,50.8073,53.6077,49.6062,51.6598,66.6497,1.0,0.0,0.0,0.0,0.0
79,2.0,0.0,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,53.49,55.59,60.0,58.18,66.79,1.0,0.0,0.0,0.0,0.0
199,2.0,0.0,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,54.76,54.93,59.68,56.78,64.68,1.0,0.0,0.0,0.0,0.0
322,2.0,0.0,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,55.99,52.08,40.9,55.31,62.8,1.0,0.0,0.0,0.0,0.0
7,3.0,0.0,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,39.0,35.95,59.51,30.49,72.77,1.0,0.0,0.0,0.0,0.0
19,3.0,0.0,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,50.8073,53.6077,49.6062,51.6598,66.6497,1.0,0.0,0.0,0.0,0.0


In [140]:
new_index = list(train_new_x.index)

In [141]:
new_index

[220,
 480,
 530,
 146,
 424,
 79,
 199,
 322,
 7,
 19,
 24,
 569,
 571,
 596,
 350,
 409,
 438,
 570,
 572,
 597,
 376,
 413,
 492]

In [114]:
train_y

0      1
1      2
2      1
3      2
4      1
      ..
593    1
594    0
595    0
596    1
597    1
Name: Y_Class, Length: 598, dtype: int64

In [120]:
train_y = pd.DataFrame(train_y, columns=['Y_Class'])

In [136]:
train_y.loc[[1,2]]

Unnamed: 0,Y_Class
1,2
2,1


In [145]:
train_new_y = pd.DataFrame() 

train_new_y = train_new_y.append(train_y.loc[new_index])

In [146]:
train_new_y

Unnamed: 0,Y_Class
220,1
480,1
530,1
146,1
424,1
79,2
199,1
322,0
7,2
19,2


## Classification Model Fit

In [147]:
RF = RandomForestClassifier(random_state=37).fit(train_new_x, train_new_y)
print('Done.')

  RF = RandomForestClassifier(random_state=37).fit(train_new_x, train_new_y)


Done.


In [154]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [155]:
model = DecisionTreeClassifier(max_depth = 20, random_state=37)
preds = model.fit(train_new_x, train_new_y).predict(test_x)

## Inference

In [148]:
preds = RF.predict(test_x)
print('Done.')

Done.


## Submit

In [156]:
submit = pd.read_csv('./sample_submission.csv')

In [157]:
submit['Y_Class'] = preds

In [158]:
submit.to_csv('./submission5.csv', index=False)