### Library import

In [1]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
# 출력 관련 세팅
# pd.set_option('display.max_seq_items', None)
# pd.set_option('display.width', 10)
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

In [3]:
RANDOM_STATE = 110

train_df = pd.read_csv(r'D:\LGAimers\Hackerton\data\train.csv')
train_df

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION X Unit Time_Dam,CURE END POSITION X Judge Value_Dam,...,Production Qty Collect Result_Fill2,Production Qty Unit Time_Fill2,Production Qty Judge Value_Fill2,Receip No Collect Result_Fill2,Receip No Unit Time_Fill2,Receip No Judge Value_Fill2,WorkMode Collect Result_Fill2,WorkMode Unit Time_Fill2,WorkMode Judge Value_Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,1,OK,240.0,,,...,7,,,127,,,1,,,Normal
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,1,OK,240.0,,,...,185,,,1,,,0,,,Normal
2,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1,OK,1000.0,,,...,10,,,73,,,1,,,Normal
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1,OK,1000.0,,,...,268,,,1,,,0,,,Normal
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,1,OK,240.0,,,...,121,,,1,,,0,,,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XF434-2,1,OK,240.0,,,...,318,,,1,,,0,,,Normal
40502,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1XC796-1,1,OK,1000.0,,,...,14,,,197,,,1,,,Normal
40503,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4C1XD438-1,1,OK,240.0,,,...,1,,,27,,,1,,,Normal
40504,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3I1XA258-1,1,OK,1000.0,,,...,117,,,1,,,0,,,Normal


In [4]:
train_df['target'].describe()

count      40506
unique         2
top       Normal
freq       38156
Name: target, dtype: object

### 결측치 처리

In [5]:
# 특정 값 (OK) null 로 대체
train_df = train_df.replace('OK', np.nan)

# Dataset에서 결측치 확인하기
missing_values = train_df.isnull().sum()

missing_values

  train_df = train_df.replace('OK', np.nan)


Wip Line_Dam                         0
Process Desc._Dam                    0
Equipment_Dam                        0
Model.Suffix_Dam                     0
Workorder_Dam                        0
                                 ...  
Receip No Judge Value_Fill2      40506
WorkMode Collect Result_Fill2        0
WorkMode Unit Time_Fill2         40506
WorkMode Judge Value_Fill2       40506
target                               0
Length: 464, dtype: int64

In [6]:
# 결측치 비율 계산
total_entries = len(train_df)
missing_percentage = (train_df.isnull().sum() / total_entries) *100

# 결측치 비율 확인
missing_percentage.sort_values(ascending=False)

WorkMode Judge Value_Fill2             100.0
WorkMode Unit Time_Fill2               100.0
Receip No Judge Value_Fill2            100.0
Receip No Unit Time_Fill2              100.0
Production Qty Judge Value_Fill2       100.0
                                       ...  
PalletID Collect Result_Fill2            0.0
Production Qty Collect Result_Fill2      0.0
Receip No Collect Result_Fill2           0.0
WorkMode Collect Result_Fill2            0.0
target                                   0.0
Length: 464, dtype: float64

In [7]:
# 결측치 비율이 일정 수준 이상인 속성들을 삭제
# 50%를 기준으로 설정
threshold = 70.0
columns_to_drop = missing_percentage[missing_percentage > threshold].index

# 해당 속성들을 데이터셋에서 삭제
cleaned_data = train_df.drop(columns = columns_to_drop)

# 새로운 데이터셋의 크기 확인
cleaned_data.shape

(40506, 173)

In [8]:
print(columns_to_drop)

Index(['Insp Judge Code_Dam', 'CURE END POSITION X Unit Time_Dam',
       'CURE END POSITION X Judge Value_Dam',
       'CURE END POSITION Z Unit Time_Dam',
       'CURE END POSITION Z Judge Value_Dam',
       'CURE END POSITION Θ Unit Time_Dam',
       'CURE END POSITION Θ Judge Value_Dam', 'CURE SPEED Unit Time_Dam',
       'CURE SPEED Judge Value_Dam', 'CURE STANDBY POSITION X Unit Time_Dam',
       ...
       'Machine Tact time Unit Time_Fill2',
       'Machine Tact time Judge Value_Fill2', 'PalletID Unit Time_Fill2',
       'PalletID Judge Value_Fill2', 'Production Qty Unit Time_Fill2',
       'Production Qty Judge Value_Fill2', 'Receip No Unit Time_Fill2',
       'Receip No Judge Value_Fill2', 'WorkMode Unit Time_Fill2',
       'WorkMode Judge Value_Fill2'],
      dtype='object', length=291)


In [9]:
cleaned_data['target'].describe()

count      40506
unique         2
top       Normal
freq       38156
Name: target, dtype: object

In [10]:
pd.set_option('display.max_seq_items', None)

In [11]:
# Dataset에서 결측치 확인하기
missing_values = cleaned_data.isnull().sum()

print(missing_values)

Wip Line_Dam                           0
Process Desc._Dam                      0
Equipment_Dam                          0
Model.Suffix_Dam                       0
Workorder_Dam                          0
                                      ..
PalletID Collect Result_Fill2          0
Production Qty Collect Result_Fill2    0
Receip No Collect Result_Fill2         0
WorkMode Collect Result_Fill2          0
target                                 0
Length: 173, dtype: int64


In [12]:
cleaned_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Columns: 173 entries, Wip Line_Dam to target
dtypes: float64(72), int64(77), object(24)
memory usage: 53.5+ MB


In [13]:
# 수치형 변수의 결측치 중앙값으로 대체
num_cols = cleaned_data.select_dtypes(include=['float64', 'int64']).columns
for col in num_cols:
    if cleaned_data[col].isnull().sum() > 0:
        cleaned_data[col].fillna(cleaned_data[col].median(), inplace = True)

# 범주형 변수의 결측치는 최빈값으로 대체
cat_cols = cleaned_data.select_dtypes(include=['object']).columns
for col in cat_cols:
    if cleaned_data[col].isnull().sum() > 0:
        cleaned_data[col].fillna(cleaned_data[col].mode()[0], inplace = True)

# 결측치 처리 후 데이터프레임
after_missing_data = cleaned_data

# 결측치 처리 후 확인
missing_data_after = cleaned_data.isnull().sum()
missing_data_after = missing_data_after[missing_data_after > 0]

missing_data_after

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cleaned_data[col].fillna(cleaned_data[col].mode()[0], inplace = True)


Series([], dtype: int64)

### Label Encoding

In [14]:
from sklearn.preprocessing import LabelEncoder

# object type을 가지는 columns 추출
object_cols = after_missing_data.select_dtypes(include=['object']).columns

# object type columns에 label encoding 수행
label_encoder = LabelEncoder()
for col in object_cols:
    after_missing_data[col] = label_encoder.fit_transform(after_missing_data[col].astype(str))

after_missing_data

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,0,0,0,3,657,1,240.0,2.5,-90,100,...,50.0,91.8,270,50,114.612,19.9,7,127,1,1
1,0,0,0,3,283,1,240.0,2.5,-90,70,...,91.8,270.0,50,85,19.600,7.0,185,1,0,1
2,0,0,1,0,589,1,1000.0,12.5,90,85,...,50.0,91.8,270,50,114.612,19.8,10,73,1,1
3,0,0,1,0,251,1,1000.0,12.5,90,70,...,91.8,270.0,50,85,19.900,12.0,268,1,0,1
4,0,0,0,0,142,1,240.0,2.5,-90,70,...,91.8,270.0,50,85,19.700,8.0,121,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,0,0,0,0,238,1,240.0,2.5,-90,70,...,91.8,270.0,50,85,19.200,1.0,318,1,0,1
40502,0,0,1,0,643,1,1000.0,12.5,90,100,...,50.0,91.8,270,50,114.612,20.5,14,197,1,1
40503,0,0,0,0,540,1,240.0,2.5,-90,100,...,50.0,91.8,270,50,85.000,19.7,1,27,1,1
40504,0,0,1,0,164,1,1000.0,12.5,90,70,...,91.8,270.0,50,85,20.100,13.0,117,1,0,1


In [15]:
# import matplotlib.pyplot as plt

# # 데이터 히스토그램
# plt.rcParams['figure.figsize'] = [20, 400]
# after_missing_data.hist(bins = 30, layout=(100, 2))
# plt.show()

In [16]:
after_missing_data['target'].describe()

count    40506.000000
mean         0.941984
std          0.233777
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: target, dtype: float64

### 이상치 처리

In [17]:
# 수치형 변수의 이상치 탐지 및 처리
num_cols = after_missing_data.select_dtypes(include = ['float64', 'int64']).columns

# IQR 기반 이상치 탐지 및 처리
for col in num_cols:
    Q1 = after_missing_data[col].quantile(0.25)
    Q3 = after_missing_data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # 이상치를 상한 및 하한으로 대체
    after_missing_data[col] = after_missing_data[col].apply(lambda x: lower_bound if x < lower_bound else x)
    after_missing_data[col] = after_missing_data[col].apply(lambda x: upper_bound if x > upper_bound else x)

# 이상치 처리 후 데이터 요약 통계 확인
outliers_processed_data = after_missing_data
outliers_processed_data.describe()

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
count,40506.0,40506.0,40506.0,40506.0,40506.0,40506.0,40506.0,40506.0,40506.0,40506.0,...,40506.0,40506.0,40506.0,40506.0,40506.0,40506.0,40506.0,40506.0,40506.0,40506.0
mean,0.0,0.0,0.382067,0.0,307.362218,1.0,530.370809,6.320669,-21.227966,70.0,...,66.972414,164.156081,180.671505,64.211351,62.046416,14.385228,69.840357,108.320557,0.639473,1.0
std,0.0,0.0,0.485899,0.0,183.213263,0.0,369.283055,4.858988,87.461776,0.0,...,20.527895,87.513657,108.041552,17.188429,36.454902,7.25855,95.368614,129.579191,0.586801,0.0
min,0.0,0.0,0.0,0.0,0.0,1.0,240.0,2.5,-90.0,70.0,...,50.0,91.8,50.0,50.0,17.6,1.0,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,0.0,162.0,1.0,240.0,2.5,-90.0,70.0,...,50.0,91.8,50.0,50.0,19.6,7.0,7.0,1.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,298.0,1.0,240.0,2.5,-90.0,70.0,...,50.0,91.8,270.0,50.0,85.0,19.5,13.0,49.0,1.0,1.0
75%,0.0,0.0,1.0,0.0,449.0,1.0,1000.0,12.5,90.0,70.0,...,91.8,270.0,270.0,85.0,85.0,19.9,120.0,196.0,1.0,1.0
max,0.0,0.0,1.0,0.0,662.0,1.0,1000.0,12.5,90.0,70.0,...,91.8,270.0,270.0,85.0,183.1,39.25,289.5,488.5,2.5,1.0


In [18]:
after_missing_data['target'].describe()

count    40506.0
mean         1.0
std          0.0
min          1.0
25%          1.0
50%          1.0
75%          1.0
max          1.0
Name: target, dtype: float64

### Scaling 진행

In [19]:
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer

# 수치형 변수 선택
num_cols = outliers_processed_data.select_dtypes(include=['float64', 'int64']).columns
num_cols = num_cols[num_cols != 'target']

# 각 스케일러를 사용하여 데이터 스케일링
scalers = {
    'StandardScaler': StandardScaler(),
    'RobustScaler': RobustScaler(),
    'Normalizer': Normalizer()
}

scaled_data = {}

# 원본 데이터 복사하여 스케일링 적용
for scaler_name, scaler in scalers.items():
    data_copy = outliers_processed_data.copy()
    data_copy[num_cols] = scaler.fit_transform(data_copy[num_cols])
    scaled_data[scaler_name] = data_copy

# 스케일링된 데이터의 확인을 위해 첫 5개의 데이터를 각각 확인
scaled_data_overview = {name: df.head() for name, df in scaled_data.items()}
scaled_data_overview

{'StandardScaler':    Wip Line_Dam  Process Desc._Dam  Equipment_Dam  Model.Suffix_Dam  \
 0           0.0                0.0      -0.786319               0.0   
 1           0.0                0.0      -0.786319               0.0   
 2           0.0                0.0       1.271748               0.0   
 3           0.0                0.0       1.271748               0.0   
 4           0.0                0.0      -0.786319               0.0   
 
    Workorder_Dam  Insp. Seq No._Dam  CURE END POSITION X Collect Result_Dam  \
 0       1.908389                0.0                               -0.786319   
 1      -0.132974                0.0                               -0.786319   
 2       1.537232                0.0                                1.271748   
 3      -0.307636                0.0                                1.271748   
 4      -0.902578                0.0                               -0.786319   
 
    CURE END POSITION Z Collect Result_Dam  \
 0                  

In [20]:
scaled_data['Normalizer']['target'].describe()

count    40506.0
mean         1.0
std          0.0
min          1.0
25%          1.0
50%          1.0
75%          1.0
max          1.0
Name: target, dtype: float64

In [21]:
from imblearn.over_sampling import SMOTE

# SMOTE 초기화
smote = SMOTE(random_state=42)

# 오버샘플링을 수행할 스케일링된 데이터셋 이름
scaler_names = ['StandardScaler', 'RobustScaler', 'Normalizer']

# 오버샘플링된 데이터를 저장할 딕셔너리
oversampled_data = {}

# 각 스케일링된 데이터셋에 대해 오버샘플링 수행
for scaler_name in scaler_names:
    data_copy = scaled_data[scaler_name]

    X = data_copy.drop('target', axis = 1)
    y = data_copy['target']

    print(y.describe())

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

    oversampled_data[scaler_name] = (X_resampled, y_resampled)

# 오버샘플링된 데이터 확인
for name, (X_res, y_res) in oversampled_data.items():
    print(f"{name} oversampled data class distribution:\n", y_res.value_counts(), "\n")


count    40506.0
mean         1.0
std          0.0
min          1.0
25%          1.0
50%          1.0
75%          1.0
max          1.0
Name: target, dtype: float64


ValueError: The target 'y' needs to have more than 1 class. Got 1 class instead