# 제품 이상여부 판별 프로젝트


## 데이터 불러오기


### 필수 라이브러리


In [1]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import random
import torch
import warnings
warnings.filterwarnings("ignore")

### 데이터 읽어오기


In [2]:
ROOT_DIR = "data"
RANDOM_STATE = 736665
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(RANDOM_STATE)


seed_everything(RANDOM_STATE)
# Load data
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

In [3]:
# test_data에는 target 컬럼이 있지만 다 결측치
# test_data에 Set ID가 있어서 열 하나가 train_data보다 많음
train_data.shape, test_data.shape

((40506, 464), (17361, 465))

In [4]:
train_data

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION X Unit Time_Dam,CURE END POSITION X Judge Value_Dam,...,Production Qty Collect Result_Fill2,Production Qty Unit Time_Fill2,Production Qty Judge Value_Fill2,Receip No Collect Result_Fill2,Receip No Unit Time_Fill2,Receip No Judge Value_Fill2,WorkMode Collect Result_Fill2,WorkMode Unit Time_Fill2,WorkMode Judge Value_Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,1,OK,240.0,,,...,7,,,127,,,1,,,Normal
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,1,OK,240.0,,,...,185,,,1,,,0,,,Normal
2,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1,OK,1000.0,,,...,10,,,73,,,1,,,Normal
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1,OK,1000.0,,,...,268,,,1,,,0,,,Normal
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,1,OK,240.0,,,...,121,,,1,,,0,,,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XF434-2,1,OK,240.0,,,...,318,,,1,,,0,,,Normal
40502,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1XC796-1,1,OK,1000.0,,,...,14,,,197,,,1,,,Normal
40503,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4C1XD438-1,1,OK,240.0,,,...,1,,,27,,,1,,,Normal
40504,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3I1XA258-1,1,OK,1000.0,,,...,117,,,1,,,0,,,Normal


In [5]:
train_data['target'].value_counts(normalize=True)

target
Normal      0.941984
AbNormal    0.058016
Name: proportion, dtype: float64

## 데이터 전처리

### 전체 행이 결측치이거나 똑같은 값인 칼럼 삭제

In [6]:
import pandas as pd

def drop_columns(df):

    drop_cols = []
    # 전체 행이 결측치인 칼럼 삭제
    base_col = df.columns
    df.dropna(axis=1, how='all', inplace=True)
    # drop된 컬럼 확인
    dif_col = list(set(base_col) - set(df.columns))
    drop_cols.extend(dif_col)
    print('모든 값이 결측치인 열 제거 진행')
    print(f'drop한 컬럼 : {dif_col}')
    print(f'drop한 컬럼 개수 : {len(dif_col)}개')

    # 특정 컬럼의 값이 다 같은 경우 컬럼 제거
    base_col = df.columns
    for col in df.columns:
        if (df[col].nunique() == 1) and (df[col].isnull().sum() == 0):
            df.drop(columns=col, inplace=True)
    dif_col = list(set(base_col) - set(df.columns))
    drop_cols.extend(dif_col)
    print('모든 값이 같은 열 제거 진행')
    print(f'drop한 컬럼 : {dif_col}')
    print(f'drop한 컬럼 개수 : {len(dif_col)}개')


    return df, drop_cols

train_data, drop_cols = drop_columns(train_data)
test_data = test_data.drop(columns=drop_cols)
train_data.shape, test_data.shape

모든 값이 결측치인 열 제거 진행
drop한 컬럼 : ['Head Purge Position Y Judge Value_Fill2', 'PalletID Judge Value_Dam', 'Stage2 Circle3 Distance Speed Judge Value_Dam', 'Stage1 Line2 Distance Speed Judge Value_Dam', 'CURE STANDBY POSITION Z Judge Value_Dam', 'Dispense Volume(Stage1) Unit Time_Fill2', 'Receip No Judge Value_Dam', 'Stage2 Circle2 Distance Speed Unit Time_Dam', 'PalletID Unit Time_Dam', 'HEAD Standby Position X Unit Time_Dam', 'Head Purge Position Z Judge Value_Fill1', 'Machine Tact time Judge Value_Fill1', 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Judge Value_Fill1', 'WorkMode Unit Time_Dam', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Unit Time_Dam', 'Stage3 Line4 Distance Speed Unit Time_Dam', 'DISCHARGED TIME OF RESIN(Stage2) Unit Time_Fill1', 'Head Purge Position Y Unit Time_Fill2', 'Machine Tact time Judge Value_Dam', 'Stage3 Line4 Distance Speed Judge Value_Dam', 'HEAD Standby Position Y Judge Value_Dam', 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Unit Time_Fill2', 'CURE END POSITION X Unit Tim

((40506, 151), (17361, 152))

### 중복 컬럼 제거

In [7]:
columns_dict = {}
duplicate_dict = {}

for col in train_data.columns:
    # 각 열의 해시 값을 계산하여 문자열로 변환
    col_hash = tuple(train_data[col].tolist())

    # 이미 존재하는 해시 값이라면 중복으로 처리
    if col_hash in columns_dict:
        existing_col = columns_dict[col_hash]
        if existing_col in duplicate_dict:
            duplicate_dict[existing_col].append(col)
        else:
            duplicate_dict[existing_col] = [col]
    else:
        columns_dict[col_hash] = col
duplicate_dict

{'CURE END POSITION Θ Collect Result_Dam': ['CURE START POSITION Θ Collect Result_Dam'],
 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam': ['HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam'],
 'HEAD Standby Position Y Collect Result_Dam': ['Head Purge Position Y Collect Result_Dam'],
 'Stage1 Circle2 Distance Speed Collect Result_Dam': ['Stage1 Circle3 Distance Speed Collect Result_Dam',
  'Stage1 Circle4 Distance Speed Collect Result_Dam'],
 'Stage1 Line1 Distance Speed Collect Result_Dam': ['Stage1 Line3 Distance Speed Collect Result_Dam'],
 'Stage2 Circle2 Distance Speed Collect Result_Dam': ['Stage2 Circle3 Distance Speed Collect Result_Dam',
  'Stage2 Circle4 Distance Speed Collect Result_Dam',
  'Stage2 Line1 Distance Speed Collect Result_Dam'],
 'Stage3 Circle2 Distance Speed Collect Result_Dam': ['Stage3 Circle3 Distance Speed Collect Result_Dam',
  'Stage3 Circle4 Distance Speed Collect Result_Dam'],
 'Stage3 Line1 Distance Speed Collect Result_Dam': ['Stag

In [8]:
delete_cols = []
for col in list(duplicate_dict.values()):
    delete_cols.extend(col)
delete_cols

['CURE START POSITION Θ Collect Result_Dam',
 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam',
 'Head Purge Position Y Collect Result_Dam',
 'Stage1 Circle3 Distance Speed Collect Result_Dam',
 'Stage1 Circle4 Distance Speed Collect Result_Dam',
 'Stage1 Line3 Distance Speed Collect Result_Dam',
 'Stage2 Circle3 Distance Speed Collect Result_Dam',
 'Stage2 Circle4 Distance Speed Collect Result_Dam',
 'Stage2 Line1 Distance Speed Collect Result_Dam',
 'Stage3 Circle3 Distance Speed Collect Result_Dam',
 'Stage3 Circle4 Distance Speed Collect Result_Dam',
 'Stage3 Line3 Distance Speed Collect Result_Dam',
 'Model.Suffix_AutoClave',
 'Model.Suffix_Fill1',
 'Model.Suffix_Fill2',
 'Workorder_AutoClave',
 'Workorder_Fill1',
 'Workorder_Fill2',
 'GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave',
 'GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave',
 'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1',
 'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2',
 'Head Pur

In [9]:
train_data.drop(columns=delete_cols, inplace=True)
test_data.drop(columns=delete_cols, inplace=True)
train_data.shape, test_data.shape

((40506, 125), (17361, 126))

###  OK nan 만 있는 열 제거

In [10]:
train_data.drop(columns= ['HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam'], inplace=True)
test_data.drop(columns= ['HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam'], inplace=True)
train_data.shape, test_data.shape

((40506, 124), (17361, 125))

### 파생 변수 생성

#### 상관관계가 1, -1인 열 확인 - 할지 말지는 선택
- 특정 열들의 unique값이 아주 적은 경우

In [11]:
# train_data의 수치형 데이터
num_cols = train_data.select_dtypes(include=["int64", "float64"]).columns
corr_matrix = train_data[num_cols].corr()
corr_matrix

Unnamed: 0,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam,Dispense Volume(Stage1) Collect Result_Dam,...,Head Clean Position X Collect Result_Fill2,Head Clean Position Y Collect Result_Fill2,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2
CURE END POSITION X Collect Result_Dam,1.000000,1.000000,1.000000,0.072784,-1.000000,0.218303,0.274864,0.284963,0.273605,0.270166,...,-0.279744,0.279744,-0.279744,-0.279744,-0.279744,0.268463,0.415902,-0.167640,0.190253,0.148343
CURE END POSITION Z Collect Result_Dam,1.000000,1.000000,1.000000,0.072784,-1.000000,0.218303,0.274864,0.284963,0.273605,0.270166,...,-0.279744,0.279744,-0.279744,-0.279744,-0.279744,0.268463,0.415902,-0.167640,0.190253,0.148343
CURE END POSITION Θ Collect Result_Dam,1.000000,1.000000,1.000000,0.072784,-1.000000,0.218303,0.274864,0.284963,0.273605,0.270166,...,-0.279744,0.279744,-0.279744,-0.279744,-0.279744,0.268463,0.415902,-0.167640,0.190253,0.148343
CURE SPEED Collect Result_Dam,0.072784,0.072784,0.072784,1.000000,-0.072784,0.411677,0.049065,0.313518,0.036990,0.190544,...,-0.327282,0.327282,-0.327282,-0.327282,-0.327282,0.500878,0.322051,-0.245664,0.106939,0.172034
CURE START POSITION X Collect Result_Dam,-1.000000,-1.000000,-1.000000,-0.072784,1.000000,-0.218303,-0.274864,-0.284963,-0.273605,-0.270166,...,0.279744,-0.279744,0.279744,0.279744,0.279744,-0.268463,-0.415902,0.167640,-0.190253,-0.148343
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Machine Tact time Collect Result_Fill2,0.268463,0.268463,0.268463,0.500878,-0.268463,0.806875,0.277224,0.783062,0.267644,0.487390,...,-0.966389,0.966389,-0.966389,-0.966389,-0.966389,1.000000,0.911833,-0.723120,0.625772,0.505677
PalletID Collect Result_Fill2,0.415902,0.415902,0.415902,0.322051,-0.415902,0.751740,0.368828,0.762839,0.362118,0.518180,...,-0.935672,0.935672,-0.935672,-0.935672,-0.935672,0.911833,1.000000,-0.691297,0.632636,0.473422
Production Qty Collect Result_Fill2,-0.167640,-0.167640,-0.167640,-0.245664,0.167640,-0.595208,-0.177210,-0.587332,-0.171922,-0.304106,...,0.748361,-0.748361,0.748361,0.748361,0.748361,-0.723120,-0.691297,1.000000,-0.509699,-0.369916
Receip No Collect Result_Fill2,0.190253,0.190253,0.190253,0.106939,-0.190253,0.553829,0.165186,0.508740,0.161199,0.233516,...,-0.680962,0.680962,-0.680962,-0.680962,-0.680962,0.625772,0.632636,-0.509699,1.000000,0.312106


In [12]:
# 상관계수 1 또는 -1인 쌍만 추출
strong_corrs = corr_matrix[(corr_matrix >= 0.99999) | (corr_matrix <= -0.99999) ]
strong_corrs

Unnamed: 0,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam,Dispense Volume(Stage1) Collect Result_Dam,...,Head Clean Position X Collect Result_Fill2,Head Clean Position Y Collect Result_Fill2,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2
CURE END POSITION X Collect Result_Dam,1.0,1.0,1.0,,-1.0,,,,,,...,,,,,,,,,,
CURE END POSITION Z Collect Result_Dam,1.0,1.0,1.0,,-1.0,,,,,,...,,,,,,,,,,
CURE END POSITION Θ Collect Result_Dam,1.0,1.0,1.0,,-1.0,,,,,,...,,,,,,,,,,
CURE SPEED Collect Result_Dam,,,,1.0,,,,,,,...,,,,,,,,,,
CURE START POSITION X Collect Result_Dam,-1.0,-1.0,-1.0,,1.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Machine Tact time Collect Result_Fill2,,,,,,,,,,,...,,,,,,1.0,,,,
PalletID Collect Result_Fill2,,,,,,,,,,,...,,,,,,,1.0,,,
Production Qty Collect Result_Fill2,,,,,,,,,,,...,,,,,,,,1.0,,
Receip No Collect Result_Fill2,,,,,,,,,,,...,,,,,,,,,1.0,


In [13]:
strong_corrs

Unnamed: 0,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam,Dispense Volume(Stage1) Collect Result_Dam,...,Head Clean Position X Collect Result_Fill2,Head Clean Position Y Collect Result_Fill2,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2
CURE END POSITION X Collect Result_Dam,1.0,1.0,1.0,,-1.0,,,,,,...,,,,,,,,,,
CURE END POSITION Z Collect Result_Dam,1.0,1.0,1.0,,-1.0,,,,,,...,,,,,,,,,,
CURE END POSITION Θ Collect Result_Dam,1.0,1.0,1.0,,-1.0,,,,,,...,,,,,,,,,,
CURE SPEED Collect Result_Dam,,,,1.0,,,,,,,...,,,,,,,,,,
CURE START POSITION X Collect Result_Dam,-1.0,-1.0,-1.0,,1.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Machine Tact time Collect Result_Fill2,,,,,,,,,,,...,,,,,,1.0,,,,
PalletID Collect Result_Fill2,,,,,,,,,,,...,,,,,,,1.0,,,
Production Qty Collect Result_Fill2,,,,,,,,,,,...,,,,,,,,1.0,,
Receip No Collect Result_Fill2,,,,,,,,,,,...,,,,,,,,,1.0,


In [14]:
train_data[['CURE END POSITION X Collect Result_Dam', 'CURE END POSITION Z Collect Result_Dam']].drop_duplicates()

Unnamed: 0,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam
0,240.0,2.5
2,1000.0,12.5


In [15]:
col_list = strong_corrs.columns
corr_list = []
for col in col_list:
    corr_cols = list(strong_corrs[strong_corrs[col].notnull()].index)

    if (len(corr_cols) > 1) and (corr_cols not in corr_list):
        print('서로 상관관계가 1인 집단 크기 :', len(corr_cols))
        corr_list.append(corr_cols)

서로 상관관계가 1인 집단 크기 : 4
서로 상관관계가 1인 집단 크기 : 18
서로 상관관계가 1인 집단 크기 : 2
서로 상관관계가 1인 집단 크기 : 2
서로 상관관계가 1인 집단 크기 : 2
서로 상관관계가 1인 집단 크기 : 2
서로 상관관계가 1인 집단 크기 : 3


In [16]:
corr_list

[['CURE END POSITION X Collect Result_Dam',
  'CURE END POSITION Z Collect Result_Dam',
  'CURE END POSITION Θ Collect Result_Dam',
  'CURE START POSITION X Collect Result_Dam'],
 ['HEAD Standby Position Y Collect Result_Dam',
  'HEAD Standby Position Z Collect Result_Dam',
  'Head Clean Position X Collect Result_Dam',
  'Head Clean Position Y Collect Result_Dam',
  'Head Zero Position X Collect Result_Dam',
  'HEAD Standby Position Y Collect Result_Fill1',
  'HEAD Standby Position Z Collect Result_Fill1',
  'Head Clean Position X Collect Result_Fill1',
  'Head Clean Position Y Collect Result_Fill1',
  'Head Clean Position Z Collect Result_Fill1',
  'Head Purge Position X Collect Result_Fill1',
  'HEAD Standby Position Y Collect Result_Fill2',
  'HEAD Standby Position Z Collect Result_Fill2',
  'Head Clean Position X Collect Result_Fill2',
  'Head Clean Position Y Collect Result_Fill2',
  'Head Clean Position Z Collect Result_Fill2',
  'Head Purge Position X Collect Result_Fill2',
  'H

In [17]:
print('집단 별로 합하기 전의 고유값과 합한 후의 고유값 비교')
for i in range(len(corr_list)):
    print('전', len(train_data[corr_list[i]].drop_duplicates()), '후', train_data[corr_list[i]].sum(axis=1).nunique())

집단 별로 합하기 전의 고유값과 합한 후의 고유값 비교
전 2 후 2
전 2 후 2
전 5 후 5
전 10 후 10
전 26 후 22
전 2 후 1
전 3 후 2


In [18]:
# 합하기 전과 후의 고유값이 같은 경우는 그냥 합침
for i in range(4):
    train_data['SUM ' + corr_list[i][0]] = train_data[corr_list[i]].sum(axis=1)
    test_data['SUM ' + corr_list[i][0]] = test_data[corr_list[i]].sum(axis=1)
    print(corr_list[i])
    train_data.drop(columns=corr_list[i], inplace=True)
    test_data.drop(columns=corr_list[i], inplace=True)
train_data.shape, test_data.shape

['CURE END POSITION X Collect Result_Dam', 'CURE END POSITION Z Collect Result_Dam', 'CURE END POSITION Θ Collect Result_Dam', 'CURE START POSITION X Collect Result_Dam']
['HEAD Standby Position Y Collect Result_Dam', 'HEAD Standby Position Z Collect Result_Dam', 'Head Clean Position X Collect Result_Dam', 'Head Clean Position Y Collect Result_Dam', 'Head Zero Position X Collect Result_Dam', 'HEAD Standby Position Y Collect Result_Fill1', 'HEAD Standby Position Z Collect Result_Fill1', 'Head Clean Position X Collect Result_Fill1', 'Head Clean Position Y Collect Result_Fill1', 'Head Clean Position Z Collect Result_Fill1', 'Head Purge Position X Collect Result_Fill1', 'HEAD Standby Position Y Collect Result_Fill2', 'HEAD Standby Position Z Collect Result_Fill2', 'Head Clean Position X Collect Result_Fill2', 'Head Clean Position Y Collect Result_Fill2', 'Head Clean Position Z Collect Result_Fill2', 'Head Purge Position X Collect Result_Fill2', 'Head Purge Position Z Collect Result_Fill2']

((40506, 102), (17361, 103))

In [19]:
train_data[corr_list[4]].drop_duplicates()

Unnamed: 0,HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1,HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1
0,430.0,429.8
1,430.5,430.8
2,1323.5,1322.5
3,1322.5,1322.8
4,430.5,430.5
6,430.8,430.5
24,430.3,430.0
25,431.1,430.8
48,429.8,430.0
56,1323.0,1322.8


In [20]:
temp = train_data[corr_list[4]].drop_duplicates()
temp['sum'] =  temp[corr_list[4][0]] + ((temp[corr_list[4][1]] - temp[corr_list[4][0]]) / temp[corr_list[4][0]] * 100)
temp['sum']

0         429.953488
1         430.569686
2        1323.424443
3        1322.522684
4         430.500000
6         430.730362
24        430.230281
25        431.030411
48        429.846533
56       1322.984883
66       1323.447110
71       1324.924528
111      1325.524562
121      1323.147098
127       430.523229
450      1323.200000
569       430.183802
605      1324.462250
621       429.730200
661       430.960821
703      1323.262216
844      1322.707560
1432     1325.184908
1450     1324.762258
2687     1322.852918
37520    1332.274790
Name: sum, dtype: float64

In [21]:
train_data['DELTA ' + corr_list[4][0]] = train_data[corr_list[4][0]] + ((train_data[corr_list[4][1]] - train_data[corr_list[4][0]]) / train_data[corr_list[4][0]] * 100)
test_data['DELTA ' + corr_list[4][0]] = test_data[corr_list[4][0]] + ((test_data[corr_list[4][1]] - test_data[corr_list[4][0]]) / test_data[corr_list[4][0]] * 100)
train_data.drop(columns=corr_list[4], inplace=True)
test_data.drop(columns=corr_list[4], inplace=True)
train_data.shape, test_data.shape

((40506, 101), (17361, 102))

In [22]:
train_data[corr_list[5]].drop_duplicates()

Unnamed: 0,CURE END POSITION X Collect Result_Fill2,CURE START POSITION X Collect Result_Fill2
0,240,1020
569,1020,240


In [23]:
train_data['DELTA ' + corr_list[5][0]] = train_data[corr_list[5][0]] - train_data[corr_list[5][1]]
test_data['DELTA ' + corr_list[5][0]] = test_data[corr_list[5][0]] - test_data[corr_list[5][1]]
train_data.drop(columns=corr_list[5], inplace=True)
test_data.drop(columns=corr_list[5], inplace=True)
train_data.shape, test_data.shape

((40506, 100), (17361, 101))

In [24]:
train_data[corr_list[6]].drop_duplicates()

Unnamed: 0,HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2,HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2,HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2
0,428.0,427.9,243.7
1,427.9,428.0,243.7
2,1324.2,1324.2,243.5


In [25]:
train_data['DELTA ' + corr_list[6][0]]  = (train_data[corr_list[6][1]] - train_data[corr_list[6][0]] + train_data[corr_list[6][2]])
test_data['DELTA ' + corr_list[6][0]]  = (test_data[corr_list[6][1]] - test_data[corr_list[6][0]] + test_data[corr_list[6][2]])
train_data.drop(columns=corr_list[6], inplace=True)
test_data.drop(columns=corr_list[6], inplace=True)
train_data.shape, test_data.shape

((40506, 98), (17361, 99))

#### 상관관계 범위를 1, -1이 아닌 0.99, -0.99로 변경 시 확인

In [26]:
num_cols = train_data.select_dtypes(include=["int64", "float64"]).columns
corr_matrix = train_data[num_cols].corr()

# 상관계수 1 또는 -1인 쌍만 추출
strong_corrs = corr_matrix[(corr_matrix >= 0.99)  | (corr_matrix <= -0.99) ]
strong_corrs

Unnamed: 0,CURE SPEED Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam,Dispense Volume(Stage1) Collect Result_Dam,Dispense Volume(Stage2) Collect Result_Dam,Dispense Volume(Stage3) Collect Result_Dam,HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam,HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam,...,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,SUM CURE END POSITION X Collect Result_Dam,SUM HEAD Standby Position Y Collect Result_Dam,SUM Head Clean Position Z Collect Result_Dam,SUM Stage2 Circle2 Distance Speed Collect Result_Dam,DELTA HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1,DELTA CURE END POSITION X Collect Result_Fill2,DELTA HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2
CURE SPEED Collect Result_Dam,1.0,,,,,,,,,,...,,,,,,,,,,
DISCHARGED SPEED OF RESIN Collect Result_Dam,,1.0,,,,,,,,,...,,,,,,,,,,
DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,,,1.000000,,0.999476,,,,,,...,,,,,,,,,,
DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,,,,1.0,,,,,,,...,,,,,,,,,,
DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam,,,0.999476,,1.000000,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SUM Head Clean Position Z Collect Result_Dam,,,,,,,,,,,...,,,,,0.999443,1.0,,,,
SUM Stage2 Circle2 Distance Speed Collect Result_Dam,,,,,,,,,,,...,,,,,,,1.0,,,
DELTA HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1,,,,,,,,,,,...,,,,0.998797,,,,1.0,,
DELTA CURE END POSITION X Collect Result_Fill2,,,,,,,,,,,...,,,,,,,,,1.0,


In [27]:
# 상관 행렬 계산 (예: df는 데이터프레임)
corr_matrix = train_data[num_cols].corr()

# 절대값이 0.99 이상인 상관관계 쌍을 필터링
high_corr_pairs = corr_matrix[(corr_matrix.abs() >= 0.99)]
high_corr_pairs.fillna(0, inplace=True)
# 상관관계가 0.99 이상인 컬럼 쌍 추출
correlated_pairs = []
for i in range(len(high_corr_pairs.columns)):
    for j in range(i):
        if (high_corr_pairs.iloc[i, j] != 0):
            correlated_pairs.append((high_corr_pairs.index[i], high_corr_pairs.columns[j], high_corr_pairs.iloc[i, j]))

# 결과 출력
for pair in correlated_pairs:
    print(f'{pair[0]}와 {pair[1]}의 상관관계: {pair[2]}')

DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam와 DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam의 상관관계: 0.9994755091064483
Dispense Volume(Stage3) Collect Result_Dam와 Dispense Volume(Stage1) Collect Result_Dam의 상관관계: 0.9993794627149966
HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam와 HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam의 상관관계: 0.997222491026016
Head Zero Position Y Collect Result_Dam와 Head Purge Position X Collect Result_Dam의 상관관계: -0.9991435701922116
Head Zero Position Z Collect Result_Dam와 Head Purge Position X Collect Result_Dam의 상관관계: -0.9991490400388723
Head Zero Position Z Collect Result_Dam와 Head Zero Position Y Collect Result_Dam의 상관관계: 0.9999168250383242
Machine Tact time Collect Result_Dam와 Head Purge Position X Collect Result_Dam의 상관관계: -0.9914490717908745
Machine Tact time Collect Result_Dam와 Head Zero Position Y Collect Result_Dam의 상관관계: 0.9928418093572455
Machine Tact time Collect Result_Dam와 Head Zero Position Z Collect Result_Dam의 상관

In [28]:
temp = pd.DataFrame(strong_corrs.notnull().sum())
temp.columns = ['group_size']
temp = temp[temp['group_size'] > 1]
print(temp['group_size'].unique())
temp


[ 2  4  3 10  8  9]


Unnamed: 0,group_size
DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,2
DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam,2
Dispense Volume(Stage1) Collect Result_Dam,2
Dispense Volume(Stage3) Collect Result_Dam,2
HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam,4
HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam,4
HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam,3
Head Purge Position X Collect Result_Dam,10
Head Zero Position Y Collect Result_Dam,10
Head Zero Position Z Collect Result_Dam,10


In [29]:
col_list = strong_corrs.columns
corr_list = []
for col in col_list:
    corr_cols = list(strong_corrs[strong_corrs[col].notnull()].index)

    if (len(corr_cols) > 1) and (corr_cols not in corr_list):
        print('서로 상관관계가 0.99이상인 집단 크기 :', len(corr_cols))
        corr_list.append(corr_cols)

서로 상관관계가 0.99이상인 집단 크기 : 2
서로 상관관계가 0.99이상인 집단 크기 : 2
서로 상관관계가 0.99이상인 집단 크기 : 4
서로 상관관계가 0.99이상인 집단 크기 : 3
서로 상관관계가 0.99이상인 집단 크기 : 10
서로 상관관계가 0.99이상인 집단 크기 : 8
서로 상관관계가 0.99이상인 집단 크기 : 3
서로 상관관계가 0.99이상인 집단 크기 : 3
서로 상관관계가 0.99이상인 집단 크기 : 8
서로 상관관계가 0.99이상인 집단 크기 : 3
서로 상관관계가 0.99이상인 집단 크기 : 9


In [30]:
print('집단 별로 합하기 전의 고유값과 합한 후의 고유값 비교')
for i in range(len(corr_list)):
    print('전', len(train_data[corr_list[i]].drop_duplicates()), '후', train_data[corr_list[i]].sum(axis=1).nunique())
    if len(train_data[corr_list[i]].drop_duplicates()) == train_data[corr_list[i]].sum(axis=1).nunique():
        print(i)

집단 별로 합하기 전의 고유값과 합한 후의 고유값 비교
전 46 후 33
전 49 후 36
전 49 후 49
2
전 46 후 43
전 388 후 375
전 327 후 321
전 572 후 535
전 660 후 642
전 13 후 12
전 14 후 14
9
전 76 후 75


In [31]:
train_data.shape, test_data.shape

((40506, 98), (17361, 99))

In [32]:
# 합하기 전과 후의 고유값이 같은 경우는 그냥 합침
for i in [2, 9]:
    train_data['SUM ' + corr_list[i][0]] = train_data[corr_list[i]].sum(axis=1)
    test_data['SUM ' + corr_list[i][0]] = test_data[corr_list[i]].sum(axis=1)
    print(corr_list[i])
    train_data.drop(columns=corr_list[i], inplace=True)
    test_data.drop(columns=corr_list[i], inplace=True)
train_data.shape, test_data.shape

['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam', 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam', 'SUM CURE END POSITION X Collect Result_Dam', 'DELTA HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1']
['Stage2 Line2 Distance Speed Collect Result_Dam', 'Stage2 Line4 Distance Speed Collect Result_Dam', 'SUM Stage2 Circle2 Distance Speed Collect Result_Dam']


((40506, 93), (17361, 94))

In [33]:
temp = train_data[corr_list[0]].drop_duplicates()
print(len(temp))
# temp['sum'] =  temp[corr_list[0][0]] + ((temp[corr_list[0][1]] - temp[corr_list[0][0]]) / temp[corr_list[0][0]] * 100)
temp['sum'] =  temp[corr_list[0][0]] + (temp[corr_list[0][1]] - temp[corr_list[0][0]]) / temp[corr_list[0][0]] * 100
temp['sum'].nunique()

46


46

In [34]:
corr_list[1]

['Dispense Volume(Stage1) Collect Result_Dam',
 'Dispense Volume(Stage3) Collect Result_Dam']

In [35]:
temp = train_data[corr_list[1]].drop_duplicates()
print(len(temp))
# temp['sum'] =  temp[corr_list[0][0]] + ((temp[corr_list[0][1]] - temp[corr_list[0][0]]) / temp[corr_list[0][0]] * 100)
temp['sum'] =  train_data[corr_list[1][0]] + (train_data[corr_list[1][1]] - train_data[corr_list[1][0]]) / train_data[corr_list[1][0]] * 100
temp['sum'].nunique()

49


49

In [36]:
train_data['DELTA ' + corr_list[1][0]] = train_data[corr_list[1][0]] + (train_data[corr_list[1][1]] - train_data[corr_list[1][0]]) / train_data[corr_list[1][0]] * 100
test_data['DELTA ' + corr_list[1][0]] = test_data[corr_list[1][0]] + (test_data[corr_list[1][1]] - test_data[corr_list[1][0]]) / test_data[corr_list[1][0]] * 100
train_data.drop(columns=corr_list[1], inplace=True)
test_data.drop(columns=corr_list[1], inplace=True)
train_data.shape, test_data.shape

((40506, 92), (17361, 93))

In [37]:
corr_list[8]

['Stage1 Circle2 Distance Speed Collect Result_Dam',
 'Stage1 Line1 Distance Speed Collect Result_Dam',
 'Stage1 Line2 Distance Speed Collect Result_Dam',
 'Stage1 Line4 Distance Speed Collect Result_Dam',
 'Stage3 Circle2 Distance Speed Collect Result_Dam',
 'Stage3 Line1 Distance Speed Collect Result_Dam',
 'Stage3 Line2 Distance Speed Collect Result_Dam',
 'Stage3 Line4 Distance Speed Collect Result_Dam']

In [38]:
temp = train_data[corr_list[8]].drop_duplicates()
print(len(temp))
# temp['sum'] =  temp[corr_list[0][0]] + ((temp[corr_list[0][1]] - temp[corr_list[0][0]]) / temp[corr_list[0][0]] * 100)
temp['sum'] = train_data[corr_list[8]].sum(axis=1) + train_data[corr_list[8][2]]
temp['sum'].nunique()

13


13

In [39]:
train_data['DELTA ' + corr_list[8][0]] = train_data[corr_list[8]].sum(axis=1) + train_data[corr_list[8][2]]
test_data['DELTA ' + corr_list[8][0]] = test_data[corr_list[8]].sum(axis=1) + test_data[corr_list[8][2]]
train_data.drop(columns=corr_list[8], inplace=True)
test_data.drop(columns=corr_list[8], inplace=True)
train_data.shape, test_data.shape

((40506, 85), (17361, 86))

In [40]:
corr_list[10]

['Head Purge Position X Collect Result_Dam',
 'Head Zero Position Y Collect Result_Dam',
 'Head Zero Position Z Collect Result_Dam',
 'WorkMode Collect Result_Dam',
 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',
 'HEAD Standby Position X Collect Result_Fill1',
 'HEAD Standby Position X Collect Result_Fill2',
 'SUM HEAD Standby Position Y Collect Result_Dam',
 'SUM Head Clean Position Z Collect Result_Dam']

In [41]:
temp = train_data[corr_list[10]].drop_duplicates()
print(len(temp))
# temp['sum'] =  temp[corr_list[0][0]] + ((temp[corr_list[0][1]] - temp[corr_list[0][0]]) / temp[corr_list[0][0]] * 100)
temp['sum'] = (train_data[corr_list[10][5]] + train_data[corr_list[10][6]])/2 + (train_data[corr_list[10][0:5]].sum(axis=1)+ train_data[corr_list[10][7:]].sum(axis=1)) / 7
temp['sum'].nunique()

76


76

In [42]:
train_data['DELTA ' + corr_list[10][0]] = (train_data[corr_list[10][5]] + train_data[corr_list[10][6]])/2 + (train_data[corr_list[10][0:5]].sum(axis=1)+ train_data[corr_list[10][7:]].sum(axis=1)) / 7
test_data['DELTA ' + corr_list[10][0]] = (test_data[corr_list[10][5]] + test_data[corr_list[10][6]])/2 + (test_data[corr_list[10][0:5]].sum(axis=1)+ test_data[corr_list[10][7:]].sum(axis=1)) / 7
train_data.drop(columns=corr_list[10], inplace=True)
test_data.drop(columns=corr_list[10], inplace=True)
train_data.shape, test_data.shape

((40506, 77), (17361, 78))

In [43]:
pd.DataFrame(train_data.isnull().sum()).sort_values(by=0, ascending=False)

Unnamed: 0,0
HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2,12766
HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1,12766
HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam,12766
Head Purge Position Z Collect Result_Fill1,0
CURE END POSITION Z Collect Result_Fill2,0
...,...
THICKNESS 3 Collect Result_Dam,0
THICKNESS 2 Collect Result_Dam,0
THICKNESS 1 Collect Result_Dam,0
Stage3 Circle1 Distance Speed Collect Result_Dam,0


### 범주형 변수 확인

In [44]:
# train_data의 변수 타입 데이터 프레임 생성
train_data_types = pd.DataFrame(train_data.dtypes, columns=["type"])
train_data_types['nunique'] = train_data.nunique()
train_data_types['unique'] = train_data.apply(lambda x: str(x.unique()), axis=0)
train_data_types['null_num'] = train_data.isnull().sum()
train_data_types['type'].value_counts()

type
float64    41
int64      26
object     10
Name: count, dtype: int64

In [45]:
train_data_types[train_data_types['type'] == 'object']

Unnamed: 0,type,nunique,unique,null_num
Equipment_Dam,object,2,['Dam dispenser #1' 'Dam dispenser #2'],0
Model.Suffix_Dam,object,7,['AJX75334505' 'AJX75334501' 'AJX75334502' 'AJ...,0
Workorder_Dam,object,663,['4F1XA938-1' '3KPM0016-2' '4E1X9167-1' '3K1X0...,0
HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam,object,7,[nan '550.3' 'OK' '162.4' '549' '549.5' '550' ...,12766
Chamber Temp. Judge Value_AutoClave,object,2,['OK' 'NG'],0
Equipment_Fill1,object,2,['Fill1 dispenser #1' 'Fill1 dispenser #2'],0
HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1,object,6,[nan '838.4' 'OK' '837.7' '837.9' '838.2' '837...,12766
Equipment_Fill2,object,2,['Fill2 dispenser #1' 'Fill2 dispenser #2'],0
HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2,object,3,[nan '835.5' 'OK' '305'],12766
target,object,2,['Normal' 'AbNormal'],0


In [46]:
train_data[['Equipment_Dam', 'Equipment_Fill1','Equipment_Fill2']].value_counts()

Equipment_Dam     Equipment_Fill1     Equipment_Fill2   
Dam dispenser #1  Fill1 dispenser #1  Fill2 dispenser #1    25011
Dam dispenser #2  Fill1 dispenser #2  Fill2 dispenser #2    15461
Dam dispenser #1  Fill1 dispenser #2  Fill2 dispenser #2       13
Dam dispenser #2  Fill1 dispenser #1  Fill2 dispenser #1       10
Dam dispenser #1  Fill1 dispenser #1  Fill2 dispenser #2        6
Dam dispenser #2  Fill1 dispenser #2  Fill2 dispenser #1        5
Name: count, dtype: int64

#### 범주형 변수 인코딩
- 라벨
    - Equipment_Dam, Equipment_Fill1, Equipment_Fill2, Chamber Temp. Judge Value_AutoClave
- 원핫
    - Model.Suffix_Dam , Workorder_Dam

In [47]:
le_cols = ['Equipment_Dam', 'Equipment_Fill1', 'Equipment_Fill2', 'Chamber Temp. Judge Value_AutoClave']
ohe_cols = ['Model.Suffix_Dam','Workorder_Dam']

##### 라벨인코딩

In [48]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

# 학습되지 않은 레이블을 새로운 값으로 처리
def encode_with_unseen(x, le):
    if x in le.classes_:
        return le.transform([x])[0]
    else:
        return len(le.classes_)  # unseen label에 대해 새로운 숫자 할당

for col in le_cols:
    le = LabelEncoder()

    # train_data와 test_data에 등장하는 모든 레이블을 미리 학습

    # train_data 변환
    train_data[col] = le.fit_transform(train_data[col])


    test_data[col] = test_data[col].apply(lambda x: encode_with_unseen(x, le))

train_data.shape, test_data.shape

((40506, 77), (17361, 78))

In [49]:
# le_cols의 unique 값 확인
for col in le_cols:
    print(f"{col}: {train_data[col].unique()}")

Equipment_Dam: [0 1]
Equipment_Fill1: [0 1]
Equipment_Fill2: [0 1]
Chamber Temp. Judge Value_AutoClave: [1 0]


##### 원핫인코딩

In [52]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# 원핫인코더 생성
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

for col in ohe_cols:
    # train_data에 대해 원핫인코딩 학습 및 변환
    ohe.fit(train_data[[col]])  # 2D로 입력해야 하므로 [[]] 사용
    train_encoded = ohe.transform(train_data[[col]])

    # test_data에 대해 동일한 인코더를 사용하여 변환
    test_encoded = ohe.transform(test_data[[col]])

    # 변환된 데이터프레임 생성 (원래 컬럼 이름을 기준으로 원핫 인코딩된 열의 이름을 생성)
    train_encoded_df = pd.DataFrame(train_encoded, columns=ohe.get_feature_names_out([col]))
    test_encoded_df = pd.DataFrame(test_encoded, columns=ohe.get_feature_names_out([col]))

    # 인코딩된 열을 원래 데이터프레임에 추가
    train_data = pd.concat([train_data, train_encoded_df], axis=1)
    test_data = pd.concat([test_data, test_encoded_df], axis=1)

    # 원래의 컬럼을 삭제 (원핫 인코딩된 데이터로 대체)
    train_data.drop(columns=[col], inplace=True)
    test_data.drop(columns=[col], inplace=True)

train_data.shape, test_data.shape

((40506, 745), (17361, 746))

#### target label 변경

In [53]:
train_data.loc[train_data['target'] == 'Normal', 'target'] = 0
train_data.loc[train_data['target'] == 'AbNormal', 'target'] = 1
train_data['target'] = train_data['target'].astype(int)
train_data['target']

0        0
1        0
2        0
3        0
4        0
        ..
40501    0
40502    0
40503    0
40504    0
40505    1
Name: target, Length: 40506, dtype: int32

#### OK가 오염된 열 확인 및 처리
['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',

'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
       
'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2']

아에 제거 or 다른 컬럼과의 관계를 보고 대체


##### 다른 걸럼과의 관계를 보고 대체

In [54]:
ok_leakage_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2']

for col in ok_leakage_cols:
  train_data[col] = train_data[col].replace('OK', np.nan)
  test_data[col] = test_data[col].replace('OK', np.nan)

In [55]:
for col in ['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2']:
  train_data[col] = train_data[col].astype(float)
  test_data[col] = test_data[col].astype(float)

In [56]:
train_data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].unique()

array([  nan, 550.3, 162.4, 549. , 549.5, 550. , 548.5])

In [57]:
import numpy as np

# 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'의 결측치가 아닌 행들을 선택
not_null_data = train_data[train_data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].notnull()]

# (X좌표 + Y좌표 + Z좌표)의 평균 계산
mean_value = (not_null_data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] +
              not_null_data['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'] +
              not_null_data['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam']).mean()

# 유니크 값 목록
unique_values = np.array([550.3, 162.4, 549, 549.5, 550, 548.5])

# 결측치를 채우는 함수 정의
def replace_with_closest_value(row, mean_value, unique_values):
    if np.isnan(row['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam']):
        # mean_value - Y - Z 계산
        calculated_value = mean_value - row['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'] - row['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam']
        # 가장 가까운 유니크 값 선택
        closest_value = unique_values[np.abs(unique_values - calculated_value).argmin()]
        return closest_value
    else:
        return row['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam']

# train_data에서 결측치를 대체
train_data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] = train_data.apply(
    lambda row: replace_with_closest_value(row, mean_value, unique_values), axis=1
)

# test_data에서도 동일한 방식 적용
test_data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] = test_data.apply(
    lambda row: replace_with_closest_value(row, mean_value, unique_values), axis=1
)


In [58]:
train_data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].unique()

array([  nan, 838.4, 837.7, 837.9, 838.2, 837.5])

In [59]:
import numpy as np

# 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'의 결측치가 아닌 행들을 선택
not_null_data = train_data[train_data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].notnull()]

# (X좌표 + Y좌표 + Z좌표)의 평균 계산
mean_value = (not_null_data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] +
              not_null_data['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'] +
              not_null_data['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1']).mean()

# 유니크 값 목록
unique_values = np.array([838.4, 837.7, 837.9, 838.2, 837.5])

# 결측치를 채우는 함수 정의
def replace_with_closest_value(row, mean_value, unique_values):
    if np.isnan(row['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1']):
        # mean_value - Y - Z 계산
        calculated_value = mean_value - row['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'] - row['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1']
        # 가장 가까운 유니크 값 선택
        closest_value = unique_values[np.abs(unique_values - calculated_value).argmin()]
        return closest_value
    else:
        return row['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1']

# train_data에서 결측치를 대체
train_data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] = train_data.apply(
    lambda row: replace_with_closest_value(row, mean_value, unique_values), axis=1
)

# test_data에서도 동일한 방식 적용
test_data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] = test_data.apply(
    lambda row: replace_with_closest_value(row, mean_value, unique_values), axis=1
)


In [60]:
train_data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].unique()

array([  nan, 835.5, 305. ])

In [61]:
import numpy as np

# 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'의 결측치가 아닌 행들을 선택
not_null_data = train_data[train_data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].notnull()]

# (X좌표 + Y좌표 + Z좌표)의 평균 계산
mean_value = (not_null_data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'] +
              not_null_data['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2'] +
              not_null_data['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2']).mean()

# 유니크 값 목록
unique_values = np.array([835.5, 305.0])

# 결측치를 채우는 함수 정의
def replace_with_closest_value(row, mean_value, unique_values):
    if np.isnan(row['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2']):
        # mean_value - Y - Z 계산
        calculated_value = mean_value - row['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2'] - row['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2']
        # 가장 가까운 유니크 값 선택
        closest_value = unique_values[np.abs(unique_values - calculated_value).argmin()]
        return closest_value
    else:
        return row['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2']

# train_data에서 결측치를 대체
train_data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'] = train_data.apply(
    lambda row: replace_with_closest_value(row, mean_value, unique_values), axis=1
)

# test_data에서도 동일한 방식 적용
test_data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'] = test_data.apply(
    lambda row: replace_with_closest_value(row, mean_value, unique_values), axis=1
)


In [62]:
train_missing = train_data.isnull().sum()
print("Missing values in train data:\n", train_missing[train_missing > 0])

test_missing = test_data.isnull().sum()
print("\nMissing values in test data:\n", test_missing[test_missing > 0])

Missing values in train data:
 Series([], dtype: int64)

Missing values in test data:
 target    17361
dtype: int64


##### 범주형 변수 잘 처리됐는지 확인

In [63]:
# train_data의 변수 타입 데이터 프레임 생성
train_data_types = pd.DataFrame(train_data.dtypes, columns=["type"])
train_data_types['nunique'] = train_data.nunique()
train_data_types['unique'] = train_data.apply(lambda x: str(x.unique()), axis=0)
train_data_types['null_num'] = train_data.isnull().sum()
train_data_types['type'].value_counts()

type
float64    714
int64       26
int32        5
Name: count, dtype: int64

In [64]:
train_data_types[train_data_types['type'] == 'object']

Unnamed: 0,type,nunique,unique,null_num


In [66]:
train_data.shape

(40506, 745)

## 모델 구축

### k fold 적용


In [64]:
# stratify k fold 적용
from sklearn.model_selection import StratifiedKFold
from collections import Counter
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.combine import SMOTEENN, SMOTETomek
from collections import Counter

#### 언더 샘플링 비율 실험

In [65]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.5


In [66]:
import xgboost as xgb
import lightgbm as lgb
import catboost as cat

rf = RandomForestClassifier(random_state=RANDOM_STATE)
xgb = xgb.XGBClassifier(random_state=RANDOM_STATE)
lgbm = lgb.LGBMClassifier(random_state=RANDOM_STATE, verbose=-1)
cat = cat.CatBoostClassifier(verbose=0, random_state=RANDOM_STATE)

In [67]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# train_data에서 target을 제외한 모든 열을 feature로 사용
X = train_data.drop(columns=["target"])
y = train_data["target"]



# train 데이터를 5개로 나누어 교차 검증
under_ratio_df = pd.DataFrame(columns=["ratio", 'fold', "model", "f1", "precision", "recall", "accuracy"])

for ratio in range(1, 6):
    f1_list = []
    precision_list = []
    recall_list = []
    accuracy_list = []

    normal_ratio = ratio

    for i, index in enumerate(skf.split(X, y)):
        # 데이터 분할
        print(f"Fold {i + 1}")
        X_train, X_val = X.iloc[index[0]], X.iloc[index[1]]
        y_train, y_val = y.iloc[index[0]], y.iloc[index[1]]
        print(Counter(y_train), Counter(y_val))

        train = pd.concat([X_train, y_train], axis=1)
        val = pd.concat([X_val, y_val], axis=1)

        df_normal = train[train["target"] == 0]
        df_abnormal = train[train["target"] == 1]
        print(f"Total: Normal: {len(df_normal)}, AbNormal: {len(df_abnormal)}")

        df_normal = df_normal.sample(n=int(len(df_abnormal) * normal_ratio), replace=False, random_state=RANDOM_STATE)
        df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
        X_under, y_under = df_concat.drop(columns=["target"]), df_concat["target"]

        print("언더샘플링 후 train의 클래스 비율:", Counter(y_under))

        print('모델 학습 진행')
        for model in [rf, xgb, lgbm, cat]:
            print(model.__class__.__name__)
            model.fit(X_under, y_under)
            y_pred = model.predict(X_val)
            f1 = f1_score(y_val, y_pred)
            precision = precision_score(y_val, y_pred)
            recall = recall_score(y_val, y_pred)
            accuracy = accuracy_score(y_val, y_pred)
            under_ratio_df.loc[len(under_ratio_df), :] = [ratio, i+1, model.__class__.__name__, f1, precision, recall, accuracy]

Fold 1
Counter({0: 30524, 1: 1880}) Counter({0: 7632, 1: 470})
Total: Normal: 30524, AbNormal: 1880
언더샘플링 후 train의 클래스 비율: Counter({0: 1880, 1: 1880})
모델 학습 진행
RandomForestClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier
Fold 2
Counter({0: 30525, 1: 1880}) Counter({0: 7631, 1: 470})
Total: Normal: 30525, AbNormal: 1880
언더샘플링 후 train의 클래스 비율: Counter({0: 1880, 1: 1880})
모델 학습 진행
RandomForestClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier
Fold 3
Counter({0: 30525, 1: 1880}) Counter({0: 7631, 1: 470})
Total: Normal: 30525, AbNormal: 1880
언더샘플링 후 train의 클래스 비율: Counter({0: 1880, 1: 1880})
모델 학습 진행
RandomForestClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier
Fold 4
Counter({0: 30525, 1: 1880}) Counter({0: 7631, 1: 470})
Total: Normal: 30525, AbNormal: 1880
언더샘플링 후 train의 클래스 비율: Counter({0: 1880, 1: 1880})
모델 학습 진행
RandomForestClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier
Fold 5
Counter({0: 30525, 1: 1880}) Counter({0: 7631, 1: 470})
Total: Normal

In [68]:
under_ratio_df.groupby(["ratio"])[['f1', 'precision', 'recall', 'accuracy']].mean()

Unnamed: 0_level_0,f1,precision,recall,accuracy
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.15456,0.089029,0.58617,0.627555
2,0.186116,0.146615,0.265213,0.865026
3,0.183049,0.236403,0.156489,0.919456
4,0.168167,0.299335,0.118511,0.932134
5,0.151507,0.396914,0.095,0.93854


In [73]:
under_ratio_df.groupby(["ratio", "model"]).mean()[['f1', 'precision', 'recall', 'accuracy']]

Unnamed: 0_level_0,Unnamed: 1_level_0,f1,precision,recall,accuracy
ratio,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,CatBoostClassifier,0.162196,0.093727,0.602553,0.6384
1,LGBMClassifier,0.147335,0.08452,0.574043,0.614502
1,RandomForestClassifier,0.155846,0.089821,0.588511,0.63008
1,XGBClassifier,0.152864,0.088047,0.579574,0.627241
2,CatBoostClassifier,0.192071,0.177,0.211489,0.896707
2,LGBMClassifier,0.176224,0.132206,0.264255,0.856713
2,RandomForestClassifier,0.191207,0.141214,0.29617,0.854565
2,XGBClassifier,0.184963,0.136042,0.288936,0.852121
3,CatBoostClassifier,0.176273,0.321398,0.121702,0.934059
3,LGBMClassifier,0.168089,0.206262,0.142128,0.918308


## 여러 샘플링 기법 적용 및 모델 학습

In [69]:
# stratify k fold 적용
from sklearn.model_selection import StratifiedKFold
from collections import Counter
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.combine import SMOTEENN, SMOTETomek
from collections import Counter

In [72]:
import xgboost as xgb
import lightgbm as lgb
import catboost as cat

rf = RandomForestClassifier(random_state=RANDOM_STATE)
xgb = xgb.XGBClassifier(random_state=RANDOM_STATE)
lgbm = lgb.LGBMClassifier(random_state=RANDOM_STATE, verbose = -1)
cat = cat.CatBoostClassifier(verbose=0, random_state=RANDOM_STATE)

In [74]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# train_data에서 target을 제외한 모든 열을 feature로 사용
X = train_data.drop(columns=["target"])
y = train_data["target"]

result_df = pd.DataFrame(columns=["fold", "sampling", "model", "f1", "precision", "recall", "accuracy"])

# train 데이터를 5개로 나누어 교차 검증
for i, index in enumerate(skf.split(X, y)):
    # 데이터 분할
    print(f"Fold {i + 1}")
    X_train, X_val = X.iloc[index[0]], X.iloc[index[1]]
    y_train, y_val = y.iloc[index[0]], y.iloc[index[1]]
    print(Counter(y_train), Counter(y_val))

    train = pd.concat([X_train, y_train], axis=1)
    val = pd.concat([X_val, y_val], axis=1)

    X_basic, y_basic = X_train, y_train
    # 샘플링 파트
    print('샘플링 진행')
    ros = RandomOverSampler(random_state=RANDOM_STATE)
    X_ros, y_ros = ros.fit_resample(X_train, y_train)
    print("Random Oversampling 클래스 비율:", Counter(y_ros))

    normal_ratio = 2.0
    df_normal = train[train["target"] == 0]
    df_abnormal = train[train["target"] == 1]
    print(f"Total: Normal: {len(df_normal)}, AbNormal: {len(df_abnormal)}")

    df_normal = df_normal.sample(n=int(len(df_abnormal) * normal_ratio), replace=False, random_state=RANDOM_STATE)
    df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
    X_under_2, y_under_2 = df_concat.drop(columns=["target"]), df_concat["target"]

    print("언더샘플링 후 train의 클래스 비율:", Counter(y_under_2))

    normal_ratio = 3.0
    df_normal = train[train["target"] == 0]
    df_abnormal = train[train["target"] == 1]
    print(f"Total: Normal: {len(df_normal)}, AbNormal: {len(df_abnormal)}")

    df_normal = df_normal.sample(n=int(len(df_abnormal) * normal_ratio), replace=False, random_state=RANDOM_STATE)
    df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
    X_under_3, y_under_3 = df_concat.drop(columns=["target"]), df_concat["target"]

    print("언더샘플링 후 train의 클래스 비율:", Counter(y_under_3))




    for sample_tech in tqdm(["basic", "ros", "under_2", "under_3"]):
        print(sample_tech, '데이터 학습 진행')
        train_x = globals()[f"X_{sample_tech}"]
        train_y = globals()[f"y_{sample_tech}"]
        for model in [rf, xgb, lgbm, cat]:
            print(model.__class__.__name__)
            model.fit(train_x, train_y)
            y_pred = model.predict(X_val)
            f1 = f1_score(y_val, y_pred)
            precision = precision_score(y_val, y_pred)
            recall = recall_score(y_val, y_pred)
            accuracy = accuracy_score(y_val, y_pred)
            result_df.loc[len(result_df), :] = [i+1, sample_tech, model.__class__.__name__, f1, precision, recall, accuracy]
        del train_x, train_y

Fold 1
Counter({0: 30524, 1: 1880}) Counter({0: 7632, 1: 470})
샘플링 진행
Random Oversampling 클래스 비율: Counter({0: 30524, 1: 30524})
Total: Normal: 30524, AbNormal: 1880
언더샘플링 후 train의 클래스 비율: Counter({0: 3760, 1: 1880})
Total: Normal: 30524, AbNormal: 1880
언더샘플링 후 train의 클래스 비율: Counter({0: 5640, 1: 1880})


  0%|          | 0/4 [00:00<?, ?it/s]

basic 데이터 학습 진행
RandomForestClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier


 25%|██▌       | 1/4 [00:40<02:01, 40.40s/it]

ros 데이터 학습 진행
RandomForestClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier


 50%|█████     | 2/4 [01:40<01:44, 52.25s/it]

under_2 데이터 학습 진행
RandomForestClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier


 75%|███████▌  | 3/4 [01:52<00:33, 33.78s/it]

under_3 데이터 학습 진행
RandomForestClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier


100%|██████████| 4/4 [02:06<00:00, 31.72s/it]


Fold 2
Counter({0: 30525, 1: 1880}) Counter({0: 7631, 1: 470})
샘플링 진행
Random Oversampling 클래스 비율: Counter({0: 30525, 1: 30525})
Total: Normal: 30525, AbNormal: 1880
언더샘플링 후 train의 클래스 비율: Counter({0: 3760, 1: 1880})
Total: Normal: 30525, AbNormal: 1880
언더샘플링 후 train의 클래스 비율: Counter({0: 5640, 1: 1880})


  0%|          | 0/4 [00:00<?, ?it/s]

basic 데이터 학습 진행
RandomForestClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier


 25%|██▌       | 1/4 [00:36<01:50, 36.88s/it]

ros 데이터 학습 진행
RandomForestClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier


 50%|█████     | 2/4 [01:35<01:39, 49.70s/it]

under_2 데이터 학습 진행
RandomForestClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier


 75%|███████▌  | 3/4 [01:47<00:32, 32.62s/it]

under_3 데이터 학습 진행
RandomForestClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier


100%|██████████| 4/4 [02:01<00:00, 30.32s/it]


Fold 3
Counter({0: 30525, 1: 1880}) Counter({0: 7631, 1: 470})
샘플링 진행
Random Oversampling 클래스 비율: Counter({0: 30525, 1: 30525})
Total: Normal: 30525, AbNormal: 1880
언더샘플링 후 train의 클래스 비율: Counter({0: 3760, 1: 1880})
Total: Normal: 30525, AbNormal: 1880
언더샘플링 후 train의 클래스 비율: Counter({0: 5640, 1: 1880})


  0%|          | 0/4 [00:00<?, ?it/s]

basic 데이터 학습 진행
RandomForestClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier


 25%|██▌       | 1/4 [00:36<01:50, 36.71s/it]

ros 데이터 학습 진행
RandomForestClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier


 50%|█████     | 2/4 [01:35<01:39, 49.97s/it]

under_2 데이터 학습 진행
RandomForestClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier


 75%|███████▌  | 3/4 [01:47<00:32, 32.27s/it]

under_3 데이터 학습 진행
RandomForestClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier


100%|██████████| 4/4 [02:00<00:00, 30.08s/it]


Fold 4
Counter({0: 30525, 1: 1880}) Counter({0: 7631, 1: 470})
샘플링 진행
Random Oversampling 클래스 비율: Counter({0: 30525, 1: 30525})
Total: Normal: 30525, AbNormal: 1880
언더샘플링 후 train의 클래스 비율: Counter({0: 3760, 1: 1880})
Total: Normal: 30525, AbNormal: 1880
언더샘플링 후 train의 클래스 비율: Counter({0: 5640, 1: 1880})


  0%|          | 0/4 [00:00<?, ?it/s]

basic 데이터 학습 진행
RandomForestClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier


 25%|██▌       | 1/4 [00:37<01:53, 37.80s/it]

ros 데이터 학습 진행
RandomForestClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier


 50%|█████     | 2/4 [01:36<01:40, 50.37s/it]

under_2 데이터 학습 진행
RandomForestClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier


 75%|███████▌  | 3/4 [01:48<00:32, 32.61s/it]

under_3 데이터 학습 진행
RandomForestClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier


100%|██████████| 4/4 [02:01<00:00, 30.32s/it]


Fold 5
Counter({0: 30525, 1: 1880}) Counter({0: 7631, 1: 470})
샘플링 진행
Random Oversampling 클래스 비율: Counter({0: 30525, 1: 30525})
Total: Normal: 30525, AbNormal: 1880
언더샘플링 후 train의 클래스 비율: Counter({0: 3760, 1: 1880})
Total: Normal: 30525, AbNormal: 1880
언더샘플링 후 train의 클래스 비율: Counter({0: 5640, 1: 1880})


  0%|          | 0/4 [00:00<?, ?it/s]

basic 데이터 학습 진행
RandomForestClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier


 25%|██▌       | 1/4 [00:36<01:49, 36.44s/it]

ros 데이터 학습 진행
RandomForestClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier


 50%|█████     | 2/4 [01:34<01:37, 48.97s/it]

under_2 데이터 학습 진행
RandomForestClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier


 75%|███████▌  | 3/4 [01:46<00:32, 32.07s/it]

under_3 데이터 학습 진행
RandomForestClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier


100%|██████████| 4/4 [01:59<00:00, 29.85s/it]


In [75]:
result_df.groupby(["sampling", "model"]).mean()[['f1', 'precision', 'recall', 'accuracy']]

Unnamed: 0_level_0,Unnamed: 1_level_0,f1,precision,recall,accuracy
sampling,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
basic,CatBoostClassifier,0.137999,0.887925,0.074894,0.945786
basic,LGBMClassifier,0.09566,0.883638,0.050638,0.944551
basic,RandomForestClassifier,0.10866,0.818624,0.058298,0.944601
basic,XGBClassifier,0.109616,0.832935,0.058723,0.9447
ros,CatBoostClassifier,0.196268,0.134834,0.360851,0.828741
ros,LGBMClassifier,0.186216,0.11421,0.504255,0.744309
ros,RandomForestClassifier,0.133902,0.479276,0.077872,0.941564
ros,XGBClassifier,0.177044,0.109529,0.461702,0.750877
under_2,CatBoostClassifier,0.192071,0.177,0.211489,0.896707
under_2,LGBMClassifier,0.176224,0.132206,0.264255,0.856713


In [76]:
result_df.groupby(["sampling", "model"]).mean()[['f1', 'precision', 'recall', 'accuracy']].sort_values(by='f1', ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,f1,precision,recall,accuracy
sampling,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
under_3,RandomForestClassifier,0.196817,0.209663,0.185532,0.912186
ros,CatBoostClassifier,0.196268,0.134834,0.360851,0.828741
under_2,CatBoostClassifier,0.192071,0.177,0.211489,0.896707
under_2,RandomForestClassifier,0.191207,0.141214,0.29617,0.854565
under_3,XGBClassifier,0.191017,0.208287,0.176596,0.913272


## 앙상블 모델

### Blending

In [77]:
# stratify k fold 적용
from sklearn.model_selection import StratifiedKFold
from collections import Counter
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.combine import SMOTEENN, SMOTETomek
from collections import Counter
import xgboost as xgb
import lightgbm as lgb
import catboost as cat

In [78]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# train_data에서 target을 제외한 모든 열을 feature로 사용
X = train_data.drop(columns=["target"])
y = train_data["target"]

blending_result_df = pd.DataFrame(columns=["fold", "f1", "precision", "recall", "accuracy"])
y_proba_list = []
# train 데이터를 5개로 나누어 교차 검증
for i, index in enumerate(skf.split(X, y)):
    # 데이터 분할
    print(f"Fold {i + 1}")
    X_train, X_val = X.iloc[index[0]], X.iloc[index[1]]
    y_train, y_val = y.iloc[index[0]], y.iloc[index[1]]
    print(Counter(y_train), Counter(y_val))

    train = pd.concat([X_train, y_train], axis=1)
    val = pd.concat([X_val, y_val], axis=1)

    # 샘플링 파트
    print('샘플링 진행')
    ros = RandomOverSampler(random_state=RANDOM_STATE)
    X_ros, y_ros = ros.fit_resample(X_train, y_train)

    normal_ratio = 2.0
    df_normal = train[train["target"] == 0]
    df_abnormal = train[train["target"] == 1]
    print(f"Total: Normal: {len(df_normal)}, AbNormal: {len(df_abnormal)}")

    df_normal = df_normal.sample(n=int(len(df_abnormal) * normal_ratio), replace=False, random_state=RANDOM_STATE)
    df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
    X_under_2, y_under_2 = df_concat.drop(columns=["target"]), df_concat["target"]

    print("언더샘플링 후 train의 클래스 비율:", Counter(y_under_2))

    normal_ratio = 3.0
    df_normal = train[train["target"] == 0]
    df_abnormal = train[train["target"] == 1]
    print(f"Total: Normal: {len(df_normal)}, AbNormal: {len(df_abnormal)}")

    df_normal = df_normal.sample(n=int(len(df_abnormal) * normal_ratio), replace=False, random_state=RANDOM_STATE)
    df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
    X_under_3, y_under_3 = df_concat.drop(columns=["target"]), df_concat["target"]

    print("언더샘플링 후 train의 클래스 비율:", Counter(y_under_3))

    print('모델 학습 진행')
    import catboost as cat
    ros_cat = cat.CatBoostClassifier(verbose=0, random_state=RANDOM_STATE)
    under2_cat = cat.CatBoostClassifier(verbose=0, random_state=RANDOM_STATE)
    import xgboost as xgb
    under3_rf = RandomForestClassifier(random_state=RANDOM_STATE)


    ros_cat.fit(X_ros,y_ros)
    under2_cat.fit(X_under_2, y_under_2)
    under3_rf.fit(X_under_3, y_under_3)

    ros_cat_pred = ros_cat.predict_proba(X_val)[:, 1]
    under2_cat_pred = under2_cat.predict_proba(X_val)[:, 1]
    under3_rf_pred = under3_rf.predict_proba(X_val)[:, 1]

    final_outputs = {
        'ros_cat' : ros_cat_pred,
        'under2_cat' : under2_cat_pred,
        'under3_rf' : under3_rf_pred}

    #Blending
    y_pred = final_outputs['ros_cat'] * 0.5 +final_outputs['under2_cat'] * 0.3 +final_outputs['under3_rf'] * 0.2
    y_proba_list.append(final_outputs['ros_cat'] * 0.5 +final_outputs['under2_cat'] * 0.3 +final_outputs['under3_rf'] * 0.2)
    y_pred = np.where(y_pred > 0.5, 1, 0)

    f1 = f1_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    accuracy = accuracy_score(y_val, y_pred)
    blending_result_df.loc[len(blending_result_df), :] = [i+1, f1, precision, recall, accuracy]

Fold 1
Counter({0: 30524, 1: 1880}) Counter({0: 7632, 1: 470})
샘플링 진행
Total: Normal: 30524, AbNormal: 1880
언더샘플링 후 train의 클래스 비율: Counter({0: 3760, 1: 1880})
Total: Normal: 30524, AbNormal: 1880
언더샘플링 후 train의 클래스 비율: Counter({0: 5640, 1: 1880})
모델 학습 진행
Fold 2
Counter({0: 30525, 1: 1880}) Counter({0: 7631, 1: 470})
샘플링 진행
Total: Normal: 30525, AbNormal: 1880
언더샘플링 후 train의 클래스 비율: Counter({0: 3760, 1: 1880})
Total: Normal: 30525, AbNormal: 1880
언더샘플링 후 train의 클래스 비율: Counter({0: 5640, 1: 1880})
모델 학습 진행
Fold 3
Counter({0: 30525, 1: 1880}) Counter({0: 7631, 1: 470})
샘플링 진행
Total: Normal: 30525, AbNormal: 1880
언더샘플링 후 train의 클래스 비율: Counter({0: 3760, 1: 1880})
Total: Normal: 30525, AbNormal: 1880
언더샘플링 후 train의 클래스 비율: Counter({0: 5640, 1: 1880})
모델 학습 진행
Fold 4
Counter({0: 30525, 1: 1880}) Counter({0: 7631, 1: 470})
샘플링 진행
Total: Normal: 30525, AbNormal: 1880
언더샘플링 후 train의 클래스 비율: Counter({0: 3760, 1: 1880})
Total: Normal: 30525, AbNormal: 1880
언더샘플링 후 train의 클래스 비율: Counter({0: 5640,

In [79]:
blending_result_df

Unnamed: 0,fold,f1,precision,recall,accuracy
0,1,0.22113,0.17976,0.287234,0.882622
1,2,0.208855,0.171939,0.265957,0.883101
2,3,0.222037,0.182692,0.282979,0.884952
3,4,0.213008,0.172368,0.278723,0.880509
4,5,0.221493,0.18024,0.287234,0.882854


In [80]:
blending_result_df['f1'].mean()

0.21730471563521164

### 임계값 최적화

In [81]:
def calculate_f1_threshold(threshold):
    y_pred = (y_proba >= threshold).astype(int)
    return f1_score(y_val, y_pred)

f1_scores_list = []
for i, index in enumerate(skf.split(X, y)):
    # 데이터 분할
    print(f"Fold {i + 1}")
    X_train, X_val = X.iloc[index[0]], X.iloc[index[1]]
    y_train, y_val = y.iloc[index[0]], y.iloc[index[1]]
    y_proba = y_proba_list[i]
    # F1 스코어 계산
    thresholds = np.arange(0.4, 0.6, 0.0001)

    # 모든 임계값에 대해 F1 스코어 계산
    f1_scores = [calculate_f1_threshold(t) for t in thresholds]
    f1_scores_list.append(f1_scores)

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5


In [82]:
pd.DataFrame(f1_scores_list)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,0.186178,0.186353,0.186441,0.186441,0.186528,0.186616,0.186616,0.186616,0.186616,0.185761,...,0.190083,0.187586,0.187586,0.187586,0.187586,0.187845,0.187845,0.187845,0.188366,0.188366
1,0.185149,0.185149,0.185239,0.185328,0.185418,0.185418,0.185507,0.185597,0.185776,0.185956,...,0.180575,0.180822,0.180822,0.180822,0.180822,0.180822,0.180822,0.180822,0.18107,0.18107
2,0.204681,0.204775,0.204869,0.205057,0.205152,0.205246,0.205341,0.20572,0.205814,0.2061,...,0.212121,0.212121,0.212121,0.212121,0.212121,0.212414,0.212414,0.212414,0.212414,0.212414
3,0.191431,0.191518,0.191606,0.191781,0.191868,0.191956,0.191956,0.191217,0.190476,0.190738,...,0.19888,0.19888,0.19888,0.19888,0.19888,0.19888,0.19888,0.19888,0.199158,0.199438
4,0.195622,0.195713,0.195713,0.195713,0.195804,0.195804,0.195896,0.195987,0.195987,0.196078,...,0.199715,0.199715,0.199715,0.199715,0.199715,0.199715,0.199715,0.199715,0.2,0.2


In [83]:
opt_idx = pd.DataFrame(f1_scores_list).mean(axis=0).idxmax()

In [84]:
opt_threshold = np.arange(0.4, 0.6, 0.0001)[opt_idx]
opt_threshold

0.5109999999999878

In [85]:
opt_threshold_score = pd.DataFrame(f1_scores_list)[opt_idx].mean()
opt_threshold_score

0.2191591736690041

## 최종 예측

In [86]:
X = train_data.drop(columns=["target"])
y = train_data["target"]

ros = RandomOverSampler(random_state=RANDOM_STATE)
X_ros, y_ros = ros.fit_resample(X, y)

print(Counter(y))
normal_ratio = 3.0
df_normal = train_data[train_data["target"] == 0]
df_abnormal = train_data[train_data["target"] == 1]
print(f"Total: Normal: {len(df_normal)}, AbNormal: {len(df_abnormal)}")

df_normal = df_normal.sample(n=int(len(df_abnormal) * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
X_under_3, y_under_3 = df_concat.drop(columns=["target"]), df_concat["target"]

print(Counter(y))
normal_ratio = 2.0
df_normal = train_data[train_data["target"] == 0]
df_abnormal = train_data[train_data["target"] == 1]
print(f"Total: Normal: {len(df_normal)}, AbNormal: {len(df_abnormal)}")

df_normal = df_normal.sample(n=int(len(df_abnormal) * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
X_under_2, y_under_2 = df_concat.drop(columns=["target"]), df_concat["target"]

print("언더샘플링 후 train의 클래스 비율:", Counter(y_under_3))

print("언더샘플링 후 train의 클래스 비율:", Counter(y_under_2))

import catboost as cat
ros_cat = cat.CatBoostClassifier(verbose=0, random_state=RANDOM_STATE)
under2_cat = cat.CatBoostClassifier(verbose=0, random_state=RANDOM_STATE)
import xgboost as xgb
under3_rf = RandomForestClassifier(random_state=RANDOM_STATE)



ros_cat.fit(X_ros,y_ros)
under2_cat.fit(X_under_2, y_under_2)
under3_rf.fit(X_under_3, y_under_3)

df_test_x = test_data.drop(columns=["Set ID", 'target'])

ros_cat_pred = ros_cat.predict_proba(df_test_x)[:, 1]
under2_cat_pred = under2_cat.predict_proba(df_test_x)[:, 1]
under3_rf_pred = under3_rf.predict_proba(df_test_x)[:, 1]

final_outputs = {
    'ros_cat' : ros_cat_pred,
    'under2_cat' : under2_cat_pred,
    'under3_rf' : under3_rf_pred}

#Blending
test_pred = final_outputs['ros_cat'] * 0.5 +final_outputs['under2_cat'] * 0.3 +final_outputs['under3_rf'] * 0.2
test_pred = np.where(test_pred > 0.5109999999999878, 1, 0)

Counter({0: 38156, 1: 2350})
Total: Normal: 38156, AbNormal: 2350
Counter({0: 38156, 1: 2350})
Total: Normal: 38156, AbNormal: 2350
언더샘플링 후 train의 클래스 비율: Counter({0: 7050, 1: 2350})
언더샘플링 후 train의 클래스 비율: Counter({0: 4700, 1: 2350})


### 후처리 진행

In [87]:
temp = pd.concat([df_test_x, pd.DataFrame(test_pred)], axis=1)
temp = temp[['Equipment_Dam', 'Equipment_Fill1', 'Equipment_Fill2', 0]]
temp['sum'] = temp['Equipment_Dam'] + temp['Equipment_Fill1'] + temp['Equipment_Fill2']
temp

Unnamed: 0,Equipment_Dam,Equipment_Fill1,Equipment_Fill2,0,sum
0,1,1,1,0,3
1,1,1,1,0,3
2,0,0,0,0,0
3,1,1,1,0,3
4,0,0,0,0,0
...,...,...,...,...,...
17356,1,1,1,0,3
17357,1,1,1,0,3
17358,0,0,0,0,0
17359,0,0,0,1,0


In [89]:
temp[temp['sum'] == 1]

Unnamed: 0,Equipment_Dam,Equipment_Fill1,Equipment_Fill2,0,sum
562,1,0,0,1,1
3457,0,0,1,1,1
7287,0,0,1,1,1
7836,1,0,0,1,1
8253,1,0,0,1,1
8898,1,0,0,1,1
15406,1,0,0,1,1


In [88]:
temp[temp['sum'] == 2]

Unnamed: 0,Equipment_Dam,Equipment_Fill1,Equipment_Fill2,0,sum
10989,0,1,1,1,2
12439,1,1,0,1,2
15180,1,1,0,1,2
15811,1,1,0,0,2
15964,1,1,0,1,2


In [90]:
test_pred[15811] = 1

In [91]:
test_pred[15811]

1

## 제출파일 작성

In [92]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv(os.path.join(ROOT_DIR, "submission.csv"))
df_sub["target"] = test_pred
df_sub.loc[df_sub["target"] == 0, 'target'] = "Normal"
df_sub.loc[df_sub["target"] == 1, 'target'] = "AbNormal"
# 제출 파일 저장
df_sub.to_csv("/content/submission.csv", index=False)