## 1. 데이터 불러오기


### 필수 라이브러리


In [741]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [742]:
def unique_mapping(column):
    convert = dict()
    for idx, str in enumerate(set(column)):
        convert[str] = idx
    return convert

In [743]:
def print_info(train_data):
    ls = []
    for col_ in train_data.columns:
        if len(set(train_data[col_])) < 5:
            print(col_, len(set(train_data[col_])))
            ls.append(col_)

### 데이터 읽어오기


In [744]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

ROOT_DIR = "data"
RANDOM_STATE = 110

# Load data
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
print(train_data['Wip Line_Dam'])
print(train_data.shape)

def alter_table(train_data):
    # column 값의 종류가 하나 밖에 없으면 그냥 drop하는게 나음
    just_ones = ['Wip Line_Dam', 'Process Desc._Dam', 'Insp. Seq No._Dam', 'Insp Judge Code_Dam', 'CURE STANDBY POSITION X Collect Result_Dam', 'CURE STANDBY POSITION Z Collect Result_Dam', 'CURE STANDBY POSITION Θ Collect Result_Dam', 'CURE START POSITION Z Collect Result_Dam', 'Wip Line_AutoClave', 'Process Desc._AutoClave', 'Equipment_AutoClave', 'Insp. Seq No._AutoClave', 'Insp Judge Code_AutoClave', '1st Pressure Judge Value_AutoClave', '2nd Pressure Judge Value_AutoClave', '3rd Pressure Judge Value_AutoClave', 'Wip Line_Fill1', 'Process Desc._Fill1', 'Insp. Seq No._Fill1', 'Insp Judge Code_Fill1', 'Wip Line_Fill2', 'Process Desc._Fill2', 'Insp. Seq No._Fill2', 'Insp Judge Code_Fill2', 'CURE END POSITION Θ Collect Result_Fill2', 'CURE STANDBY POSITION X Collect Result_Fill2', 'CURE STANDBY POSITION Θ Collect Result_Fill2', 'CURE START POSITION Θ Collect Result_Fill2', 'DISCHARGED SPEED OF RESIN Collect Result_Fill2', 'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill2', 'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill2', 'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill2', 'Dispense Volume(Stage1) Collect Result_Fill2', 'Dispense Volume(Stage2) Collect Result_Fill2', 'Dispense Volume(Stage3) Collect Result_Fill2']
    for col in just_ones:
        if col in train_data.columns:
            train_data = train_data.drop(col, axis=1)

    # somethig #1, #2 와 같이 생긴거는 끝 숫자만 가져오기
    to_trunc_num = ['Equipment_Dam', 'Equipment_Fill1', 'Equipment_Fill2']
    for col in to_trunc_num:
        if col in train_data.columns:
            train_data[col] = train_data[col].apply(lambda x: int(x.split('#')[-1]))

    # NaN, OK로만 되어 있는 것은 0, 1로 변경
    to_num = ['Chamber Temp. Judge Value_AutoClave', 'GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave', 'GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2']
    for col in to_num:
        if col in train_data.columns:
            train_data[col] = train_data[col].map(unique_mapping(train_data[col]))
    
    return train_data

# train_data = alter_table(train_data)


0        IVI-OB6
1        IVI-OB6
2        IVI-OB6
3        IVI-OB6
4        IVI-OB6
          ...   
40501    IVI-OB6
40502    IVI-OB6
40503    IVI-OB6
40504    IVI-OB6
40505    IVI-OB6
Name: Wip Line_Dam, Length: 40506, dtype: object
(40506, 464)


In [745]:
# 수치형 데이터만 선택
numeric_cols = train_data.select_dtypes(include=['int64', 'float64']).columns

print(numeric_cols)

# # MinMaxScaler 생성
# scaler = MinMaxScaler()

# # 수치형 데이터만 정규화
# train_data[numeric_cols] = scaler.fit_transform(train_data[numeric_cols])


Index(['Insp. Seq No._Dam', 'CURE END POSITION X Collect Result_Dam',
       'CURE END POSITION X Unit Time_Dam',
       'CURE END POSITION X Judge Value_Dam',
       'CURE END POSITION Z Collect Result_Dam',
       'CURE END POSITION Z Unit Time_Dam',
       'CURE END POSITION Z Judge Value_Dam',
       'CURE END POSITION Θ Collect Result_Dam',
       'CURE END POSITION Θ Unit Time_Dam',
       'CURE END POSITION Θ Judge Value_Dam',
       ...
       'PalletID Judge Value_Fill2', 'Production Qty Collect Result_Fill2',
       'Production Qty Unit Time_Fill2', 'Production Qty Judge Value_Fill2',
       'Receip No Collect Result_Fill2', 'Receip No Unit Time_Fill2',
       'Receip No Judge Value_Fill2', 'WorkMode Collect Result_Fill2',
       'WorkMode Unit Time_Fill2', 'WorkMode Judge Value_Fill2'],
      dtype='object', length=427)


In [746]:
for col in train_data.columns:
    if len(set(train_data[col])) < 5 and col not in just_ones and col not in to_trunc_num and col not in to_num:
        pass
#         print(col, len(set(train_data[col])), set(train_data[col]))


### 언더 샘플링


데이타 불균형을 해결하기 위해 언더 샘플링을 진행합니다.


In [747]:
normal_ratio = 5.0  # 1.0 means 1:1 ratio

df_normal = train_data[train_data["target"] == "Normal"]
df_abnormal = train_data[train_data["target"] == "AbNormal"]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

df_normal = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
df_concat.value_counts("target")

  Total: Normal: 38156, AbNormal: 2350


target
Normal      11750
AbNormal     2350
Name: count, dtype: int64

### 데이터 분할


In [748]:
df_train, df_val = train_test_split(
    df_concat,
    test_size=0.0001,
    stratify=df_concat["target"],
    random_state=RANDOM_STATE,
)


def print_stats(df: pd.DataFrame):
    num_normal = len(df[df["target"] == "Normal"])
    num_abnormal = len(df[df["target"] == "AbNormal"])

    print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}" + f" ratio: {num_abnormal/num_normal}")


# Print statistics
print(f"  \tAbnormal\tNormal")
print_stats(df_train)
print_stats(df_val)

  	Abnormal	Normal
  Total: Normal: 11748, AbNormal: 2350 ratio: 0.2000340483486551
  Total: Normal: 2, AbNormal: 0 ratio: 0.0


## 3. 모델 학습


### 모델 정의


In [749]:
# pip install xgboost

In [750]:
from xgboost import XGBClassifier
model = RandomForestClassifier(
    n_estimators=400,
    max_depth=100,
    min_samples_split=16,
    min_samples_leaf=1,
    max_features=None,
    bootstrap=True,
    oob_score=True,
    n_jobs=-1,
    random_state=RANDOM_STATE,
    class_weight='balanced',
    verbose=1
)
# model = XGBClassifier()

### 모델 학습


In [751]:
features = []

for col in df_train.columns:
    try:
        df_train[col] = df_train[col].astype(int)
        features.append(col)
    except:
        continue

print(np.array(features).shape)

train_x = df_train[features]
train_y = df_train["target"]
train_y = [0 if el == 'AbNormal' else 1 for el in train_y]

model.fit(train_x, train_y)

(149,)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:   16.6s finished


In [752]:
from sklearn.metrics import classification_report, f1_score

df = df_val

# df_valid에 동일한 전처리 수행
valid_x = df[features]  # train_x에서 사용한 features와 동일한 features를 선택

# 모델로 예측
valid_y_pred = model.predict(valid_x)

# df_valid에 실제 타겟값이 있다고 가정 (예: valid_y)
valid_y = df["target"]
valid_y = [0 if el == 'AbNormal' else 1 for el in valid_y]

print(np.array(valid_y).shape)
print(np.array(valid_y_pred).shape)
# 정확도 및 기타 지표 계산
print("f1_score:", f1_score(valid_y, valid_y_pred))
print(classification_report(valid_y, valid_y_pred))

(2,)
(2,)
f1_score: 1.0
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         2

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 400 out of 400 | elapsed:    0.0s finished


## 4. 제출하기


### 테스트 데이터 예측


테스트 데이터 불러오기


In [753]:
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

In [754]:
print(test_data.shape)
# test_data = alter_table(test_data)

df_test_x = test_data[features]

for col in df_test_x.columns:
    try:
        df_test_x.loc[:, col] = df_test_x[col].astype(int)
    except:
        continue

(17361, 465)


In [755]:
test_pred = model.predict(df_test_x)
test_pred = ['AbNormal' if el == 0 else 'Normal' for el in test_pred]
print(test_pred.count('AbNormal'))
print(test_pred.count('Normal'))

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.3s


1748
15613


[Parallel(n_jobs=4)]: Done 400 out of 400 | elapsed:    0.6s finished


### 제출 파일 작성


In [756]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**
