# 제품 이상여부 판별 프로젝트

## 1. 데이터 불러오기

### 필수 라이브러리

In [1]:
import os
from pprint import pprint
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import xgboost as xgb
import shap

pd.set_option('display.max_columns', None)
pd.set_option('display.max_row', None)

In [2]:
# ROOT_DIR = "data"
# RANDOM_STATE = 110

# # CSV 파일 읽기
# X_Dam = pd.read_csv(os.path.join(ROOT_DIR, "Dam dispensing.csv"))
# X_AutoClave = pd.read_csv(os.path.join(ROOT_DIR, "Auto clave.csv"))
# X_Fill1 = pd.read_csv(os.path.join(ROOT_DIR, "Fill1 dispensing.csv"))
# X_Fill2 = pd.read_csv(os.path.join(ROOT_DIR, "Fill2 dispensing.csv"))

# # y 데이터 읽기
# y = pd.read_csv(os.path.join(ROOT_DIR, "train_y.csv"))


### 데이터 병합

x 데이터 병합

In [3]:
# # Rename columns
# X_Dam.columns = [i + " - Dam" for i in X_Dam.columns]
# X_AutoClave.columns = [i + " - AutoClave" for i in X_AutoClave.columns]
# X_Fill1.columns = [i + " - Fill1" for i in X_Fill1.columns]
# X_Fill2.columns = [i + " - Fill2" for i in X_Fill2.columns]
# X_Dam = X_Dam.rename(columns={"Set ID - Dam": "Set ID"})
# X_AutoClave = X_AutoClave.rename(columns={"Set ID - AutoClave": "Set ID"})
# X_Fill1 = X_Fill1.rename(columns={"Set ID - Fill1": "Set ID"})
# X_Fill2 = X_Fill2.rename(columns={"Set ID - Fill2": "Set ID"})

# # Merge X
# X = pd.merge(X_Dam, X_AutoClave, on="Set ID")
# X = pd.merge(X, X_Fill1, on="Set ID")
# X = pd.merge(X, X_Fill2, on="Set ID")
# X = X.drop(X[X.duplicated(subset="Set ID")].index).reset_index(drop=True)

In [4]:
# X.to_csv('X.csv')
# y.to_csv('y.csv')

In [5]:
ROOT_DIR = "data"
RANDOM_STATE = 110

X = pd.read_csv('X.csv', index_col=0).reset_index(drop=True)
y = pd.read_csv('y.csv', index_col=0).reset_index(drop=True)

Columns (63,65,245,247,279,281,410,412) have mixed types. Specify dtype option on import or set low_memory=False.


x 데이터와 y 데이터 병합

In [6]:
# Merge X and y
df_merged = pd.merge(X, y, "inner", on="Set ID")

# Drop columns with more than half of the values missing
drop_cols = []
for column in df_merged.columns:
    if (df_merged[column].notnull().sum() // 2) < df_merged[
        column
    ].isnull().sum():
        drop_cols.append(column)
df_merged = df_merged.drop(drop_cols, axis=1)

# Drop Lot ID
df_merged = df_merged.drop("LOT ID - Dam", axis=1)

In [7]:
# 결측치 피쳐 세 개의 피쳐가 결측치 수가 동일함
na_list = []
for i in df_merged.columns:
    if df_merged[i].isna().sum() > 0:
        na_list.append(i)
df_merged[na_list] = df_merged[na_list].fillna(0)

## 2. 데이터 전처리

### 언더 샘플링 

In [8]:
normal_ratio = 1.0  # 1.0 means 1:1 ratio

df_normal = df_merged[df_merged["target"] == "Normal"]
df_abnormal = df_merged[df_merged["target"] == "AbNormal"]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

df_normal = df_normal.sample(
    n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE
)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
df_concat.value_counts("target")

  Total: Normal: 38156, AbNormal: 2350


target
AbNormal    2350
Normal      2350
dtype: int64

### 전처리 함수

In [39]:
import pandas as pd
import numpy as np

# 파트 1: 전처리 초기 단계
def preprocess_initial(df):
    # 고유값이 하나인 컬럼 찾기
    unique_value_1_columns = [column for column in df.columns if df[column].nunique() == 1]

    # 모든 행의 값이 다른 컬럼 찾기
    row_count = len(df)
    columns_to_check = ['Collect Date - Dam', 'Collect Date - Fill1', 'Collect Date - Fill2']
    matching_row_columns = [column for column in df.columns if df[column].value_counts().size == row_count]
    for col in columns_to_check:
        if col in matching_row_columns:
            matching_row_columns.remove(col)
        else:
            continue
    
    columns_to_convert = ['Collect Result.17 - Dam', 'Collect Result.7 - Fill1', 'Collect Result.17 - Fill2']
    for col in columns_to_convert:
        if col in df_columns:
            df_columns.remove(col)
        else:
            continue
        # 고유값, 혼합값 피쳐 제거
    df.drop(columns=unique_value_1_columns, inplace=True)
    df.drop(columns=matching_row_columns, inplace=True)
    return df

# # 파트 2: "OK" 값을 최빈값으로 대체하는 함수
# def replace_ok_with_mode(df):
#     # 최빈값으로 대체하는 함수
#     def replace_ok_with_mode_inner(df, columns):
#         for column in columns:
#             # 해당 컬럼의 최빈값 계산
#             mode_value = df[df[column] != 'OK'][column].astype(float).mode()[0]
#             # "OK" 값을 최빈값으로 대체
#             df[column] = df[column].replace('OK', mode_value)
#             df[column] = df[column].fillna(mode_value)
#         return df

#     # 대상 컬럼 리스트
#     columns_to_convert = ['Collect Result.17 - Dam', 'Collect Result.7 - Fill1', 'Collect Result.17 - Fill2']

#     # 함수 적용
#     df = replace_ok_with_mode_inner(df, columns_to_convert)
#     return df

# 파트 3: 날짜 데이터로 변환
def convert_to_datetime(df):
    df['Collect Date - Dam'] = pd.to_datetime(df['Collect Date - Dam'])
    df['Collect Date - AutoClave'] = pd.to_datetime(df['Collect Date - AutoClave'])
    df['Collect Date - Fill1'] = pd.to_datetime(df['Collect Date - Fill1'])
    df['Collect Date - Fill2'] = pd.to_datetime(df['Collect Date - Fill2'])
    return df

# 파트 4: 유사한 피쳐 제거
def reduce_dataframe(df):
    def get_value_counts_ratio(series):
        value_counts = series.value_counts(normalize=True)
        return value_counts.sort_values().values

    # 고유값 매핑 확인 함수
    def check_value_mapping(df, col1, col2):
        unique_values_1 = df[col1].unique()
        unique_values_2 = df[col2].unique()

        if len(unique_values_1) != len(unique_values_2):
            return False

        value_mapping = {}
        for val1 in unique_values_1:
            corresponding_values = df[df[col1] == val1][col2].unique()
            if len(corresponding_values) != 1:
                return False
            value_mapping[val1] = corresponding_values[0]

        for val1 in unique_values_1:
            if isinstance(val1, pd.Timestamp):
                ratio_1 = (df[col1].apply(lambda x: x.date() if isinstance(x, pd.Timestamp) else x) == val1.date()).mean()
                ratio_2 = (df[col2].apply(lambda x: x.date() if isinstance(x, pd.Timestamp) else x) == value_mapping[val1].date()).mean()
            else:
                ratio_1 = (df[col1] == val1).mean()
                ratio_2 = (df[col2] == value_mapping[val1]).mean()
            if ratio_1 != ratio_2:
                return False

        return True

    def compare_all_features(df):
        ratios = {column: get_value_counts_ratio(df[column]) for column in df.columns if not pd.api.types.is_datetime64_any_dtype(df[column])}
        similar_columns_dict = {column: [] for column in df.columns if not pd.api.types.is_datetime64_any_dtype(df[column])}

        # 고유값 비율이 같은 열들 찾기
        columns = list(ratios.keys())
        for i in range(len(columns)):
            for j in range(i + 1, len(columns)):
                if np.array_equal(ratios[columns[i]], ratios[columns[j]]):
                    similar_columns_dict[columns[i]].append(columns[j])

        # 고유값 비율이 같고 매핑도 동일한 피쳐들 찾기
        comparisons = []
        for key, values in similar_columns_dict.items():
            for value in values:
                if check_value_mapping(df, key, value):
                    comparisons.append((key, value))

        if comparisons:
            print("다음 피쳐 쌍은 고유값 비율과 양상이 동일합니다:")
            for base, compare in comparisons:
                print(f"'{base}'와(과) '{compare}'")
        else:
            print("모든 피쳐의 고유값 비율과 양상이 동일하지 않습니다.")

        return comparisons

    comparisons = compare_all_features(df)
    columns_to_remove = set()
    for _, col_to_remove in comparisons:
        columns_to_remove.add(col_to_remove)

    df = df.drop(columns=columns_to_remove)
    global df_columns
    df_columns = df.columns
    return df

# 파트 5: 통계 계산 및 추가 전처리
def finalize_preprocessing(df):
    # 날짜 기준 정렬
    df_preprocessed = df.sort_values(by=["Collect Date - Dam"])

    # 장비별 Collect Result 컬럼들 식별
    collect_result_columns_dam = [col for col in df_preprocessed.columns if 'Collect Result' in col and 'Dam' in col]
    collect_result_columns_autoclave = [col for col in df_preprocessed.columns if 'Collect Result' in col and 'AutoClave' in col]
    collect_result_columns_fill1 = [col for col in df_preprocessed.columns if 'Collect Result' in col and 'Fill1' in col]
    collect_result_columns_fill2 = [col for col in df_preprocessed.columns if 'Collect Result' in col and 'Fill2' in col]

    # Dam 장비의 평균, 표준편차, 합계 계산
    df_preprocessed['Collect_Result_Mean_Dam'] = df_preprocessed[collect_result_columns_dam].astype(float).mean(axis=1)
    df_preprocessed['Collect_Result_Std_Dam'] = df_preprocessed[collect_result_columns_dam].astype(float).std(axis=1)
    df_preprocessed['Collect_Result_Sum_Dam'] = df_preprocessed[collect_result_columns_dam].astype(float).sum(axis=1)

    # AutoClave 장비의 평균, 표준편차, 합계 계산
    df_preprocessed['Collect_Result_Mean_AutoClave'] = df_preprocessed[collect_result_columns_autoclave].astype(float).mean(axis=1)
    df_preprocessed['Collect_Result_Std_AutoClave'] = df_preprocessed[collect_result_columns_autoclave].astype(float).std(axis=1)
    df_preprocessed['Collect_Result_Sum_AutoClave'] = df_preprocessed[collect_result_columns_autoclave].astype(float).sum(axis=1)

    # Fill1 장비의 평균, 표준편차, 합계 계산
    df_preprocessed['Collect_Result_Mean_Fill1'] = df_preprocessed[collect_result_columns_fill1].astype(float).mean(axis=1)
    df_preprocessed['Collect_Result_Std_Fill1'] = df_preprocessed[collect_result_columns_fill1].astype(float).std(axis=1)
    df_preprocessed['Collect_Result_Sum_Fill1'] = df_preprocessed[collect_result_columns_fill1].astype(float).sum(axis=1)

    # Fill2 장비의 평균, 표준편차, 합계 계산
    df_preprocessed['Collect_Result_Mean_Fill2'] = df_preprocessed[collect_result_columns_fill2].astype(float).mean(axis=1)
    df_preprocessed['Collect_Result_Std_Fill2'] = df_preprocessed[collect_result_columns_fill2].astype(float).std(axis=1)
    df_preprocessed['Collect_Result_Sum_Fill2'] = df_preprocessed[collect_result_columns_fill2].astype(float).sum(axis=1)

    # 각 장비 간의 시간 차이 계산 (초 단위)
    df_preprocessed['Time_Difference_Dam_Fill1'] = (df_preprocessed['Collect Date - Dam'] - df_preprocessed['Collect Date - Fill1']).dt.total_seconds()
    df_preprocessed['Time_Difference_Dam_Fill2'] = (df_preprocessed['Collect Date - Dam'] - df_preprocessed['Collect Date - Fill2']).dt.total_seconds()
    df_preprocessed['Time_Difference_Dam_AutoClave'] = (df_preprocessed['Collect Date - Dam'] - df_preprocessed['Collect Date - AutoClave']).dt.total_seconds()
    df_preprocessed['Time_Difference_Fill1_Fill2'] = (df_preprocessed['Collect Date - Fill1'] - df_preprocessed['Collect Date - Fill2']).dt.total_seconds()
    df_preprocessed['Time_Difference_Fill1_AutoClave'] = (df_preprocessed['Collect Date - Fill1'] - df_preprocessed['Collect Date - AutoClave']).dt.total_seconds()
    df_preprocessed['Time_Difference_Fill2_AutoClave'] = (df_preprocessed['Collect Date - Fill2'] - df_preprocessed['Collect Date - AutoClave']).dt.total_seconds()

    # 날짜 컬럼을 타임스탬프로 변환
    df_preprocessed['Collect Date - Dam'] = pd.to_datetime(df_preprocessed['Collect Date - Dam']).apply(lambda x: x.timestamp())
    df_preprocessed['Collect Date - Fill1'] = pd.to_datetime(df_preprocessed['Collect Date - Fill1']).apply(lambda x: x.timestamp())
    df_preprocessed['Collect Date - Fill2'] = pd.to_datetime(df_preprocessed['Collect Date - Fill2']).apply(lambda x: x.timestamp())
    df_preprocessed['Collect Date - AutoClave'] = pd.to_datetime(df_preprocessed['Collect Date - AutoClave']).apply(lambda x: x.timestamp())

    return df_preprocessed

# 전체 전처리 함수
def preprocess_train_dataframe(df):
    df = preprocess_initial(df)
    # df = replace_ok_with_mode(df)
    df = convert_to_datetime(df)
    df = reduce_dataframe(df)
    df = finalize_preprocessing(df)
    return df

# 전체 전처리 함수
def preprocess_test_dataframe(df):
    df = preprocess_initial(df)
    # df = replace_ok_with_mode(df)
    df = convert_to_datetime(df)
    df = finalize_preprocessing(df)
    return df

In [72]:
# 데이터프레임을 전처리 함수에 넣어서 실행
df_preprocessed = preprocess_train_dataframe(df_concat)

다음 피쳐 쌍은 고유값 비율과 양상이 동일합니다:
'Equipment - Dam'와(과) 'Collect Result - Dam'
'Equipment - Dam'와(과) 'Collect Result.1 - Dam'
'Equipment - Dam'와(과) 'Collect Result.2 - Dam'
'Equipment - Dam'와(과) 'Collect Result.7 - Dam'
'Equipment - Dam'와(과) 'Collect Result.9 - Dam'
'Model.Suffix - Dam'와(과) 'Model.Suffix - AutoClave'
'Model.Suffix - Dam'와(과) 'Model.Suffix - Fill1'
'Model.Suffix - Dam'와(과) 'Model.Suffix - Fill2'
'Workorder - Dam'와(과) 'Workorder - AutoClave'
'Workorder - Dam'와(과) 'Workorder - Fill1'
'Workorder - Dam'와(과) 'Workorder - Fill2'
'Collect Result - Dam'와(과) 'Collect Result.1 - Dam'
'Collect Result - Dam'와(과) 'Collect Result.2 - Dam'
'Collect Result - Dam'와(과) 'Collect Result.7 - Dam'
'Collect Result - Dam'와(과) 'Collect Result.9 - Dam'
'Collect Result.1 - Dam'와(과) 'Collect Result.2 - Dam'
'Collect Result.1 - Dam'와(과) 'Collect Result.7 - Dam'
'Collect Result.1 - Dam'와(과) 'Collect Result.9 - Dam'
'Collect Result.2 - Dam'와(과) 'Collect Result.7 - Dam'
'Collect Result.2 - Dam'와(과) 'Collect

In [78]:
df_train = df_preprocessed.copy()

In [79]:
df_train.shape

(4700, 115)

In [80]:
y_train = df_train.target
df_train.drop(columns = ['target'],inplace =True)

### SHAP

In [44]:
# # NaN 및 무한대 값 처리
# df_train_shap = df_train.replace([np.inf, -np.inf], np.nan).dropna()
# # df_encoded = df_encoded.applymap(lambda x: x if x > 1e-2 else 0)
# df_train_shap['target'] = df_train_shap['target'].map({'AbNormal': 1, 'Normal': 0})
# df_cleaned = df_train_shap.drop(columns=['Workorder - Dam'])

# # 문자열 데이터를 범주형으로 변환
# categorical_cols = df_cleaned.select_dtypes(include=['object']).columns
# df_cleaned[categorical_cols] = df_cleaned[categorical_cols].astype('category')

# # 범주형 데이터를 원-핫 인코딩
# df_encoded = pd.get_dummies(df_cleaned, drop_first=True)

# # 독립 변수와 종속 변수 분리
# X_shap = df_encoded.drop(columns=['target'])
# y_shap = df_encoded['target']

# # 학습 데이터와 테스트 데이터 분리
# X_train, X_test, y_train, y_test = train_test_split(X_shap, y_shap, test_size=0.3, random_state=42)

In [45]:
# # XGBoost 모델 학습
# model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
# model.fit(X_train, y_train)

# # 예측
# y_pred = model.predict(X_test)

# # F1 score 계산
# f1 = f1_score(y_test, y_pred)
# print(f'F1 Score: {f1}')

In [46]:
# # SHAP 값 계산
# explainer = shap.TreeExplainer(model)
# shap_values = explainer.shap_values(X_test)

# # SHAP 값을 데이터프레임으로 변환
# shap_values_df = pd.DataFrame(shap_values, columns=X_test.columns)

# # 각 피쳐의 평균 절대 SHAP 값 계산
# mean_abs_shap_values = shap_values_df.abs().mean()

# # 상위 N개의 피쳐 선택 (여기서는 상위 10개 피쳐를 예시로 선택)
# top_features = mean_abs_shap_values.sort_values(ascending=False).head(50)
# # 평균 절대 SHAP 값이 0.01 이상인 피처 선택
# selected_features = top_features[top_features >= 0.01]

# # 피처 이름만 추출
# selected_feature_names = selected_features.index.tolist()

# # 결과 출력
# print("평균 절대 SHAP 값이 0.01 이상인 피처들:\n", selected_feature_names)

In [47]:
# # SHAP 값 계산
# explainer = shap.TreeExplainer(model)
# shap_values = explainer.shap_values(X_test)

# # SHAP 값 시각화
# shap.summary_plot(shap_values, X_test, plot_type="bar")
# shap.summary_plot(shap_values, X_test)



In [48]:
# X_train_shap = X_shap[selected_feature_names]

## 3. 모델 학습

### 모델 정의 

In [49]:
from catboost import CatBoostClassifier
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import cross_val_score
import optuna
import warnings
warnings.filterwarnings("ignore")

model = RandomForestClassifier(random_state=RANDOM_STATE)

### 모델 학습

In [81]:
features = []

for col in df_train.columns:
    try:
        df_train[col] = df_train[col].astype(int)
        features.append(col)
    except:
        continue

if "Set ID" in features:
    features.remove("Set ID")

train_x = df_train[features]
train_y = y_train

In [82]:
train_x.shape

(4700, 108)

In [83]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(train_y)

### Catboost

In [52]:
# 하이퍼파라미터 튜닝을 위한 목적 함수 정의 + L2 추가 
def objective_decision(trial):
    cat_depth = trial.suggest_int('depth', 2, 10, step=1)
    cat_learning_rate = trial.suggest_float('learning_rate', 0.001, 0.01, step=0.001)
    cat_iterations = trial.suggest_int('iterations', 100, 300, step=10)
    cat_l2_leaf_reg = trial.suggest_float('l2_leaf_reg', 1e-5, 1e-1, log=True)

    classifier_obj = CatBoostClassifier(
        depth=cat_depth,
        learning_rate=cat_learning_rate,
        iterations=cat_iterations,
        l2_leaf_reg=cat_l2_leaf_reg,
        early_stopping_rounds=50,  # 조기 종료 설정
        verbose=0  # 훈련 중 출력 억제
    )

    # 교차 검증을 통해 F1 스코어 계산
    score = cross_val_score(classifier_obj, train_x, y_encoded, cv=5, n_jobs=-1, scoring='f1', error_score='raise')
    mean_f1 = score.mean()
    return mean_f1

# 최적화 실행
study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize")
study.optimize(objective_decision, n_trials=1) 

# 최적화 결과 보기
print("Best score:", study.best_value)
print("Best parameters:", study.best_params)

[I 2024-08-06 14:09:22,861] A new study created in memory with name: no-name-8787059c-78fe-4838-a235-665ad02c3594
[I 2024-08-06 14:09:25,721] Trial 0 finished with value: 0.3307299486385614 and parameters: {'depth': 6, 'learning_rate': 0.003, 'iterations': 180, 'l2_leaf_reg': 0.023938918670661075}. Best is trial 0 with value: 0.3307299486385614.


Best score: 0.3307299486385614
Best parameters: {'depth': 6, 'learning_rate': 0.003, 'iterations': 180, 'l2_leaf_reg': 0.023938918670661075}


In [53]:
cat = CatBoostClassifier(**study.best_params)
cat.fit(train_x, y_encoded)

0:	learn: 0.6926806	total: 4.02ms	remaining: 719ms
1:	learn: 0.6922795	total: 7.98ms	remaining: 710ms
2:	learn: 0.6917140	total: 12.1ms	remaining: 712ms
3:	learn: 0.6913282	total: 16.2ms	remaining: 713ms
4:	learn: 0.6909240	total: 19.7ms	remaining: 690ms
5:	learn: 0.6904873	total: 23ms	remaining: 667ms
6:	learn: 0.6901185	total: 26.5ms	remaining: 655ms
7:	learn: 0.6897283	total: 29.7ms	remaining: 639ms
8:	learn: 0.6893401	total: 33.2ms	remaining: 631ms
9:	learn: 0.6889333	total: 36.3ms	remaining: 617ms
10:	learn: 0.6885180	total: 39.7ms	remaining: 610ms
11:	learn: 0.6881081	total: 43.3ms	remaining: 606ms
12:	learn: 0.6875195	total: 46.5ms	remaining: 597ms
13:	learn: 0.6872291	total: 50.6ms	remaining: 600ms
14:	learn: 0.6868576	total: 53.9ms	remaining: 593ms
15:	learn: 0.6864054	total: 57.4ms	remaining: 589ms
16:	learn: 0.6858945	total: 60.7ms	remaining: 582ms
17:	learn: 0.6853529	total: 64.2ms	remaining: 578ms
18:	learn: 0.6847228	total: 67.6ms	remaining: 573ms
19:	learn: 0.6843492	tot

<catboost.core.CatBoostClassifier at 0x116fdad2890>

### xgboost

In [54]:
from xgboost import XGBClassifier

def objective_decision(trial):
    xgbm_n_estimators = trial.suggest_int('n_estimators', 300, 400, step=50)
    xgbm_learning_rate = trial.suggest_float('learning_rate', 0.02, 0.05, step=0.01)
    xgbm_max_depth = trial.suggest_int('max_depth', 3, 10, step=1)
    xgbm_gamma = trial.suggest_float('gamma', 0.0, 0.5, step=0.1)
    xgbm_min_child_weight = trial.suggest_int('min_child_weight', 1, 10, step=1)
    xgbm_subsample = trial.suggest_float('subsample', 0.5, 1.0, step=0.1)
    xgbm_colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0, step=0.1)

    classifier_obj = XGBClassifier(
        n_estimators=xgbm_n_estimators,
        learning_rate=xgbm_learning_rate,
        max_depth=xgbm_max_depth,
        gamma=xgbm_gamma,
        min_child_weight=xgbm_min_child_weight,
        subsample=xgbm_subsample,
        colsample_bytree=xgbm_colsample_bytree,
        use_label_encoder=False,
        eval_metric='logloss'
    )

    score = cross_val_score(classifier_obj, train_x, y_encoded, cv=5, n_jobs=-1, scoring='f1', error_score='raise')
    mean_f1 = score.mean()
    return mean_f1

# 최적화 실행
study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize")
study.optimize(objective_decision, n_trials=1) 

# 최적화 결과 보기
print("Best score:", study.best_value)
print("Best parameters:", study.best_params)

[I 2024-08-06 14:09:26,564] A new study created in memory with name: no-name-ee57da04-438a-45a6-b7b4-77381f667041
[I 2024-08-06 14:09:27,885] Trial 0 finished with value: 0.33781804171484087 and parameters: {'n_estimators': 350, 'learning_rate': 0.03, 'max_depth': 6, 'gamma': 0.5, 'min_child_weight': 1, 'subsample': 0.5, 'colsample_bytree': 0.9}. Best is trial 0 with value: 0.33781804171484087.


Best score: 0.33781804171484087
Best parameters: {'n_estimators': 350, 'learning_rate': 0.03, 'max_depth': 6, 'gamma': 0.5, 'min_child_weight': 1, 'subsample': 0.5, 'colsample_bytree': 0.9}


In [55]:
xgbm = XGBClassifier(**study.best_params)
xgbm.fit(train_x, y_encoded)

In [56]:
# y_encoded #0은 AbNormal, 1은 Normal

### RandomForest

In [57]:
def objective(trial):
    # 하이퍼파라미터 설정
    rf_n_estimators = trial.suggest_int('n_estimators', 200, 400, step=20)
    rf_max_depth = trial.suggest_int('max_depth', 2, 15, step=2)
    rf_min_samples_split = trial.suggest_int('min_samples_split', 5, 20, step=2)
    rf_max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 2, 60, step=2)
    rf_min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10, step=1)
    rf_max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])

    classifier_obj = RandomForestClassifier(
        n_estimators=rf_n_estimators,
        max_depth=rf_max_depth,
        min_samples_split=rf_min_samples_split,
        max_leaf_nodes=rf_max_leaf_nodes,
        min_samples_leaf=rf_min_samples_leaf,
        max_features=rf_max_features,
        random_state=100
    )

    # 교차 검증을 통해 F1 스코어 계산
    score = cross_val_score(classifier_obj, train_x, y_encoded, cv=5, n_jobs=-1, scoring='f1')
    mean_f1 = score.mean()
    return mean_f1

# 최적화 실행
study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize")
study.optimize(objective, n_trials=2)

# 최적화 결과 보기
print("Best score:", study.best_value)
print("Best parameters:", study.best_params)

[I 2024-08-06 14:09:28,450] A new study created in memory with name: no-name-a8d5850e-615f-4055-a36a-d5d0d0fa57ef
[I 2024-08-06 14:09:29,291] Trial 0 finished with value: 0.33737676868004085 and parameters: {'n_estimators': 300, 'max_depth': 4, 'min_samples_split': 11, 'max_leaf_nodes': 52, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 0 with value: 0.33737676868004085.
[I 2024-08-06 14:09:30,361] Trial 1 finished with value: 0.3747475120276536 and parameters: {'n_estimators': 380, 'max_depth': 2, 'min_samples_split': 13, 'max_leaf_nodes': 54, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.3747475120276536.


Best score: 0.3747475120276536
Best parameters: {'n_estimators': 380, 'max_depth': 2, 'min_samples_split': 13, 'max_leaf_nodes': 54, 'min_samples_leaf': 3, 'max_features': 'sqrt'}


In [58]:
rf = RandomForestClassifier(**study.best_params)
rf.fit(train_x, y_encoded)

## 4. 제출하기

### 테스트 데이터 예측

In [84]:
df_test_y = pd.read_csv(os.path.join("submission.csv"))
df_test = pd.merge(X, df_test_y, "inner", on="Set ID")

In [85]:
df_test = df_test[df_columns]

In [86]:
df_test_preprocessed = preprocess_test_dataframe(df_test)
df_test_x = df_test_preprocessed.copy()

In [87]:
features = []

for col in df_test_x.columns:
    try:
        df_test_x.loc[:, col] = df_test_x[col].astype(int)
        features.append(col)
    except:
        continue

df_test_x = df_test_x[features]

In [88]:
train_x.shape

(4700, 108)

In [90]:
df_test_x.shape

(17361, 108)

In [91]:
cat_test_pred = cat.predict(df_test_x)
xgbm_test_pred = xgbm.predict(df_test_x)
rf_test_pred = rf.predict(df_test_x)

### voting 전 평가

In [92]:
# 학습데이터 70%, 평가데이터 30%로 데이터 분할
X_tra, X_dev, y_tra, y_dev = train_test_split(train_x, y_encoded, test_size=0.3, stratify=train_y, random_state=0)

In [93]:
y_dev

array([1, 1, 1, ..., 0, 1, 0])

In [94]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score

averaging = VotingClassifier(
    estimators = [('xgbm', xgbm), ('cat', cat),  ('rf', rf)], voting='soft', weights=[4, 4, 3])

averaging.fit(X_tra, y_tra)

ensemble_pred = averaging.predict(X_dev)

print(f1_score(y_dev, ensemble_pred))

0:	learn: 0.6926540	total: 3.36ms	remaining: 601ms
1:	learn: 0.6921623	total: 6.69ms	remaining: 595ms
2:	learn: 0.6915102	total: 9.98ms	remaining: 589ms
3:	learn: 0.6909843	total: 13.2ms	remaining: 582ms
4:	learn: 0.6904934	total: 16.3ms	remaining: 569ms
5:	learn: 0.6900438	total: 19.5ms	remaining: 566ms
6:	learn: 0.6896946	total: 22.6ms	remaining: 558ms
7:	learn: 0.6890610	total: 25.6ms	remaining: 550ms
8:	learn: 0.6886578	total: 28.9ms	remaining: 549ms
9:	learn: 0.6882925	total: 32ms	remaining: 545ms
10:	learn: 0.6878473	total: 35.4ms	remaining: 543ms
11:	learn: 0.6874885	total: 38.3ms	remaining: 536ms
12:	learn: 0.6870051	total: 41.6ms	remaining: 535ms
13:	learn: 0.6866883	total: 44.9ms	remaining: 532ms
14:	learn: 0.6864018	total: 46.8ms	remaining: 515ms
15:	learn: 0.6861072	total: 49.8ms	remaining: 511ms
16:	learn: 0.6854984	total: 53ms	remaining: 508ms
17:	learn: 0.6852464	total: 56.6ms	remaining: 510ms
18:	learn: 0.6847668	total: 59.9ms	remaining: 508ms
19:	learn: 0.6843348	total

### voting

In [None]:
from sklearn.ensemble import VotingClassifier

averaging_model = VotingClassifier(
    estimators = [('xgbm', xgbm), ('cat', cat),  ('rf', rf)], voting='soft', weights=[4, 4, 3])

averaging_model.fit(train_x, train_y)

ensemble_pred = averaging_model.predict(df_test_x)

0:	learn: 0.6927189	total: 3.57ms	remaining: 640ms
1:	learn: 0.6923048	total: 7.07ms	remaining: 629ms
2:	learn: 0.6918878	total: 10.2ms	remaining: 603ms
3:	learn: 0.6914771	total: 13.7ms	remaining: 603ms
4:	learn: 0.6909559	total: 17ms	remaining: 595ms
5:	learn: 0.6904760	total: 20.7ms	remaining: 600ms
6:	learn: 0.6900814	total: 23.9ms	remaining: 591ms
7:	learn: 0.6895801	total: 27.2ms	remaining: 585ms
8:	learn: 0.6891483	total: 30.8ms	remaining: 585ms
9:	learn: 0.6885133	total: 34.6ms	remaining: 587ms
10:	learn: 0.6880854	total: 38ms	remaining: 583ms
11:	learn: 0.6875359	total: 41.2ms	remaining: 577ms
12:	learn: 0.6872281	total: 45ms	remaining: 578ms
13:	learn: 0.6867938	total: 48.7ms	remaining: 578ms
14:	learn: 0.6863737	total: 52.6ms	remaining: 578ms
15:	learn: 0.6859325	total: 56.1ms	remaining: 575ms
16:	learn: 0.6854023	total: 60.1ms	remaining: 576ms
17:	learn: 0.6847763	total: 63.6ms	remaining: 573ms
18:	learn: 0.6843894	total: 67.3ms	remaining: 570ms
19:	learn: 0.6840840	total: 

In [None]:
ensemble_pred 

array(['AbNormal', 'AbNormal', 'AbNormal', ..., 'Normal', 'AbNormal',
       'Normal'], dtype=object)

### 제출 파일 작성

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = ensemble_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

ValueError: Length of values (4700) does not match length of index (17361)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**