In [3]:
import os
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from pytorch_tabnet.tab_model import TabNetClassifier

RANDOM_STATE = 110

# 데이터 불러오기
ROOT_DIR = "data"

def read_excel_file(file_path: str, header: int = None) -> pd.DataFrame:
    csv_file = file_path.replace(".xlsx", ".csv")

    if not os.path.exists(csv_file):
        print("Converting excel to csv...")
        if header:
            df = pd.read_excel(file_path, header=header)
        else:
            df = pd.read_excel(file_path)

        df.to_csv(csv_file, index=False)
        print(f"  {file_path} -> {csv_file}")
        return df
    else:
        print(f"  Reading {csv_file}")
        return pd.read_csv(csv_file, low_memory=False)

X_Dam = read_excel_file(os.path.join(ROOT_DIR, "Dam dispensing.xlsx"), header=1)
X_AutoClave = read_excel_file(os.path.join(ROOT_DIR, "Auto clave.xlsx"), header=1)
X_Fill1 = read_excel_file(os.path.join(ROOT_DIR, "Fill1 dispensing.xlsx"), header=1)
X_Fill2 = read_excel_file(os.path.join(ROOT_DIR, "Fill2 dispensing.xlsx"), header=1)
y = pd.read_csv(os.path.join(ROOT_DIR, "train_y.csv"))

# 컬럼 이름 변경
X_Dam.columns = [i + " - Dam" for i in X_Dam.columns]
X_AutoClave.columns = [i + " - AutoClave" for i in X_AutoClave.columns]
X_Fill1.columns = [i + " - Fill1" for i in X_Fill1.columns]
X_Fill2.columns = [i + " - Fill2" for i in X_Fill2.columns]
X_Dam = X_Dam.rename(columns={"Set ID - Dam": "Set ID"})
X_AutoClave = X_AutoClave.rename(columns={"Set ID - AutoClave": "Set ID"})
X_Fill1 = X_Fill1.rename(columns={"Set ID - Fill1": "Set ID"})
X_Fill2 = X_Fill2.rename(columns={"Set ID - Fill2": "Set ID"})

# 데이터 병합
X = pd.merge(X_Dam, X_AutoClave, on="Set ID")
X = pd.merge(X, X_Fill1, on="Set ID")
X = pd.merge(X, X_Fill2, on="Set ID")
X = X.drop(X[X.duplicated(subset="Set ID")].index).reset_index(drop=True)

# 날짜 컬럼 제거
date_columns = [col for col in X.columns if 'Date' in col]
X = X.drop(columns=(date_columns))

# ID 분리
set_id = X[['Set ID']]

# 비수치형 컬럼 인코딩
non_numeric_columns = X.select_dtypes(include=['object']).columns
encoder = LabelEncoder()
encoded_df = pd.DataFrame(index=X.index)
for column in non_numeric_columns:
    encoded_df[column] = encoder.fit_transform(X[column])

X = X.drop(columns=non_numeric_columns)
X = pd.concat([X, encoded_df], axis=1)

# 스케일링
scaler = StandardScaler()
X_columns = X.columns
X = scaler.fit_transform(X)
X = pd.DataFrame(X, columns=X_columns)

# Set ID 추가
X = X.drop(columns=['Set ID'])
X = pd.concat([X, set_id], axis=1)

# X와 y 병합
df_merged = pd.merge(X, y, "inner", on="Set ID")

# 결측치가 절반 이상인 컬럼 제거
drop_cols = [column for column in df_merged.columns if (df_merged[column].notnull().sum() // 2) < df_merged[column].isnull().sum()]
df_merged = df_merged.drop(drop_cols, axis=1)

# Lot ID 컬럼 제거
df_merged = df_merged.drop("LOT ID - Dam", axis=1)

# 데이터 분리
normal_ratio = 1.0  # 1:1 비율
df_normal = df_merged[df_merged["target"] == "Normal"]
df_abnormal = df_merged[df_merged["target"] == "AbNormal"]

num_abnormal = len(df_abnormal)
df_normal = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)

df_train, df_val = train_test_split(df_concat, test_size=0.3, stratify=df_concat["target"], random_state=RANDOM_STATE)

# 피처 목록 생성
features = [col for col in df_train.columns if col not in ['Set ID', 'target']]

train_x = df_train[features].values
train_y = (df_train["target"] == "AbNormal").astype(int).values
test_x = df_val[features].values
test_y = (df_val["target"] == "AbNormal").astype(int).values


  Reading data\Dam dispensing.csv
  Reading data\Auto clave.csv
  Reading data\Fill1 dispensing.csv
  Reading data\Fill2 dispensing.csv


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [4]:
from sklearn.ensemble import RandomForestClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

# 특성 선택을 위한 랜덤 포레스트 모델 정의
rf = RandomForestClassifier(n_estimators=50, random_state=RANDOM_STATE, n_jobs=-1)

# Sequential Feature Selector 정의
sfs = SFS(rf,
          k_features='best',
          forward=True,
          floating=False,
          scoring='accuracy',
          cv=3,  # 교차 검증 폴드 수를 줄여서 실행 시간 단축
          n_jobs=-1)  # 병렬 처리 활성화

# 특성 선택 수행
sfs = sfs.fit(train_x, train_y)

# 선택된 특성 목록
selected_features = [features[i] for i in sfs.k_feature_idx_]
print("Selected features:", selected_features)

# 선택된 특성을 사용하여 데이터 재구성
train_x_selected = df_train[selected_features].values
test_x_selected = df_val[selected_features].values

# TabNet 모델 정의 및 학습
tabnet_model = TabNetClassifier(seed=RANDOM_STATE)

tabnet_model.fit(
    X_train=train_x_selected,
    y_train=train_y,
    eval_set=[(test_x_selected, test_y)],
    eval_name=['val'],
    eval_metric=['accuracy'],
    max_epochs=50,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)

Selected features: ['Insp. Seq No. - Dam', 'Collect Result.3 - Dam', 'Collect Result.4 - Dam', 'Collect Result.5 - Dam', 'Collect Result.10 - Dam', 'Collect Result.13 - Dam', 'Collect Result.19 - Dam', 'Collect Result.31 - Dam', 'Collect Result.32 - Dam', 'Collect Result.34 - Dam', 'Collect Result.35 - Dam', 'Collect Result.36 - Dam', 'Collect Result.37 - Dam', 'Collect Result.38 - Dam', 'Collect Result.42 - Dam', 'Collect Result.46 - Dam', 'Collect Result.50 - Dam', 'Collect Result.54 - Dam', 'Collect Result.55 - Dam', 'Collect Result.56 - Dam', 'Collect Result.57 - Dam', 'Collect Result.59 - Dam', 'Collect Result.60 - Dam', 'Collect Result.61 - Dam', 'Collect Result.62 - Dam', 'Collect Result.63 - Dam', 'Collect Result.64 - Dam', 'Collect Result.65 - Dam', 'Collect Result.67 - Dam', 'Collect Result.68 - Dam', 'Collect Result.69 - Dam', 'Insp. Seq No. - AutoClave', 'Collect Result - AutoClave', 'Collect Result.1 - AutoClave', 'Unit Time.1 - AutoClave', 'Unit Time.2 - AutoClave', 'Coll



epoch 0  | loss: 0.89372 | val_accuracy: 0.47943 |  0:00:01s
epoch 1  | loss: 0.74403 | val_accuracy: 0.52482 |  0:00:01s
epoch 2  | loss: 0.72491 | val_accuracy: 0.52624 |  0:00:01s
epoch 3  | loss: 0.70643 | val_accuracy: 0.54681 |  0:00:01s
epoch 4  | loss: 0.6954  | val_accuracy: 0.5383  |  0:00:01s
epoch 5  | loss: 0.68811 | val_accuracy: 0.53262 |  0:00:01s
epoch 6  | loss: 0.68865 | val_accuracy: 0.53759 |  0:00:01s
epoch 7  | loss: 0.68379 | val_accuracy: 0.54823 |  0:00:02s
epoch 8  | loss: 0.67879 | val_accuracy: 0.5539  |  0:00:02s
epoch 9  | loss: 0.67939 | val_accuracy: 0.55248 |  0:00:02s
epoch 10 | loss: 0.68374 | val_accuracy: 0.54326 |  0:00:02s
epoch 11 | loss: 0.68045 | val_accuracy: 0.53901 |  0:00:02s
epoch 12 | loss: 0.68199 | val_accuracy: 0.53901 |  0:00:02s
epoch 13 | loss: 0.67709 | val_accuracy: 0.54823 |  0:00:02s
epoch 14 | loss: 0.67654 | val_accuracy: 0.55106 |  0:00:02s
epoch 15 | loss: 0.67587 | val_accuracy: 0.55532 |  0:00:02s
epoch 16 | loss: 0.67449



In [6]:
# 검증 데이터 예측
preds_val = tabnet_model.predict(train_x_selected)

# 평가 결과 출력
print("Classification Report:")
print(classification_report(test_y, preds_val, target_names=['Normal', 'AbNormal']))

print("Confusion Matrix:")
print(confusion_matrix(test_y, preds_val))

Classification Report:


ValueError: Found input variables with inconsistent numbers of samples: [1410, 3290]

In [14]:
# 레이블 매핑
label_mapping = {0: 'Normal', 1: 'AbNormal'}

In [16]:
# 테스트 데이터 예측
test_pred = tabnet_model.predict(df_test_x)

# 숫자 레이블을 문자열로 변환
test_pred_labels = [label_mapping[pred] for pred in test_pred]

# 제출 파일 준비
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = test_pred_labels

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)