# 제품 이상여부 판별 프로젝트

## 1. 데이터 불러오기

### 필수 라이브러리

### 엑셀 파일을 읽는 함수

읽어오는 속도가 느린 엑셀 파일을 위해 csv 파일로 변환하여 저장해 두고 사용합니다.

In [2]:
import os
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
def read_excel_file(file_path: str, header: int = None) -> pd.DataFrame:
    csv_file = file_path.replace(".xlsx", ".csv")

    if not os.path.exists(csv_file):
        print("Converting excel to csv...")
        if header:
            df = pd.read_excel(file_path, header=header)
        else:
            df = pd.read_excel(file_path)

        df.to_csv(csv_file, index=False)
        print(f"  {file_path} -> {csv_file}")
        return df
    else:
        print(f"  Reading {csv_file}")
        return pd.read_csv(csv_file, low_memory=False)

### 엑셀 파일들 읽어오기

In [4]:
ROOT_DIR = "data"
RANDOM_STATE = 110

X_Dam = read_excel_file(os.path.join(ROOT_DIR, "Dam dispensing.xlsx"), header=1)

X_AutoClave = read_excel_file(
    os.path.join(ROOT_DIR, "Auto clave.xlsx"), header=1
)

X_Fill1 = read_excel_file(
    os.path.join(ROOT_DIR, "Fill1 dispensing.xlsx"), header=1
)

X_Fill2 = read_excel_file(
    os.path.join(ROOT_DIR, "Fill2 dispensing.xlsx"), header=1
)

y = pd.read_csv(os.path.join(ROOT_DIR, "train_y.csv"))

  Reading data\Dam dispensing.csv
  Reading data\Auto clave.csv
  Reading data\Fill1 dispensing.csv
  Reading data\Fill2 dispensing.csv


### 데이터 병합

x 데이터 병합

In [5]:
# Rename columns
X_Dam.columns = [i + " - Dam" for i in X_Dam.columns]
X_AutoClave.columns = [i + " - AutoClave" for i in X_AutoClave.columns]
X_Fill1.columns = [i + " - Fill1" for i in X_Fill1.columns]
X_Fill2.columns = [i + " - Fill2" for i in X_Fill2.columns]
X_Dam = X_Dam.rename(columns={"Set ID - Dam": "Set ID"})
X_AutoClave = X_AutoClave.rename(columns={"Set ID - AutoClave": "Set ID"})
X_Fill1 = X_Fill1.rename(columns={"Set ID - Fill1": "Set ID"})
X_Fill2 = X_Fill2.rename(columns={"Set ID - Fill2": "Set ID"})

# Merge X
X = pd.merge(X_Dam, X_AutoClave, on="Set ID")
X = pd.merge(X, X_Fill1, on="Set ID")
X = pd.merge(X, X_Fill2, on="Set ID")
X = X.drop(X[X.duplicated(subset="Set ID")].index).reset_index(drop=True)
X

Unnamed: 0,Wip Line - Dam,Process Desc. - Dam,Equipment - Dam,Model.Suffix - Dam,Workorder - Dam,LOT ID - Dam,Set ID,Box ID - Dam,Collect Date - Dam,Insp. Seq No. - Dam,...,Collect Result.37 - Fill2,Unit Time.37 - Fill2,Judge Value.37 - Fill2,Collect Result.38 - Fill2,Unit Time.38 - Fill2,Judge Value.38 - Fill2,Collect Result.39 - Fill2,Unit Time.39 - Fill2,Judge Value.39 - Fill2,Unnamed: 131 - Fill2
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1X5847-2,OP753345013050000002,OP753345013050000002,,2023-05-04 08:57:23,1,...,1,,,1,,,0,,,
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1X5847-2,OP753345013050000003,OP753345013050000003,,2023-05-04 09:11:35,1,...,2,,,1,,,0,,,
2,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1X5847-2,OP753345013050000004,OP753345013050000004,,2023-05-04 09:13:19,1,...,3,,,1,,,0,,,
3,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1X5847-2,OP753345013050000005,OP753345013050000005,,2023-05-04 09:15:24,1,...,4,,,1,,,0,,,
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1X5847-2,OP753345013050000006,OP753345013050000006,,2023-05-04 09:17:27,1,...,5,,,1,,,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57862,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334505,4F1XB738-1,OP753345054040002685,OP753345054040002685,,2024-04-28 18:30:42,1,...,11,,,435,,,1,,,0.0
57863,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XB738-1,OP753345054040002686,OP753345054040002686,,2024-04-28 18:31:43,1,...,5,,,436,,,1,,,0.0
57864,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334505,4F1XB738-1,OP753345054040002687,OP753345054040002687,,2024-04-28 18:32:49,1,...,12,,,437,,,1,,,0.0
57865,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XB738-1,OP753345054040002688,OP753345054040002688,,2024-04-28 18:33:51,1,...,6,,,438,,,1,,,0.0


In [6]:
# column_names = X.columns.tolist()
# print(column_names)

In [7]:
import pandas as pd
date_columns = [col for col in X.columns if 'Date' in col]
unnamed_columns = [col for col in X.columns if 'Unnamed' in col]
# wip_line_columns = [col for col in X.columns if 'Wip Line' in col]
# process_desc_columns = [col for col in X.columns if 'Process Desc.' in col]
# equipment_columns = [col for col in X.columns if 'Equipment' in col]
# model_suffix_columns = [col for col in X.columns if 'Model.Suffix' in col]
# workorder_columns = [col for col in X.columns if 'Workorder' in col]
lot_id_columns = [col for col in X.columns if 'LOT ID' in col]
box_id_columns = [col for col in X.columns if 'Box ID' in col]

columns_to_drop = (
    date_columns + 
    unnamed_columns + 
    # wip_line_columns + 
    # process_desc_columns + 
    # equipment_columns + 
    # model_suffix_columns + 
    # workorder_columns + 
    lot_id_columns + 
    box_id_columns
)

columns_to_drop = list(set(columns_to_drop))
X = X.drop(columns=columns_to_drop)
X

Unnamed: 0,Wip Line - Dam,Process Desc. - Dam,Equipment - Dam,Model.Suffix - Dam,Workorder - Dam,Set ID,Insp. Seq No. - Dam,Insp Judge Code - Dam,Collect Result - Dam,Unit Time - Dam,...,Judge Value.36 - Fill2,Collect Result.37 - Fill2,Unit Time.37 - Fill2,Judge Value.37 - Fill2,Collect Result.38 - Fill2,Unit Time.38 - Fill2,Judge Value.38 - Fill2,Collect Result.39 - Fill2,Unit Time.39 - Fill2,Judge Value.39 - Fill2
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1X5847-2,OP753345013050000002,1,OK,240.0,,...,,1,,,1,,,0,,
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1X5847-2,OP753345013050000003,1,OK,240.0,,...,,2,,,1,,,0,,
2,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1X5847-2,OP753345013050000004,1,OK,240.0,,...,,3,,,1,,,0,,
3,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1X5847-2,OP753345013050000005,1,OK,240.0,,...,,4,,,1,,,0,,
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1X5847-2,OP753345013050000006,1,OK,240.0,,...,,5,,,1,,,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57862,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334505,4F1XB738-1,OP753345054040002685,1,OK,1000.0,,...,,11,,,435,,,1,,
57863,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XB738-1,OP753345054040002686,1,OK,240.0,,...,,5,,,436,,,1,,
57864,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334505,4F1XB738-1,OP753345054040002687,1,OK,1000.0,,...,,12,,,437,,,1,,
57865,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XB738-1,OP753345054040002688,1,OK,240.0,,...,,6,,,438,,,1,,


In [8]:
set_id = X[['Set ID']]

In [9]:
from sklearn.preprocessing import LabelEncoder

non_numeric_columns = X.select_dtypes(include=['object']).columns

encoder = LabelEncoder()
encoded_df = pd.DataFrame(index=X.index)
for column in non_numeric_columns:
    encoded_df[column] = encoder.fit_transform(X[column])

X = X.drop(columns=non_numeric_columns)
X = pd.concat([X, encoded_df], axis=1)
X

Unnamed: 0,Insp. Seq No. - Dam,Collect Result - Dam,Unit Time - Dam,Judge Value - Dam,Collect Result.1 - Dam,Unit Time.1 - Dam,Judge Value.1 - Dam,Collect Result.2 - Dam,Unit Time.2 - Dam,Judge Value.2 - Dam,...,Collect Result.7 - Fill1,Judge Value.7 - Fill1,Wip Line - Fill2,Process Desc. - Fill2,Equipment - Fill2,Model.Suffix - Fill2,Workorder - Fill2,Insp Judge Code - Fill2,Collect Result.17 - Fill2,Judge Value.17 - Fill2
0,1,240.0,,,2.5,,,-90,,,...,3,1,0,0,0,0,0,0,1,1
1,1,240.0,,,2.5,,,-90,,,...,3,1,0,0,0,0,0,0,1,1
2,1,240.0,,,2.5,,,-90,,,...,3,1,0,0,0,0,0,0,1,1
3,1,240.0,,,2.5,,,-90,,,...,3,1,0,0,0,0,0,0,1,1
4,1,240.0,,,2.5,,,-90,,,...,3,1,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57862,1,1000.0,,,12.5,,,90,,,...,6,1,0,0,1,3,661,0,3,1
57863,1,240.0,,,2.5,,,-90,,,...,6,1,0,0,0,3,661,0,3,1
57864,1,1000.0,,,12.5,,,90,,,...,6,1,0,0,1,3,661,0,3,1
57865,1,240.0,,,2.5,,,-90,,,...,6,1,0,0,0,3,661,0,3,1


In [10]:
from sklearn.preprocessing import StandardScaler

X_columns = X.columns

scaler = StandardScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X, columns=X_columns)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [11]:
X = X.drop(columns=['Set ID'])
X = pd.concat([X, set_id], axis=1)
X

Unnamed: 0,Insp. Seq No. - Dam,Collect Result - Dam,Unit Time - Dam,Judge Value - Dam,Collect Result.1 - Dam,Unit Time.1 - Dam,Judge Value.1 - Dam,Collect Result.2 - Dam,Unit Time.2 - Dam,Judge Value.2 - Dam,...,Judge Value.7 - Fill1,Wip Line - Fill2,Process Desc. - Fill2,Equipment - Fill2,Model.Suffix - Fill2,Workorder - Fill2,Insp Judge Code - Fill2,Collect Result.17 - Fill2,Judge Value.17 - Fill2,Set ID
0,0.0,-0.786145,,,-0.786145,,,-0.786145,,,...,0.622841,0.0,0.0,-0.786087,-0.374311,-1.677825,0.0,-0.841222,0.622841,OP753345013050000002
1,0.0,-0.786145,,,-0.786145,,,-0.786145,,,...,0.622841,0.0,0.0,-0.786087,-0.374311,-1.677825,0.0,-0.841222,0.622841,OP753345013050000003
2,0.0,-0.786145,,,-0.786145,,,-0.786145,,,...,0.622841,0.0,0.0,-0.786087,-0.374311,-1.677825,0.0,-0.841222,0.622841,OP753345013050000004
3,0.0,-0.786145,,,-0.786145,,,-0.786145,,,...,0.622841,0.0,0.0,-0.786087,-0.374311,-1.677825,0.0,-0.841222,0.622841,OP753345013050000005
4,0.0,-0.786145,,,-0.786145,,,-0.786145,,,...,0.622841,0.0,0.0,-0.786087,-0.374311,-1.677825,0.0,-0.841222,0.622841,OP753345013050000006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57862,0.0,1.272030,,,1.272030,,,1.272030,,,...,0.622841,0.0,0.0,1.272123,2.847982,1.916228,0.0,1.206996,0.622841,OP753345054040002685
57863,0.0,-0.786145,,,-0.786145,,,-0.786145,,,...,0.622841,0.0,0.0,-0.786087,2.847982,1.916228,0.0,1.206996,0.622841,OP753345054040002686
57864,0.0,1.272030,,,1.272030,,,1.272030,,,...,0.622841,0.0,0.0,1.272123,2.847982,1.916228,0.0,1.206996,0.622841,OP753345054040002687
57865,0.0,-0.786145,,,-0.786145,,,-0.786145,,,...,0.622841,0.0,0.0,-0.786087,2.847982,1.916228,0.0,1.206996,0.622841,OP753345054040002688


x 데이터와 y 데이터 병합

In [12]:
# Merge X and y
df_merged = pd.merge(X, y, "inner", on="Set ID")

# Drop columns with more than half of the values missing
drop_cols = []
for column in df_merged.columns:
    if (df_merged[column].notnull().sum() // 2) < df_merged[
        column
    ].isnull().sum():
        drop_cols.append(column)
df_merged = df_merged.drop(drop_cols, axis=1)
df_merged

Unnamed: 0,Insp. Seq No. - Dam,Collect Result - Dam,Collect Result.1 - Dam,Collect Result.2 - Dam,Collect Result.3 - Dam,Collect Result.4 - Dam,Collect Result.5 - Dam,Collect Result.6 - Dam,Collect Result.7 - Dam,Collect Result.8 - Dam,...,Wip Line - Fill2,Process Desc. - Fill2,Equipment - Fill2,Model.Suffix - Fill2,Workorder - Fill2,Insp Judge Code - Fill2,Collect Result.17 - Fill2,Judge Value.17 - Fill2,Set ID,target
0,0.0,-0.786145,-0.786145,-0.786145,-0.398000,0.0,0.0,0.0,0.786145,0.0,...,0.0,0.0,-0.786087,-0.374311,-1.677825,0.0,-0.841222,0.622841,OP753345013050000003,AbNormal
1,0.0,-0.786145,-0.786145,-0.786145,-0.398000,0.0,0.0,0.0,0.786145,0.0,...,0.0,0.0,-0.786087,-0.374311,-1.677825,0.0,-0.841222,0.622841,OP753345013050000004,AbNormal
2,0.0,-0.786145,-0.786145,-0.786145,-0.398000,0.0,0.0,0.0,0.786145,0.0,...,0.0,0.0,-0.786087,-0.374311,-1.677825,0.0,-0.841222,0.622841,OP753345013050000007,AbNormal
3,0.0,-0.786145,-0.786145,-0.786145,-0.398000,0.0,0.0,0.0,0.786145,0.0,...,0.0,0.0,-0.786087,-0.374311,-1.677825,0.0,-0.841222,0.622841,OP753345013050000011,AbNormal
4,0.0,-0.786145,-0.786145,-0.786145,-0.398000,0.0,0.0,0.0,0.786145,0.0,...,0.0,0.0,-0.786087,-0.374311,-1.677825,0.0,-0.841222,0.622841,OP753345013050000024,AbNormal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,0.0,-0.786145,-0.786145,-0.786145,2.666543,0.0,0.0,0.0,0.786145,0.0,...,0.0,0.0,-0.786087,2.847982,1.916228,0.0,1.206996,0.622841,OP753345054040002680,Normal
40502,0.0,1.272030,1.272030,1.272030,2.666543,0.0,0.0,0.0,-1.272030,0.0,...,0.0,0.0,1.272123,2.847982,1.916228,0.0,1.206996,0.622841,OP753345054040002681,Normal
40503,0.0,-0.786145,-0.786145,-0.786145,2.666543,0.0,0.0,0.0,0.786145,0.0,...,0.0,0.0,-0.786087,2.847982,1.916228,0.0,1.206996,0.622841,OP753345054040002682,Normal
40504,0.0,-0.786145,-0.786145,-0.786145,2.666543,0.0,0.0,0.0,0.786145,0.0,...,0.0,0.0,-0.786087,2.847982,1.916228,0.0,1.206996,0.622841,OP753345054040002684,Normal


In [13]:
filtered_columns = [col for col in df_merged.columns if 'Collect Date' in col]
df_filtered = df_merged[['Set ID'] + ['target'] + filtered_columns]

In [14]:
df_filtered

Unnamed: 0,Set ID,target
0,OP753345013050000003,AbNormal
1,OP753345013050000004,AbNormal
2,OP753345013050000007,AbNormal
3,OP753345013050000011,AbNormal
4,OP753345013050000024,AbNormal
...,...,...
40501,OP753345054040002680,Normal
40502,OP753345054040002681,Normal
40503,OP753345054040002682,Normal
40504,OP753345054040002684,Normal


### 언더 샘플링

데이타 불균형을 해결하기 위해 언더 샘플링을 진행합니다.

In [97]:
# normal_ratio = 1.0  # 1.0 means 1:1 ratio

# df_normal = df_merged[df_merged["target"] == "Normal"]
# df_abnormal = df_merged[df_merged["target"] == "AbNormal"]

# num_normal = len(df_normal)
# num_abnormal = len(df_abnormal)
# print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

# df_normal = df_normal.sample(
#     n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE
# )
# df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
# df_concat.value_counts("target")
# df_concat

In [98]:
df_merged = df_concat

### 데이터 분할

In [99]:
# df_concat = df_concat.sort_values(by=["Collect Date - Dam"])
df_train, df_val = train_test_split(
    df_concat,
    test_size=0.3,
    stratify=df_concat["target"],
    random_state=RANDOM_STATE,
)


def print_stats(df: pd.DataFrame):
    num_normal = len(df[df["target"] == "Normal"])
    num_abnormal = len(df[df["target"] == "AbNormal"])

    print(
        f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}"
        + f" ratio: {num_abnormal/num_normal}"
    )


# Print statistics
print(f"  \tAbnormal\tNormal")
print_stats(df_train)
print_stats(df_val)

  	Abnormal	Normal
  Total: Normal: 1645, AbNormal: 1645 ratio: 1.0
  Total: Normal: 705, AbNormal: 705 ratio: 1.0


## 3. 모델 학습

### 모델 정의 

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier

import xgboost as xgb
import lightgbm as lgb

In [15]:
modellog = LogisticRegression(random_state=42)
modeldt = DecisionTreeClassifier(random_state=42)
modelrf = RandomForestClassifier(n_estimators=100, random_state=42)
modelsvc = SVC(kernel='rbf', C=1, gamma=0.1, random_state=42)

modelab = AdaBoostClassifier(n_estimators=100, learning_rate=1.0, random_state=42)
modelgb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=3, random_state=42)
modelxgb = xgb.XGBClassifier(n_estimators=100, learning_rate=1.0, max_depth=3, use_label_encoder=False, eval_metric='mlogloss')
modellgbm = lgb.LGBMClassifier(n_estimators=100, learning_rate=1.0, max_depth=3, random_state=42)

modelknn = KNeighborsClassifier(n_neighbors=3)



modelkms = KMeans(n_clusters=3, random_state=42, n_init=20)

### 모델 학습

In [16]:
MODEL = modelrf

features = []

for col in df_train.columns:
    try:
        df_train[col] = df_train[col].astype(int)
        features.append(col)
    except:
        continue

if "Set ID" in features:
    features.remove("Set ID")

train_x = df_train[features]
train_y = df_train["target"]
test_x = df_val[features]
test_y = df_val["target"]

MODEL.fit(train_x, train_y)

val_features = [col for col in features if col in df_val.columns]
df_val_x = df_val[val_features]
df_val_y = df_val["target"]

predictions = MODEL.predict(df_val_x)

accuracy = accuracy_score(df_val_y, predictions)
print(f"Accuracy: {accuracy*100:.2f}%")

Accuracy: 53.26%


---

In [100]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Extract features and labels
X_train = df_train.drop(columns=['Set ID', 'target']).values
y_train = (df_train['target'] == 'AbNormal').astype(int).values

X_val = df_val.drop(columns=['Set ID', 'target']).values
y_val = (df_val['target'] == 'AbNormal').astype(int).values

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

# Create TensorDataset and DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)


In [101]:
class DNN(nn.Module):
    def __init__(self, input_size):
        super(DNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 65)
        self.fc2 = nn.Linear(65, 50)
        self.fc3 = nn.Linear(50, 35)
        self.fc4 = nn.Linear(35, 20)
        
        self.fc5 = nn.Linear(20, 35)
        self.fc6 = nn.Linear(35, 50)
        self.fc7 = nn.Linear(50, 65)
        self.fc8 = nn.Linear(65, 2)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.relu(self.fc4(x))
        x = torch.relu(self.fc5(x))
        x = torch.relu(self.fc6(x))
        x = torch.relu(self.fc7(x))
        x = self.fc8(x)
        return x

input_size = X_train.shape[1]
model = DNN(input_size)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f'Epoch {epoch+1}/{num_epochs}, '
          f'Train Loss: {running_loss/len(train_loader)}, '
          f'Validation Loss: {val_loss/len(val_loader)}, '
          f'Validation Accuracy: {correct/total*100:.2f}%')


In [58]:
model.eval()
y_true = []
y_pred = []
with torch.no_grad():
    for inputs, labels in val_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=['Normal', 'AbNormal']))

print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))

Classification Report:
              precision    recall  f1-score   support

      Normal       0.54      0.69      0.61       705
    AbNormal       0.58      0.42      0.48       705

    accuracy                           0.56      1410
   macro avg       0.56      0.56      0.55      1410
weighted avg       0.56      0.56      0.55      1410

Confusion Matrix:
[[489 216]
 [411 294]]


---

In [103]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

# Extract features and labels
X_train = df_train.drop(columns=['Set ID', 'target']).values
y_train = (df_train['target'] == 'AbNormal').astype(int).values

X_val = df_val.drop(columns=['Set ID', 'target']).values
y_val = (df_val['target'] == 'AbNormal').astype(int).values

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

# Create TensorDataset and DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)


In [104]:
from pytorch_tabnet.tab_model import TabNetClassifier

# TabNet 모델 초기화
tabnet_model = TabNetClassifier()

# 모델 학습
tabnet_model.fit(
    X_train=X_train,
    y_train=y_train,
    eval_set=[(X_val, y_val)],
    eval_name=['val'],
    eval_metric=['accuracy'],
    max_epochs=50,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)



epoch 0  | loss: 0.88614 | val_accuracy: 0.50567 |  0:00:00s
epoch 1  | loss: 0.77679 | val_accuracy: 0.46525 |  0:00:00s
epoch 2  | loss: 0.71979 | val_accuracy: 0.49645 |  0:00:00s
epoch 3  | loss: 0.70599 | val_accuracy: 0.54255 |  0:00:00s
epoch 4  | loss: 0.69455 | val_accuracy: 0.54681 |  0:00:00s
epoch 5  | loss: 0.69184 | val_accuracy: 0.53972 |  0:00:00s
epoch 6  | loss: 0.68697 | val_accuracy: 0.54397 |  0:00:00s
epoch 7  | loss: 0.68666 | val_accuracy: 0.55035 |  0:00:00s
epoch 8  | loss: 0.68525 | val_accuracy: 0.55674 |  0:00:01s
epoch 9  | loss: 0.68555 | val_accuracy: 0.54894 |  0:00:01s
epoch 10 | loss: 0.68404 | val_accuracy: 0.55035 |  0:00:01s
epoch 11 | loss: 0.68494 | val_accuracy: 0.55816 |  0:00:01s
epoch 12 | loss: 0.68221 | val_accuracy: 0.55035 |  0:00:01s
epoch 13 | loss: 0.68322 | val_accuracy: 0.5461  |  0:00:01s
epoch 14 | loss: 0.68242 | val_accuracy: 0.54184 |  0:00:01s
epoch 15 | loss: 0.68601 | val_accuracy: 0.54184 |  0:00:01s
epoch 16 | loss: 0.68093



In [61]:
# Validation 데이터셋 예측
preds_val = tabnet_model.predict(X_val)

# 평가 결과 출력
from sklearn.metrics import classification_report, confusion_matrix

print("Classification Report:")
print(classification_report(y_val, preds_val, target_names=['Normal', 'AbNormal']))

print("Confusion Matrix:")
print(confusion_matrix(y_val, preds_val))


Classification Report:
              precision    recall  f1-score   support

      Normal       0.53      0.66      0.59       705
    AbNormal       0.55      0.43      0.48       705

    accuracy                           0.54      1410
   macro avg       0.54      0.54      0.54      1410
weighted avg       0.54      0.54      0.54      1410

Confusion Matrix:
[[462 243]
 [403 302]]


## 4. 제출하기

### 테스트 데이터 예측

테스트 데이터 불러오기

In [19]:
df_test_y = pd.read_csv(os.path.join("submission.csv"))

In [20]:
df_test = pd.merge(X, df_test_y, "inner", on="Set ID")
df_test_x = df_test[features]

for col in df_test_x.columns:
    try:
        df_test_x.loc[:, col] = df_test_x[col].astype(int)
    except:
        continue

In [21]:
test_pred = model.predict(df_test_x)
test_pred

array(['AbNormal', 'AbNormal', 'AbNormal', ..., 'Normal', 'AbNormal',
       'Normal'], dtype=object)

### 제출 파일 작성

In [22]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**