In [51]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score
from imblearn.over_sampling import SMOTE, SMOTENC, SMOTEN, BorderlineSMOTE, SVMSMOTE, ADASYN, KMeansSMOTE
import catboost as cb

ROOT_DIR = "data"
RANDOM_STATE = 110

train_data = pd.read_csv(os.path.join(ROOT_DIR, "train_mod.csv"))

def cat2num(X):
    non_numeric_columns = X.select_dtypes(include=['object']).columns
    encoded_columns = {}

    for column in non_numeric_columns:
        encoder = LabelEncoder()
        encoded_columns[column] = encoder.fit_transform(X[column])

    encoded_df = pd.DataFrame(encoded_columns, index=X.index)
    X = X.drop(columns=non_numeric_columns)
    X = pd.concat([X, encoded_df], axis=1)

    return X

def condition(df):
    df = cat2num(df)
    
    df['Condition1'] = df['Head Clean Position Z Collect Result_Dam'] <= 120
    df['Condition1'] = df['Condition1'].astype(int)
    
    df['Condition2'] = (df['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam'] >= 284) & (df['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'] >= 1000)
    df['Condition2'] = df['Condition2'].astype(int)

    try:
        df = df[['Receip No Collect Result_Dam', 
                 '1st Pressure 1st Pressure Unit Time_AutoClave', 
                 '3rd Pressure Collect Result_AutoClave', 
                 'Chamber Temp. Unit Time_AutoClave', 
                 'Receip No Collect Result_Fill1', 
                 'Receip No Collect Result_Fill2',
                 'Condition1', 
                 'Condition2',
                 'target']]
    except KeyError:
        df = df[['Receip No Collect Result_Dam', 
                 '1st Pressure 1st Pressure Unit Time_AutoClave', 
                 '3rd Pressure Collect Result_AutoClave', 
                 'Chamber Temp. Unit Time_AutoClave', 
                 'Receip No Collect Result_Fill1', 
                 'Receip No Collect Result_Fill2',
                 'Condition1', 
                 'Condition2']]
    
    return df

train_data = condition(train_data)

print(train_data.dtypes)
train_data

Receip No Collect Result_Dam                       int64
1st Pressure 1st Pressure Unit Time_AutoClave      int64
3rd Pressure Collect Result_AutoClave            float64
Chamber Temp. Unit Time_AutoClave                  int64
Receip No Collect Result_Fill1                     int64
Receip No Collect Result_Fill2                     int64
Condition1                                         int32
Condition2                                         int32
target                                             int32
dtype: object


Unnamed: 0,Receip No Collect Result_Dam,1st Pressure 1st Pressure Unit Time_AutoClave,3rd Pressure Collect Result_AutoClave,Chamber Temp. Unit Time_AutoClave,Receip No Collect Result_Fill1,Receip No Collect Result_Fill2,Condition1,Condition2,target
0,1,240,0.499,361,1,1,0,0,1
1,1,241,0.498,483,1,1,0,0,1
2,1,241,0.498,363,1,1,0,0,1
3,1,241,0.500,483,1,1,0,0,1
4,1,240,0.498,480,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...
40501,1,241,0.500,483,1,1,0,0,1
40502,1,240,0.498,361,1,1,0,0,1
40503,1,241,0.498,363,1,1,0,0,1
40504,1,241,0.499,483,1,1,0,0,1


In [52]:
# Scale the data
scaler = MinMaxScaler()
columns_to_scale = [col for col in train_data.columns if col != 'target']
train_data[columns_to_scale] = scaler.fit_transform(train_data[columns_to_scale])

# SMOTE for oversampling the minority class
X = train_data.drop(columns=['target'])
y = train_data['target']

In [67]:
# 기본 SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=RANDOM_STATE)
X_resampled, y_resampled = smote.fit_resample(X, y)
    # F1 Score: 0.6494
    # Recall: 0.7532
    # Accuracy: 0.5906
    # Precision: 0.5707

# SMOTENC: 연속형 및 범주형 데이터 혼합 시 사용
categorical_features = [0, 4, 5, 6, 7]  # 범주형 특성의 인덱스
smote_nc = SMOTENC(categorical_features=categorical_features, sampling_strategy='auto', random_state=RANDOM_STATE)
X_resampled, y_resampled = smote_nc.fit_resample(X, y)
    # F1 Score: 0.6477
    # Recall: 0.7496
    # Accuracy: 0.5895
    # Precision: 0.5702


# SMOTEN: 범주형 데이터에만 적용
smote_n = SMOTEN(sampling_strategy='auto', random_state=RANDOM_STATE)
X_resampled, y_resampled = smote_n.fit_resample(X, y)
    # F1 Score: 0.6519
    # Recall: 0.7526
    # Accuracy: 0.5954
    # Precision: 0.5749


# BorderlineSMOTE: 경계선 근처의 샘플을 이용한 SMOTE 변형
borderline_smote = BorderlineSMOTE(sampling_strategy='auto', random_state=RANDOM_STATE)
X_resampled, y_resampled = borderline_smote.fit_resample(X, y)
    # F1 Score: 0.7891
    # Recall: 0.6937
    # Accuracy: 0.8133
    # Precision: 0.9149

# SVMSMOTE: SVM을 이용한 SMOTE 변형
svm_smote = SVMSMOTE(sampling_strategy='auto', random_state=RANDOM_STATE)
X_resampled, y_resampled = svm_smote.fit_resample(X, y)
    # F1 Score: 0.8185
    # Recall: 0.8232
    # Accuracy: 0.8162
    # Precision: 0.8139

# ADASYN: 밀도에 기반하여 샘플을 생성하는 방법
adasyn = ADASYN(sampling_strategy='auto', random_state=RANDOM_STATE)
X_resampled, y_resampled = adasyn.fit_resample(X, y)
    # F1 Score: 0.6205
    # Recall: 0.6843
    # Accuracy: 0.5798
    # Precision: 0.5676

# KMeansSMOTE: 클러스터링을 사용한 SMOTE 변형
# cluster_balance_threshold를 낮추어, 더 적은 수의 소수 클래스 샘플이 있는 클러스터에서도 오버샘플링을 허용
kmeans_smote = KMeansSMOTE(sampling_strategy='auto', random_state=RANDOM_STATE, cluster_balance_threshold=0.01)
X_resampled, y_resampled = kmeans_smote.fit_resample(X, y)
    # F1 Score: 0.9529
    # Recall: 0.9618
    # Accuracy: 0.9520
    # Precision: 0.9442
# kmeans_args 매개변수를 통해 클러스터의 수를 늘리거나 줄임
kmeans_smote = KMeansSMOTE(sampling_strategy='auto', random_state=RANDOM_STATE, kmeans_estimator=20)
X_resampled, y_resampled = kmeans_smote.fit_resample(X, y)
    # F1 Score: 0.9726
    # Recall: 0.9994
    # Accuracy: 0.9716
    # Precision: 0.9471
#둘 다 적용
kmeans_smote = KMeansSMOTE(sampling_strategy='auto', random_state=RANDOM_STATE, cluster_balance_threshold=0.01, kmeans_estimator=20)
X_resampled, y_resampled = kmeans_smote.fit_resample(X, y)
    # F1 Score: 0.9693
    # Recall: 0.9954
    # Accuracy: 0.9683
    # Precision: 0.9445

In [70]:
# Split the data into features and target
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled,
    y_resampled,
    test_size=0.3,
    random_state=RANDOM_STATE,
)

# Recursive Feature Elimination (RFE) for backward feature selection
model = cb.CatBoostClassifier(
    depth=6,
    iterations=400,
    l2_leaf_reg=5,
    learning_rate=0.01,
    verbose=0  # No output during training
)

# Train final model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)

f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")

F1 Score: 0.9693
Recall: 0.9954
Accuracy: 0.9683
Precision: 0.9445


In [None]:
# Process test data
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test_mod.csv"))
test_data = condition(test_data)

# Scale the test data
test_data[columns_to_scale] = scaler.transform(test_data[columns_to_scale])

# Select the same features as the training data
test_x_rfe = test_data[X_train.columns]

# Predict on test data
y_pred = model.predict(test_x_rfe)

# Prepare submission
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = y_pred
df_sub['target'] = df_sub['target'].map({1: 'Normal', 0: 'AbNormal'})

# Calculate the ratio of abnormal cases
counts = df_sub['target'].value_counts()
ratio = counts['AbNormal'] / (counts['AbNormal'] + counts['Normal'])
print("The ratio of abnormal is:", ratio)

# Save the submission file
df_sub.to_csv("submission.csv", index=False)

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import catboost as cb

# GAN 모델 정의
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, output_dim),
            nn.Tanh(),
        )

    def forward(self, x):
        return self.model(x)

class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid(),
        )

    def forward(self, x):
        return self.model(x)

# 데이터 전처리 및 GAN 학습
ROOT_DIR = "data"
RANDOM_STATE = 110

train_data = pd.read_csv(os.path.join(ROOT_DIR, "train_mod.csv"))

def cat2num(X):
    non_numeric_columns = X.select_dtypes(include=['object']).columns
    encoded_columns = {}

    for column in non_numeric_columns:
        encoder = LabelEncoder()
        encoded_columns[column] = encoder.fit_transform(X[column])

    encoded_df = pd.DataFrame(encoded_columns, index=X.index)
    X = X.drop(columns=non_numeric_columns)
    X = pd.concat([X, encoded_df], axis=1)

    return X

def condition(df):
    df = cat2num(df)
    
    df['Condition1'] = df['Head Clean Position Z Collect Result_Dam'] <= 120
    df['Condition1'] = df['Condition1'].astype(int)
    
    df['Condition2'] = (df['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam'] >= 284) & (df['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'] >= 1000)
    df['Condition2'] = df['Condition2'].astype(int)

    try:
        df = df[['Receip No Collect Result_Dam', 
                 '1st Pressure 1st Pressure Unit Time_AutoClave', 
                 '3rd Pressure Collect Result_AutoClave', 
                 'Chamber Temp. Unit Time_AutoClave', 
                 'Receip No Collect Result_Fill1', 
                 'Receip No Collect Result_Fill2',
                 'Condition1', 
                 'Condition2',
                 'target']]
    except KeyError:
        df = df[['Receip No Collect Result_Dam', 
                 '1st Pressure 1st Pressure Unit Time_AutoClave', 
                 '3rd Pressure Collect Result_AutoClave', 
                 'Chamber Temp. Unit Time_AutoClave', 
                 'Receip No Collect Result_Fill1', 
                 'Receip No Collect Result_Fill2',
                 'Condition1', 
                 'Condition2']]
    
    return df

train_data = condition(train_data)

# Scale the data
scaler = MinMaxScaler()
columns_to_scale = [col for col in train_data.columns if col != 'target']
train_data[columns_to_scale] = scaler.fit_transform(train_data[columns_to_scale])

# 소수 클래스 데이터만 사용하여 GAN 학습
df_minority = train_data[train_data['target'] == 0]
X_minority = df_minority.drop(columns=['target']).values
df_majority = train_data[train_data['target'] == 1]

input_dim = X_minority.shape[1]
latent_dim = 16  # 잠재 공간의 차원

generator = Generator(input_dim=latent_dim, output_dim=input_dim)
discriminator = Discriminator(input_dim=input_dim)

criterion = nn.BCELoss()
optimizer_G = optim.Adam(generator.parameters(), lr=0.0002)
optimizer_D = optim.Adam(discriminator.parameters(), lr=0.0002)

num_epochs = 10000
batch_size = 64
X_minority_tensor = torch.tensor(X_minority, dtype=torch.float32)
data_loader = DataLoader(X_minority_tensor, batch_size=batch_size, shuffle=True)

for epoch in range(num_epochs):
    for real_samples in data_loader:
        batch_size = real_samples.size(0)
        
        # 진짜 샘플에 대한 라벨은 1
        real_labels = torch.ones(batch_size, 1)
        # 가짜 샘플에 대한 라벨은 0
        fake_labels = torch.zeros(batch_size, 1)

        # Discriminator 학습
        optimizer_D.zero_grad()
        outputs = discriminator(real_samples)
        d_loss_real = criterion(outputs, real_labels)

        z = torch.randn(batch_size, latent_dim)
        fake_samples = generator(z)
        outputs = discriminator(fake_samples.detach())
        d_loss_fake = criterion(outputs, fake_labels)

        d_loss = d_loss_real + d_loss_fake
        d_loss.backward()
        optimizer_D.step()

        # Generator 학습
        optimizer_G.zero_grad()
        z = torch.randn(batch_size, latent_dim)
        fake_samples = generator(z)
        outputs = discriminator(fake_samples)
        g_loss = criterion(outputs, real_labels)
        g_loss.backward()
        optimizer_G.step()

    if epoch % 1000 == 0:
        print(f"Epoch [{epoch}/{num_epochs}] - D Loss: {d_loss.item():.4f}, G Loss: {g_loss.item():.4f}")

# 다수 클래스 수에 맞춰 소수 클래스 오버샘플링
num_majority = len(df_majority)
num_minority_to_generate = num_majority - len(df_minority)

z = torch.randn(num_minority_to_generate, latent_dim)
generated_samples = generator(z).detach().numpy()
generated_df = pd.DataFrame(generated_samples, columns=columns_to_scale)
generated_df['target'] = 0

# 원래 소수 클래스 데이터와 생성된 데이터를 결합
oversampled_data = pd.concat([df_majority, df_minority, generated_df], axis=0).reset_index(drop=True)

# Split the data into features and target
X = oversampled_data.drop(columns=['target'])
y = oversampled_data['target']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=RANDOM_STATE,
)

# CatBoost 모델 학습
model = cb.CatBoostClassifier(
    depth=6,
    iterations=400,
    l2_leaf_reg=5,
    learning_rate=0.01,
    verbose=0  # No output during training
)

model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)

f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")

# Test 데이터에 대한 처리 및 예측
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test_mod.csv"))
test_data = condition(test_data)

test_data[columns_to_scale] = scaler.transform(test_data[columns_to_scale])
test_x = test_data[X_train.columns]

y_pred = model.predict(test_x)

# 제출 파일 준비
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = y_pred
df_sub['target'] = df_sub['target'].map({1: 'Normal', 0: 'AbNormal'})

# 비정상 케이스 비율 계산
counts = df_sub['target'].value_counts()
ratio = counts['AbNormal'] / (counts['AbNormal'] + counts['Normal'])
print("The ratio of abnormal is:", ratio)

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

Epoch [0/10000] - D Loss: 1.2746, G Loss: 0.6387
Epoch [1000/10000] - D Loss: 1.3296, G Loss: 0.6952
Epoch [2000/10000] - D Loss: 1.3418, G Loss: 0.7197
Epoch [3000/10000] - D Loss: 1.3839, G Loss: 0.7617
Epoch [4000/10000] - D Loss: 1.3912, G Loss: 0.7099
Epoch [5000/10000] - D Loss: 1.1336, G Loss: 1.1088
Epoch [6000/10000] - D Loss: 0.9867, G Loss: 1.1017
Epoch [7000/10000] - D Loss: 1.1094, G Loss: 0.5506
Epoch [8000/10000] - D Loss: 1.0654, G Loss: 0.5339
Epoch [9000/10000] - D Loss: 0.7271, G Loss: 2.1135
F1 Score: 0.9713
Recall: 0.9992
Accuracy: 0.9703
Precision: 0.9449
The ratio of abnormal is: 0.0017856114279131386
