In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from time import time

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, roc_curve, auc

import warnings
warnings.filterwarnings(action='ignore')

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [None]:
file_path = './filtered_merged_encoded_data.csv'
df = pd.read_csv(file_path)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24867 entries, 0 to 24866
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   HY_YN             24867 non-null  int64  
 1   AGE               24867 non-null  float64
 2   BLDS              24867 non-null  float64
 3   TOT_CHOLE         24867 non-null  float64
 4   GAMMA_GTP         24867 non-null  float64
 5   SCR_CNT           24867 non-null  float64
 6   T_IN_LOS          24867 non-null  float64
 7   T_OUT_LOS         24867 non-null  float64
 8   BMI               24867 non-null  float64
 9   Liver_Enzyme_Avg  24867 non-null  float64
 10  MAP               24867 non-null  float64
 11  SEX_2             24867 non-null  bool   
 12  SMK_STAT_1        24867 non-null  bool   
 13  SMK_STAT_2        24867 non-null  bool   
 14  SMK_STAT_3        24867 non-null  bool   
 15  DRNK_HABIT_2      24867 non-null  bool   
 16  DRNK_HABIT_3      24867 non-null  bool  

# *정규화*
standard vs minmax vs robust

In [None]:
#수치형변수들의 분포 확인
import matplotlib.pyplot as plt

columns = ['AGE', 'BLDS', 'TOT_CHOLE', 'GAMMA_GTP', 'SCR_CNT', 'T_IN_LOS', 'T_OUT_LOS', 'BMI', 'Liver_Enzyme_Avg', 'MAP']

plt.figure(figsize=(15, 12))
for i, col in enumerate(columns, 1):
    plt.subplot(5, 2, i)
    plt.hist(df[col], bins=30, alpha=0.7, edgecolor='black')
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

# MinMaxScaler 적용 변수들
minmax_columns = ['AGE', 'SCR_CNT']
# StandardScaler 적용 변수들
standard_columns = ['BLDS', 'TOT_CHOLE', 'BMI', 'MAP']
# RobustScaler 적용 변수들
robust_columns = ['GAMMA_GTP', 'T_IN_LOS', 'T_OUT_LOS', 'Liver_Enzyme_Avg']

# MinMaxScaler 적용
scaler_minmax = MinMaxScaler()
df[minmax_columns] = scaler_minmax.fit_transform(df[minmax_columns])

# StandardScaler 적용
scaler_standard = StandardScaler()
df[standard_columns] = scaler_standard.fit_transform(df[standard_columns])

# RobustScaler 적용
scaler_robust = RobustScaler()
df[robust_columns] = scaler_robust.fit_transform(df[robust_columns])

In [None]:
df.head()

#GAN

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['HY_YN'], random_state=42)

minority_data = train_df[train_df['HY_YN'] == 1].drop(columns=['HY_YN']).values.astype(float)
minority_labels = np.ones((len(minority_data), 1))

minority_data_tensor = torch.tensor(minority_data, dtype=torch.float32)
minority_labels_tensor = torch.tensor(minority_labels, dtype=torch.float32)

minority_loader = DataLoader(TensorDataset(minority_data_tensor, minority_labels_tensor), batch_size=32, shuffle=True)

# 모델 정의
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim + 1, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim)
        )

    def forward(self, noise, labels):
        input = torch.cat((noise, labels), dim=1)
        return self.model(input)

class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim + 1, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, data, labels):
        input = torch.cat((data, labels), dim=1)
        return self.model(input)

latent_dim = 50
num_features = train_df.shape[1] - 1
num_samples_needed = (len(train_df[train_df['HY_YN'] == 0]) // 2) - len(train_df[train_df['HY_YN'] == 1])  # 정확한 2:1 비율로 샘플 수 계산

generator = Generator(latent_dim, num_features)
discriminator = Discriminator(num_features)

# 손실 함수와 옵티마이저 설정
criterion = nn.BCELoss()
optimizer_G = optim.Adam(generator.parameters(), lr=0.0002)
optimizer_D = optim.Adam(discriminator.parameters(), lr=0.0002)

# GAN 학습 루프
num_epochs = 1000
for epoch in range(num_epochs):
    for real_data, labels in minority_loader:
        # Discriminator 학습
        optimizer_D.zero_grad()
        real_labels = torch.ones(real_data.size(0), 1)
        real_loss = criterion(discriminator(real_data, labels), real_labels)

        noise = torch.randn(real_data.size(0), latent_dim)
        fake_data = generator(noise, labels)
        fake_labels = torch.zeros(real_data.size(0), 1)
        fake_loss = criterion(discriminator(fake_data.detach(), labels), fake_labels)

        d_loss = real_loss + fake_loss
        d_loss.backward()
        optimizer_D.step()

        # Generator 학습
        optimizer_G.zero_grad()
        g_loss = criterion(discriminator(fake_data, labels), real_labels)
        g_loss.backward()
        optimizer_G.step()

    if epoch % 100 == 0:
        print(f"Epoch [{epoch}/{num_epochs}]  D Loss: {d_loss.item():.4f}  G Loss: {g_loss.item():.4f}")


generated_samples = []
while len(generated_samples) < num_samples_needed:
    noise = torch.randn(1, latent_dim)
    label = torch.ones((1, 1))
    fake_sample = generator(noise, label).detach().numpy()
    generated_samples.append(fake_sample)


generated_samples = np.vstack(generated_samples)
generated_df = pd.DataFrame(generated_samples, columns=train_df.columns.drop('HY_YN'))
generated_df['HY_YN'] = 1

# 최종 train 데이터셋 결합
train_balanced = pd.concat([train_df, generated_df], ignore_index=True)
print(f"최종 train 데이터셋 클래스 분포: \n{train_balanced['HY_YN'].value_counts()}")


Epoch [0/1000]  D Loss: 1.2355  G Loss: 0.6199
Epoch [100/1000]  D Loss: 1.3142  G Loss: 0.7132
Epoch [200/1000]  D Loss: 1.2942  G Loss: 0.7545
Epoch [300/1000]  D Loss: 1.4305  G Loss: 0.7021
Epoch [400/1000]  D Loss: 1.3217  G Loss: 0.8086
Epoch [500/1000]  D Loss: 1.3829  G Loss: 0.7260
Epoch [600/1000]  D Loss: 1.2608  G Loss: 0.7528
Epoch [700/1000]  D Loss: 1.2389  G Loss: 0.7907
Epoch [800/1000]  D Loss: 0.9296  G Loss: 1.0388
Epoch [900/1000]  D Loss: 1.0207  G Loss: 1.0773
최종 train 데이터셋 클래스 분포: 
HY_YN
0    18364
1     9182
Name: count, dtype: int64
