In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
import torch
import torch.nn as nn
import torch.optim as optim
import sys
sys.path.append("F:/기계학습특론/saint-main/models")  # SAINT 경로 추가
from model import TabAttention  # model.py에서 TabAttention 클래스 임포트

# 데이터 로드 및 전처리
file_path = "adult.csv"
data = pd.read_csv(file_path)

In [2]:
# '?' 처리 및 결측치 제거
data.replace('?', pd.NA, inplace=True)
data.dropna(inplace=True)

# 범주형 데이터와 연속형 데이터 구분
categorical_columns = ['workclass', 'education', 'marital-status', 'occupation', 
                       'relationship', 'race', 'gender', 'native-country']
continuous_columns = ['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']

In [3]:
# 범주형 데이터 인코딩
label_encoders = {col: LabelEncoder().fit(data[col]) for col in categorical_columns}
for col, encoder in label_encoders.items():
    data[col] = encoder.transform(data[col])

# 타겟 데이터 인코딩
data['income'] = (data['income'] == '>50K').astype(int)

# 입력 데이터 분리
X = data.drop("income", axis=1)
y = data["income"]

# 범주형 및 연속형 데이터 분리
X_categ = X[categorical_columns].values
X_cont = X[continuous_columns].values

# 데이터 분할
X_categ_train, X_categ_test, X_cont_train, X_cont_test, y_train, y_test = train_test_split(
    X_categ, X_cont, y, test_size=0.2, random_state=42
)

In [4]:
# 연속형 데이터 정규화
scaler = StandardScaler()
X_cont_train = scaler.fit_transform(X_cont_train)
X_cont_test = scaler.transform(X_cont_test)

# PyTorch 텐서로 변환
X_categ_train_tensor = torch.tensor(X_categ_train, dtype=torch.long)
X_cont_train_tensor = torch.tensor(X_cont_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)

X_categ_test_tensor = torch.tensor(X_categ_test, dtype=torch.long)
X_cont_test_tensor = torch.tensor(X_cont_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

In [5]:
# 모델 초기화
model = TabAttention(
    categories=[len(data[col].unique()) for col in categorical_columns],
    num_continuous=len(continuous_columns),
    dim=16,  # 작은 임베딩 차원
    depth=3,  # Transformer 레이어 수 축소
    heads=4,  # Multi-head Attention 수 축소
    dim_head=8,
    dim_out=1,
    mlp_hidden_mults=(2, 1),
    mlp_act=nn.ReLU(),
    attentiontype="col"
)

In [6]:
# 인코딩 텐서 생성
def create_encodings(x_categ, x_cont, model):
    device = x_categ.device
    x_categ_enc = model.embeds(x_categ + model.categories_offset.to(device))
    x_cont_enc = torch.cat([
        model.simple_MLP[i](x_cont[:, i:i + 1]) for i in range(x_cont.shape[1])
    ], dim=1)
    return x_categ_enc, x_cont_enc

x_categ_train_enc, x_cont_train_enc = create_encodings(X_categ_train_tensor, X_cont_train_tensor, model)
x_categ_test_enc, x_cont_test_enc = create_encodings(X_categ_test_tensor, X_cont_test_tensor, model)


In [7]:
# 차원 확장 및 크기 맞춤
x_cont_train_enc = x_cont_train_enc.unsqueeze(1).expand(-1, x_categ_train_enc.size(1), -1)
x_cont_test_enc = x_cont_test_enc.unsqueeze(1).expand(-1, x_categ_test_enc.size(1), -1)

# 연속형 임베딩 크기를 카테고리 임베딩 크기와 맞추기
if x_cont_train_enc.size(-1) != x_categ_train_enc.size(-1):
    linear_adjust = nn.Linear(x_cont_train_enc.size(-1), x_categ_train_enc.size(-1))
    x_cont_train_enc = linear_adjust(x_cont_train_enc)
    x_cont_test_enc = linear_adjust(x_cont_test_enc)

# 디버깅: 최종 입력 크기 확인
flat_x_size = (x_categ_train_enc.size(1) * x_categ_train_enc.size(2)) + \
              (x_cont_train_enc.size(1) * x_cont_train_enc.size(2))
print(f"Final flattened input size: {flat_x_size}")

# TabAttention 모델 수정: MLP의 입력 크기 동적으로 설정
model.mlp = nn.Sequential(
    nn.Linear(flat_x_size, 128),  # 첫 번째 레이어의 입력 크기 동적으로 설정
    nn.ReLU(),
    nn.Linear(128, 1)
)

Final flattened input size: 256


In [8]:
# CPU 설정
device = torch.device("cpu")
model.to(device)

# 손실 함수 및 옵티마이저 정의
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 디버깅: 모델 크기 확인
print("Model Parameters:")
print(f"Categorical Features: {X_categ_train_tensor.shape}")
print(f"Continuous Features: {X_cont_train_tensor.shape}")


Model Parameters:
Categorical Features: torch.Size([36177, 8])
Continuous Features: torch.Size([36177, 6])


In [9]:
# 학습
epochs = 200
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()

    # 모델 출력
    outputs = model(X_categ_train_tensor, X_cont_train_tensor, x_categ_train_enc.detach(), x_cont_train_enc.detach()).squeeze()

    # 디버깅: detach()로 그래프에서 분리된 값을 사용
    print(f"Epoch {epoch + 1} - Model Output: Shape = {outputs.shape}, requires_grad = {outputs.requires_grad}")
    
    # 손실 계산
    loss = criterion(outputs, y_train_tensor)
    loss_value = loss.item()  # 손실 값 분리
    print(f"Epoch {epoch + 1} - Loss: {loss_value}, requires_grad = {loss.requires_grad}")

    # 역전파
    loss.backward()  # retain_graph를 사용하지 않도록 기본값 유지
    optimizer.step()

    # 디버깅 출력
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss_value}")


Epoch 1 - Model Output: Shape = torch.Size([36177]), requires_grad = True
Epoch 1 - Loss: 0.7430999875068665, requires_grad = True
Epoch 1/200, Loss: 0.7430999875068665
Epoch 2 - Model Output: Shape = torch.Size([36177]), requires_grad = True
Epoch 2 - Loss: 0.5968636870384216, requires_grad = True
Epoch 2/200, Loss: 0.5968636870384216
Epoch 3 - Model Output: Shape = torch.Size([36177]), requires_grad = True
Epoch 3 - Loss: 0.5411301255226135, requires_grad = True
Epoch 3/200, Loss: 0.5411301255226135
Epoch 4 - Model Output: Shape = torch.Size([36177]), requires_grad = True
Epoch 4 - Loss: 0.5321691036224365, requires_grad = True
Epoch 4/200, Loss: 0.5321691036224365
Epoch 5 - Model Output: Shape = torch.Size([36177]), requires_grad = True
Epoch 5 - Loss: 0.5290831327438354, requires_grad = True
Epoch 5/200, Loss: 0.5290831327438354
Epoch 6 - Model Output: Shape = torch.Size([36177]), requires_grad = True
Epoch 6 - Loss: 0.5131364464759827, requires_grad = True
Epoch 6/200, Loss: 0.513

In [10]:
from sklearn.metrics import confusion_matrix

# 평가
model.eval()
with torch.no_grad():
    predictions = model(X_categ_test_tensor, X_cont_test_tensor, x_categ_test_enc, x_cont_test_enc).squeeze()
    predictions = torch.sigmoid(predictions).cpu().numpy()
    pred_labels = (predictions > 0.5).astype(int)

    # 성능 지표 계산
    accuracy = accuracy_score(y_test, pred_labels)
    auroc = roc_auc_score(y_test, predictions)
    precision = precision_score(y_test, pred_labels)
    recall = recall_score(y_test, pred_labels)
    f1 = f1_score(y_test, pred_labels)

    # 공정성 지표 계산
    sensitive_attr = X_categ_test_tensor[:, categorical_columns.index('gender')].cpu().numpy()  # gender 컬럼
    group_0 = sensitive_attr == 0  # 예: 남성
    group_1 = sensitive_attr == 1  # 예: 여성

    # Demographic Parity
    dp = abs(pred_labels[group_0].mean() - pred_labels[group_1].mean())

    # Equal Opportunity
    eo = abs(pred_labels[group_0 & (y_test == 1)].mean() - pred_labels[group_1 & (y_test == 1)].mean())

    # Equality of Odds
    group_0_tpr = pred_labels[group_0 & (y_test == 1)].mean()
    group_1_tpr = pred_labels[group_1 & (y_test == 1)].mean()
    group_0_fpr = pred_labels[group_0 & (y_test == 0)].mean()
    group_1_fpr = pred_labels[group_1 & (y_test == 0)].mean()
    eop = abs(group_0_tpr - group_1_tpr) + abs(group_0_fpr - group_1_fpr)

# 평가 결과 출력
print("\n모델 평가 결과:")
print(f"정확도 (Accuracy): {accuracy:.4f}")
print(f"AUROC: {auroc:.4f}")
print(f"정밀도 (Precision): {precision:.4f}")
print(f"재현율 (Recall): {recall:.4f}")
print(f"F1 점수 (F1-Score): {f1:.4f}")

# 공정성 지표 출력
print("\n공정성 지표:")
print(f"Demographic Parity (DP): {dp:.4f}")
print(f"Equal Opportunity (EO): {eo:.4f}")
print(f"Equality of Odds (EOP): {eop:.4f}")



모델 평가 결과:
정확도 (Accuracy): 0.8587
AUROC: 0.9142
정밀도 (Precision): 0.7520
재현율 (Recall): 0.6264
F1 점수 (F1-Score): 0.6835

공정성 지표:
Demographic Parity (DP): 0.1815
Equal Opportunity (EO): 0.1082
Equality of Odds (EOP): 0.1805
