In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import classification_report, roc_curve, auc, precision_recall_curve, average_precision_score, confusion_matrix
from sklearn.calibration import calibration_curve
from torch.utils.data import Dataset, DataLoader
from itertools import cycle
# GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


In [None]:
ls

In [None]:
df = pd.read_csv('./data/merged_V2.csv')

In [None]:
# Label Encoding
label_encoder = LabelEncoder()
df['target'] = label_encoder.fit_transform(df['label'])
num_classes = len(label_encoder.classes_)


In [None]:
feature_cols = [col for col in df.columns if '_x' in col or '_y' in col]
print(feature_cols)

In [None]:
import numpy as np
import pandas as pd
import gc

# ✅ Feature 컬럼 선택
feature_cols = [col for col in df.columns if '_x' in col or '_y' in col]
seq_length = 20  # 윈도우 크기

# ✅ 1. Pandas → NumPy 변환 (RAM 절약)
X_np = df[feature_cols].values.astype(np.float32)  # NumPy 배열로 변환
Y_np = df['target'].values.astype(np.int64)

del df  # ✅ 원본 데이터프레임 삭제 (메모리 절약)
gc.collect()

# ✅ 2. 슬라이딩 윈도우 제너레이터 (Colab 안정화)
def sliding_window_generator(X, Y, seq_length):
    for i in range(len(X) - seq_length):
        yield X[i:i+seq_length], Y[i+seq_length]  # ✅ 한 번에 하나씩 반환 (메모리 절약)

# ✅ 3. NumPy 배열로 변환 (제너레이터 사용)
X_gen = sliding_window_generator(X_np, Y_np, seq_length)

# ✅ 4. NumPy 배열 저장 (Colab에서 재사용 가능)
X_data = np.array([x for x, _ in X_gen], dtype=np.float32)
Y_data = np.array([y for _, y in sliding_window_generator(X_np, Y_np, seq_length)], dtype=np.int64)

print(f"✅ 최종 데이터 크기: X={X_data.shape}, Y={Y_data.shape}")

# ✅ 5. NumPy 파일로 저장 (Colab에서 재사용 가능)
np.save("X_data.npy", X_data)
np.save("Y_data.npy", Y_data)

# ✅ 6. 불필요한 변수 삭제 후 메모리 정리
del X_np, Y_np, X_data, Y_data, X_gen
gc.collect()



In [None]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split, KFold

# ✅ NumPy 데이터 불러오기 (mmap_mode='r' 사용)
X = np.load("X_data.npy", mmap_mode="r")  # ✅ 전체를 메모리에 올리지 않음
Y = np.load("Y_data.npy", mmap_mode="r")

# ✅ 테스트 세트 분리 (한 번만 수행)
X_train_full, X_test, Y_train_full, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42, shuffle=False  # ✅ 시계열 유지
)

print(f"✅ Full Train Set: {X_train_full.shape}, Test Set: {X_test.shape}")

# ✅ KFold 설정
kf = KFold(n_splits=5, shuffle=False)

# ✅ PyTorch Dataset 정의
class SkatingDataset(Dataset):
    def __init__(self, X, Y, indices):
        self.X = X
        self.Y = Y
        self.indices = indices  # ✅ Fold별 인덱스만 저장하여 RAM 절약

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        real_idx = self.indices[idx]  # ✅ 원본 데이터의 인덱스
        return torch.tensor(self.X[real_idx], dtype=torch.float32), torch.tensor(self.Y[real_idx], dtype=torch.long)

# ✅ Fold별 데이터 분할
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_full)):
    print(f"\n=== Fold {fold+1} ===")

    # ✅ Fold별 Dataset 생성 (RAM 절약)
    train_dataset = SkatingDataset(X_train_full, Y_train_full, train_idx)
    val_dataset = SkatingDataset(X_train_full, Y_train_full, val_idx)
    test_dataset = SkatingDataset(X_test, Y_test, np.arange(len(Y_test)))  # 테스트 세트는 전체 사용

    # ✅ DataLoader 생성 (배치 크기 조정)
    batch_size = 32
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)

    print(f"✅ Train: {len(train_dataset)}, Val: {len(val_dataset)}, Test: {len(test_dataset)}")
    print("✅ DataLoader 생성 완료!")


In [None]:
import torch.nn as nn

class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.rnn(x)
        return self.fc(out[:, -1, :])


# class TransformerModel(nn.Module):
#     def __init__(self, input_size, hidden_size, output_size, num_heads=4, num_layers=2):
#         super().__init__()
#         encoder_layer = nn.TransformerEncoderLayer(d_model=input_size, nhead=num_heads)
#         self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
#         self.fc = nn.Linear(input_size, output_size)

#     def forward(self, x):
#         x = self.transformer_encoder(x)
#         return self.fc(x[:, -1, :])

input_size = X_train_full.shape[2]  # ✅ Feature 개수
hidden_size = 128
output_size = num_classes

models = {
    "RNN": RNNModel(input_size, hidden_size, output_size),
    #"Transformer": TransformerModel(input_size, hidden_size, output_size)
}

print("모델 정의 완료 ✅")


In [None]:
import torch.optim as optim
import torch.cuda.amp as amp

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_epochs = 10
lr = 0.001
accumulation_steps = 2

for name, model in models.items():
    print(f"\n--- Training {name} ---")
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scaler = amp.GradScaler()

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        correct, total = 0, 0

        for step, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)

            with amp.autocast():
                outputs = model(inputs)
                loss = criterion(outputs, labels) / accumulation_steps

            scaler.scale(loss).backward()
            if (step + 1) % accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()

            train_loss += loss.item() * accumulation_steps
            predicted = torch.argmax(outputs, dim=1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

        train_acc = correct / total
        print(f"Epoch {epoch+1}: Train Loss={train_loss / len(train_loader):.4f}, Accuracy={train_acc:.4f}")

    torch.save(model.state_dict(), f"{name}_model.pth")
    print(f"Model {name} saved!\n")

    del model, optimizer, criterion, scaler
    torch.cuda.empty_cache()
