In [None]:
import torch
print(torch.__version__)
print(torch.version.cuda)

In [None]:
cd BiS495/GNN

In [19]:
import os
import logging
import numpy as np
import pandas as pd
import random
import adabound
import pickle
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import roc_auc_score

import torch

# Env
from utils import *
from model_GAT import *
from options import parse_args
from test_model import test

In [None]:
opt = parse_args()
opt.lin_input_dim = 958
# opt.lin_input_dim = 1916
opt.act_type = 'SM'
opt.optimizer_type = 'adabound'
opt.lr = 0.00001
opt.batch_size = 64
k = 5
# opt.num_epochs = 100
opt.num_epochs = 50

In [None]:
tr_features, tr_labels, te_features, te_labels, adj_matrix = load_csv_data(k, opt)

In [22]:
device = torch.device('cpu')
adj = adj_matrix.to(device)

In [23]:
train_dataset = TensorDataset(tr_features, tr_labels)
train_loader = DataLoader(dataset=train_dataset, batch_size=opt.batch_size, shuffle=True)

In [24]:
def acc_test(out, lb):
    score = 0
    out = out.flatten()
    # print(out)
    out = np.where(out > 0.5, 1.0, 0.0)
    lb = np.array(lb.flatten())
    # print(lb)
    score += np.sum(out == lb)
    return score

In [25]:
def auroc_score(out, lb):
    out = out.flatten()
    lb = np.array(lb.flatten())
    # print(out)
    # print(lb)
    if isinstance(out, torch.Tensor):
        out = out.detach().cpu().numpy()
    if isinstance(lb, torch.Tensor):
        lb = lb.detach().cpu().numpy()
    # roc_auc_score는 실제 레이블과 예측 확률을 받아 AUROC 값을 반환
    auroc = roc_auc_score(lb, out)
    
    return auroc

In [26]:
def preprocess(labels):
    n = len(labels)
    one_hot = torch.zeros(n, 2)

    for i in range(n):
        if labels[i] == 0:
            one_hot[i] = torch.tensor([1, 0])
        elif labels[i] == 1:
            one_hot[i] = torch.tensor([0, 1])
    
    return one_hot

In [27]:
def initialize_weights(layer):
    if isinstance(layer, nn.Linear):
        nn.init.xavier_normal_(layer.weight)  # Sigmoid를 사용하는 경우
        if layer.bias is not None:
            nn.init.zeros_(layer.bias)

In [28]:
def train_model(config):
    opt.lr = config["lr"]
    opt.optimizer_type = config["optimizer_type"]
    opt.final_lr = config["final_lr"]
    opt.weight_decay = config["weight_decay"]
    opt.alpha = config["alpha"]
    num_models = 50
    models = []
    optimizers = []

    for i in range(num_models):
        print(f"Training {i+1}/{num_models}th GNN Model")

        # 랜덤한 피처 서브샘플링 (feature drop 방식)
        feature_mask = np.random.rand(479) > 0.3

        # 새로운 모델 생성 (랜덤 초기화)
        model = GAT(
            opt=opt, 
            input_dim=opt.input_dim, 
            omic_dim=opt.omic_dim, 
            label_dim=opt.label_dim, 
            dropout=opt.dropout, 
            alpha=opt.alpha, 
            nhids=config["nhids"], 
            nheads=config["nheads"], 
            fc=config["fc_dim"]
        ).to(device)
        model.apply(initialize_weights)

        criterion = nn.CrossEntropyLoss()
        optimizer = define_optimizer(opt, model)

        max_acc = 0
        for epoch in range(opt.num_epochs):
            model.train()
            running_loss = 0.0
            running_acc = 0.0
            all_outs = np.empty(0)
            all_labels = np.empty(0)

            for features, labels in train_loader:
                features = features.float().to(device)
                labels = labels.float().to(device)

                optimizer.zero_grad()

                # 랜덤한 피처 서브샘플링 적용
                features[:, feature_mask, :] = 0 

                outputs = model(features, adj, labels, opt)
                labels_change = preprocess(labels)

                loss = criterion(outputs, labels_change)
                loss.backward()
                optimizer.step()

                # 예측 및 평가
                outs = (outputs[:, 0] < 0.5).long()
                acc = acc_test(outs, labels)

                all_outs = np.concatenate((all_outs, outs.cpu().numpy()))
                all_labels = np.concatenate((all_labels, labels.view(-1).cpu().numpy()))

                running_loss += loss.item()
                running_acc += acc.item()

            auroc = auroc_score(all_outs, all_labels)

            print(f'Epoch [{epoch+1}/{opt.num_epochs}], Loss: {running_loss/len(train_loader):.4f}, Acc: {running_acc/len(train_loader):.4f}, Auroc: {auroc:.4f}')
            
            if running_acc > max_acc:
                best_model = model
                max_acc = running_acc

        print(f"Completed training {i+1}/{num_models}th model with max_acc: {max_acc:.4f}")
        models.append(best_model)
        optimizers.append(optimizer)

    print("Training of all models completed.")

    return models, max_acc

In [29]:
te_dataset = TensorDataset(te_features, te_labels)

test_loader = DataLoader(
    te_dataset,
    batch_size=opt.batch_size,
    shuffle=False
)

In [30]:
def ensemble_predict(models, test_loader):
    all_preds = []
    
    for model in models:
        model.eval()
        preds = []
        
        with torch.no_grad():
            for features, labels in test_loader:
                features = features.float().to(device)
                outputs = model(features, adj, labels, opt)
                preds.append((outputs[:, 0] < 0.5).long().cpu().numpy())

        all_preds.append(np.concatenate(preds, axis=0))

    # ensemble
    final_preds = np.round(np.mean(np.stack(all_preds, axis=0), axis=0)).astype(int)

    return final_preds

In [31]:
def train_test_model(config):
    model, max_acc = train_model(config)
    preds_real = ensemble_predict(model, test_loader)
    preds_real = np.array(preds_real)
    preds = (preds_real < 0.5).astype(int)
    print(preds_real.reshape(-1))
    print(preds)
    print(te_labels.view(-1))
    acc = acc_test(preds, te_labels)
    auroc = auroc_score(preds, te_labels)

    print(f'Acc: {acc/len(te_labels):.4f}, Auroc: {auroc:.4f}')
    print(f'Train Acc: {max_acc/len(train_loader):.4f}')

    return acc, auroc

In [32]:
# search_space = {
#     "optimizer_type": tune.choice(["adam", "adagrad", "adabound"]),
#     "lr": tune.loguniform(1e-5, 1e-2),
#     "final_lr": tune.uniform(1e-3, 1e-1),
#     "weight_decay": tune.uniform(1e-5, 1e-3),
#     "dropout": tune.uniform(0.1, 0.5),
#     "alpha": tune.uniform(0.1, 0.3),
#     "nhids": tune.choice([[8, 16, 32], [8, 16, 16], [8, 8, 8]]),
#     "nheads": tune.choice([[4, 3, 4], [4, 4, 4], [2, 2, 2]]),
#     "fc_dim": tune.choice([[256, 128, 64, 32], [128, 64, 32, 16], [512, 256, 64, 32]]),
# }


# analysis = tune.run(
#     train_test_model,
#     config=search_space,
#     metric="accuracy",
#     mode="max",
#     num_samples = 10,
#     max_concurrent_trials = 1
# )

# print("Best config: ", analysis.best_config)

In [None]:
config = {
    "optimizer_type": "adabound",
     "lr": 0.00005,
     "final_lr": 0.1,
     "weight_decay": 5e-4,
     "dropout": 0.0,
     "alpha": 0.1,
     "nhids": [8, 16, 32],
     "nheads": [4, 3, 4],
     "fc_dim": [256, 128, 64, 16]
}

acc_list = []

for i in range(30):
    acc, auroc = train_test_model(config)
    if auroc > 0.5:
        acc_list.append(acc)

print(acc_list)

In [None]:
preds_real = ensemble_predict(model, test_loader, device)
preds_real = np.array(preds_real)
preds = (preds_real[:, 0] < 0.5).astype(int)

acc = acc_test(preds, te_labels)
auroc = auroc_score(preds, te_labels)