In [1]:
import warnings

warnings.filterwarnings('ignore')

import os
import random
import numpy as np
import pandas as pd
from ResNN import ClassificationModel2 as ResNN, FocalLoss
import scipy.stats as st
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, StandardScaler, OneHotEncoder
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader

Couldn't import dot_parser, loading of dot files will not be possible.


In [2]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = False  # type: ignore

seed_everything()   

In [3]:
df = pd.read_csv('./impute_set/imp3.csv')

In [4]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        super().__init__()
        self.X = torch.from_numpy(X.values).float()
        self.y = torch.from_numpy(y.values)

    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    

def preprocessing(df, numeric='minmax', category='label'):
    X = df.drop('BS3_1', axis=1)
    y = df['BS3_1']
    numeric_col = [
        'FEV1', 'FEV1FVC', 'age', 'BS6_3', 'BS6_2_1', 'BD1',
        '건강문해력', 'Total_slp_wk', 'EQ_5D', 'BE3_31', 'BE5_1', '질환유병기간'
    ]
    cat_col = []
    for col in X.columns:
        if col not in numeric_col:
            cat_col.append(col)

    df_num, df_cat = X[numeric_col], X[cat_col]
    if numeric == 'minmax':
        n_pre = MinMaxScaler()
    else:
        n_pre = StandardScaler()
    df_num = pd.DataFrame(n_pre.fit_transform(df_num), columns=df_num.columns)

    if category == 'label':
        c_pre = OrdinalEncoder()
        df_cat = pd.DataFrame(c_pre.fit_transform(df_cat), columns=df_cat.columns)
    else:
        c_pre = OneHotEncoder(sparse_output=False)
        df_cat = pd.DataFrame(c_pre.fit_transform(df_cat))

    X = pd.concat([df_num, df_cat], axis=1)
    # uniques = []
    # for col in cat_col:
    #     uniques.append(len(X[col].unique()))
    return X, y


def test_with_imputations(train_loader, test_loader, test_y, input_dim):
    class_counts = torch.tensor([test_y.value_counts()[0], test_y.value_counts()[1]])
    class_weights = 1.0 / class_counts
    class_weights /= class_weights.sum()
    print(class_weights)

    device = torch.device('cuda')
    model = ResNN(input_dim=input_dim, hidden_dim=64, num_classes=2)
    model = model.to(device)
    optim = Adam(model.parameters(), lr=.0001)
    
    criterion = FocalLoss(weight=class_weights.to(device))
    best_f1 = 0.0
    best_epoch = 0
    epochs=500
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for xx, yy in train_loader:
            xx, yy = xx.to(device), yy.to(device)
            optim.zero_grad()
            preds = model(xx)
            loss = criterion(preds, yy)
            loss.backward()
            optim.step()
            running_loss += loss.item()
        # print(f'{epoch+1} Epoch | Loss: {running_loss/len(train_loader):.4f}')

        model.eval()
        val_loss = 0
        correct = 0
        val_preds = []
        val_targets = []
        with torch.no_grad():
            for xx, yy in test_loader:
                xx, yy = xx.to(device), yy.to(device)
                preds = model(xx)
                val_loss = criterion(preds, yy).item()
                yy = yy.detach().cpu().numpy().squeeze()
                preds = preds.detach().cpu().numpy().squeeze()
                preds_labels = preds.argmax(axis=1)
                correct += (preds_labels == yy).sum().item()
                val_preds.extend(preds_labels.tolist())
                val_targets.extend(yy.tolist())

        val_loss /= len(test_loader)
        val_f1 = f1_score(val_targets, val_preds, average='macro')
        # print(f'{epoch+1} Epoch | TestLoss: {val_loss:.4f} | TestF1: {val_f1:.4f}')
        if val_f1 > best_f1:
            best_f1 = val_f1
            best_epoch = epoch+1
            torch.save(model.state_dict(), 'bestResNN.pth')

    print(f'Best Epoch: {best_epoch} | Best F1 : {best_f1:.4f}')
    return best_f1


def test_with_5fold(df, numeric, category, shuffle=True):
    f1s = []
    X, y = preprocessing(df, numeric, category)
    if shuffle:
        skf = StratifiedKFold(n_splits=5, shuffle=shuffle, random_state=42)
    else:
        skf = StratifiedKFold(n_splits=5, shuffle=shuffle)
    
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        train_X, train_y = X.iloc[train_idx], y.iloc[train_idx]
        test_X, test_y = X.iloc[test_idx], y.iloc[test_idx]

        train_set = CustomDataset(train_X, train_y)
        test_set = CustomDataset(test_X, test_y)
        train_loader = DataLoader(train_set, batch_size=64, shuffle=True, pin_memory=True)
        test_loader = DataLoader(test_set, batch_size=64, shuffle=True, pin_memory=True)

        f1_value = test_with_imputations(
            train_loader, test_loader, test_y, input_dim=train_X.shape[-1]
        )
        f1s.append(f1_value)

    return f1s


def get_cv_results(f1s:list):
    f1s = np.array(f1s)
    mean_f1 = np.mean(f1s)
    std_f1 = np.std(f1s)
    ci95 = st.t.interval(.95, df=len(f1s)-1, loc=mean_f1, scale=std_f1/np.sqrt(len(f1s)))
    return mean_f1, std_f1, ci95    

In [5]:
# minmax | label
f1s = test_with_5fold(df, numeric='minmax', category='label')
mean_f1, std_f1, ci95 = get_cv_results(f1s)
print(f1s)
print(f'CV Results: Mean {mean_f1:.2f} | Std {std_f1:.2f} | CI95% {ci95[0]:.2f}~{ci95[1]:.2f}')

tensor([0.1429, 0.8571])
Best Epoch: 264 | Best F1 : 0.8833
tensor([0.1429, 0.8571])
Best Epoch: 172 | Best F1 : 0.7479
tensor([0.1429, 0.8571])
Best Epoch: 243 | Best F1 : 0.6825
tensor([0.1429, 0.8571])
Best Epoch: 12 | Best F1 : 0.6618
tensor([0.1571, 0.8429])
Best Epoch: 156 | Best F1 : 0.7989
[0.8833333333333334, 0.7478991596638654, 0.6825396825396824, 0.6618357487922706, 0.7988505747126436]
CV Results: Mean 0.75 | Std 0.08 | CI95% 0.65~0.85


In [6]:
# minmax | one-hot
f1s = test_with_5fold(df, numeric='minmax', category='onehot')
mean_f1, std_f1, ci95 = get_cv_results(f1s)
print(f1s)
print(f'CV Results: Mean {mean_f1:.2f} | Std {std_f1:.2f} | CI95% {ci95[0]:.2f}~{ci95[1]:.2f}')

tensor([0.1429, 0.8571])
Best Epoch: 157 | Best F1 : 0.7843
tensor([0.1429, 0.8571])
Best Epoch: 68 | Best F1 : 0.8039
tensor([0.1429, 0.8571])
Best Epoch: 242 | Best F1 : 0.7450
tensor([0.1429, 0.8571])
Best Epoch: 73 | Best F1 : 0.6694
tensor([0.1571, 0.8429])
Best Epoch: 103 | Best F1 : 0.7486
[0.7842835130970724, 0.803921568627451, 0.7449908925318762, 0.6693657219973009, 0.7485632183908046]
CV Results: Mean 0.75 | Std 0.05 | CI95% 0.69~0.81


In [7]:
# standard | label
f1s = test_with_5fold(df, numeric='standard', category='label')
mean_f1, std_f1, ci95 = get_cv_results(f1s)
print(f1s)
print(f'CV Results: Mean {mean_f1:.2f} | Std {std_f1:.2f} | CI95% {ci95[0]:.2f}~{ci95[1]:.2f}')

tensor([0.1429, 0.8571])
Best Epoch: 79 | Best F1 : 0.8699
tensor([0.1429, 0.8571])
Best Epoch: 78 | Best F1 : 0.8833
tensor([0.1429, 0.8571])
Best Epoch: 296 | Best F1 : 0.8478
tensor([0.1429, 0.8571])
Best Epoch: 411 | Best F1 : 0.6359
tensor([0.1571, 0.8429])
Best Epoch: 137 | Best F1 : 0.7479
[0.8699368264585655, 0.8833333333333334, 0.8477598956067856, 0.6358543417366948, 0.7478991596638654]
CV Results: Mean 0.80 | Std 0.09 | CI95% 0.68~0.91


In [8]:
# standard | one-hot
f1s = test_with_5fold(df, numeric='standard', category='onehot')
mean_f1, std_f1, ci95 = get_cv_results(f1s)
print(f1s)
print(f'CV Results: Mean {mean_f1:.2f} | Std {std_f1:.2f} | CI95% {ci95[0]:.2f}~{ci95[1]:.2f}')

tensor([0.1429, 0.8571])
Best Epoch: 117 | Best F1 : 0.8921
tensor([0.1429, 0.8571])
Best Epoch: 238 | Best F1 : 0.7450
tensor([0.1429, 0.8571])
Best Epoch: 66 | Best F1 : 0.7768
tensor([0.1429, 0.8571])
Best Epoch: 28 | Best F1 : 0.6835
tensor([0.1571, 0.8429])
Best Epoch: 122 | Best F1 : 0.8039
[0.8921417565485362, 0.7449908925318762, 0.7767857142857143, 0.6834782608695651, 0.803921568627451]
CV Results: Mean 0.78 | Std 0.07 | CI95% 0.69~0.87
