In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
import multiprocessing
from tqdm.auto import tqdm

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler

from transformers import get_cosine_schedule_with_warmup

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset, sampler

from imblearn.over_sampling import SMOTE

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [3]:
random_seed = 41

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed) 
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    os.environ["PYTHONHASHSEED"] = str(seed)
    
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

seed_everything(seed=random_seed) # Seed 고정

In [4]:
train = pd.read_csv("./data/df_train6.csv")
test = pd.read_csv("./data/df_test6.csv")
train.shape, test.shape

((193, 33), (175, 33))

In [5]:
train.columns

Index(['id', 'SNP_01', 'SNP_02', 'SNP_03', 'SNP_04', 'SNP_05', 'SNP_06',
       'SNP_07', 'SNP_08', 'SNP_09', 'SNP_10', 'SNP_11', 'SNP_12', 'SNP_13',
       'SNP_14', 'SNP_15', 'class', 'SNP_01_ratio', 'SNP_02_ratio',
       'SNP_03_ratio', 'SNP_04_ratio', 'SNP_05_ratio', 'SNP_06_ratio',
       'SNP_07_ratio', 'SNP_08_ratio', 'SNP_09_ratio', 'SNP_10_ratio',
       'SNP_11_ratio', 'SNP_12_ratio', 'SNP_13_ratio', 'SNP_14_ratio',
       'SNP_15_ratio', 'trait_ratio'],
      dtype='object')

In [6]:
test.columns

Index(['id', 'SNP_01', 'SNP_02', 'SNP_03', 'SNP_04', 'SNP_05', 'SNP_06',
       'SNP_07', 'SNP_08', 'SNP_09', 'SNP_10', 'SNP_11', 'SNP_12', 'SNP_13',
       'SNP_14', 'SNP_15', 'class', 'SNP_01_ratio', 'SNP_02_ratio',
       'SNP_03_ratio', 'SNP_04_ratio', 'SNP_05_ratio', 'SNP_06_ratio',
       'SNP_07_ratio', 'SNP_08_ratio', 'SNP_09_ratio', 'SNP_10_ratio',
       'SNP_11_ratio', 'SNP_12_ratio', 'SNP_13_ratio', 'SNP_14_ratio',
       'SNP_15_ratio', 'trait_ratio'],
      dtype='object')

In [7]:
y = torch.Tensor(train['class'].values)
X = train.drop(['id', 'class'], axis=1).to_numpy()
X_test = test.drop(['id', 'class'], axis=1).to_numpy()

# strategy = {0:1000, 1:1000}
# strategy = 'auto'
# smote = SMOTE(random_state=random_seed, k_neighbors=4, sampling_strategy=strategy)
# X, y = smote.fit_resample(X, y)

y = torch.Tensor(y).to(device)
y

tensor([0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 1., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        1., 1., 1., 0., 0., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0.,
        0., 1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0.,
        0., 1., 0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 0., 1., 1., 0., 0.,
        0., 1., 1., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 1., 0., 0.,
        0., 0., 0., 1., 0., 1., 0., 0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 1., 0., 1., 1., 1.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0.], device='cuda:0')

In [8]:
total = np.concatenate([X, X_test], axis=0)
total.shape

(368, 31)

In [9]:
class Encoder(nn.Module):
    def __init__(self, n_features, latent_dim):
        super().__init__()
        
        self.lstm0 = nn.Sequential(nn.Linear(n_features, latent_dim**2),
                                    nn.GELU(),
                                    nn.Dropout(0.1))
        self.lstm1 = nn.Sequential(nn.Linear(latent_dim**2, latent_dim*3),
                                    nn.GELU(),
                                    nn.Dropout(0.1))
        self.lstm2 = nn.Sequential(nn.Linear(latent_dim*3, latent_dim),
                                    nn.GELU(),
                                    nn.Dropout(0.1))

    def forward(self, x):
        x1 = self.lstm0(x)
        x2 = self.lstm1(x1)
        x3 = self.lstm2(x2)
        
        return x3


class Decoder(nn.Module):
    def __init__(self, n_features, latent_dim):
        super().__init__()
        self.latent_dim = latent_dim
        
        self.lstm0 = nn.Sequential(nn.Linear(latent_dim, latent_dim*3),
                                    nn.GELU(),
                                    nn.Dropout(0.1))
        self.lstm1 = nn.Sequential(nn.Linear(latent_dim*3, latent_dim**2),
                                    nn.GELU(),
                                    nn.Dropout(0.1))
        self.lstm2 = nn.Sequential(nn.Linear(latent_dim**2, latent_dim*2),        
                                    nn.GELU(),
                                    nn.Dropout(0.1))
        
        
        self.linear = nn.Linear(in_features=latent_dim*2, out_features=n_features)

    def forward(self, x):        
        x = self.lstm0(x)
        x = self.lstm1(x)
        x = self.lstm2(x)
        
        x = self.linear(x)

        return x


class AutoEncoder(nn.Module):
    def __init__(self, n_features=31, latent_dim=7, device=None):
        super().__init__()
        
        self.encoder = Encoder(n_features, latent_dim).to(device)
        self.decoder = Decoder(n_features, latent_dim).to(device)

    def forward(self, x):
        x1 = self.encoder(x)
        x2 = self.decoder(x1)
        
        return x1, x2

In [10]:
CFG = {
    'EPOCHS':5000,
    'LEARNING_RATE':0.017,
    'BATCH_SIZE':len(total),
}

In [11]:
def validation(model, criterion, test_loader, device):
    model.eval()
    
    model_preds = []
    true_labels = []
    
    val_loss = []

    with torch.no_grad():
        for x, label in iter(test_loader):
            x, label = x.to(device), label.to(device)

            model_pred = model(x)

            loss = criterion(model_pred, label)

            val_loss.append(loss.item())

    return np.mean(val_loss)

In [12]:
# DataLoader 정의
train_dataset = TensorDataset(torch.from_numpy(total).type(torch.float), torch.from_numpy(total).type(torch.float))
train_loader = DataLoader(train_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=True, num_workers=2, worker_init_fn=seed_worker)

# 학습 모델 설정
model = AutoEncoder().to(device)

criterion1 = nn.CrossEntropyLoss().to(device)
criterion = nn.MSELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=CFG['LEARNING_RATE'])  # Adam
scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=len(train_loader) * int(CFG['EPOCHS']*0.5),
    num_training_steps=len(train_loader) * CFG['EPOCHS']
)


best_score = 0.1
y = torch.Tensor(y).to(device)

# train
torch.backends.cudnn.benchmark = True
for epoch in range(1,CFG['EPOCHS']+1):
    model.train()
    train_loss = []
    for x, label in iter(train_loader):            
        x, label = x.to(device), label.to(device)

        optimizer.zero_grad()

        _, pred_vector = model(x)

        loss = criterion(label, pred_vector)

        loss.backward()
        optimizer.step()
        scheduler.step()

        train_loss.append(loss.item())

    tr_loss = np.mean(train_loss)
    print()
    print(f'Epoch [{epoch}], Train Loss : [{tr_loss:.5f}]', end=" ")
    # print(f'Epoch [{epoch}], Train Loss : [{tr_loss:.5f}]')

    if best_score > tr_loss:
        print("- Model Saved!")
        torch.save(model.state_dict(), f'./models/AutoEncoder_total.pt')
        best_score = tr_loss


Epoch [1], Train Loss : [0.83121] 
Epoch [2], Train Loss : [0.83128] 
Epoch [3], Train Loss : [0.83137] 
Epoch [4], Train Loss : [0.83134] 
Epoch [5], Train Loss : [0.83098] 
Epoch [6], Train Loss : [0.83130] 
Epoch [7], Train Loss : [0.83074] 
Epoch [8], Train Loss : [0.83076] 
Epoch [9], Train Loss : [0.83036] 
Epoch [10], Train Loss : [0.83060] 
Epoch [11], Train Loss : [0.83007] 
Epoch [12], Train Loss : [0.83008] 
Epoch [13], Train Loss : [0.82982] 
Epoch [14], Train Loss : [0.82906] 
Epoch [15], Train Loss : [0.82910] 
Epoch [16], Train Loss : [0.82868] 
Epoch [17], Train Loss : [0.82858] 
Epoch [18], Train Loss : [0.82785] 
Epoch [19], Train Loss : [0.82762] 
Epoch [20], Train Loss : [0.82726] 
Epoch [21], Train Loss : [0.82666] 
Epoch [22], Train Loss : [0.82592] 
Epoch [23], Train Loss : [0.82553] 
Epoch [24], Train Loss : [0.82488] 
Epoch [25], Train Loss : [0.82475] 
Epoch [26], Train Loss : [0.82382] 
Epoch [27], Train Loss : [0.82302] 
Epoch [28], Train Loss : [0.82283] 
