# CONCEPT 

- 데이터를 증강하지 않고, A와 notA를 분류
- A 데이터를 제외하고, B&C 데이터만을 남겨두기
- B&C 데이터에 대한 Label Encoder 변수 추가
- B&C 데이터에 대한 AE 및 VAE Encoding 값 및 error 값 추가
- B&C 데이터에 대한 데이터 증강 유무에 따른 성능 확인

# STEP 01. 데이터를 증강하지 않고 A와 notA를 분류

In [57]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
import multiprocessing
from tqdm.auto import tqdm

from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset, sampler

from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from imblearn.over_sampling import SMOTE, SMOTENC, SMOTEN

In [58]:
device = torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda', index=1)

In [59]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')              
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 21 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      262 non-null    object
 1   father  262 non-null    int64 
 2   mother  262 non-null    int64 
 3   gender  262 non-null    int64 
 4   trait   262 non-null    int64 
 5   SNP_01  262 non-null    object
 6   SNP_02  262 non-null    object
 7   SNP_03  262 non-null    object
 8   SNP_04  262 non-null    object
 9   SNP_05  262 non-null    object
 10  SNP_06  262 non-null    object
 11  SNP_07  262 non-null    object
 12  SNP_08  262 non-null    object
 13  SNP_09  262 non-null    object
 14  SNP_10  262 non-null    object
 15  SNP_11  262 non-null    object
 16  SNP_12  262 non-null    object
 17  SNP_13  262 non-null    object
 18  SNP_14  262 non-null    object
 19  SNP_15  262 non-null    object
 20  class   262 non-null    object
dtypes: int64(4), object(17)
memory usage: 43.1+ KB


In [60]:
df_train['class'].value_counts()

B    114
C     79
A     69
Name: class, dtype: int64

In [61]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      175 non-null    object
 1   father  175 non-null    int64 
 2   mother  175 non-null    int64 
 3   gender  175 non-null    int64 
 4   trait   175 non-null    int64 
 5   SNP_01  175 non-null    object
 6   SNP_02  175 non-null    object
 7   SNP_03  175 non-null    object
 8   SNP_04  175 non-null    object
 9   SNP_05  175 non-null    object
 10  SNP_06  175 non-null    object
 11  SNP_07  175 non-null    object
 12  SNP_08  175 non-null    object
 13  SNP_09  175 non-null    object
 14  SNP_10  175 non-null    object
 15  SNP_11  175 non-null    object
 16  SNP_12  175 non-null    object
 17  SNP_13  175 non-null    object
 18  SNP_14  175 non-null    object
 19  SNP_15  175 non-null    object
dtypes: int64(4), object(16)
memory usage: 27.5+ KB


In [62]:
df_train.describe()

Unnamed: 0,father,mother,gender,trait
count,262.0,262.0,262.0,262.0
mean,0.0,0.0,0.0,1.736641
std,0.0,0.0,0.0,0.441298
min,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,2.0
75%,0.0,0.0,0.0,2.0
max,0.0,0.0,0.0,2.0


In [63]:
df_test.describe()

Unnamed: 0,father,mother,gender,trait
count,175.0,175.0,175.0,175.0
mean,0.0,0.0,0.0,1.708571
std,0.0,0.0,0.0,0.455724
min,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,2.0
75%,0.0,0.0,0.0,2.0
max,0.0,0.0,0.0,2.0


In [64]:
# trait와 Label A의 상관계수가 1로, 다른 변수에 대한 영향력이 학습되지 않을 정도로 높으므로 배제해보고 진행
df_train2 = df_train.drop(columns=['father', 'mother', 'gender','trait','SNP_06'])
df_test2 = df_test.drop(columns=['father', 'mother', 'gender', 'trait','SNP_06'])

smote = SMOTEN()
X, y = df_train2.iloc[:, 1:-1], (df_train2['class'].values == 'A').astype(int)
X2, y2 = smote.fit_resample(X, y)

In [65]:
def catgbmc(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()
    model = CatBoostClassifier(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY)
        );     

    pred = model.predict(validX)
    score = f1_score(validY, pred, average='macro')
    print(score)
    
    return model

In [66]:
params = {'iterations':100,
          'learning_rate':0.1,
          'l2_leaf_reg' : 0,
        #   'grow_policy' : 'Depthwise',
          'verbose':0,
          'random_seed':0}

model1 = catgbmc(X2, y2, X, y, params)
model2 = catgbmc(X, y, X, y, params)

1.0
1.0


In [67]:
pd.DataFrame(data=model1.get_feature_importance(), index=model1.feature_names_, columns=['feature_importance']).sort_values('feature_importance', ascending=False)

Unnamed: 0,feature_importance
SNP_10,33.877307
SNP_07,26.358528
SNP_12,18.584903
SNP_08,11.123008
SNP_04,6.514777
SNP_03,1.997152
SNP_01,1.018393
SNP_09,0.525932
SNP_02,0.0
SNP_05,0.0


In [68]:
pd.DataFrame(data=model2.get_feature_importance(), index=model2.feature_names_, columns=['feature_importance']).sort_values('feature_importance', ascending=False)

Unnamed: 0,feature_importance
SNP_07,23.641925
SNP_03,18.978285
SNP_12,15.133577
SNP_14,9.732432
SNP_11,9.523872
SNP_01,8.971842
SNP_13,8.68642
SNP_04,2.906656
SNP_15,2.424992
SNP_02,0.0


In [69]:
pred = model1.predict_proba(X)[:,1] > 0.97
score = f1_score(y, pred, average='macro')
print(score) 

0.9950583752994211


In [70]:
pred = model2.predict_proba(X)[:,1] > 0.93
score = f1_score(y, pred, average='macro')
print(score) 

1.0


- 본래의 성능을 해치지 않는 선에서 threshold를 최대로 높여보기

In [71]:
pred1 = model1.predict_proba(df_test2.iloc[:, 1:])[:,1] > 0.97
sum(pred1)

45

In [72]:
pred2 = model2.predict_proba(df_test2.iloc[:, 1:])[:,1] > 0.93
sum(pred2)

48

In [74]:
train2 = df_train[df_train['class'] != 'A'].copy().reset_index(drop=True).drop(columns=['father', 'mother', 'gender','trait'])
test2 = df_test[~pred1.astype(bool)].copy().reset_index(drop=True).drop(columns=['father', 'mother', 'gender','trait'])
test2

Unnamed: 0,id,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15
0,TEST_001,G G,A G,C C,G G,C C,A A,A A,A A,A A,G G,A G,A A,A A,A A,A A
1,TEST_002,G G,A G,A A,A A,C A,A G,A A,A A,A A,A G,A A,G A,G G,A A,G G
2,TEST_003,G G,A G,C A,A A,C C,A A,A A,A A,A A,G G,A A,G A,A G,A A,A A
3,TEST_005,G G,A G,C A,A A,C A,A A,A A,G A,A A,G G,G G,A A,A G,A A,A A
4,TEST_006,A A,A G,C A,A A,C A,A G,A A,A A,A A,A G,A G,G A,A G,A A,G A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,TEST_170,A G,G G,C C,A A,C A,A G,A A,G G,A A,G G,G G,A A,A A,A A,G A
126,TEST_171,G G,A A,A A,A A,C A,A G,A A,A A,A A,A G,A A,A A,A G,A A,G A
127,TEST_172,G G,A A,A A,A A,C A,A G,A A,A A,A A,G G,A G,A A,A G,A A,G G
128,TEST_173,A G,G G,C A,G A,C C,G G,A A,G A,A A,G G,A G,A A,A A,A A,A A


### Summay 01
- 이전의 과정들에서 수차례 증명되었듯, A와 notA를 분류하는 것은 아주 쉽다.  
 (기본적으로 trait 변수 하나만으로도 기계적인 분류가 가능하다)
- 때문에 우선적으로 A와 notA를 분류해놓고, B와 C만이 존재하는 데이터를 대상으로 파생변수 생성, 증강 등의 과정을 진행한다.

# STEP 02. 파생변수 추가

## 1) B와 C에 대한 Target Labeling

In [75]:
# categorical 변수 중 train에만 있거나, test에만 있는 칼럼 확인

train3 = pd.get_dummies(train2.iloc[:, 1:-1]).copy()
test3 = pd.get_dummies(test2.iloc[:, 1:]).copy()

target1 = train3.columns.tolist()
target2 = test3.columns.tolist()
[x for x in target1 if x not in target2], [y for y in target2 if y not in target1]

(['SNP_09_G G'], ['SNP_07_G G'])

In [76]:
train2.drop(columns=['SNP_07', 'SNP_09'], inplace=True)
test2.drop(columns=['SNP_07', 'SNP_09'], inplace=True)

In [77]:
# text 형태의 categorical 변수들을 숫자형태로 변경

for i in tqdm(range(1, 15+1)) :
    if (i == 7) | (i==9) :
        continue
    target = str(i) if i >= 10 else "0"+str(i)
    cols = sorted(train2[f"SNP_{target}"].unique().tolist())
    train2[f"SNP_{target}"] = train2[f"SNP_{target}"].map(lambda x : 0 if x==cols[0] else (1 if x==cols[1] else 2))
    test2[f"SNP_{target}"] = test2[f"SNP_{target}"].map(lambda x : 0 if x==cols[0] else (1 if x==cols[1] else 2))

train2.info(), test2.info()

  0%|          | 0/15 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      193 non-null    object
 1   SNP_01  193 non-null    int64 
 2   SNP_02  193 non-null    int64 
 3   SNP_03  193 non-null    int64 
 4   SNP_04  193 non-null    int64 
 5   SNP_05  193 non-null    int64 
 6   SNP_06  193 non-null    int64 
 7   SNP_08  193 non-null    int64 
 8   SNP_10  193 non-null    int64 
 9   SNP_11  193 non-null    int64 
 10  SNP_12  193 non-null    int64 
 11  SNP_13  193 non-null    int64 
 12  SNP_14  193 non-null    int64 
 13  SNP_15  193 non-null    int64 
 14  class   193 non-null    object
dtypes: int64(13), object(2)
memory usage: 22.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      130 non-null    object
 1   SNP_01  130 n

(None, None)

In [78]:
train2['class_B'] = train2['class'].map(lambda x : 1 if x=='B' else 0)
train2['class_C'] = train2['class'].map(lambda x : 1 if x=='C' else 0)

for i in range(1, 15+1) :
    if (i == 7) | (i==9) :
        continue
    target = str(i) if i >= 10 else "0"+str(i)
    target2 = train2.groupby(f'SNP_{target}')['class_B', 'class_C'].sum()
    target2['total'] = target2[['class_B', 'class_C']].sum(axis=1)
    for j in range(len(target2)) :
        if (target2['total'][j] < 3) : # 전체 표본이 너무 적은 경우 제외(확률 반반으로)
            target2['class_B'][j] = 1
            target2['class_C'][j] = 1
            
    value = target2['class_B'] / (target2['class_B']+target2['class_C'])    

    train2[f"SNP_{target}_ratio"] = train2[f"SNP_{target}"].map(lambda x : value[0] if x==0 else (value[1] if x==1 else value[2]))
    test2[f"SNP_{target}_ratio"] = test2[f"SNP_{target}"].map(lambda x : value[0] if x==0 else (value[1] if x==1 else value[2]))    

train2.drop(columns=['class_B', 'class_C'], inplace=True)
train2.info(), test2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 28 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            193 non-null    object 
 1   SNP_01        193 non-null    int64  
 2   SNP_02        193 non-null    int64  
 3   SNP_03        193 non-null    int64  
 4   SNP_04        193 non-null    int64  
 5   SNP_05        193 non-null    int64  
 6   SNP_06        193 non-null    int64  
 7   SNP_08        193 non-null    int64  
 8   SNP_10        193 non-null    int64  
 9   SNP_11        193 non-null    int64  
 10  SNP_12        193 non-null    int64  
 11  SNP_13        193 non-null    int64  
 12  SNP_14        193 non-null    int64  
 13  SNP_15        193 non-null    int64  
 14  class         193 non-null    object 
 15  SNP_01_ratio  193 non-null    float64
 16  SNP_02_ratio  193 non-null    float64
 17  SNP_03_ratio  193 non-null    float64
 18  SNP_04_ratio  193 non-null    

(None, None)

## 2) AE & VAE 정의

In [81]:
class Autoencoder(nn.Module):
    def __init__(self, encoding_dim):
        super().__init__()
        self.encoding_dim = encoding_dim
        self.encoder = nn.Sequential(
            nn.Linear(26, 64),
            nn.GELU(),
            nn.Linear(64, 16),
            nn.GELU(),
            nn.Linear(16, encoding_dim),
            nn.GELU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 16),
            nn.GELU(),
            nn.Linear(16, 64),
            nn.GELU(),
            nn.Linear(64, 26)
        )

    def forward(self, x):
        x1 = self.encoder(x)
        x2 = self.decoder(x1)
        return x1, x2

def ae_train(model, data_loader, criterion, optimizer, device, epochs=10):
    model.to(device)
    for epoch in range(epochs):
        epoch_loss = 0
        for x in data_loader:
            x = x[0].to(device)
            _, x_hat = model(x)
            loss = criterion(x_hat, x)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f'Epoch {epoch}: loss = {epoch_loss / len(data_loader):.4f}')

In [82]:
X = torch.randn(320, train2.shape[1]-2)  # generate some example data

# Create a dataset and data loader
dataset = torch.utils.data.TensorDataset(X)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

# Initialize the model, criterion, and optimizer
encoding_dim = 6
model = Autoencoder(encoding_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())

# Train the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
ae_train(model, data_loader, criterion, optimizer, device)

Epoch 0: loss = 0.9920
Epoch 1: loss = 0.9879
Epoch 2: loss = 0.9853
Epoch 3: loss = 0.9831
Epoch 4: loss = 0.9783
Epoch 5: loss = 0.9692
Epoch 6: loss = 0.9536
Epoch 7: loss = 0.9345
Epoch 8: loss = 0.9152
Epoch 9: loss = 0.8981


In [83]:
class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(VAE, self).__init__()
        self.input_dim = input_dim
        self.latent_dim = latent_dim
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, input_dim*2),
            nn.ReLU(),
            nn.Linear(input_dim*2, input_dim),
            nn.ReLU(),
            nn.Linear(input_dim, input_dim//2),
            nn.ReLU()
            )
        self.fc_mu = nn.Linear(input_dim//2, latent_dim)
        self.fc_logvar = nn.Linear(input_dim//2, latent_dim)
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, input_dim//2),
            nn.ReLU(),
            nn.Linear(input_dim//2, input_dim),
            nn.ReLU(),
            nn.Linear(input_dim, input_dim*2),
            nn.ReLU(),
            nn.Linear(input_dim*2, input_dim)
        )

    def encode(self, x):
        h = self.encoder(x)
        return self.fc_mu(h), self.fc_logvar(h)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return mu + eps*std

    def decode(self, z):
        return self.decoder(z)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

def loss_fn(recon_x, x, mu, logvar):
    reconstruction_loss = F.mse_loss(recon_x, x, reduction='sum')
    kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return reconstruction_loss + kl_loss

def train(model, optimizer, train_loader, device):
    model.train()
    train_loss = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        loss = loss_fn(recon_batch, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        
        optimizer.step()
        
    return train_loss / len(train_loader.dataset)

def test(model, test_loader, device):
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for data in test_loader:
            data = data.to(device)
            recon_batch, mu, logvar = model(data)
            test_loss += loss_fn(recon_batch, data, mu, logvar).item()
    return test_loss / len(test_loader.dataset)

In [84]:
input_dim = train2.shape[1]-1
latent_dim = 8
batch_size = 32
num_epochs = 10

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_data = torch.randn(100, input_dim)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size)
test_data = torch.randn(10, input_dim)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)

model = VAE(input_dim, latent_dim).to(device)
optimizer = torch.optim.Adam(model.parameters())

for epoch in range(num_epochs):
    train_loss = train(model, optimizer, train_loader, device)
    test_loss = test(model, test_loader, device)
    print(f"Epoch {epoch}: Train loss = {train_loss}, Test loss = {test_loss}")

Epoch 0: Train loss = 26.811466903686522, Test loss = 25.53804168701172
Epoch 1: Train loss = 26.729301300048828, Test loss = 25.714266967773437
Epoch 2: Train loss = 26.62572006225586, Test loss = 25.841311645507812
Epoch 3: Train loss = 26.589081420898438, Test loss = 25.58756866455078
Epoch 4: Train loss = 26.556602325439453, Test loss = 25.871975708007813
Epoch 5: Train loss = 26.488694534301757, Test loss = 25.827731323242187
Epoch 6: Train loss = 26.453925857543947, Test loss = 25.74207458496094
Epoch 7: Train loss = 26.453749160766602, Test loss = 25.704693603515626
Epoch 8: Train loss = 26.398755950927736, Test loss = 25.678756713867188
Epoch 9: Train loss = 26.38263229370117, Test loss = 25.709365844726562


## 3) AE & VAE 학습 및 변수 추가

### AutoEncoder

In [85]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = torch.Tensor(train2.drop(columns=['id', 'class']).to_numpy())

# Create a dataset and data loader
dataset = torch.utils.data.TensorDataset(X)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)

# Initialize the model, criterion, and optimizer
encoding_dim = 8
model = Autoencoder(encoding_dim)
criterion = nn.MSELoss().to(device)
optimizer = optim.Adam(model.parameters())
epochs=1000

# Train the model
ae_train(model, data_loader, criterion, optimizer, device, epochs)

Epoch 0: loss = 0.7983
Epoch 1: loss = 0.4548
Epoch 2: loss = 0.2218
Epoch 3: loss = 0.1937
Epoch 4: loss = 0.1897
Epoch 5: loss = 0.1883
Epoch 6: loss = 0.1867
Epoch 7: loss = 0.1816
Epoch 8: loss = 0.1810
Epoch 9: loss = 0.1801
Epoch 10: loss = 0.1710
Epoch 11: loss = 0.1609
Epoch 12: loss = 0.1461
Epoch 13: loss = 0.1403
Epoch 14: loss = 0.1362
Epoch 15: loss = 0.1347
Epoch 16: loss = 0.1294
Epoch 17: loss = 0.1295
Epoch 18: loss = 0.1280
Epoch 19: loss = 0.1269
Epoch 20: loss = 0.1234
Epoch 21: loss = 0.1187
Epoch 22: loss = 0.1194
Epoch 23: loss = 0.1123
Epoch 24: loss = 0.1069
Epoch 25: loss = 0.1073
Epoch 26: loss = 0.1064
Epoch 27: loss = 0.1000
Epoch 28: loss = 0.0940
Epoch 29: loss = 0.0940
Epoch 30: loss = 0.0941
Epoch 31: loss = 0.0937
Epoch 32: loss = 0.0897
Epoch 33: loss = 0.0900
Epoch 34: loss = 0.0917
Epoch 35: loss = 0.0896
Epoch 36: loss = 0.0895
Epoch 37: loss = 0.0895
Epoch 38: loss = 0.0836
Epoch 39: loss = 0.0840
Epoch 40: loss = 0.0816
Epoch 41: loss = 0.0799
Ep

In [86]:
X1 = X.to(device)
X2 = torch.Tensor(test2.drop(columns=['id']).to_numpy()).to(device)

pred_train = model(X1)
pred_test = model(X2)

# reconstruction error
loss_train = []
loss_test = []

for i in range(len(X1)) :
    loss = criterion(pred_train[1][i], X1[i]).detach().cpu().numpy()
    loss_train.append(loss)
for j in range(len(X2)) :
    loss = criterion(pred_test[1][j], X2[j]).detach().cpu().numpy()
    loss_test.append(loss)

# encoding values
enco_train = pred_train[0].detach().cpu().numpy()
enco_test = pred_test[0].detach().cpu().numpy()

trainLoss = pd.DataFrame(data=loss_train, columns=['ae_loss'])
testLoss = pd.DataFrame(data=loss_test, columns=['ae_loss'])

ae_train = pd.DataFrame(data=enco_train, columns=['ae_'+str(x) for x in range(enco_train.shape[1])])
ae_test = pd.DataFrame(data=enco_test, columns=['ae_'+str(x) for x in range(enco_train.shape[1])])

train3 = pd.concat([train2, trainLoss, ae_train], axis=1)
test3 = pd.concat([test2, testLoss, ae_test], axis=1)

train3.info(), test3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 37 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            193 non-null    object 
 1   SNP_01        193 non-null    int64  
 2   SNP_02        193 non-null    int64  
 3   SNP_03        193 non-null    int64  
 4   SNP_04        193 non-null    int64  
 5   SNP_05        193 non-null    int64  
 6   SNP_06        193 non-null    int64  
 7   SNP_08        193 non-null    int64  
 8   SNP_10        193 non-null    int64  
 9   SNP_11        193 non-null    int64  
 10  SNP_12        193 non-null    int64  
 11  SNP_13        193 non-null    int64  
 12  SNP_14        193 non-null    int64  
 13  SNP_15        193 non-null    int64  
 14  class         193 non-null    object 
 15  SNP_01_ratio  193 non-null    float64
 16  SNP_02_ratio  193 non-null    float64
 17  SNP_03_ratio  193 non-null    float64
 18  SNP_04_ratio  193 non-null    

(None, None)

### Variational AutoEncoder

In [87]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = torch.Tensor(train2.drop(columns=['id', 'class']).to_numpy())

input_dim = X.shape[1]
latent_dim = 8
batch_size = 32
num_epochs = 1000

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_data = X
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_data = X
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False)

model = VAE(input_dim, latent_dim).to(device)
optimizer = torch.optim.Adam(model.parameters())

for epoch in range(num_epochs):
    train_loss = train(model, optimizer, train_loader, device)
    test_loss = test(model, test_loader, device)
    print(f"Epoch {epoch}: Train loss = {train_loss}, Test loss = {test_loss}")

Epoch 0: Train loss = 23.727877241342178, Test loss = 22.580428978322082
Epoch 1: Train loss = 21.924934426737572, Test loss = 20.800747569978547
Epoch 2: Train loss = 20.12178102305516, Test loss = 18.702853800719264
Epoch 3: Train loss = 17.56872311527865, Test loss = 15.712776999399452
Epoch 4: Train loss = 14.462522862488742, Test loss = 12.564648939537879
Epoch 5: Train loss = 11.421667983494892, Test loss = 10.033523984523635
Epoch 6: Train loss = 9.656345917153235, Test loss = 8.732481744005272
Epoch 7: Train loss = 8.501931331318277, Test loss = 7.944884749891845
Epoch 8: Train loss = 7.832404487491272, Test loss = 6.967996528111591
Epoch 9: Train loss = 7.018926067055816, Test loss = 7.021194781664122
Epoch 10: Train loss = 6.841889574120081, Test loss = 6.661000785432331
Epoch 11: Train loss = 6.446276911799772, Test loss = 6.420134371426439
Epoch 12: Train loss = 6.34838374785191, Test loss = 6.18799270867066
Epoch 13: Train loss = 6.227492269456695, Test loss = 6.2480466995

In [88]:
X1 = X.to(device)
X2 = torch.Tensor(test2.drop(columns=['id']).to_numpy()).to(device)

pred_train = model(X1)
pred_test = model(X2)

# reconstruction error
loss_train = []
loss_test = []

for i in range(len(X1)) :
    loss = criterion(pred_train[0][i], X1[i]).detach().cpu().numpy()
    loss_train.append(loss)
for j in range(len(X2)) :
    loss = criterion(pred_test[0][j], X2[j]).detach().cpu().numpy()
    loss_test.append(loss)

# encoding values
enco_train = pred_train[1].detach().cpu().numpy()
enco_test = pred_test[1].detach().cpu().numpy()

trainLoss = pd.DataFrame(data=loss_train, columns=['vae_loss'])
testLoss = pd.DataFrame(data=loss_test, columns=['vae_loss'])

ae_train = pd.DataFrame(data=enco_train, columns=['vae_'+str(x) for x in range(enco_train.shape[1])])
ae_test = pd.DataFrame(data=enco_test, columns=['vae_'+str(x) for x in range(enco_train.shape[1])])

train4 = pd.concat([train3, trainLoss, ae_train], axis=1)
test4 = pd.concat([test3, testLoss, ae_test], axis=1)

train4.info(), test4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 46 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            193 non-null    object 
 1   SNP_01        193 non-null    int64  
 2   SNP_02        193 non-null    int64  
 3   SNP_03        193 non-null    int64  
 4   SNP_04        193 non-null    int64  
 5   SNP_05        193 non-null    int64  
 6   SNP_06        193 non-null    int64  
 7   SNP_08        193 non-null    int64  
 8   SNP_10        193 non-null    int64  
 9   SNP_11        193 non-null    int64  
 10  SNP_12        193 non-null    int64  
 11  SNP_13        193 non-null    int64  
 12  SNP_14        193 non-null    int64  
 13  SNP_15        193 non-null    int64  
 14  class         193 non-null    object 
 15  SNP_01_ratio  193 non-null    float64
 16  SNP_02_ratio  193 non-null    float64
 17  SNP_03_ratio  193 non-null    float64
 18  SNP_04_ratio  193 non-null    

(None, None)

### Summary 02
- Target Labeling을 통해 15개의 파생변수를 생성
- AE와 VAE를 통해 encoding한 값 8개와 reconstruction error 1개를 각기 추가해, 18개의 파생변수를 생성
- 총 33개의 파생변수가 추가되어 49개의 변수를 이용해 B와 C를 구분

In [89]:
train4.to_csv("./data/train44.csv", index=False)
test4.to_csv("./data/test44.csv", index=False)

## Categorical Features
- 전체 변수들을 CATEGORY 타입으로 변환

In [92]:
train = pd.read_csv("./data/train44.csv")
test = pd.read_csv("./data/test44.csv")
train.iloc[:, 1:14] = train.iloc[:, 1:14].astype('category')
test.iloc[:, 1:14] = test.iloc[:, 1:14].astype('category')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 46 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   id            193 non-null    object  
 1   SNP_01        193 non-null    category
 2   SNP_02        193 non-null    category
 3   SNP_03        193 non-null    category
 4   SNP_04        193 non-null    category
 5   SNP_05        193 non-null    category
 6   SNP_06        193 non-null    category
 7   SNP_08        193 non-null    category
 8   SNP_10        193 non-null    category
 9   SNP_11        193 non-null    category
 10  SNP_12        193 non-null    category
 11  SNP_13        193 non-null    category
 12  SNP_14        193 non-null    category
 13  SNP_15        193 non-null    category
 14  class         193 non-null    object  
 15  SNP_01_ratio  193 non-null    float64 
 16  SNP_02_ratio  193 non-null    float64 
 17  SNP_03_ratio  193 non-null    float64 
 18  SNP_04_rat

In [93]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 45 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   id            130 non-null    object  
 1   SNP_01        130 non-null    category
 2   SNP_02        130 non-null    category
 3   SNP_03        130 non-null    category
 4   SNP_04        130 non-null    category
 5   SNP_05        130 non-null    category
 6   SNP_06        130 non-null    category
 7   SNP_08        130 non-null    category
 8   SNP_10        130 non-null    category
 9   SNP_11        130 non-null    category
 10  SNP_12        130 non-null    category
 11  SNP_13        130 non-null    category
 12  SNP_14        130 non-null    category
 13  SNP_15        130 non-null    category
 14  SNP_01_ratio  130 non-null    float64 
 15  SNP_02_ratio  130 non-null    float64 
 16  SNP_03_ratio  130 non-null    float64 
 17  SNP_04_ratio  130 non-null    float64 
 18  SNP_05_rat

# STEP 02. MODELING & VALIDATION
- 데이터 증강여부에 따른 성능향상 유무를 확인
- Classifier와 Regressor를 동시에 사용해 자체적인 ensemble 효과 추가 고려

## without Aug

In [94]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold

from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from imblearn.over_sampling import SMOTE, SMOTENC, SMOTEN

In [95]:
X, y = train.drop(columns=['id', 'class']), (train['class'].values == 'B').astype(int)

strategy = {0:1000, 1:1000}
smote = SMOTENC(categorical_features=[x for x in range(16)], sampling_strategy=strategy)
X2, y2 = smote.fit_resample(X, y)
X_test = test.drop(columns=['id'])
X2

Unnamed: 0,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_08,SNP_10,SNP_11,SNP_12,...,ae_7,vae_loss,vae_0,vae_1,vae_2,vae_3,vae_4,vae_5,vae_6,vae_7
0,2,1,0,1,1,0,2,2,1,0,...,5.486553,0.159097,0.026900,0.013049,0.013923,0.033346,-0.003520,0.019434,0.021099,-0.485469
1,1,1,1,0,0,1,1,1,0,1,...,2.338903,0.116361,-0.019236,-0.014687,-0.008318,0.005216,-0.012849,0.009623,0.006137,0.787507
2,2,2,0,1,2,2,1,1,0,0,...,1.768928,0.247609,0.009370,-0.001504,-0.000892,-0.003537,0.012318,-0.038352,0.048411,-0.931604
3,2,2,2,0,2,0,0,2,0,0,...,0.339235,0.252251,0.017040,0.011551,-0.009094,0.002984,0.004547,-0.016042,0.015548,-0.531253
4,2,2,1,0,2,0,1,2,0,0,...,2.536202,0.152769,0.010032,0.001370,0.002682,-0.001705,0.008241,-0.018646,0.002630,-0.752838
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,2,1,1,0,2,1,2,2,1,1,...,6.495145,0.111825,-0.003409,-0.016987,0.009645,0.002217,0.000824,0.000554,-0.012569,-0.053378
1996,2,1,1,1,2,1,1,2,1,0,...,4.159630,0.075199,0.006012,-0.001055,0.006392,-0.002621,0.010449,-0.022174,0.008716,-0.915039
1997,2,2,1,1,2,1,2,2,0,0,...,4.411622,0.100652,0.006155,-0.004872,0.012770,-0.000462,0.011297,-0.015483,-0.001564,-0.901721
1998,2,1,1,0,2,1,0,2,1,0,...,5.485061,0.153012,0.019608,0.007032,0.002611,0.011005,0.009639,-0.004662,0.017610,-0.533441


In [96]:
y2

array([1, 0, 1, ..., 1, 1, 1])

In [97]:
X_test

Unnamed: 0,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_08,SNP_10,SNP_11,SNP_12,...,ae_7,vae_loss,vae_0,vae_1,vae_2,vae_3,vae_4,vae_5,vae_6,vae_7
0,2,1,2,2,2,0,0,2,1,0,...,4.125077,0.259566,0.007233,0.000595,0.004058,0.001307,0.006137,-0.016166,0.006052,-0.829781
1,2,1,0,0,1,1,0,1,0,1,...,0.291936,0.179799,0.005518,0.003180,-0.014739,-0.000426,-0.009431,0.000003,0.008960,1.387121
2,2,1,1,0,2,0,0,2,0,1,...,2.690188,0.193094,0.023213,0.009246,0.000727,0.014192,0.000707,0.001943,0.010248,-0.156940
3,2,1,1,0,1,0,1,2,2,0,...,7.017147,0.169372,0.013033,-0.000339,-0.001256,0.008897,0.001023,-0.001493,0.005045,-0.240011
4,0,1,1,0,1,1,0,1,1,1,...,3.536815,0.297773,0.028503,0.033139,-0.017028,-0.007630,0.010288,0.000265,-0.009831,1.565095
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,1,2,2,0,1,1,2,2,2,0,...,7.183743,0.310557,0.005143,0.002828,-0.001291,-0.006450,0.003664,-0.021391,-0.010427,-0.808265
126,2,0,0,0,1,1,0,1,0,0,...,0.695514,0.083908,0.005572,-0.004586,-0.011555,-0.000225,-0.008561,-0.004601,0.020345,1.381729
127,2,0,0,0,1,1,0,2,1,0,...,3.456507,0.257170,0.011699,0.001440,-0.015009,-0.006651,0.002078,-0.006705,-0.005611,0.888649
128,1,2,1,1,2,2,1,2,1,0,...,4.759387,0.149449,-0.001189,-0.005570,0.003619,-0.012381,0.009763,-0.039792,0.015342,-1.128698


In [102]:
def catgbmc(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:13]
    model = CatBoostClassifier(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        # l2_leaf_reg=10
        # random_state=random_seed
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY),
        );     

    pred = model.predict(validX)
    score = f1_score(validY, pred, average='macro')
    print(score)
    
    return model

In [103]:
def catgbmr(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:13]
    model = CatBoostRegressor(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        # l2_leaf_reg=10
        # random_state=random_seed
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY),
        );     

    pred = model.predict(validX)
    score = f1_score(validY, np.round(pred), average='macro')
    print(score)
    
    return model

In [105]:
params = {'iterations':300,
          'learning_rate':0.03,
          # 'l2_leaf_reg' : 10,
        #   'auto_class_weights' : 'Balanced',
          # 'grow_policy' : 'Depthwise',
          'verbose':0}

model3 = catgbmc(X2, y2, X, y, params)
model4 = catgbmc(X, y, X, y,params)

1.0
1.0


In [108]:
pd.DataFrame(data=model3.get_feature_importance(), index=model3.feature_names_, columns=['feature_importance']).sort_values('feature_importance', ascending=False)

Unnamed: 0,feature_importance
vae_7,37.267535
ae_2,9.938607
vae_2,9.817353
ae_5,3.28963
SNP_04,3.067442
ae_6,2.802314
SNP_15,2.773697
SNP_02_ratio,2.49593
SNP_04_ratio,2.244201
SNP_10_ratio,2.190313


In [109]:
pd.DataFrame(data=model4.get_feature_importance(), index=model4.feature_names_, columns=['feature_importance']).sort_values('feature_importance', ascending=False)

Unnamed: 0,feature_importance
vae_7,42.077305
vae_2,7.498149
ae_5,6.934725
ae_2,3.908171
SNP_15_ratio,2.420094
SNP_05_ratio,2.357751
ae_1,2.082757
SNP_04_ratio,2.023048
SNP_08_ratio,1.969687
ae_3,1.70032


In [110]:
total = pd.DataFrame()

test00 = pd.read_csv("./data/test.csv")
X_test1 = test00.drop(columns=['id', 'father', 'mother', 'gender', 'trait','SNP_06'])
X_test1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   SNP_01  175 non-null    object
 1   SNP_02  175 non-null    object
 2   SNP_03  175 non-null    object
 3   SNP_04  175 non-null    object
 4   SNP_05  175 non-null    object
 5   SNP_07  175 non-null    object
 6   SNP_08  175 non-null    object
 7   SNP_09  175 non-null    object
 8   SNP_10  175 non-null    object
 9   SNP_11  175 non-null    object
 10  SNP_12  175 non-null    object
 11  SNP_13  175 non-null    object
 12  SNP_14  175 non-null    object
 13  SNP_15  175 non-null    object
dtypes: object(14)
memory usage: 19.3+ KB


In [113]:
total['pred1'] = (model1.predict_proba(X_test1)[:,1] > 0.97).astype(int)
pred2 = model3.predict_proba(X_test)[:,1]
pred3 = model4.predict_proba(X_test)[:,1]
total['pred2'] = 0
total['pred3'] = 0

total.loc[total.pred1==0, 'pred2'] = pred2
total.loc[total.pred1==0, 'pred3'] = pred3
total

Unnamed: 0,pred1,pred2,pred3
0,1,0.000000,0.000000
1,0,0.999828,0.996950
2,0,0.000463,0.007232
3,0,0.995427,0.967965
4,1,0.000000,0.000000
...,...,...,...
170,0,0.997832,0.994602
171,0,0.000279,0.003781
172,0,0.003883,0.017411
173,0,0.999792,0.998398


In [114]:
total[(total.pred2 >= 0.7) & (total.pred3 < 0.7)]

Unnamed: 0,pred1,pred2,pred3


In [115]:
total[(total.pred2 < 0.7) & (total.pred3 >= 0.7)]

Unnamed: 0,pred1,pred2,pred3
162,0,0.262244,0.936088


In [121]:
total['answer1'] = total['pred1'].map(lambda x : 'A' if x==1 else 'D')
total['answer2'] = total['pred1'].map(lambda x : 'A' if x==1 else 'D')
total['answer3'] = total['pred1'].map(lambda x : 'A' if x==1 else 'D')

total.loc[total.answer1=='D', 'answer1'] = (pred2 > 0.9).astype(int)
total.loc[total.answer2=='D', 'answer2'] = (pred3 > 0.9).astype(int)
total.loc[total.answer3=='D', 'answer3'] = (np.mean([pred2, pred3], axis=0) > 0.9).astype(int)

total['answer1'] = total['answer1'].map(lambda x : 'A' if x=='A' else ('B' if x==1 else ('C' if x==0 else 'D')))
total['answer2'] = total['answer2'].map(lambda x : 'A' if x=='A' else ('B' if x==1 else ('C' if x==0 else 'D')))
total['answer3'] = total['answer3'].map(lambda x : 'A' if x=='A' else ('B' if x==1 else ('C' if x==0 else 'D')))

total.answer1.value_counts()

B    84
C    46
A    45
Name: answer1, dtype: int64

In [122]:
total.answer2.value_counts()

B    85
A    45
C    45
Name: answer2, dtype: int64

In [123]:
total.answer3.value_counts()

B    83
C    47
A    45
Name: answer3, dtype: int64

In [124]:
total.iloc[[3,5,12,119,126,162,168]]

Unnamed: 0,pred1,pred2,pred3,answer1,answer2,answer3
3,0,0.995427,0.967965,B,B,B
5,0,0.99414,0.976174,B,B,B
12,0,0.134592,0.228385,C,C,C
119,0,0.159007,0.501302,C,C,C
126,0,0.987937,0.989209,B,B,B
162,0,0.262244,0.936088,C,B,C
168,0,0.990872,0.987864,B,B,B


In [120]:
submit = pd.read_csv("./data/sample_submission.csv")
submit['class'] = total['answer3']
submit['class'].value_counts()
submit.to_csv("./submit.csv", index=False)