# CONCEPT 

- Simple하게 하나의 모델로 전체 학습하도록
- 변수를 제거해가며 진행

# STEP 01

In [2]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
import multiprocessing
from tqdm.auto import tqdm

from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset, sampler

from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from imblearn.over_sampling import SMOTE, SMOTENC, SMOTEN

import shap
import catboost

In [3]:
device = torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda', index=1)

In [5]:
df_train = pd.read_csv('./data/train.csv').drop(columns=['id', 'father', 'mother', 'gender','trait']).drop_duplicates().reset_index(drop=True)
df_test = pd.read_csv('./data/test.csv').drop(columns=['id', 'father', 'mother', 'gender','trait'])              
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   SNP_01  256 non-null    object
 1   SNP_02  256 non-null    object
 2   SNP_03  256 non-null    object
 3   SNP_04  256 non-null    object
 4   SNP_05  256 non-null    object
 5   SNP_06  256 non-null    object
 6   SNP_07  256 non-null    object
 7   SNP_08  256 non-null    object
 8   SNP_09  256 non-null    object
 9   SNP_10  256 non-null    object
 10  SNP_11  256 non-null    object
 11  SNP_12  256 non-null    object
 12  SNP_13  256 non-null    object
 13  SNP_14  256 non-null    object
 14  SNP_15  256 non-null    object
 15  class   256 non-null    object
dtypes: object(16)
memory usage: 32.1+ KB


In [6]:
# text 형태의 categorical 변수들을 숫자형태로 변경

for i in tqdm(range(1, 15+1)) :
    target = str(i) if i >= 10 else "0"+str(i)
    cols = sorted(df_train[f"SNP_{target}"].unique().tolist())
    df_train[f"SNP_{target}"] = df_train[f"SNP_{target}"].map(lambda x : 0 if x==cols[0] else (1 if x==cols[1] else 2))
    df_test[f"SNP_{target}"] = df_test[f"SNP_{target}"].map(lambda x : 0 if x==cols[0] else (1 if x==cols[1] else 2))

df_train.info(), df_test.info()

  0%|          | 0/15 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   SNP_01  256 non-null    int64 
 1   SNP_02  256 non-null    int64 
 2   SNP_03  256 non-null    int64 
 3   SNP_04  256 non-null    int64 
 4   SNP_05  256 non-null    int64 
 5   SNP_06  256 non-null    int64 
 6   SNP_07  256 non-null    int64 
 7   SNP_08  256 non-null    int64 
 8   SNP_09  256 non-null    int64 
 9   SNP_10  256 non-null    int64 
 10  SNP_11  256 non-null    int64 
 11  SNP_12  256 non-null    int64 
 12  SNP_13  256 non-null    int64 
 13  SNP_14  256 non-null    int64 
 14  SNP_15  256 non-null    int64 
 15  class   256 non-null    object
dtypes: int64(15), object(1)
memory usage: 32.1+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   SNP_01  175 non

(None, None)

In [7]:
df_train['class_B'] = df_train['class'].map(lambda x : 1 if x=='B' else 0)
df_train['class_C'] = df_train['class'].map(lambda x : 1 if x=='C' else 0)

for i in range(1, 15+1) :
    target = str(i) if i >= 10 else "0"+str(i)
    target2 = df_train.groupby(f'SNP_{target}')['class_B', 'class_C'].sum()
    target2['total'] = target2[['class_B', 'class_C']].sum(axis=1)
    for j in range(len(target2)) :
        if (target2['total'][j] < 3) : # 전체 표본이 너무 적은 경우 제외(확률 반반으로)
            target2['class_B'][j] = 1
            target2['class_C'][j] = 1
            
    value = target2['class_B'] / (target2['class_B']+target2['class_C'])    

    df_train[f"SNP_{target}_ratio"] = df_train[f"SNP_{target}"].map(lambda x : value[0] if x==0 else (value[1] if x==1 else value[2]))
    df_test[f"SNP_{target}_ratio"] = df_test[f"SNP_{target}"].map(lambda x : value[0] if x==0 else (value[1] if x==1 else value[2]))    

df_train.drop(columns=['class_B', 'class_C'], inplace=True)
df_train.info(), df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 31 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SNP_01        256 non-null    int64  
 1   SNP_02        256 non-null    int64  
 2   SNP_03        256 non-null    int64  
 3   SNP_04        256 non-null    int64  
 4   SNP_05        256 non-null    int64  
 5   SNP_06        256 non-null    int64  
 6   SNP_07        256 non-null    int64  
 7   SNP_08        256 non-null    int64  
 8   SNP_09        256 non-null    int64  
 9   SNP_10        256 non-null    int64  
 10  SNP_11        256 non-null    int64  
 11  SNP_12        256 non-null    int64  
 12  SNP_13        256 non-null    int64  
 13  SNP_14        256 non-null    int64  
 14  SNP_15        256 non-null    int64  
 15  class         256 non-null    object 
 16  SNP_01_ratio  256 non-null    float64
 17  SNP_02_ratio  256 non-null    float64
 18  SNP_03_ratio  256 non-null    

(None, None)

In [8]:
class Autoencoder(nn.Module):
    def __init__(self, encoding_dim):
        super().__init__()
        self.encoding_dim = encoding_dim
        self.encoder = nn.Sequential(
            nn.Linear(30, 32),
            nn.GELU(),
            nn.Linear(32, 8),
            nn.GELU(),
            nn.Linear(8, encoding_dim),
            nn.GELU(),
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 8),
            nn.GELU(),
            nn.Linear(8, 32),
            nn.GELU(),
            nn.Linear(32, 30)
        )

    def forward(self, x):
        x1 = self.encoder(x)
        x2 = self.decoder(x1)
        return x1, x2

def ae_train(model, data_loader, criterion, optimizer, device, epochs=10):
    model.to(device)
    for epoch in range(epochs):
        epoch_loss = 0
        for x in data_loader:
            x = x[0].to(device)
            _, x_hat = model(x)
            loss = criterion(x_hat, x)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f'Epoch {epoch}: loss = {epoch_loss / len(data_loader):.4f}')

In [9]:
class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(VAE, self).__init__()
        self.input_dim = input_dim
        self.latent_dim = latent_dim
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, input_dim*2),
            nn.ReLU(),
            nn.Linear(input_dim*2, input_dim),
            nn.ReLU(),
            nn.Linear(input_dim, input_dim//2),
            nn.ReLU()
            )
        self.fc_mu = nn.Linear(input_dim//2, latent_dim)
        self.fc_logvar = nn.Linear(input_dim//2, latent_dim)
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, input_dim//2),
            nn.ReLU(),
            nn.Linear(input_dim//2, input_dim),
            nn.ReLU(),
            nn.Linear(input_dim, input_dim*2),
            nn.ReLU(),
            nn.Linear(input_dim*2, input_dim)
        )

    def encode(self, x):
        h = self.encoder(x)
        return self.fc_mu(h), self.fc_logvar(h)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return mu + eps*std

    def decode(self, z):
        return self.decoder(z)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

def loss_fn(recon_x, x, mu, logvar):
    reconstruction_loss = F.mse_loss(recon_x, x, reduction='sum')
    kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return reconstruction_loss + kl_loss

def train(model, optimizer, train_loader, device):
    model.train()
    train_loss = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        loss = loss_fn(recon_batch, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        
        optimizer.step()
        
    return train_loss / len(train_loader.dataset)

def test(model, test_loader, device):
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for data in test_loader:
            data = data.to(device)
            recon_batch, mu, logvar = model(data)
            test_loss += loss_fn(recon_batch, data, mu, logvar).item()
    return test_loss / len(test_loader.dataset)

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = torch.Tensor(df_train.drop(columns=['class']).to_numpy())

# Create a dataset and data loader
dataset = torch.utils.data.TensorDataset(X)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)

# Initialize the model, criterion, and optimizer
encoding_dim = 5
model = Autoencoder(encoding_dim)
criterion = nn.MSELoss().to(device)
optimizer = optim.Adam(model.parameters())
epochs=1000

# Train the model
ae_train(model, data_loader, criterion, optimizer, device, epochs)

Epoch 0: loss = 0.7962
Epoch 1: loss = 0.4513
Epoch 2: loss = 0.2957
Epoch 3: loss = 0.2886
Epoch 4: loss = 0.2842
Epoch 5: loss = 0.2660
Epoch 6: loss = 0.2078
Epoch 7: loss = 0.1661
Epoch 8: loss = 0.1561
Epoch 9: loss = 0.1537
Epoch 10: loss = 0.1515
Epoch 11: loss = 0.1495
Epoch 12: loss = 0.1466
Epoch 13: loss = 0.1437
Epoch 14: loss = 0.1396
Epoch 15: loss = 0.1345
Epoch 16: loss = 0.1314
Epoch 17: loss = 0.1285
Epoch 18: loss = 0.1265
Epoch 19: loss = 0.1252
Epoch 20: loss = 0.1244
Epoch 21: loss = 0.1233
Epoch 22: loss = 0.1230
Epoch 23: loss = 0.1221
Epoch 24: loss = 0.1216
Epoch 25: loss = 0.1211
Epoch 26: loss = 0.1199
Epoch 27: loss = 0.1194
Epoch 28: loss = 0.1188
Epoch 29: loss = 0.1184
Epoch 30: loss = 0.1178
Epoch 31: loss = 0.1175
Epoch 32: loss = 0.1172
Epoch 33: loss = 0.1163
Epoch 34: loss = 0.1162
Epoch 35: loss = 0.1157
Epoch 36: loss = 0.1150
Epoch 37: loss = 0.1152
Epoch 38: loss = 0.1140
Epoch 39: loss = 0.1136
Epoch 40: loss = 0.1136
Epoch 41: loss = 0.1127
Ep

In [12]:
X1 = X.to(device)
X2 = torch.Tensor(df_test.to_numpy()).to(device)

pred_train = model(X1)
pred_test = model(X2)

# reconstruction error
loss_train = []
loss_test = []

for i in range(len(X1)) :
    loss = criterion(pred_train[1][i], X1[i]).detach().cpu().numpy()
    loss_train.append(loss)
for j in range(len(X2)) :
    loss = criterion(pred_test[1][j], X2[j]).detach().cpu().numpy()
    loss_test.append(loss)

# encoding values
enco_train = pred_train[0].detach().cpu().numpy()
enco_test = pred_test[0].detach().cpu().numpy()

trainLoss = pd.DataFrame(data=loss_train, columns=['ae_loss'])
testLoss = pd.DataFrame(data=loss_test, columns=['ae_loss'])

ae_train = pd.DataFrame(data=enco_train, columns=['ae_'+str(x) for x in range(enco_train.shape[1])])
ae_test = pd.DataFrame(data=enco_test, columns=['ae_'+str(x) for x in range(enco_train.shape[1])])

train2 = pd.concat([df_train, trainLoss, ae_train], axis=1)
test2 = pd.concat([df_test, testLoss, ae_test], axis=1)

train2.info(), test2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 37 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SNP_01        256 non-null    int64  
 1   SNP_02        256 non-null    int64  
 2   SNP_03        256 non-null    int64  
 3   SNP_04        256 non-null    int64  
 4   SNP_05        256 non-null    int64  
 5   SNP_06        256 non-null    int64  
 6   SNP_07        256 non-null    int64  
 7   SNP_08        256 non-null    int64  
 8   SNP_09        256 non-null    int64  
 9   SNP_10        256 non-null    int64  
 10  SNP_11        256 non-null    int64  
 11  SNP_12        256 non-null    int64  
 12  SNP_13        256 non-null    int64  
 13  SNP_14        256 non-null    int64  
 14  SNP_15        256 non-null    int64  
 15  class         256 non-null    object 
 16  SNP_01_ratio  256 non-null    float64
 17  SNP_02_ratio  256 non-null    float64
 18  SNP_03_ratio  256 non-null    

(None, None)

In [21]:
torch.Tensor(df_train.drop(columns=['class']).to_numpy())

tensor([[2.0000, 1.0000, 0.0000,  ..., 0.8235, 0.6532, 0.6893],
        [1.0000, 1.0000, 1.0000,  ..., 0.3673, 0.6532, 0.6893],
        [2.0000, 2.0000, 0.0000,  ..., 0.8235, 0.6532, 0.6893],
        ...,
        [1.0000, 2.0000, 0.0000,  ..., 0.3673, 0.0000, 0.2727],
        [0.0000, 2.0000, 0.0000,  ..., 0.3673, 0.0000, 0.2727],
        [2.0000, 1.0000, 1.0000,  ..., 0.3673, 0.6532, 0.5000]])

In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = torch.Tensor(df_train.drop(columns=['class']).to_numpy())
X[:, :15] -= 1

input_dim = X.shape[1]
latent_dim = 5
batch_size = 64
num_epochs = 1000

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_data = X
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_data = X
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False)

model = VAE(input_dim, latent_dim).to(device)
optimizer = torch.optim.Adam(model.parameters())

for epoch in range(num_epochs):
    train_loss = train(model, optimizer, train_loader, device)
    test_loss = test(model, test_loader, device)
    print(f"Epoch {epoch}: Train loss = {train_loss}, Test loss = {test_loss}")

Epoch 0: Train loss = 15.179901599884033, Test loss = 14.786930084228516
Epoch 1: Train loss = 14.556907176971436, Test loss = 14.200539350509644
Epoch 2: Train loss = 13.963820934295654, Test loss = 13.56462812423706
Epoch 3: Train loss = 13.336028099060059, Test loss = 12.858952522277832
Epoch 4: Train loss = 12.586609363555908, Test loss = 12.138041973114014
Epoch 5: Train loss = 11.806891441345215, Test loss = 11.258839845657349
Epoch 6: Train loss = 10.916547536849976, Test loss = 10.530199527740479
Epoch 7: Train loss = 10.196338415145874, Test loss = 9.91169285774231
Epoch 8: Train loss = 9.948222160339355, Test loss = 9.76811146736145
Epoch 9: Train loss = 9.61073350906372, Test loss = 9.448060274124146
Epoch 10: Train loss = 9.419636487960815, Test loss = 9.467551946640015
Epoch 11: Train loss = 9.48453402519226, Test loss = 9.244006395339966
Epoch 12: Train loss = 9.252277135848999, Test loss = 9.286682605743408
Epoch 13: Train loss = 9.307894468307495, Test loss = 9.20409250

In [14]:
X1 = X.to(device)
X2 = torch.Tensor(df_test.to_numpy()).to(device)

pred_train = model(X1)
pred_test = model(X2)

# reconstruction error
loss_train = []
loss_test = []

for i in range(len(X1)) :
    loss = criterion(pred_train[0][i], X1[i]).detach().cpu().numpy()
    loss_train.append(loss)
for j in range(len(X2)) :
    loss = criterion(pred_test[0][j], X2[j]).detach().cpu().numpy()
    loss_test.append(loss)

# encoding values
enco_train = pred_train[1].detach().cpu().numpy()
enco_test = pred_test[1].detach().cpu().numpy()

trainLoss = pd.DataFrame(data=loss_train, columns=['vae_loss'])
testLoss = pd.DataFrame(data=loss_test, columns=['vae_loss'])

ae_train = pd.DataFrame(data=enco_train, columns=['vae_'+str(x) for x in range(enco_train.shape[1])])
ae_test = pd.DataFrame(data=enco_test, columns=['vae_'+str(x) for x in range(enco_train.shape[1])])

train3 = pd.concat([train2, trainLoss, ae_train], axis=1)
test3 = pd.concat([test2, testLoss, ae_test], axis=1)

train3.info(), test3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 43 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SNP_01        256 non-null    int64  
 1   SNP_02        256 non-null    int64  
 2   SNP_03        256 non-null    int64  
 3   SNP_04        256 non-null    int64  
 4   SNP_05        256 non-null    int64  
 5   SNP_06        256 non-null    int64  
 6   SNP_07        256 non-null    int64  
 7   SNP_08        256 non-null    int64  
 8   SNP_09        256 non-null    int64  
 9   SNP_10        256 non-null    int64  
 10  SNP_11        256 non-null    int64  
 11  SNP_12        256 non-null    int64  
 12  SNP_13        256 non-null    int64  
 13  SNP_14        256 non-null    int64  
 14  SNP_15        256 non-null    int64  
 15  class         256 non-null    object 
 16  SNP_01_ratio  256 non-null    float64
 17  SNP_02_ratio  256 non-null    float64
 18  SNP_03_ratio  256 non-null    

(None, None)

In [15]:
train3.to_csv("train3.csv", index=False)
test3.to_csv("test3.csv", index=False)

In [16]:
train3

Unnamed: 0,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,...,ae_1,ae_2,ae_3,ae_4,vae_loss,vae_0,vae_1,vae_2,vae_3,vae_4
0,2,1,0,1,1,0,0,2,0,2,...,3.297604,1.756508,6.255674,3.642646,0.123947,0.019073,-0.022390,0.067796,0.021449,-0.673882
1,1,1,1,0,0,1,0,1,0,1,...,3.667223,4.182801,3.555612,2.159922,0.106692,0.012919,-0.008308,0.025512,-0.012970,0.276029
2,2,2,0,1,2,2,0,1,1,1,...,5.224575,-0.127179,3.630103,-0.128106,0.244127,-0.016602,0.021827,-0.010464,0.018495,-1.012079
3,0,2,0,1,0,2,2,0,2,1,...,9.187062,4.597771,2.563821,9.889058,0.125367,-0.000099,0.004065,0.004493,-0.004459,1.609574
4,2,2,2,0,2,0,0,0,0,2,...,5.601933,3.169086,4.073993,0.971149,0.228222,-0.018394,0.000236,-0.021743,0.006987,-0.674172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,1,1,0,1,2,1,0,1,0,2,...,3.913119,0.969482,5.489906,1.954625,0.239453,-0.005728,0.016870,-0.007003,0.007237,-1.032371
252,2,0,1,0,0,1,1,1,0,1,...,4.139625,6.434848,4.100220,4.945539,0.094505,0.018995,0.003443,-0.004700,-0.011192,0.217623
253,1,2,0,1,0,1,2,1,1,0,...,9.227410,6.817913,3.710675,9.640284,0.102900,-0.005802,-0.008215,-0.001354,0.007013,1.289399
254,0,2,0,1,0,2,2,0,1,1,...,9.110574,5.149833,3.243103,9.255105,0.079742,-0.013642,-0.014688,-0.003743,-0.008681,1.270026


In [18]:
target_cols = train3.columns.tolist()  + ['class']
check = train3.drop_duplicates(subset=target_cols)
check['class'].value_counts() / len(check) * 175

B    77.246094
C    54.003906
A    43.750000
Name: class, dtype: float64

In [19]:
check

Unnamed: 0,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,...,ae_1,ae_2,ae_3,ae_4,vae_loss,vae_0,vae_1,vae_2,vae_3,vae_4
0,2,1,0,1,1,0,0,2,0,2,...,3.297604,1.756508,6.255674,3.642646,0.123947,0.019073,-0.022390,0.067796,0.021449,-0.673882
1,1,1,1,0,0,1,0,1,0,1,...,3.667223,4.182801,3.555612,2.159922,0.106692,0.012919,-0.008308,0.025512,-0.012970,0.276029
2,2,2,0,1,2,2,0,1,1,1,...,5.224575,-0.127179,3.630103,-0.128106,0.244127,-0.016602,0.021827,-0.010464,0.018495,-1.012079
3,0,2,0,1,0,2,2,0,2,1,...,9.187062,4.597771,2.563821,9.889058,0.125367,-0.000099,0.004065,0.004493,-0.004459,1.609574
4,2,2,2,0,2,0,0,0,0,2,...,5.601933,3.169086,4.073993,0.971149,0.228222,-0.018394,0.000236,-0.021743,0.006987,-0.674172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,1,1,0,1,2,1,0,1,0,2,...,3.913119,0.969482,5.489906,1.954625,0.239453,-0.005728,0.016870,-0.007003,0.007237,-1.032371
252,2,0,1,0,0,1,1,1,0,1,...,4.139625,6.434848,4.100220,4.945539,0.094505,0.018995,0.003443,-0.004700,-0.011192,0.217623
253,1,2,0,1,0,1,2,1,1,0,...,9.227410,6.817913,3.710675,9.640284,0.102900,-0.005802,-0.008215,-0.001354,0.007013,1.289399
254,0,2,0,1,0,2,2,0,1,1,...,9.110574,5.149833,3.243103,9.255105,0.079742,-0.013642,-0.014688,-0.003743,-0.008681,1.270026


In [206]:
target_cols.remove('class')
test3.drop_duplicates(subset=target_cols)

Unnamed: 0,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,...,SNP_14_ratio,SNP_15_ratio,ae_loss,ae_0,ae_1,ae_2,vae_loss,vae_0,vae_1,vae_2
0,1,2,0,1,0,1,2,1,1,1,...,0.0,0.506329,0.166831,5.642057,-0.023668,3.152251,0.144563,-1.038103,0.013879,0.004878
1,2,1,2,2,2,0,0,0,0,2,...,0.655172,0.68932,0.216685,5.760644,-0.100615,1.679931,0.214011,1.359224,0.024534,3.9e-05
4,0,2,0,2,0,2,2,0,2,1,...,0.655172,0.272727,0.080097,7.72122,-0.093797,4.167721,0.096322,-1.497357,0.009801,0.009291
6,0,1,1,0,1,1,0,0,0,1,...,0.655172,0.506329,0.380005,2.020229,-0.046703,3.967607,0.137545,-0.479254,-0.003346,-0.002836
7,2,0,1,2,2,0,0,0,0,2,...,0.655172,0.68932,0.233976,3.30576,-0.087047,1.554819,0.236233,0.853061,0.00969,0.003606
11,2,2,0,2,2,1,1,1,0,2,...,0.655172,0.68932,0.172975,4.857063,-0.137661,1.077439,0.189684,1.536607,0.020073,0.007905
15,1,0,2,0,2,1,0,2,0,2,...,0.655172,0.68932,0.270085,1.916645,-0.137938,0.78688,0.200615,0.77253,-0.01012,0.02719
18,1,1,1,1,2,2,0,2,0,2,...,0.655172,0.506329,0.371069,2.073329,-0.139352,0.608346,0.16457,1.199935,0.010247,0.020097
134,0,0,1,1,2,0,0,2,0,2,...,0.655172,0.68932,0.577003,2.806244,-0.081024,0.228382,0.257584,0.547322,-0.005735,0.024441


In [183]:
# trait와 Label A의 상관계수가 1로, 다른 변수에 대한 영향력이 학습되지 않을 정도로 높으므로 배제해보고 진행
df_train2 = train3.copy()
df_test2 = test3.copy()

# G1 : A & notA / G2 : B & notB / G3 : C & notC => Regressor
X, y = df_train2.drop(columns=['class']), (df_train2['class'].values ) # original
X.iloc[:, :15] = X.iloc[:, :15].astype('category')
X_test = df_test2.copy()
X_test.iloc[:, :15] = X_test.iloc[:, :15].astype('category')

X1 = X.copy()
X2 = X.copy()
X3 = X.copy()

X_test1 = X_test.copy()
X_test2 = X_test.copy()
X_test3 = X_test.copy()

y1 = (y=='A').astype(int)
y2 = (y=='B').astype(int)
y3 = (y=='C').astype(int)

# strategy1 = {0:300, 1:300}
# smote1 = SMOTEN(sampling_strategy=strategy1, k_neighbors=50)
# strategy2 = {0:300, 1:300}
# smote2 = SMOTEN(sampling_strategy=strategy2, k_neighbors=50)
# strategy3 = {0:300, 1:600}
# smote3 = SMOTEN(sampling_strategy=strategy3, k_neighbors=50)

from imblearn.under_sampling import RandomUnderSampler

strategy1 = {0:10, 1:10}
strategy2 = {0:10, 1:10}
strategy3 = {0:10, 1:10}
smote1 = RandomUnderSampler(sampling_strategy=strategy1)
smote2 = RandomUnderSampler(sampling_strategy=strategy2)
smote3 = RandomUnderSampler(sampling_strategy=strategy3)

X11, y11 = smote1.fit_resample(X1, y1)
X22, y22 = smote2.fit_resample(X2, y2)
X33, y33 = smote3.fit_resample(X3, y3)

In [184]:
def catgbmc(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:15]
    model = CatBoostClassifier(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY)
        );     

    pred = model.predict(validX)
    score = f1_score(validY, pred, average='macro')
    print(score)
    
    return model

In [185]:
def catgbmR(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:15]
    model = CatBoostRegressor(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY)
        );     

    pred = model.predict(validX) > 0.5
    score = f1_score(validY, pred)
    print(score)
    
    return model

In [104]:
params = {'iterations':100,
          'learning_rate':0.1,
          # 'l2_leaf_reg' : 10,
          # 'grow_policy' : 'Lossguide',
          'auto_class_weights' : 'SqrtBalanced',
          'verbose':0,
          'random_seed':0}

model0 = catgbmc(X, y, X, y, params)

1.0


In [191]:
params = {'iterations':100,
          'learning_rate':0.3,
        #   'l2_leaf_reg' : 10,
        #   'grow_policy' : 'Depthwise',
          'verbose':0,
          'random_seed':0}

model1 = catgbmR(X11, y11, X1, y1, params)
model2 = catgbmR(X22, y22, X2, y2, params)
model3 = catgbmR(X33, y33, X3, y3, params)

0.9465648854961832
0.9083333333333333
0.7674418604651164


In [75]:
pd.DataFrame(data=model1.get_feature_importance(), index=model1.feature_names_, columns=['feature_importance']).sort_values('feature_importance', ascending=False)

Unnamed: 0,feature_importance
vae_0,95.688703
SNP_02,1.081806
ae_loss,0.716829
SNP_11,0.428488
SNP_12,0.378614
SNP_15,0.242201
SNP_13,0.185444
ae_1,0.149302
vae_loss,0.126855
SNP_01,0.124136


In [76]:
pd.DataFrame(data=model2.get_feature_importance(), index=model2.feature_names_, columns=['feature_importance']).sort_values('feature_importance', ascending=False)

Unnamed: 0,feature_importance
vae_0,91.081253
vae_2,2.858485
ae_1,1.889064
ae_2,0.601157
SNP_14,0.592063
SNP_12,0.447988
ae_loss,0.348154
SNP_09,0.216123
SNP_08,0.201903
SNP_02_ratio,0.176827


In [130]:
df = pd.read_csv("submit_high1.csv")
df['class'] = 'B'
df.to_csv("submit.csv", index=False)
df

Unnamed: 0,id,class
0,TEST_000,B
1,TEST_001,B
2,TEST_002,B
3,TEST_003,B
4,TEST_004,B
...,...,...
170,TEST_170,B
171,TEST_171,B
172,TEST_172,B
173,TEST_173,B


In [77]:
pd.DataFrame(data=model3.get_feature_importance(), index=model3.feature_names_, columns=['feature_importance']).sort_values('feature_importance', ascending=False)

Unnamed: 0,feature_importance
ae_0,42.157871
vae_0,25.615507
SNP_04_ratio,10.513975
SNP_04,2.833174
vae_2,2.478058
SNP_08,2.37765
vae_1,1.791499
SNP_03,1.308589
ae_loss,1.200529
ae_1,1.074583


In [107]:
pred1 = model1.predict(X_test1)
pred2 = model2.predict(X_test2)
pred3 = model3.predict(X_test3)

pred0 = model0.predict(X_test)

In [68]:
# pred3 = model1.predict_proba(df_test2.iloc[:, 1:])
# pred4 = model2.predict_proba(df_test2.iloc[:, 1:])

# pred5 = np.argmax((pred3+pred4) / 2, axis=1)
# pred5

In [108]:
df = pd.DataFrame()
df['pred_A'] = pd.Series(pred1.flatten())
df['pred_B'] = pd.Series(pred2.flatten())
df['pred_C'] = pd.Series(pred3.flatten())
df['high1'] = pd.read_csv('submit_high1.csv')['class']
df['high2'] = pd.read_csv('submit_high2.csv')['class']
df['pred_0'] = pd.Series(pred0.flatten())
df.head()

Unnamed: 0,pred_A,pred_B,pred_C,high1,high2,pred_0
0,0.961513,0.025238,0.100184,A,A,A
1,0.074034,1.030943,0.083862,B,B,B
2,0.100351,0.036408,1.016329,C,C,C
3,0.018103,0.727769,0.502003,C,C,B
4,0.965949,0.03681,0.050484,A,A,A


In [121]:
target = (model0.predict_proba(X_test) > 0.95).astype('int')

In [109]:
df.iloc[[3,5,12,119,126,162,168]]

Unnamed: 0,pred_A,pred_B,pred_C,high1,high2,pred_0
3,0.018103,0.727769,0.502003,C,C,B
5,0.027274,0.936984,0.469657,C,C,B
12,0.072923,0.690503,0.530483,B,C,B
119,0.090604,0.473318,0.634692,C,C,B
126,0.073298,0.747567,0.4278,B,C,C
162,0.012365,0.516311,0.557993,C,C,C
168,0.04651,0.953579,0.684606,B,B,B


In [122]:
np.sum(target)

86

In [123]:
target[[3,5,12,119,126,162,168]]

array([[0, 0, 0],
       [0, 1, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [82]:
df['new'] = 'D'
df.loc[df.pred_A > 0.9, 'new'] = 'A'
df.loc[df.pred_B > 0.9, 'new'] = 'B'
df.loc[df.pred_C > 0.5, 'new'] = 'C'

# df.loc[((df.pred_B > 0.5) & (df.pred_C < 0.4)), 'new'] = 'B'
df.new.value_counts()

B    78
A    50
C    42
D     5
Name: new, dtype: int64

In [83]:
df[df.new == 'D']

Unnamed: 0,pred_A,pred_B,pred_C,high1,high2,new
14,0.07585,0.80412,0.459858,B,B,D
60,0.15587,0.033594,0.113205,A,A,D
97,0.102144,0.830827,0.350064,B,B,D
122,0.00867,0.896284,0.161875,B,B,D
126,0.073298,0.747567,0.4278,B,C,D


In [None]:
len(df[df.pred_A > 0.9]), len(df[df.pred_B > 0.8]), len(df[df.pred_C > 0.9])

In [None]:
df[((df.pred_C > 0.1) & (df.pred_B < 0.9)) & ((df.high1 != 'C') | (df.high2 != 'C'))]

In [None]:
df[df.pred1 != df.high1]

In [None]:
df.pred1.value_counts()

In [None]:
df.pred2.value_counts()

In [None]:
df.high1.value_counts()

In [None]:
df.high2.value_counts()

In [None]:
df_train['class'].value_counts() / len(df_train) * len(df_test)

In [251]:
# A : 45, B : 78, C: 42
b_len = 83
answer = np.zeros(175)
answer[45:45+b_len] = 1
answer[45+b_len:] = 2

submit = np.ones(175)

# submit = answer.copy()
# submit[-1:]=0
# submit[45:45+1]=0

f1_score(answer, submit, average='macro'), f1_score(answer, submit, average='micro')

(0.2144702842377261, 0.4742857142857143)

In [248]:
answer

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2.])

In [246]:
# A : 45, B : 78, C: 42
b_len = 61
answer = np.ones(130)
answer[b_len:] = 0
# answer[45+b_len:] = 2

submit = np.ones(130)
# submit[b_len:] = 0

f1_score(answer, submit) / 3

0.21291448516579403