# CONCEPT 

- 데이터를 증강하지 않고, A와 notA를 분류
- A 데이터를 제외하고, B&C 데이터만을 남겨두기
- B&C 데이터에 대한 Label Encoder 변수 추가
- B&C 데이터에 대한 AE 및 VAE Encoding 값 및 error 값 추가
- B&C 데이터에 대한 데이터 증강 유무에 따른 성능 확인

# STEP 01. 데이터를 증강하지 않고 A와 notA를 분류

In [34]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
import multiprocessing
from tqdm.auto import tqdm

from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset, sampler

from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from imblearn.over_sampling import SMOTE, SMOTENC, SMOTEN

In [35]:
device = torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda', index=1)

In [36]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')              
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 21 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      262 non-null    object
 1   father  262 non-null    int64 
 2   mother  262 non-null    int64 
 3   gender  262 non-null    int64 
 4   trait   262 non-null    int64 
 5   SNP_01  262 non-null    object
 6   SNP_02  262 non-null    object
 7   SNP_03  262 non-null    object
 8   SNP_04  262 non-null    object
 9   SNP_05  262 non-null    object
 10  SNP_06  262 non-null    object
 11  SNP_07  262 non-null    object
 12  SNP_08  262 non-null    object
 13  SNP_09  262 non-null    object
 14  SNP_10  262 non-null    object
 15  SNP_11  262 non-null    object
 16  SNP_12  262 non-null    object
 17  SNP_13  262 non-null    object
 18  SNP_14  262 non-null    object
 19  SNP_15  262 non-null    object
 20  class   262 non-null    object
dtypes: int64(4), object(17)
memory usage: 43.1+ KB


In [37]:
df_train['class'].value_counts()

B    114
C     79
A     69
Name: class, dtype: int64

In [38]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      175 non-null    object
 1   father  175 non-null    int64 
 2   mother  175 non-null    int64 
 3   gender  175 non-null    int64 
 4   trait   175 non-null    int64 
 5   SNP_01  175 non-null    object
 6   SNP_02  175 non-null    object
 7   SNP_03  175 non-null    object
 8   SNP_04  175 non-null    object
 9   SNP_05  175 non-null    object
 10  SNP_06  175 non-null    object
 11  SNP_07  175 non-null    object
 12  SNP_08  175 non-null    object
 13  SNP_09  175 non-null    object
 14  SNP_10  175 non-null    object
 15  SNP_11  175 non-null    object
 16  SNP_12  175 non-null    object
 17  SNP_13  175 non-null    object
 18  SNP_14  175 non-null    object
 19  SNP_15  175 non-null    object
dtypes: int64(4), object(16)
memory usage: 27.5+ KB


In [39]:
df_train.describe()

Unnamed: 0,father,mother,gender,trait
count,262.0,262.0,262.0,262.0
mean,0.0,0.0,0.0,1.736641
std,0.0,0.0,0.0,0.441298
min,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,2.0
75%,0.0,0.0,0.0,2.0
max,0.0,0.0,0.0,2.0


In [40]:
df_test.describe()

Unnamed: 0,father,mother,gender,trait
count,175.0,175.0,175.0,175.0
mean,0.0,0.0,0.0,1.708571
std,0.0,0.0,0.0,0.455724
min,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,2.0
75%,0.0,0.0,0.0,2.0
max,0.0,0.0,0.0,2.0


In [41]:
# trait와 Label A의 상관계수가 1로, 다른 변수에 대한 영향력이 학습되지 않을 정도로 높으므로 배제해보고 진행
df_train2 = df_train.drop(columns=['father', 'mother', 'gender','trait','SNP_06'])
df_test2 = df_test.drop(columns=['father', 'mother', 'gender', 'trait','SNP_06'])

smote = SMOTEN()
X, y = df_train2.iloc[:, 1:-1], (df_train2['class'].values == 'A').astype(int)
X2, y2 = smote.fit_resample(X, y)

In [42]:
def catgbmc(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()
    model = CatBoostClassifier(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY)
        );     

    pred = model.predict(validX)
    score = f1_score(validY, pred, average='macro')
    print(score)
    
    return model

In [43]:
params = {'iterations':200,
          'learning_rate':0.3,
          'l2_leaf_reg' : 5,
          'grow_policy' : 'Depthwise',
          'verbose':0,
          'random_seed':0}

model1 = catgbmc(X2, y2, X, y, params)
model2 = catgbmc(X, y, X, y, params)

1.0
1.0


In [44]:
pd.DataFrame(data=model1.get_feature_importance(), index=model1.feature_names_, columns=['feature_importance']).sort_values('feature_importance', ascending=False)

Unnamed: 0,feature_importance
SNP_07,69.341983
SNP_12,9.248442
SNP_10,7.383244
SNP_08,3.365065
SNP_13,2.657255
SNP_02,1.636241
SNP_09,1.463275
SNP_01,1.435952
SNP_03,1.376407
SNP_14,1.021598


In [45]:
pd.DataFrame(data=model2.get_feature_importance(), index=model2.feature_names_, columns=['feature_importance']).sort_values('feature_importance', ascending=False)

Unnamed: 0,feature_importance
SNP_07,47.008902
SNP_12,13.108403
SNP_14,12.01219
SNP_01,6.475343
SNP_09,5.34829
SNP_11,4.324104
SNP_10,3.518602
SNP_08,3.277603
SNP_04,1.796355
SNP_05,0.826109


In [46]:
pred = model1.predict_proba(X)[:,1] > 0.99
score = f1_score(y, pred, average='macro')
print(score) 

0.9850328477577834


In [47]:
pred = model2.predict_proba(X)[:,1] > 0.98
score = f1_score(y, pred, average='macro')
print(score) 

1.0


- 본래의 성능을 해치지 않는 선에서 threshold를 최대로 높여보기

In [48]:
pred1 = model1.predict_proba(df_test2.iloc[:, 1:])[:,1] > 0.99
sum(pred1)

45

In [49]:
pred2 = model2.predict_proba(df_test2.iloc[:, 1:])[:,1] > 0.98
sum(pred2)

48

In [50]:
train2 = df_train.copy().reset_index(drop=True).drop(columns=['father', 'mother', 'gender','trait'])
test2 = df_test.copy().reset_index(drop=True).drop(columns=['father', 'mother', 'gender','trait'])
test2

Unnamed: 0,id,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15
0,TEST_000,A G,G G,A A,G A,A A,A G,G G,G A,G A,A G,A G,G A,G G,C A,G A
1,TEST_001,G G,A G,C C,G G,C C,A A,A A,A A,A A,G G,A G,A A,A A,A A,A A
2,TEST_002,G G,A G,A A,A A,C A,A G,A A,A A,A A,A G,A A,G A,G G,A A,G G
3,TEST_003,G G,A G,C A,A A,C C,A A,A A,A A,A A,G G,A A,G A,A G,A A,A A
4,TEST_004,A A,G G,A A,G G,A A,G G,G G,A A,G G,A G,G G,G A,G G,A A,G G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,TEST_170,A G,G G,C C,A A,C A,A G,A A,G G,A A,G G,G G,A A,A A,A A,G A
171,TEST_171,G G,A A,A A,A A,C A,A G,A A,A A,A A,A G,A A,A A,A G,A A,G A
172,TEST_172,G G,A A,A A,A A,C A,A G,A A,A A,A A,G G,A G,A A,A G,A A,G G
173,TEST_173,A G,G G,C A,G A,C C,G G,A A,G A,A A,G G,A G,A A,A A,A A,A A


### Summay 01
- 이전의 과정들에서 수차례 증명되었듯, A와 notA를 분류하는 것은 아주 쉽다.  
 (기본적으로 trait 변수 하나만으로도 기계적인 분류가 가능하다)
- 때문에 우선적으로 A와 notA를 분류해놓고, B와 C만이 존재하는 데이터를 대상으로 파생변수 생성, 증강 등의 과정을 진행한다.

# STEP 02. 파생변수 추가

## 1) B와 C에 대한 Target Labeling

In [51]:
# categorical 변수 중 train에만 있거나, test에만 있는 칼럼 확인

train3 = pd.get_dummies(train2.iloc[:, 1:-1]).copy()
test3 = pd.get_dummies(test2.iloc[:, 1:]).copy()

target1 = train3.columns.tolist()
target2 = test3.columns.tolist()
[x for x in target1 if x not in target2], [y for y in target2 if y not in target1]

([], [])

In [52]:
# text 형태의 categorical 변수들을 숫자형태로 변경

for i in tqdm(range(1, 15+1)) :
    target = str(i) if i >= 10 else "0"+str(i)
    cols = sorted(train2[f"SNP_{target}"].unique().tolist())
    train2[f"SNP_{target}"] = train2[f"SNP_{target}"].map(lambda x : 0 if x==cols[0] else (1 if x==cols[1] else 2))
    test2[f"SNP_{target}"] = test2[f"SNP_{target}"].map(lambda x : 0 if x==cols[0] else (1 if x==cols[1] else 2))

train2.info(), test2.info()

  0%|          | 0/15 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      262 non-null    object
 1   SNP_01  262 non-null    int64 
 2   SNP_02  262 non-null    int64 
 3   SNP_03  262 non-null    int64 
 4   SNP_04  262 non-null    int64 
 5   SNP_05  262 non-null    int64 
 6   SNP_06  262 non-null    int64 
 7   SNP_07  262 non-null    int64 
 8   SNP_08  262 non-null    int64 
 9   SNP_09  262 non-null    int64 
 10  SNP_10  262 non-null    int64 
 11  SNP_11  262 non-null    int64 
 12  SNP_12  262 non-null    int64 
 13  SNP_13  262 non-null    int64 
 14  SNP_14  262 non-null    int64 
 15  SNP_15  262 non-null    int64 
 16  class   262 non-null    object
dtypes: int64(15), object(2)
memory usage: 34.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  -----

(None, None)

In [53]:
train2['class_B'] = train2['class'].map(lambda x : 1 if x=='B' else 0)
train2['class_C'] = train2['class'].map(lambda x : 1 if x=='C' else 0)

for i in range(1, 15+1) :
    target = str(i) if i >= 10 else "0"+str(i)
    target2 = train2.groupby(f'SNP_{target}')['class_B', 'class_C'].sum()
    target2['total'] = target2[['class_B', 'class_C']].sum(axis=1)
    for j in range(len(target2)) :
        if (target2['total'][j] < 3) : # 전체 표본이 너무 적은 경우 제외(확률 반반으로)
            target2['class_B'][j] = 1
            target2['class_C'][j] = 1
            
    value = target2['class_B'] / (target2['class_B']+target2['class_C'])    

    train2[f"SNP_{target}_ratio"] = train2[f"SNP_{target}"].map(lambda x : value[0] if x==0 else (value[1] if x==1 else value[2]))
    test2[f"SNP_{target}_ratio"] = test2[f"SNP_{target}"].map(lambda x : value[0] if x==0 else (value[1] if x==1 else value[2]))    

train2.drop(columns=['class_B', 'class_C'], inplace=True)
train2.info(), test2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 32 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            262 non-null    object 
 1   SNP_01        262 non-null    int64  
 2   SNP_02        262 non-null    int64  
 3   SNP_03        262 non-null    int64  
 4   SNP_04        262 non-null    int64  
 5   SNP_05        262 non-null    int64  
 6   SNP_06        262 non-null    int64  
 7   SNP_07        262 non-null    int64  
 8   SNP_08        262 non-null    int64  
 9   SNP_09        262 non-null    int64  
 10  SNP_10        262 non-null    int64  
 11  SNP_11        262 non-null    int64  
 12  SNP_12        262 non-null    int64  
 13  SNP_13        262 non-null    int64  
 14  SNP_14        262 non-null    int64  
 15  SNP_15        262 non-null    int64  
 16  class         262 non-null    object 
 17  SNP_01_ratio  262 non-null    float64
 18  SNP_02_ratio  262 non-null    

(None, None)

## 2) AE & VAE 정의

In [54]:
class Autoencoder(nn.Module):
    def __init__(self, encoding_dim):
        super().__init__()
        self.encoding_dim = encoding_dim
        self.encoder = nn.Sequential(
            nn.Linear(30, 64),
            nn.GELU(),
            nn.Linear(64, 16),
            nn.GELU(),
            nn.Linear(16, encoding_dim),
            nn.GELU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 16),
            nn.GELU(),
            nn.Linear(16, 64),
            nn.GELU(),
            nn.Linear(64, 30)
        )

    def forward(self, x):
        x1 = self.encoder(x)
        x2 = self.decoder(x1)
        return x1, x2

def ae_train(model, data_loader, criterion, optimizer, device, epochs=10):
    model.to(device)
    for epoch in range(epochs):
        epoch_loss = 0
        for x in data_loader:
            x = x[0].to(device)
            _, x_hat = model(x)
            loss = criterion(x_hat, x)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f'Epoch {epoch}: loss = {epoch_loss / len(data_loader):.4f}')

In [55]:
X = torch.randn(320, train2.shape[1]-2)  # generate some example data

# Create a dataset and data loader
dataset = torch.utils.data.TensorDataset(X)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

# Initialize the model, criterion, and optimizer
encoding_dim = 6
model = Autoencoder(encoding_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())

# Train the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
ae_train(model, data_loader, criterion, optimizer, device)

Epoch 0: loss = 0.9728
Epoch 1: loss = 0.9670
Epoch 2: loss = 0.9630
Epoch 3: loss = 0.9605
Epoch 4: loss = 0.9563
Epoch 5: loss = 0.9486
Epoch 6: loss = 0.9361
Epoch 7: loss = 0.9264
Epoch 8: loss = 0.9172
Epoch 9: loss = 0.9067


In [56]:
class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(VAE, self).__init__()
        self.input_dim = input_dim
        self.latent_dim = latent_dim
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, input_dim*2),
            nn.ReLU(),
            nn.Linear(input_dim*2, input_dim),
            nn.ReLU(),
            nn.Linear(input_dim, input_dim//2),
            nn.ReLU()
            )
        self.fc_mu = nn.Linear(input_dim//2, latent_dim)
        self.fc_logvar = nn.Linear(input_dim//2, latent_dim)
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, input_dim//2),
            nn.ReLU(),
            nn.Linear(input_dim//2, input_dim),
            nn.ReLU(),
            nn.Linear(input_dim, input_dim*2),
            nn.ReLU(),
            nn.Linear(input_dim*2, input_dim)
        )

    def encode(self, x):
        h = self.encoder(x)
        return self.fc_mu(h), self.fc_logvar(h)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return mu + eps*std

    def decode(self, z):
        return self.decoder(z)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

def loss_fn(recon_x, x, mu, logvar):
    reconstruction_loss = F.mse_loss(recon_x, x, reduction='sum')
    kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return reconstruction_loss + kl_loss

def train(model, optimizer, train_loader, device):
    model.train()
    train_loss = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        loss = loss_fn(recon_batch, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        
        optimizer.step()
        
    return train_loss / len(train_loader.dataset)

def test(model, test_loader, device):
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for data in test_loader:
            data = data.to(device)
            recon_batch, mu, logvar = model(data)
            test_loss += loss_fn(recon_batch, data, mu, logvar).item()
    return test_loss / len(test_loader.dataset)

In [57]:
input_dim = train2.shape[1]-1
latent_dim = 8
batch_size = 32
num_epochs = 10

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_data = torch.randn(100, input_dim)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size)
test_data = torch.randn(10, input_dim)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)

model = VAE(input_dim, latent_dim).to(device)
optimizer = torch.optim.Adam(model.parameters())

for epoch in range(num_epochs):
    train_loss = train(model, optimizer, train_loader, device)
    test_loss = test(model, test_loader, device)
    print(f"Epoch {epoch}: Train loss = {train_loss}, Test loss = {test_loss}")

Epoch 0: Train loss = 29.776117401123045, Test loss = 30.58157958984375
Epoch 1: Train loss = 29.700023345947265, Test loss = 30.68614501953125
Epoch 2: Train loss = 29.608728713989258, Test loss = 30.722091674804688
Epoch 3: Train loss = 29.503772354125978, Test loss = 30.60218200683594
Epoch 4: Train loss = 29.52825012207031, Test loss = 30.718594360351563
Epoch 5: Train loss = 29.438090591430665, Test loss = 30.825531005859375
Epoch 6: Train loss = 29.361272735595705, Test loss = 30.85445556640625
Epoch 7: Train loss = 29.348167572021485, Test loss = 30.89232177734375
Epoch 8: Train loss = 29.389032897949217, Test loss = 31.08165588378906
Epoch 9: Train loss = 29.348308029174806, Test loss = 30.99210205078125


## 3) AE & VAE 학습 및 변수 추가

### AutoEncoder

In [58]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = torch.Tensor(train2.drop(columns=['id', 'class']).to_numpy())

# Create a dataset and data loader
dataset = torch.utils.data.TensorDataset(X)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

# Initialize the model, criterion, and optimizer
encoding_dim = 6
model = Autoencoder(encoding_dim)
criterion = nn.MSELoss().to(device)
optimizer = optim.Adam(model.parameters())
epochs=1000

# Train the model
ae_train(model, data_loader, criterion, optimizer, device, epochs)

Epoch 0: loss = 0.8945
Epoch 1: loss = 0.8055
Epoch 2: loss = 0.7271
Epoch 3: loss = 0.6173
Epoch 4: loss = 0.4595
Epoch 5: loss = 0.3355
Epoch 6: loss = 0.2789
Epoch 7: loss = 0.2299
Epoch 8: loss = 0.1987
Epoch 9: loss = 0.1768
Epoch 10: loss = 0.1638
Epoch 11: loss = 0.1560
Epoch 12: loss = 0.1532
Epoch 13: loss = 0.1556
Epoch 14: loss = 0.1510
Epoch 15: loss = 0.1470
Epoch 16: loss = 0.1536
Epoch 17: loss = 0.1530
Epoch 18: loss = 0.1483
Epoch 19: loss = 0.1504
Epoch 20: loss = 0.1477
Epoch 21: loss = 0.1478
Epoch 22: loss = 0.1484
Epoch 23: loss = 0.1463
Epoch 24: loss = 0.1470
Epoch 25: loss = 0.1460
Epoch 26: loss = 0.1460
Epoch 27: loss = 0.1449
Epoch 28: loss = 0.1477
Epoch 29: loss = 0.1417
Epoch 30: loss = 0.1422
Epoch 31: loss = 0.1441
Epoch 32: loss = 0.1433
Epoch 33: loss = 0.1414
Epoch 34: loss = 0.1387
Epoch 35: loss = 0.1394
Epoch 36: loss = 0.1421
Epoch 37: loss = 0.1393
Epoch 38: loss = 0.1380
Epoch 39: loss = 0.1390
Epoch 40: loss = 0.1326
Epoch 41: loss = 0.1353
Ep

In [59]:
X1 = X.to(device)
X2 = torch.Tensor(test2.drop(columns=['id']).to_numpy()).to(device)

pred_train = model(X1)
pred_test = model(X2)

# reconstruction error
loss_train = []
loss_test = []

for i in range(len(X1)) :
    loss = criterion(pred_train[1][i], X1[i]).detach().cpu().numpy()
    loss_train.append(loss)
for j in range(len(X2)) :
    loss = criterion(pred_test[1][j], X2[j]).detach().cpu().numpy()
    loss_test.append(loss)

# encoding values
enco_train = pred_train[0].detach().cpu().numpy()
enco_test = pred_test[0].detach().cpu().numpy()

trainLoss = pd.DataFrame(data=loss_train, columns=['ae_loss'])
testLoss = pd.DataFrame(data=loss_test, columns=['ae_loss'])

ae_train = pd.DataFrame(data=enco_train, columns=['ae_'+str(x) for x in range(enco_train.shape[1])])
ae_test = pd.DataFrame(data=enco_test, columns=['ae_'+str(x) for x in range(enco_train.shape[1])])

train3 = pd.concat([train2, trainLoss, ae_train], axis=1)
test3 = pd.concat([test2, testLoss, ae_test], axis=1)

train3.info(), test3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 39 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            262 non-null    object 
 1   SNP_01        262 non-null    int64  
 2   SNP_02        262 non-null    int64  
 3   SNP_03        262 non-null    int64  
 4   SNP_04        262 non-null    int64  
 5   SNP_05        262 non-null    int64  
 6   SNP_06        262 non-null    int64  
 7   SNP_07        262 non-null    int64  
 8   SNP_08        262 non-null    int64  
 9   SNP_09        262 non-null    int64  
 10  SNP_10        262 non-null    int64  
 11  SNP_11        262 non-null    int64  
 12  SNP_12        262 non-null    int64  
 13  SNP_13        262 non-null    int64  
 14  SNP_14        262 non-null    int64  
 15  SNP_15        262 non-null    int64  
 16  class         262 non-null    object 
 17  SNP_01_ratio  262 non-null    float64
 18  SNP_02_ratio  262 non-null    

(None, None)

### Variational AutoEncoder

In [60]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = torch.Tensor(train2.drop(columns=['id', 'class']).to_numpy())

input_dim = X.shape[1]
latent_dim = 6
batch_size = 32
num_epochs = 1000

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_data = X
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_data = X
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False)

model = VAE(input_dim, latent_dim).to(device)
optimizer = torch.optim.Adam(model.parameters())

for epoch in range(num_epochs):
    train_loss = train(model, optimizer, train_loader, device)
    test_loss = test(model, test_loader, device)
    print(f"Epoch {epoch}: Train loss = {train_loss}, Test loss = {test_loss}")

Epoch 0: Train loss = 27.79488768832374, Test loss = 26.269544004483986
Epoch 1: Train loss = 25.17690148972373, Test loss = 23.268580662385197
Epoch 2: Train loss = 21.706235055705065, Test loss = 19.172696674143086
Epoch 3: Train loss = 17.121665459552794, Test loss = 14.10542102260444
Epoch 4: Train loss = 13.11016615656496, Test loss = 12.198922703284344
Epoch 5: Train loss = 11.41193587907398, Test loss = 10.686762147277365
Epoch 6: Train loss = 10.270906841481915, Test loss = 10.071188599098729
Epoch 7: Train loss = 10.094798168153254, Test loss = 9.759428694048001
Epoch 8: Train loss = 9.873311865420742, Test loss = 9.91933661016799
Epoch 9: Train loss = 9.79064900638493, Test loss = 9.798935518919967
Epoch 10: Train loss = 9.714552435256143, Test loss = 9.591578665580458
Epoch 11: Train loss = 9.684063190722284, Test loss = 9.630793447712906
Epoch 12: Train loss = 9.590430616422465, Test loss = 9.620445207785105
Epoch 13: Train loss = 9.63061022576485, Test loss = 9.47651972297

In [61]:
X1 = X.to(device)
X2 = torch.Tensor(test2.drop(columns=['id']).to_numpy()).to(device)

pred_train = model(X1)
pred_test = model(X2)

# reconstruction error
loss_train = []
loss_test = []

for i in range(len(X1)) :
    loss = criterion(pred_train[0][i], X1[i]).detach().cpu().numpy()
    loss_train.append(loss)
for j in range(len(X2)) :
    loss = criterion(pred_test[0][j], X2[j]).detach().cpu().numpy()
    loss_test.append(loss)

# encoding values
enco_train = pred_train[1].detach().cpu().numpy()
enco_test = pred_test[1].detach().cpu().numpy()

trainLoss = pd.DataFrame(data=loss_train, columns=['vae_loss'])
testLoss = pd.DataFrame(data=loss_test, columns=['vae_loss'])

ae_train = pd.DataFrame(data=enco_train, columns=['vae_'+str(x) for x in range(enco_train.shape[1])])
ae_test = pd.DataFrame(data=enco_test, columns=['vae_'+str(x) for x in range(enco_train.shape[1])])

train4 = pd.concat([train3, trainLoss, ae_train], axis=1)
test4 = pd.concat([test3, testLoss, ae_test], axis=1)

train4.info(), test4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 46 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            262 non-null    object 
 1   SNP_01        262 non-null    int64  
 2   SNP_02        262 non-null    int64  
 3   SNP_03        262 non-null    int64  
 4   SNP_04        262 non-null    int64  
 5   SNP_05        262 non-null    int64  
 6   SNP_06        262 non-null    int64  
 7   SNP_07        262 non-null    int64  
 8   SNP_08        262 non-null    int64  
 9   SNP_09        262 non-null    int64  
 10  SNP_10        262 non-null    int64  
 11  SNP_11        262 non-null    int64  
 12  SNP_12        262 non-null    int64  
 13  SNP_13        262 non-null    int64  
 14  SNP_14        262 non-null    int64  
 15  SNP_15        262 non-null    int64  
 16  class         262 non-null    object 
 17  SNP_01_ratio  262 non-null    float64
 18  SNP_02_ratio  262 non-null    

(None, None)

### Summary 02
- Target Labeling을 통해 15개의 파생변수를 생성
- AE와 VAE를 통해 encoding한 값 8개와 reconstruction error 1개를 각기 추가해, 18개의 파생변수를 생성
- 총 33개의 파생변수가 추가되어 49개의 변수를 이용해 B와 C를 구분

In [62]:
train4.to_csv("./data/train44.csv", index=False)
test4.to_csv("./data/test44.csv", index=False)

## Categorical Features
- 전체 변수들을 CATEGORY 타입으로 변환

In [63]:
train = pd.read_csv("./data/train44.csv")
test = pd.read_csv("./data/test44.csv")
train.iloc[:, 1:16] = train.iloc[:, 1:16].astype('category')
test.iloc[:, 1:16] = test.iloc[:, 1:16].astype('category')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 46 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   id            262 non-null    object  
 1   SNP_01        262 non-null    category
 2   SNP_02        262 non-null    category
 3   SNP_03        262 non-null    category
 4   SNP_04        262 non-null    category
 5   SNP_05        262 non-null    category
 6   SNP_06        262 non-null    category
 7   SNP_07        262 non-null    category
 8   SNP_08        262 non-null    category
 9   SNP_09        262 non-null    category
 10  SNP_10        262 non-null    category
 11  SNP_11        262 non-null    category
 12  SNP_12        262 non-null    category
 13  SNP_13        262 non-null    category
 14  SNP_14        262 non-null    category
 15  SNP_15        262 non-null    category
 16  class         262 non-null    object  
 17  SNP_01_ratio  262 non-null    float64 
 18  SNP_02_rat

In [64]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 45 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   id            175 non-null    object  
 1   SNP_01        175 non-null    category
 2   SNP_02        175 non-null    category
 3   SNP_03        175 non-null    category
 4   SNP_04        175 non-null    category
 5   SNP_05        175 non-null    category
 6   SNP_06        175 non-null    category
 7   SNP_07        175 non-null    category
 8   SNP_08        175 non-null    category
 9   SNP_09        175 non-null    category
 10  SNP_10        175 non-null    category
 11  SNP_11        175 non-null    category
 12  SNP_12        175 non-null    category
 13  SNP_13        175 non-null    category
 14  SNP_14        175 non-null    category
 15  SNP_15        175 non-null    category
 16  SNP_01_ratio  175 non-null    float64 
 17  SNP_02_ratio  175 non-null    float64 
 18  SNP_03_rat

# STEP 02. MODELING & VALIDATION
- 데이터 증강여부에 따른 성능향상 유무를 확인
- Classifier와 Regressor를 동시에 사용해 자체적인 ensemble 효과 추가 고려

## w/o & w/ Aug

In [65]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold

from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from imblearn.over_sampling import SMOTE, SMOTENC, SMOTEN

In [66]:
X, y = train.drop(columns=['id', 'class']), (train['class'].values == 'B').astype(int)

strategy = {0:1200, 1:1000}
smote = SMOTENC(categorical_features=[x for x in range(16)], sampling_strategy=strategy)
X2, y2 = smote.fit_resample(X, y)
X_test = test.drop(columns=['id'])
X2

Unnamed: 0,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,...,ae_3,ae_4,ae_5,vae_loss,vae_0,vae_1,vae_2,vae_3,vae_4,vae_5
0,2,1,0,1,1,0,0,2,0,2,...,2.431611,-0.106581,2.498325,0.139278,-0.007006,0.013411,-0.008253,-0.025471,0.830296,0.007900
1,1,1,1,0,0,1,0,1,0,1,...,4.618769,4.234283,2.760611,0.104779,-0.016590,0.007498,-0.015945,0.013792,-0.162930,-0.005958
2,2,2,0,1,2,2,0,1,1,1,...,-0.169834,1.822250,2.224755,0.238899,-0.017366,0.000105,0.019572,-0.052061,1.125260,0.032062
3,0,2,0,1,0,2,2,0,2,1,...,1.564791,5.829734,9.283328,0.111061,0.002522,0.000835,0.015297,-0.006023,-1.473958,-0.008611
4,2,2,2,0,2,0,0,0,0,2,...,4.100493,4.037419,4.762250,0.247876,-0.014172,-0.010907,0.018917,-0.032634,0.956545,0.011679
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2195,2,1,2,0,2,1,0,2,0,2,...,6.193446,4.430719,3.516715,0.076388,-0.012697,-0.000690,-0.014861,-0.023553,1.272223,-0.003556
2196,2,1,1,0,1,1,0,2,0,2,...,6.031495,1.379756,3.059469,0.118252,-0.008956,0.001718,0.000456,-0.026663,0.924957,0.006810
2197,2,2,1,0,2,1,0,1,0,2,...,5.409315,2.348264,4.920689,0.128451,-0.013745,-0.011743,0.008431,-0.033586,1.134699,0.012184
2198,2,2,1,0,2,1,0,1,0,2,...,5.223872,4.361554,4.529073,0.103785,-0.011226,-0.008477,0.012838,-0.029268,0.930458,0.011763


In [67]:
y2

array([1, 0, 1, ..., 1, 1, 1])

In [68]:
X_test

Unnamed: 0,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,...,ae_3,ae_4,ae_5,vae_loss,vae_0,vae_1,vae_2,vae_3,vae_4,vae_5
0,1,2,0,1,0,1,2,1,1,1,...,2.653941,5.249811,6.814631,0.160050,-0.011247,0.022909,-0.009519,0.021676,-0.951097,-0.003032
1,2,1,2,2,2,0,0,0,0,2,...,2.812864,3.352915,4.802450,0.245462,0.001066,-0.015345,0.025305,-0.059915,1.145095,0.025660
2,2,1,0,0,1,1,0,0,0,1,...,2.858739,4.201324,2.976211,0.154172,-0.017299,-0.004565,0.003285,0.018605,-0.389941,-0.014876
3,2,1,1,0,2,0,0,0,0,2,...,4.027157,1.268331,4.658266,0.181836,-0.014104,-0.005064,0.020519,-0.026758,0.636461,0.020019
4,0,2,0,2,0,2,2,0,2,1,...,0.441846,6.453301,8.018075,0.097868,0.003082,-0.006587,0.021166,0.000250,-1.647759,-0.007184
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,1,2,2,0,1,1,0,2,0,2,...,7.672681,4.967480,2.829824,0.315421,-0.014762,0.010382,-0.032822,-0.013026,1.286132,-0.013451
171,2,0,0,0,1,1,0,0,0,1,...,0.740753,2.645060,0.636700,0.130903,-0.014275,-0.013699,0.015779,-0.020205,-0.338218,-0.020329
172,2,0,0,0,1,1,0,0,0,2,...,3.201596,4.524136,1.336088,0.179338,-0.022477,-0.006342,0.006630,0.006097,-0.127112,-0.004815
173,1,2,1,1,2,2,0,1,0,2,...,2.244185,3.786952,3.082878,0.136226,-0.006691,-0.014442,0.006349,-0.062342,1.541715,0.015360


In [69]:
def catgbmc(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:15]
    model = CatBoostClassifier(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        # l2_leaf_reg=10
        # random_state=random_seed
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY),
        );     

    pred = model.predict(validX)
    score = f1_score(validY, pred, average='macro')
    print(score)
    
    return model

In [70]:
def catgbmr(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:15]
    model = CatBoostRegressor(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        # l2_leaf_reg=10
        # random_state=random_seed
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY),
        );     

    pred = model.predict(validX)
    score = f1_score(validY, np.round(pred), average='macro')
    print(score)
    
    return model

In [71]:
params = {'iterations':300,
          'learning_rate':0.03,
          # 'l2_leaf_reg' : 10,
        #   'auto_class_weights' : 'Balanced',
          # 'grow_policy' : 'Depthwise',
          'verbose':0}

model3 = catgbmc(X2, y2, X, y, params)
model4 = catgbmc(X, y, X, y,params)

1.0
1.0


In [72]:
pd.DataFrame(data=model3.get_feature_importance(), index=model3.feature_names_, columns=['feature_importance']).sort_values('feature_importance', ascending=False)

Unnamed: 0,feature_importance
vae_4,24.206769
ae_2,10.992504
SNP_07_ratio,7.925403
vae_0,7.703719
SNP_08_ratio,3.477555
ae_0,3.403196
ae_1,3.066846
SNP_04_ratio,2.998596
SNP_08,2.846366
SNP_15_ratio,2.810441


In [73]:
pd.DataFrame(data=model4.get_feature_importance(), index=model4.feature_names_, columns=['feature_importance']).sort_values('feature_importance', ascending=False)

Unnamed: 0,feature_importance
vae_4,32.483924
ae_2,11.756721
vae_0,6.745395
SNP_08_ratio,3.987327
SNP_07_ratio,3.242883
ae_0,3.022093
vae_3,2.856083
SNP_04_ratio,2.762283
ae_1,2.647861
SNP_15_ratio,2.204487


In [74]:
total = pd.DataFrame()

test00 = pd.read_csv("./data/test.csv")
X_test1 = test00.drop(columns=['id', 'father', 'mother', 'gender', 'trait','SNP_06'])
X_test1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   SNP_01  175 non-null    object
 1   SNP_02  175 non-null    object
 2   SNP_03  175 non-null    object
 3   SNP_04  175 non-null    object
 4   SNP_05  175 non-null    object
 5   SNP_07  175 non-null    object
 6   SNP_08  175 non-null    object
 7   SNP_09  175 non-null    object
 8   SNP_10  175 non-null    object
 9   SNP_11  175 non-null    object
 10  SNP_12  175 non-null    object
 11  SNP_13  175 non-null    object
 12  SNP_14  175 non-null    object
 13  SNP_15  175 non-null    object
dtypes: object(14)
memory usage: 19.3+ KB


In [75]:
total['pred1'] = (model1.predict_proba(X_test1)[:,1] > 0.99).astype(int)
pred2 = model3.predict_proba(X_test)[:,1]
pred3 = model4.predict_proba(X_test)[:,1]
total['pred2'] = 0
total['pred3'] = 0

total['pred2'] = pred2
total['pred3'] = pred3
total

Unnamed: 0,pred1,pred2,pred3
0,1,0.000486,0.002643
1,0,0.998101,0.995679
2,0,0.001163,0.008184
3,0,0.949294,0.963217
4,1,0.000224,0.000858
...,...,...,...
170,0,0.996510,0.992983
171,0,0.001527,0.005082
172,0,0.001674,0.007258
173,0,0.999769,0.998082


In [76]:
c = 0.001
total[(total.pred1==1) & ((total.pred2 >= c) | (total.pred3 >= c))]

Unnamed: 0,pred1,pred2,pred3
0,1,0.000486,0.002643
8,1,0.000546,0.0027
23,1,0.000109,0.001
27,1,0.000137,0.00121
30,1,0.000454,0.001882
31,1,0.000305,0.001888
50,1,0.000154,0.001007
57,1,0.000258,0.001203
68,1,0.000225,0.001205
72,1,0.000276,0.001354


In [77]:
X, y = train.drop(columns=['id', 'class']), (train['class'].values == 'C').astype(int)

strategy = {0:1200, 1:1000}
smote = SMOTENC(categorical_features=[x for x in range(16)], sampling_strategy=strategy)
X2, y2 = smote.fit_resample(X, y)
X_test = test.drop(columns=['id'])
X2

Unnamed: 0,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,...,ae_3,ae_4,ae_5,vae_loss,vae_0,vae_1,vae_2,vae_3,vae_4,vae_5
0,2,1,0,1,1,0,0,2,0,2,...,2.431611,-0.106581,2.498325,0.139278,-0.007006,0.013411,-0.008253,-0.025471,0.830296,0.007900
1,1,1,1,0,0,1,0,1,0,1,...,4.618769,4.234283,2.760611,0.104779,-0.016590,0.007498,-0.015945,0.013792,-0.162930,-0.005958
2,2,2,0,1,2,2,0,1,1,1,...,-0.169834,1.822250,2.224755,0.238899,-0.017366,0.000105,0.019572,-0.052061,1.125260,0.032062
3,0,2,0,1,0,2,2,0,2,1,...,1.564791,5.829734,9.283328,0.111061,0.002522,0.000835,0.015297,-0.006023,-1.473958,-0.008611
4,2,2,2,0,2,0,0,0,0,2,...,4.100493,4.037419,4.762250,0.247876,-0.014172,-0.010907,0.018917,-0.032634,0.956545,0.011679
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2195,1,0,1,0,0,1,1,1,0,1,...,4.297341,3.944401,2.267574,0.108212,-0.015770,0.018952,-0.031275,0.019907,-0.101995,-0.008247
2196,1,1,2,0,0,1,0,1,0,2,...,1.971491,6.581552,3.127164,0.211497,-0.013969,0.003499,-0.009417,-0.006060,0.045091,0.004553
2197,2,0,1,0,0,1,1,1,0,1,...,5.319653,4.841182,0.882598,0.091431,-0.016162,0.017073,-0.033028,0.012988,0.002787,-0.011979
2198,1,0,1,0,1,1,0,0,0,0,...,3.238293,1.826958,1.154573,0.190592,-0.007301,-0.014165,0.019102,-0.035576,-0.384451,-0.018629


In [78]:
params = {'iterations':300,
          'learning_rate':0.03,
          # 'l2_leaf_reg' : 10,
        #   'auto_class_weights' : 'Balanced',
          # 'grow_policy' : 'Depthwise',
          'verbose':0}

model5 = catgbmc(X2, y2, X, y, params)
model6 = catgbmc(X, y, X, y,params)

1.0
1.0


In [79]:
pd.DataFrame(data=model5.get_feature_importance(), index=model5.feature_names_, columns=['feature_importance']).sort_values('feature_importance', ascending=False)

Unnamed: 0,feature_importance
vae_4,23.568329
ae_5,10.884362
vae_0,9.516976
ae_2,5.899879
SNP_07_ratio,5.886284
SNP_04_ratio,5.111822
SNP_04,4.334857
SNP_02_ratio,3.011637
ae_1,2.994843
SNP_15_ratio,1.985191


In [80]:
pd.DataFrame(data=model6.get_feature_importance(), index=model6.feature_names_, columns=['feature_importance']).sort_values('feature_importance', ascending=False)

Unnamed: 0,feature_importance
vae_4,23.07705
vae_0,9.69746
ae_2,7.0744
SNP_04_ratio,6.798037
ae_5,5.083924
SNP_02_ratio,4.392988
vae_5,4.198222
SNP_07_ratio,3.890975
ae_1,2.944723
SNP_08_ratio,2.903652


In [81]:
X_test = test.drop(columns=['id'])

pred5 = model5.predict_proba(X_test)[:,1]
pred6 = model6.predict_proba(X_test)[:,1]

total['pred4'] = pred5
total['pred5'] = pred6
total

Unnamed: 0,pred1,pred2,pred3,pred4,pred5
0,1,0.000486,0.002643,0.002076,0.005869
1,0,0.998101,0.995679,0.000536,0.002186
2,0,0.001163,0.008184,0.999399,0.989565
3,0,0.949294,0.963217,0.031323,0.051287
4,1,0.000224,0.000858,0.000480,0.001431
...,...,...,...,...,...
170,0,0.996510,0.992983,0.006701,0.019758
171,0,0.001527,0.005082,0.998842,0.996032
172,0,0.001674,0.007258,0.999230,0.993411
173,0,0.999769,0.998082,0.000285,0.001075


In [82]:
c = 0.01
total[(total.pred1==1) & ((total.pred4 >= c) | (total.pred5 >= c))]

Unnamed: 0,pred1,pred2,pred3,pred4,pred5
8,1,0.000546,0.0027,0.096738,0.038694
30,1,0.000454,0.001882,0.028207,0.004331


In [87]:
total[((total.pred2 > 0.9) | (total.pred3 > 0.9)) & ((total.pred4 > 0.9) | (total.pred5 > 0.9))]

Unnamed: 0,pred1,pred2,pred3,pred4,pred5,answer1,answer2,answer3


In [88]:
a_index = total[total.pred1 == 1].index.tolist()
b_index = total[(total.pred2 > 0.9) & (total.pred3 > 0.9)].index.tolist()
c_index = total[(total.pred4 > 0.9) & (total.pred5 > 0.9)].index.tolist()
d_index = [x for x in range(175) if (x not in a_index) & (x not in b_index) & (x not in c_index)]
d_index

[12, 28, 35, 52, 60, 65, 97, 110, 117, 119, 139, 140, 162]

In [89]:
total.iloc[d_index]

Unnamed: 0,pred1,pred2,pred3,pred4,pred5,answer1,answer2,answer3
12,0,0.655926,0.582591,0.316187,0.213491,D,D,D
28,0,0.000833,0.002905,0.007768,0.016967,D,D,D
35,0,0.867736,0.919653,0.024591,0.059552,D,D,D
52,0,0.000633,0.002736,0.003273,0.006416,D,D,D
60,0,0.003651,0.003169,0.653559,0.492133,D,D,D
65,0,0.000349,0.001374,0.002719,0.015073,D,D,D
97,0,0.666616,0.594607,0.502797,0.267322,D,D,D
110,0,0.001022,0.005714,0.942461,0.890608,D,D,D
117,0,0.099584,0.410347,0.927017,0.444166,D,D,D
119,0,0.880787,0.857759,0.474684,0.175483,D,D,D


In [84]:
total[(total.pred2 >= 0.7) & (total.pred3 < 0.7)]

Unnamed: 0,pred1,pred2,pred3,pred4,pred5


In [85]:
total[(total.pred2 < 0.7) & (total.pred3 >= 0.7)]

Unnamed: 0,pred1,pred2,pred3,pred4,pred5


In [86]:
total['answer1'] = total['pred1'].map(lambda x : 'A' if x==1 else 'D')
total['answer2'] = total['pred1'].map(lambda x : 'A' if x==1 else 'D')
total['answer3'] = total['pred1'].map(lambda x : 'A' if x==1 else 'D')

total.loc[total.answer1=='D', 'answer1'] = (pred2 > 0.9).astype(int)
total.loc[total.answer2=='D', 'answer2'] = (pred3 > 0.9).astype(int)
total.loc[total.answer3=='D', 'answer3'] = (np.mean([pred2, pred3], axis=0) > 0.9).astype(int)

total['answer1'] = total['answer1'].map(lambda x : 'A' if x=='A' else ('B' if x==1 else ('C' if x==0 else 'D')))
total['answer2'] = total['answer2'].map(lambda x : 'A' if x=='A' else ('B' if x==1 else ('C' if x==0 else 'D')))
total['answer3'] = total['answer3'].map(lambda x : 'A' if x=='A' else ('B' if x==1 else ('C' if x==0 else 'D')))

total.answer1.value_counts()

ValueError: Must have equal len keys and value when setting with an iterable

In [None]:
total.answer2.value_counts()

In [None]:
total.answer3.value_counts()

In [None]:
total.iloc[[3,5,12,119,126,162,168]]

In [None]:
submit = pd.read_csv("./data/sample_submission.csv")
submit['class'] = total['answer3']
submit['class'].value_counts()
submit.to_csv("./submit.csv", index=False)