# CONCEPT 

- 기초 데이터부터 각 단계별 성능을 구체적으로 확인
- 제공받은 데이터만 썼을 경우, 파생변수 추가, AE 변수 추가, 데이터 증강의 효과 중 AE 변수를 추가했을 경우

# STEP 01. EDA

In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
import multiprocessing
from tqdm.auto import tqdm

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler

from transformers import get_cosine_schedule_with_warmup

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset, sampler

from imblearn.over_sampling import SMOTE

In [2]:
device = torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda', index=1)

In [3]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')              
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 21 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      262 non-null    object
 1   father  262 non-null    int64 
 2   mother  262 non-null    int64 
 3   gender  262 non-null    int64 
 4   trait   262 non-null    int64 
 5   SNP_01  262 non-null    object
 6   SNP_02  262 non-null    object
 7   SNP_03  262 non-null    object
 8   SNP_04  262 non-null    object
 9   SNP_05  262 non-null    object
 10  SNP_06  262 non-null    object
 11  SNP_07  262 non-null    object
 12  SNP_08  262 non-null    object
 13  SNP_09  262 non-null    object
 14  SNP_10  262 non-null    object
 15  SNP_11  262 non-null    object
 16  SNP_12  262 non-null    object
 17  SNP_13  262 non-null    object
 18  SNP_14  262 non-null    object
 19  SNP_15  262 non-null    object
 20  class   262 non-null    object
dtypes: int64(4), object(17)
memory usage: 43.1+ KB


In [4]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      175 non-null    object
 1   father  175 non-null    int64 
 2   mother  175 non-null    int64 
 3   gender  175 non-null    int64 
 4   trait   175 non-null    int64 
 5   SNP_01  175 non-null    object
 6   SNP_02  175 non-null    object
 7   SNP_03  175 non-null    object
 8   SNP_04  175 non-null    object
 9   SNP_05  175 non-null    object
 10  SNP_06  175 non-null    object
 11  SNP_07  175 non-null    object
 12  SNP_08  175 non-null    object
 13  SNP_09  175 non-null    object
 14  SNP_10  175 non-null    object
 15  SNP_11  175 non-null    object
 16  SNP_12  175 non-null    object
 17  SNP_13  175 non-null    object
 18  SNP_14  175 non-null    object
 19  SNP_15  175 non-null    object
dtypes: int64(4), object(16)
memory usage: 27.5+ KB


In [5]:
df_train.describe()

Unnamed: 0,father,mother,gender,trait
count,262.0,262.0,262.0,262.0
mean,0.0,0.0,0.0,1.736641
std,0.0,0.0,0.0,0.441298
min,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,2.0
75%,0.0,0.0,0.0,2.0
max,0.0,0.0,0.0,2.0


In [6]:
df_test.describe()

Unnamed: 0,father,mother,gender,trait
count,175.0,175.0,175.0,175.0
mean,0.0,0.0,0.0,1.708571
std,0.0,0.0,0.0,0.455724
min,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,2.0
75%,0.0,0.0,0.0,2.0
max,0.0,0.0,0.0,2.0


In [7]:
df_train.drop(columns=['father', 'mother', 'gender'], inplace=True)
df_test.drop(columns=['father', 'mother', 'gender'], inplace=True)

### Summary 01
- father, mother, gender column은 무의미하므로, 삭제
- trait은 1 혹은 2밖에 존재하지 않음

## AutoEncoder 변수 추가

### Define Variational-AutoEncoder & Test

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(VAE, self).__init__()
        self.input_dim = input_dim
        self.latent_dim = latent_dim
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, input_dim*2),
            nn.ReLU(),
            nn.Linear(input_dim*2, input_dim),
            nn.ReLU(),
            nn.Linear(input_dim, input_dim//2),
            nn.ReLU()
            )
        self.fc_mu = nn.Linear(input_dim//2, latent_dim)
        self.fc_logvar = nn.Linear(input_dim//2, latent_dim)
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, input_dim//2),
            nn.ReLU(),
            nn.Linear(input_dim//2, input_dim),
            nn.ReLU(),
            nn.Linear(input_dim, input_dim*2),
            nn.ReLU(),
            nn.Linear(input_dim*2, input_dim)
        )

    def encode(self, x):
        h = self.encoder(x)
        return self.fc_mu(h), self.fc_logvar(h)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return mu + eps*std

    def decode(self, z):
        return self.decoder(z)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

def loss_fn(recon_x, x, mu, logvar):
    reconstruction_loss = F.mse_loss(recon_x, x, reduction='sum')
    kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return reconstruction_loss + kl_loss

def train(model, optimizer, train_loader, device):
    model.train()
    train_loss = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        loss = loss_fn(recon_batch, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        
        optimizer.step()
        
    return train_loss / len(train_loader.dataset)

def test(model, test_loader, device):
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for data in test_loader:
            data = data.to(device)
            recon_batch, mu, logvar = model(data)
            test_loss += loss_fn(recon_batch, data, mu, logvar).item()
    return test_loss / len(test_loader.dataset)

In [9]:
input_dim = 46
latent_dim = 4
batch_size = 128
num_epochs = 100

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_data = torch.randn(100, input_dim)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size)
test_data = torch.randn(10, input_dim)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)

model = VAE(input_dim, latent_dim).to(device)
optimizer = torch.optim.Adam(model.parameters())

for epoch in range(num_epochs):
    train_loss = train(model, optimizer, train_loader, device)
    test_loss = test(model, test_loader, device)
    print(f"Epoch {epoch}: Train loss = {train_loss}, Test loss = {test_loss}")

Epoch 0: Train loss = 46.691142578125, Test loss = 49.779885864257814
Epoch 1: Train loss = 46.539208984375, Test loss = 49.52662353515625
Epoch 2: Train loss = 46.4253173828125, Test loss = 49.635250854492185
Epoch 3: Train loss = 46.3461376953125, Test loss = 49.67650146484375
Epoch 4: Train loss = 46.2175732421875, Test loss = 49.58491516113281
Epoch 5: Train loss = 46.1952490234375, Test loss = 49.4861328125
Epoch 6: Train loss = 46.04615234375, Test loss = 49.77446594238281
Epoch 7: Train loss = 46.026533203125, Test loss = 49.776361083984376
Epoch 8: Train loss = 45.8463525390625, Test loss = 49.85007019042969
Epoch 9: Train loss = 45.9129736328125, Test loss = 49.734014892578124
Epoch 10: Train loss = 45.8065771484375, Test loss = 49.82383728027344
Epoch 11: Train loss = 45.77349609375, Test loss = 49.85379638671875
Epoch 12: Train loss = 45.6452685546875, Test loss = 49.8962890625
Epoch 13: Train loss = 45.68373046875, Test loss = 49.853659057617186
Epoch 14: Train loss = 45.64

### categorical to numerical

In [10]:
df_train.copy()

Unnamed: 0,id,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15,class
0,TRAIN_000,2,G G,A G,A A,G A,C A,A A,A A,G G,A A,G G,A G,A A,A A,A A,A A,B
1,TRAIN_001,2,A G,A G,C A,A A,A A,A G,A A,G A,A A,A G,A A,G A,G G,A A,A A,C
2,TRAIN_002,2,G G,G G,A A,G A,C C,G G,A A,G A,G A,A G,A A,A A,A A,A A,A A,B
3,TRAIN_003,1,A A,G G,A A,G A,A A,G G,G G,A A,G G,A G,G G,G G,G G,A A,G G,A
4,TRAIN_004,2,G G,G G,C C,A A,C C,A A,A A,A A,A A,G G,A A,A A,A G,A A,G A,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,TRAIN_257,2,A G,A G,A A,G A,C C,A G,A A,G A,A A,G G,A G,G A,A A,A A,A A,B
258,TRAIN_258,2,G G,A A,C A,A A,A A,A G,G A,G A,A A,A G,A G,A A,A G,A A,G A,C
259,TRAIN_259,1,A G,G G,A A,G A,A A,A G,G G,G A,G A,A A,G G,G G,G G,C A,G G,A
260,TRAIN_260,1,A A,G G,A A,G A,A A,G G,G G,A A,G A,A G,A G,G A,G G,C A,G G,A


In [11]:
train2 = pd.get_dummies(df_train.iloc[:, 1:-1]).copy()
test2 = pd.get_dummies(df_test.iloc[:, 1:]).copy()
train2.head()

Unnamed: 0,trait,SNP_01_A A,SNP_01_A G,SNP_01_G G,SNP_02_A A,SNP_02_A G,SNP_02_G G,SNP_03_A A,SNP_03_C A,SNP_03_C C,...,SNP_12_G G,SNP_13_A A,SNP_13_A G,SNP_13_G G,SNP_14_A A,SNP_14_C A,SNP_14_C C,SNP_15_A A,SNP_15_G A,SNP_15_G G
0,2,0,0,1,0,1,0,1,0,0,...,0,1,0,0,1,0,0,1,0,0
1,2,0,1,0,0,1,0,0,1,0,...,0,0,0,1,1,0,0,1,0,0
2,2,0,0,1,0,0,1,1,0,0,...,0,1,0,0,1,0,0,1,0,0
3,1,1,0,0,0,0,1,1,0,0,...,1,0,0,1,1,0,0,0,0,1
4,2,0,0,1,0,0,1,0,0,1,...,0,0,1,0,1,0,0,0,1,0


In [12]:
# train에만 있거나, test에만 있는 칼럼 확인
target1 = train2.columns.tolist()
target2 = test2.columns.tolist()
[x for x in target1 if x not in target2], [y for y in target2 if y not in target2]

([], [])

In [13]:
train2 = df_train.copy()
test2 = df_test.copy()
for i in tqdm(range(1, 15+1)) :
    target = str(i) if i >= 10 else "0"+str(i)
    cols = sorted(train2[f"SNP_{target}"].unique().tolist())
    train2[f"SNP_{target}"] = train2[f"SNP_{target}"].map(lambda x : 0 if x==cols[0] else (1 if x==cols[1] else 2))
    test2[f"SNP_{target}"] = test2[f"SNP_{target}"].map(lambda x : 0 if x==cols[0] else (1 if x==cols[1] else 2))

train2.info(), test2.info()

  0%|          | 0/15 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      262 non-null    object
 1   trait   262 non-null    int64 
 2   SNP_01  262 non-null    int64 
 3   SNP_02  262 non-null    int64 
 4   SNP_03  262 non-null    int64 
 5   SNP_04  262 non-null    int64 
 6   SNP_05  262 non-null    int64 
 7   SNP_06  262 non-null    int64 
 8   SNP_07  262 non-null    int64 
 9   SNP_08  262 non-null    int64 
 10  SNP_09  262 non-null    int64 
 11  SNP_10  262 non-null    int64 
 12  SNP_11  262 non-null    int64 
 13  SNP_12  262 non-null    int64 
 14  SNP_13  262 non-null    int64 
 15  SNP_14  262 non-null    int64 
 16  SNP_15  262 non-null    int64 
 17  class   262 non-null    object
dtypes: int64(16), object(2)
memory usage: 37.0+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 17 columns):
 #   Column  Non-N

(None, None)

In [14]:
train2['class_B'] = train2['class'].map(lambda x : 1 if x=='B' else 0)
train2['class_C'] = train2['class'].map(lambda x : 1 if x=='C' else 0)

for i in range(1, 15+1) :
    target = str(i) if i >= 10 else "0"+str(i)
    target2 = train2.groupby(f'SNP_{target}')['class_B', 'class_C'].sum()
    
    for j in range(3) :
        value = target2['class_B'] / (target2['class_B']+target2['class_C'])
        train2[f"SNP_{target}_ratio"] = train2[f"SNP_{target}"].map(lambda x : value[0] if x==0 else (value[1] if x==1 else value[2])).fillna(-1)    
        test2[f"SNP_{target}_ratio"] = test2[f"SNP_{target}"].map(lambda x : value[0] if x==0 else (value[1] if x==1 else value[2])).fillna(-1)    

train2.drop(columns=['class_B', 'class_C'], inplace=True)
train2.info(), test2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 33 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            262 non-null    object 
 1   trait         262 non-null    int64  
 2   SNP_01        262 non-null    int64  
 3   SNP_02        262 non-null    int64  
 4   SNP_03        262 non-null    int64  
 5   SNP_04        262 non-null    int64  
 6   SNP_05        262 non-null    int64  
 7   SNP_06        262 non-null    int64  
 8   SNP_07        262 non-null    int64  
 9   SNP_08        262 non-null    int64  
 10  SNP_09        262 non-null    int64  
 11  SNP_10        262 non-null    int64  
 12  SNP_11        262 non-null    int64  
 13  SNP_12        262 non-null    int64  
 14  SNP_13        262 non-null    int64  
 15  SNP_14        262 non-null    int64  
 16  SNP_15        262 non-null    int64  
 17  class         262 non-null    object 
 18  SNP_01_ratio  262 non-null    

(None, None)

### Training VAE

In [15]:
train2['class'].value_counts()

B    114
C     79
A     69
Name: class, dtype: int64

In [16]:
from imblearn.over_sampling import SMOTENC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

categ_var = [x for x in range(16)]
strategy = {'A':1000,'B':1100, 'C':1000}
smotenc = SMOTENC(categorical_features=categ_var, sampling_strategy=strategy, random_state=2023)

train3 = train2.drop(columns=['id','class'])
train3.iloc[:, categ_var] = train3.iloc[:, categ_var].astype('category')
train3[:] = scaler.fit_transform(train3[:])

test3 = test2.drop(columns=['id'])
test3.iloc[:, categ_var] = test3.iloc[:, categ_var].astype('category')
test3[:] = scaler.transform(test3[:])

X, y = smotenc.fit_resample(train3, train2['class']) 

print(X.shape, y.shape)

X_train, X_valid, _, _ = train_test_split(X, y, stratify=y, random_state=2023)
print(X_train.shape, X_valid.shape)

(3100, 31) (3100,)
(2325, 31) (775, 31)


In [17]:
# MODEL 01 : 4개의 VECTOR로 출력
input_dim = X_train.shape[1]
latent_dim = 4
batch_size = X_train.shape[0]
num_epochs = 10000

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_data = torch.Tensor(X_train.to_numpy())
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size)
test_data = torch.Tensor(X_valid.to_numpy())
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)

model1 = VAE(input_dim, latent_dim).to(device)
optimizer = torch.optim.Adam(model1.parameters())

for epoch in range(num_epochs):
    train_loss = train(model1, optimizer, train_loader, device)
    test_loss = test(model1, test_loader, device)
    print(f"Epoch {epoch}: Train loss = {train_loss}, Test loss = {test_loss}")

Epoch 0: Train loss = 32.28847446236559, Test loss = 32.1818220766129
Epoch 1: Train loss = 32.21147177419355, Test loss = 32.09960685483871
Epoch 2: Train loss = 32.14242271505376, Test loss = 32.00030997983871
Epoch 3: Train loss = 32.10640793010753, Test loss = 31.967273185483872
Epoch 4: Train loss = 32.01118279569892, Test loss = 31.9303125
Epoch 5: Train loss = 31.98872311827957, Test loss = 31.880672883064516
Epoch 6: Train loss = 31.913645833333334, Test loss = 31.839944556451613
Epoch 7: Train loss = 31.851528897849462, Test loss = 31.754254032258064
Epoch 8: Train loss = 31.76776545698925, Test loss = 31.701990927419356
Epoch 9: Train loss = 31.771629704301077, Test loss = 31.63149445564516
Epoch 10: Train loss = 31.64459677419355, Test loss = 31.515362903225807
Epoch 11: Train loss = 31.57652553763441, Test loss = 31.633067036290324
Epoch 12: Train loss = 31.541038306451615, Test loss = 31.381118951612905
Epoch 13: Train loss = 31.429227150537635, Test loss = 31.344606854838

In [18]:
# MODEL 02 : 16개의 VECTOR로 출력
input_dim = X_train.shape[1]
latent_dim = 16
batch_size = X_train.shape[0]
num_epochs = 10000

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_data = torch.Tensor(X_train.to_numpy())
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size)
test_data = torch.Tensor(X_valid.to_numpy())
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)

model2 = VAE(input_dim, latent_dim).to(device)
optimizer = torch.optim.Adam(model2.parameters())

for epoch in range(num_epochs):
    train_loss = train(model2, optimizer, train_loader, device)
    test_loss = test(model2, test_loader, device)
    print(f"Epoch {epoch}: Train loss = {train_loss}, Test loss = {test_loss}")

Epoch 0: Train loss = 32.245443548387094, Test loss = 32.151381048387094
Epoch 1: Train loss = 32.18653561827957, Test loss = 32.11686491935484
Epoch 2: Train loss = 32.143649193548384, Test loss = 32.086335685483874
Epoch 3: Train loss = 32.11990591397849, Test loss = 32.05056955645161
Epoch 4: Train loss = 32.07338037634408, Test loss = 32.00637096774194
Epoch 5: Train loss = 32.04236223118279, Test loss = 31.966824596774195
Epoch 6: Train loss = 32.00645161290323, Test loss = 31.953296370967742
Epoch 7: Train loss = 31.981014784946236, Test loss = 31.927381552419355
Epoch 8: Train loss = 31.951959005376345, Test loss = 31.89609627016129
Epoch 9: Train loss = 31.92078629032258, Test loss = 31.86015625
Epoch 10: Train loss = 31.896868279569894, Test loss = 31.84360635080645
Epoch 11: Train loss = 31.873151881720432, Test loss = 31.816675907258066
Epoch 12: Train loss = 31.844452284946236, Test loss = 31.792837701612903
Epoch 13: Train loss = 31.82720430107527, Test loss = 31.757681451

### ADD VAE VARIABLES(ENCODER VALUES & RECONSTRUCTION ERRORS)

In [19]:
# vae 변수 생성
## model01의 encoding 값 및 데이터별 reconstruction error

### train data
data = torch.Tensor(train3.copy().to_numpy()).to(device)
target_01 = model1(data)
target_02 = model2(data)
encodings_01 = target_01[1].cpu().detach().numpy()
encodings_02 = target_02[1].cpu().detach().numpy()

errors_01 = []
errors_02 = []
for i in tqdm(range(len(data))) :
    value = data[i]
    recon_01 = model1(value)[0]
    recon_02 = model2(value)[0]
    error_01 = F.mse_loss(recon_01, value).cpu().detach().numpy()
    error_02 = F.mse_loss(recon_02, value).cpu().detach().numpy()
    
    errors_01.append(error_01)
    errors_02.append(error_02)

df_encoding1 = pd.DataFrame(data=encodings_01, columns=["enco01_"+str(x) for x in range(encodings_01.shape[1])])
df_encoding1['enco01_error'] = errors_01
df_encoding2 = pd.DataFrame(data=encodings_02, columns=["enco02_"+str(x) for x in range(encodings_02.shape[1])])
df_encoding2['enco02_error'] = errors_02
train3 = pd.concat([train2, df_encoding1, df_encoding2], axis=1)                    
    
### test data
data = torch.Tensor(test3.copy().to_numpy()).to(device)
target_01 = model1(data)
target_02 = model2(data)
encodings_01 = target_01[1].cpu().detach().numpy()
encodings_02 = target_02[1].cpu().detach().numpy()

errors_01 = []
errors_02 = []
for i in tqdm(range(len(data))) :
    value = data[i]
    recon_01 = model1(value)[0]
    recon_02 = model2(value)[0]
    error_01 = F.mse_loss(recon_01, value).cpu().detach().numpy()
    error_02 = F.mse_loss(recon_02, value).cpu().detach().numpy()
    
    errors_01.append(error_01)
    errors_02.append(error_02)

df_encoding1 = pd.DataFrame(data=encodings_01, columns=["enco01_"+str(x) for x in range(encodings_01.shape[1])])
df_encoding1['enco01_error'] = errors_01
df_encoding2 = pd.DataFrame(data=encodings_02, columns=["enco02_"+str(x) for x in range(encodings_02.shape[1])])
df_encoding2['enco02_error'] = errors_02
test3 = pd.concat([test2, df_encoding1, df_encoding2], axis=1)  

train3.info(), test2.info()

  0%|          | 0/262 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 55 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            262 non-null    object 
 1   trait         262 non-null    int64  
 2   SNP_01        262 non-null    int64  
 3   SNP_02        262 non-null    int64  
 4   SNP_03        262 non-null    int64  
 5   SNP_04        262 non-null    int64  
 6   SNP_05        262 non-null    int64  
 7   SNP_06        262 non-null    int64  
 8   SNP_07        262 non-null    int64  
 9   SNP_08        262 non-null    int64  
 10  SNP_09        262 non-null    int64  
 11  SNP_10        262 non-null    int64  
 12  SNP_11        262 non-null    int64  
 13  SNP_12        262 non-null    int64  
 14  SNP_13        262 non-null    int64  
 15  SNP_14        262 non-null    int64  
 16  SNP_15        262 non-null    int64  
 17  class         262 non-null    object 
 18  SNP_01_ratio  262 non-null    

(None, None)

In [20]:
train3.to_csv("./data/train3.csv", index=False)
test3.to_csv("./data/test3.csv", index=False)

## Categorical Features
- 전체 변수들을 CATEGORY 타입으로 변환

In [21]:
train = pd.read_csv("./data/train3.csv")
test = pd.read_csv("./data/test3.csv")
train.iloc[:, 1:17] = train.iloc[:, 1:17].astype('category')
test.iloc[:, 1:17] = test.iloc[:, 1:17].astype('category')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 55 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   id            262 non-null    object  
 1   trait         262 non-null    category
 2   SNP_01        262 non-null    category
 3   SNP_02        262 non-null    category
 4   SNP_03        262 non-null    category
 5   SNP_04        262 non-null    category
 6   SNP_05        262 non-null    category
 7   SNP_06        262 non-null    category
 8   SNP_07        262 non-null    category
 9   SNP_08        262 non-null    category
 10  SNP_09        262 non-null    category
 11  SNP_10        262 non-null    category
 12  SNP_11        262 non-null    category
 13  SNP_12        262 non-null    category
 14  SNP_13        262 non-null    category
 15  SNP_14        262 non-null    category
 16  SNP_15        262 non-null    category
 17  class         262 non-null    object  
 18  SNP_01_rat

In [22]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 54 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   id            175 non-null    object  
 1   trait         175 non-null    category
 2   SNP_01        175 non-null    category
 3   SNP_02        175 non-null    category
 4   SNP_03        175 non-null    category
 5   SNP_04        175 non-null    category
 6   SNP_05        175 non-null    category
 7   SNP_06        175 non-null    category
 8   SNP_07        175 non-null    category
 9   SNP_08        175 non-null    category
 10  SNP_09        175 non-null    category
 11  SNP_10        175 non-null    category
 12  SNP_11        175 non-null    category
 13  SNP_12        175 non-null    category
 14  SNP_13        175 non-null    category
 15  SNP_14        175 non-null    category
 16  SNP_15        175 non-null    category
 17  SNP_01_ratio  175 non-null    float64 
 18  SNP_02_rat

# STEP 02. MODELING & VALIDATION
- 데이터 증강여부에 따른 성능향상 유무를 확인
- Classifier와 Regressor를 동시에 사용해 자체적인 ensemble 효과 추가

0. All
1. A & notA
2. B & notB
3. C & notC

## 0. All
### without Aug

In [23]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold

from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from imblearn.over_sampling import SMOTE, SMOTENC, SMOTEN

In [24]:
X, y = train.drop(columns=['id', 'class']), train['class'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2))
X_test = test.drop(columns=['id'])
X

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,...,enco02_7,enco02_8,enco02_9,enco02_10,enco02_11,enco02_12,enco02_13,enco02_14,enco02_15,enco02_error
0,2,2,1,0,1,1,0,0,2,0,...,-0.004032,-0.003087,0.688055,-0.000842,0.002033,-0.002185,0.000576,-1.105726,0.000281,0.280199
1,2,1,1,1,0,0,1,0,1,0,...,0.001067,0.000942,-0.080874,-0.003213,-0.000995,-0.000384,0.001431,-0.178386,0.001243,0.188479
2,2,2,2,0,1,2,2,0,1,1,...,-0.008200,-0.008561,1.603863,0.001029,-0.002891,0.000040,0.000836,-2.192967,0.000403,0.375980
3,1,0,2,0,1,0,2,2,0,2,...,0.002672,0.002804,-1.202829,0.000851,-0.000689,0.000146,-0.004244,1.688365,0.001678,0.042934
4,2,2,2,2,0,2,0,0,0,0,...,-0.003805,-0.002670,1.267542,0.000700,0.001103,-0.002307,0.001045,-0.142882,0.000001,0.522556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,2,1,1,0,1,2,1,0,1,0,...,-0.004082,-0.004647,0.949844,-0.001329,0.003377,-0.001602,0.001909,-1.310202,-0.000624,0.118198
258,2,2,0,1,0,0,1,1,1,0,...,0.000448,0.001591,0.040117,-0.002339,-0.001808,-0.000087,0.001032,0.268166,0.001013,0.250635
259,1,1,2,0,1,0,1,2,1,1,...,0.004926,0.003631,-1.695253,-0.003528,0.008569,-0.001143,-0.001729,1.039266,-0.000917,0.342894
260,1,0,2,0,1,0,2,2,0,1,...,0.001471,-0.000723,-0.892943,-0.001610,0.000077,0.001162,-0.000772,-0.453117,0.000795,0.170510


In [25]:
y

0      1
1      2
2      1
3      0
4      2
      ..
257    1
258    2
259    0
260    0
261    1
Name: class, Length: 262, dtype: int64

In [26]:
X_test

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,...,enco02_7,enco02_8,enco02_9,enco02_10,enco02_11,enco02_12,enco02_13,enco02_14,enco02_15,enco02_error
0,1,1,2,0,1,0,1,2,1,1,...,0.003211,-0.002199,-1.810056,-0.001662,0.005942,0.002844,-0.002069,-0.526608,-0.001598,0.544260
1,2,2,1,2,2,2,0,0,0,0,...,-0.001040,-0.000589,0.622486,-0.001828,0.004384,-0.002652,0.002314,-0.316670,-0.000206,0.364772
2,2,2,1,0,0,1,1,0,0,0,...,0.001004,0.002181,-0.211656,-0.002833,0.000770,-0.000301,0.001795,0.046846,0.001147,0.414162
3,2,2,1,1,0,2,0,0,0,0,...,-0.001710,0.000016,0.781748,-0.002488,0.004112,-0.003927,0.003004,-0.586991,0.000432,0.590315
4,1,0,2,0,2,0,2,2,0,2,...,-0.000353,-0.003142,-1.024187,0.002060,-0.004403,0.003571,-0.002743,-0.658122,0.002255,0.236359
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,2,1,2,2,0,1,1,0,2,0,...,-0.007381,-0.007090,1.559830,0.005731,-0.003834,0.001004,-0.003342,0.194221,-0.000521,0.412815
171,2,2,0,0,0,1,1,0,0,0,...,-0.000391,-0.001029,-0.226889,-0.003115,-0.000190,0.000667,0.003065,-1.628244,0.001181,0.325912
172,2,2,0,0,0,1,1,0,0,0,...,0.000769,-0.000274,-0.261439,-0.003768,0.001948,0.000288,0.003070,-1.034385,0.000553,0.500954
173,2,1,2,1,1,2,2,0,1,0,...,-0.005194,-0.004537,1.102669,0.001604,-0.001404,-0.000886,-0.000364,-0.729035,-0.000446,0.435690


In [27]:
def catgbmc(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:16]
    model = CatBoostClassifier(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        # l2_leaf_reg=10
        # random_state=random_seed
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY),
        verbose=0
        );     

    pred = model.predict(validX)
    score = f1_score(validY, pred, average='macro')
    print(score)
    
    return model

In [28]:
def catgbmr(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:16]
    model = CatBoostRegressor(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        # l2_leaf_reg=10
        # random_state=random_seed
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY),
        verbose=0
        );     

    pred = model.predict(validX)
    score = f1_score(validY, np.round(pred), average='macro')
    print(score)
    
    return model

In [29]:
params = {'iterations':1000,
          'learning_rate':0.5}

model_cls = catgbmc(X, y, X, y, params)
model_reg = catgbmr(X, y, X, y,params)

1.0
1.0


In [30]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 10,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X, y, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 10}

model_reg = catgbmr(X, y, X, y, params)

1.0
1.0


In [31]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 10,
          'auto_class_weights' : 'SqrtBalanced'}

model_cls = catgbmc(X, y, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X, y, X, y, params)

1.0
0.9964351351600956


In [32]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X, y, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X, y, X, y, params)

1.0
0.9964351351600956


In [33]:
pred = model_cls.predict(X_test)

submit = pd.read_csv("./data/sample_submission.csv")
submit['class'] = pred
submit['class'] = submit['class'].map(lambda x : 'A' if x==0 else ('B' if x==1 else 'C'))
submit.to_csv("./submit.csv", index=False)
submit['class'].value_counts()

B    86
A    51
C    38
Name: class, dtype: int64

### Summary 
- 파생변수를 추가함으로써 기존에는 데이터 증강을 해야만 달성했던 성능인 1.0을 달성완료
- 하지만 이를 제출한 결과에서는 0.9622...

## 01. A & notA
### without Aug

In [34]:
X, y = train.drop(columns=['id', 'class']), (train['class'].values == 'A').astype(int)
X_test = test.drop(columns=['id'])
display(train, y)

Unnamed: 0,id,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,...,enco02_7,enco02_8,enco02_9,enco02_10,enco02_11,enco02_12,enco02_13,enco02_14,enco02_15,enco02_error
0,TRAIN_000,2,2,1,0,1,1,0,0,2,...,-0.004032,-0.003087,0.688055,-0.000842,0.002033,-0.002185,0.000576,-1.105726,0.000281,0.280199
1,TRAIN_001,2,1,1,1,0,0,1,0,1,...,0.001067,0.000942,-0.080874,-0.003213,-0.000995,-0.000384,0.001431,-0.178386,0.001243,0.188479
2,TRAIN_002,2,2,2,0,1,2,2,0,1,...,-0.008200,-0.008561,1.603863,0.001029,-0.002891,0.000040,0.000836,-2.192967,0.000403,0.375980
3,TRAIN_003,1,0,2,0,1,0,2,2,0,...,0.002672,0.002804,-1.202829,0.000851,-0.000689,0.000146,-0.004244,1.688365,0.001678,0.042934
4,TRAIN_004,2,2,2,2,0,2,0,0,0,...,-0.003805,-0.002670,1.267542,0.000700,0.001103,-0.002307,0.001045,-0.142882,0.000001,0.522556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,TRAIN_257,2,1,1,0,1,2,1,0,1,...,-0.004082,-0.004647,0.949844,-0.001329,0.003377,-0.001602,0.001909,-1.310202,-0.000624,0.118198
258,TRAIN_258,2,2,0,1,0,0,1,1,1,...,0.000448,0.001591,0.040117,-0.002339,-0.001808,-0.000087,0.001032,0.268166,0.001013,0.250635
259,TRAIN_259,1,1,2,0,1,0,1,2,1,...,0.004926,0.003631,-1.695253,-0.003528,0.008569,-0.001143,-0.001729,1.039266,-0.000917,0.342894
260,TRAIN_260,1,0,2,0,1,0,2,2,0,...,0.001471,-0.000723,-0.892943,-0.001610,0.000077,0.001162,-0.000772,-0.453117,0.000795,0.170510


array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0])

In [35]:
def catgbmc(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:16]
    model = CatBoostClassifier(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY),
        verbose=0
        );     

    pred = model.predict(validX)
    score = f1_score(validY, pred, average='macro')
    print(score)
    
    return model

In [36]:
def catgbmr(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:16]
    model = CatBoostRegressor(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY),
        verbose=0
        );     

    pred = model.predict(validX)
    score = f1_score(validY, np.round(pred), average='macro')
    print(score)
    
    return model

In [37]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'SqrtBalanced'}

model_cls = catgbmc(X, y, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X, y, X, y, params)

1.0
1.0


In [38]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X, y, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 120}

model_reg = catgbmr(X, y, X, y, params)

1.0
1.0


### Summary
- A & notA 를 분류하는 것은 증강없이 1의 성능을 확인하였으므로, 증강을 진행하지 않음
- 이 사항은 파생변수를 추가하기 전후가 동일

## 02. B & notB
### without Aug

In [68]:
X, y = train.drop(columns=['id', 'class']), (train['class'].values == 'B').astype(int)
X_test = test.drop(columns=['id'])
display(train, y)

Unnamed: 0,id,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,...,enco02_7,enco02_8,enco02_9,enco02_10,enco02_11,enco02_12,enco02_13,enco02_14,enco02_15,enco02_error
0,TRAIN_000,2,2,1,0,1,1,0,0,2,...,-0.004032,-0.003087,0.688055,-0.000842,0.002033,-0.002185,0.000576,-1.105726,0.000281,0.280199
1,TRAIN_001,2,1,1,1,0,0,1,0,1,...,0.001067,0.000942,-0.080874,-0.003213,-0.000995,-0.000384,0.001431,-0.178386,0.001243,0.188479
2,TRAIN_002,2,2,2,0,1,2,2,0,1,...,-0.008200,-0.008561,1.603863,0.001029,-0.002891,0.000040,0.000836,-2.192967,0.000403,0.375980
3,TRAIN_003,1,0,2,0,1,0,2,2,0,...,0.002672,0.002804,-1.202829,0.000851,-0.000689,0.000146,-0.004244,1.688365,0.001678,0.042934
4,TRAIN_004,2,2,2,2,0,2,0,0,0,...,-0.003805,-0.002670,1.267542,0.000700,0.001103,-0.002307,0.001045,-0.142882,0.000001,0.522556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,TRAIN_257,2,1,1,0,1,2,1,0,1,...,-0.004082,-0.004647,0.949844,-0.001329,0.003377,-0.001602,0.001909,-1.310202,-0.000624,0.118198
258,TRAIN_258,2,2,0,1,0,0,1,1,1,...,0.000448,0.001591,0.040117,-0.002339,-0.001808,-0.000087,0.001032,0.268166,0.001013,0.250635
259,TRAIN_259,1,1,2,0,1,0,1,2,1,...,0.004926,0.003631,-1.695253,-0.003528,0.008569,-0.001143,-0.001729,1.039266,-0.000917,0.342894
260,TRAIN_260,1,0,2,0,1,0,2,2,0,...,0.001471,-0.000723,-0.892943,-0.001610,0.000077,0.001162,-0.000772,-0.453117,0.000795,0.170510


array([1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1])

In [40]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 120,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X, y, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 120}

model_reg = catgbmr(X, y, X, y, params)

1.0
1.0


In [41]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'SqrtBalanced'}

model_cls = catgbmc(X, y, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X, y, X, y, params)

1.0
1.0


### with Aug

In [42]:
pd.DataFrame(train['class'].values=='B').astype(int).value_counts()

0    148
1    114
dtype: int64

In [43]:
strategy1 = {0:1000, 1:1000}
strategy2 = {0:1000, 1:1200}
strategy3 = {0:1480, 1:1140}
categ_var = [x for x in range(16)]

smote1 = SMOTENC(categorical_features=categ_var, sampling_strategy=strategy1)
smote2 = SMOTENC(categorical_features=categ_var, sampling_strategy=strategy2)
smote3 = SMOTENC(categorical_features=categ_var, sampling_strategy=strategy3)

X1, y1 = smote1.fit_resample(X, y)
X2, y2 = smote2.fit_resample(X, y)
X3, y3 = smote3.fit_resample(X, y)

In [44]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X1, y1, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X1, y1, X, y, params)

1.0
1.0


In [45]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X2, y2, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X2, y2, X, y, params)

1.0
1.0


In [46]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X3, y3, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X3, y3, X, y, params)

1.0
1.0


### Summary 
- B & notB 는 파생변수를 추가한 뒤 성능이 향상된 것으로 보이며, 증강여부에 따른 Test의 결과 차이를 확인해야할 것으로 보임

In [70]:
def catgbmc(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:16]
    model = CatBoostClassifier(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY),
        verbose=0
        );     

    pred = model.predict(validX)
    score = f1_score(validY, pred, average='macro')
    print(score)
    
    return model

In [71]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 120,
          'auto_class_weights' : 'Balanced'}

model1 = catgbmc(X, y, X, y, params)

strategy3 = {0:1480, 1:1140}
categ_var = [x for x in range(16)]
smote3 = SMOTENC(categorical_features=categ_var, sampling_strategy=strategy3)
X3, y3 = smote3.fit_resample(X, y)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 120,
          'auto_class_weights' : 'Balanced'}

model2 = catgbmc(X3, y3, X, y, params)

1.0
1.0


In [73]:
pred1 = model1.predict(X)
pred2 = model2.predict(X)

print(np.sum(pred1 == pred2) / len(pred1))

pred1 = model1.predict(X_test)
pred2 = model2.predict(X_test)

print(np.sum(pred1 == pred2) / len(pred1))

1.0
0.9942857142857143


- augmentation 유무에 따라 학습 데이터에 대해서는 동일하나, 테스트 데이터에 대해서는 다른 결과를 제시

## 03. C & notC
### without Aug

In [47]:
X, y = train.drop(columns=['id', 'class']), (train['class'].values == 'C').astype(int)
X_test = test.drop(columns=['id'])
display(train, y)

Unnamed: 0,id,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,...,enco02_7,enco02_8,enco02_9,enco02_10,enco02_11,enco02_12,enco02_13,enco02_14,enco02_15,enco02_error
0,TRAIN_000,2,2,1,0,1,1,0,0,2,...,-0.004032,-0.003087,0.688055,-0.000842,0.002033,-0.002185,0.000576,-1.105726,0.000281,0.280199
1,TRAIN_001,2,1,1,1,0,0,1,0,1,...,0.001067,0.000942,-0.080874,-0.003213,-0.000995,-0.000384,0.001431,-0.178386,0.001243,0.188479
2,TRAIN_002,2,2,2,0,1,2,2,0,1,...,-0.008200,-0.008561,1.603863,0.001029,-0.002891,0.000040,0.000836,-2.192967,0.000403,0.375980
3,TRAIN_003,1,0,2,0,1,0,2,2,0,...,0.002672,0.002804,-1.202829,0.000851,-0.000689,0.000146,-0.004244,1.688365,0.001678,0.042934
4,TRAIN_004,2,2,2,2,0,2,0,0,0,...,-0.003805,-0.002670,1.267542,0.000700,0.001103,-0.002307,0.001045,-0.142882,0.000001,0.522556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,TRAIN_257,2,1,1,0,1,2,1,0,1,...,-0.004082,-0.004647,0.949844,-0.001329,0.003377,-0.001602,0.001909,-1.310202,-0.000624,0.118198
258,TRAIN_258,2,2,0,1,0,0,1,1,1,...,0.000448,0.001591,0.040117,-0.002339,-0.001808,-0.000087,0.001032,0.268166,0.001013,0.250635
259,TRAIN_259,1,1,2,0,1,0,1,2,1,...,0.004926,0.003631,-1.695253,-0.003528,0.008569,-0.001143,-0.001729,1.039266,-0.000917,0.342894
260,TRAIN_260,1,0,2,0,1,0,2,2,0,...,0.001471,-0.000723,-0.892943,-0.001610,0.000077,0.001162,-0.000772,-0.453117,0.000795,0.170510


array([0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0])

In [48]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 120,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X, y, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 120}

model_reg = catgbmr(X, y, X, y, params)

1.0
1.0


In [49]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'SqrtBalanced'}

model_cls = catgbmc(X, y, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X, y, X, y, params)

1.0
1.0


### with Aug

In [50]:
pd.DataFrame(train['class'].values=='C').astype(int).value_counts()

0    183
1     79
dtype: int64

In [51]:
strategy1 = {0:1000, 1:1000}
strategy2 = {0:1000, 1:1200}
strategy3 = {0:1830, 1:7900}

smote1 = SMOTENC(categorical_features=categ_var, sampling_strategy=strategy1)
smote2 = SMOTENC(categorical_features=categ_var, sampling_strategy=strategy2)
smote3 = SMOTENC(categorical_features=categ_var, sampling_strategy=strategy3)

X1, y1 = smote1.fit_resample(X, y)
X2, y2 = smote2.fit_resample(X, y)
X3, y3 = smote3.fit_resample(X, y)

In [52]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X1, y1, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X1, y1, X, y, params)

1.0
1.0


In [53]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X2, y2, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X2, y2, X, y, params)

1.0
1.0


In [54]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X3, y3, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X3, y3, X, y, params)

1.0
0.9954854828982511


### Summary
- C & notC case의 경우, 파생변수 생성 전에는 A보단 어렵지만 B보단 쉬운 정도의 문제로 보여짐
- B & notB case와 마찬가지로 데이터 증강 여부에 따른 성능 차이를 확인해보자

In [75]:
X, y = train.drop(columns=['id', 'class']), (train['class'].values == 'C').astype(int)
X_test = test.drop(columns=['id'])

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 120,
          'auto_class_weights' : 'Balanced'}

model1 = catgbmc(X, y, X, y, params)

strategy2 = {0:1000, 1:1200}
smote2 = SMOTENC(categorical_features=categ_var, sampling_strategy=strategy2)
X2, y2 = smote2.fit_resample(X, y)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 120,
          'auto_class_weights' : 'Balanced'}

model2 = catgbmc(X2, y2, X, y, params)

1.0
1.0


In [76]:
pred1 = model1.predict(X)
pred2 = model2.predict(X)

print(np.sum(pred1 == pred2) / len(pred1))

pred1 = model1.predict(X_test)
pred2 = model2.predict(X_test)

print(np.sum(pred1 == pred2) / len(pred1))

1.0
0.9828571428571429


- B&notB와 마찬가지로 증강 여부에 따라 학습 데이터셋에 대해선 동일하나 테스트셋에 대해서는 차이를 보임

## 04. B & C
### without Aug

In [55]:
X2, y2 = train[train['class']!='A'].drop(columns=['id', 'class']).reset_index(drop=True), (train[train['class']!='A']['class'].values == 'C').astype(int)
X_test = test.drop(columns=['id'])

display(X2, y)

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,...,enco02_7,enco02_8,enco02_9,enco02_10,enco02_11,enco02_12,enco02_13,enco02_14,enco02_15,enco02_error
0,2,2,1,0,1,1,0,0,2,0,...,-0.004032,-0.003087,0.688055,-0.000842,0.002033,-0.002185,0.000576,-1.105726,0.000281,0.280199
1,2,1,1,1,0,0,1,0,1,0,...,0.001067,0.000942,-0.080874,-0.003213,-0.000995,-0.000384,0.001431,-0.178386,0.001243,0.188479
2,2,2,2,0,1,2,2,0,1,1,...,-0.008200,-0.008561,1.603863,0.001029,-0.002891,0.000040,0.000836,-2.192967,0.000403,0.375980
3,2,2,2,2,0,2,0,0,0,0,...,-0.003805,-0.002670,1.267542,0.000700,0.001103,-0.002307,0.001045,-0.142882,0.000001,0.522556
4,2,2,2,1,0,2,0,0,1,0,...,-0.004613,-0.003328,1.275776,0.000839,0.003242,-0.002615,0.001263,-0.383656,-0.001248,0.298391
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,2,1,2,1,1,2,1,1,2,0,...,-0.005405,-0.003753,1.096697,0.002143,-0.002091,-0.001070,-0.000967,-0.347719,-0.000640,0.278815
189,2,2,2,1,1,2,1,0,2,0,...,-0.002952,-0.000803,0.881422,-0.000556,0.001010,-0.002509,0.001322,-0.125309,-0.001001,0.119747
190,2,1,1,0,1,2,1,0,1,0,...,-0.004082,-0.004647,0.949844,-0.001329,0.003377,-0.001602,0.001909,-1.310202,-0.000624,0.118198
191,2,2,0,1,0,0,1,1,1,0,...,0.000448,0.001591,0.040117,-0.002339,-0.001808,-0.000087,0.001032,0.268166,0.001013,0.250635


array([0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0])

In [56]:
X2.describe()

Unnamed: 0,SNP_01_ratio,SNP_02_ratio,SNP_03_ratio,SNP_04_ratio,SNP_05_ratio,SNP_06_ratio,SNP_07_ratio,SNP_08_ratio,SNP_09_ratio,SNP_10_ratio,...,enco02_7,enco02_8,enco02_9,enco02_10,enco02_11,enco02_12,enco02_13,enco02_14,enco02_15,enco02_error
count,193.0,193.0,193.0,193.0,193.0,193.0,193.0,193.0,193.0,193.0,...,193.0,193.0,193.0,193.0,193.0,193.0,193.0,193.0,193.0,193.0
mean,0.590674,0.590674,0.590674,0.590674,0.590674,0.590674,0.590674,0.590674,0.590674,0.590674,...,-0.002194,-0.001012,0.624524,-0.00042,-0.000814,-0.001001,0.000128,0.008688,0.000792,0.295798
std,0.11551,0.182372,0.085053,0.233799,0.271503,0.04142,0.125026,0.205745,0.138945,0.278487,...,0.002944,0.004388,0.683555,0.002733,0.003367,0.002746,0.002096,1.124863,0.001442,0.167111
min,0.0,0.333333,0.45283,0.40678,0.179487,0.540984,0.3,0.275862,0.538462,0.0,...,-0.01412,-0.012184,-0.483116,-0.007286,-0.015151,-0.008368,-0.010632,-2.504512,-0.002261,0.030562
25%,0.5,0.333333,0.45283,0.40678,0.486111,0.540984,0.644172,0.482353,0.538462,0.743243,...,-0.004032,-0.003709,0.117466,-0.002396,-0.003242,-0.00221,-0.001099,-0.692447,-0.000303,0.188479
50%,0.644928,0.637363,0.641304,0.40678,0.486111,0.598039,0.644172,0.482353,0.538462,0.743243,...,-0.002329,-0.001236,0.556738,-0.000412,-0.000562,-0.000915,0.000407,-0.137437,0.000829,0.256419
75%,0.644928,0.637363,0.641304,0.854839,0.878049,0.598039,0.644172,0.822785,0.538462,0.743243,...,-0.000291,0.0009,1.08759,0.00155,0.00122,0.000327,0.001527,0.834024,0.001779,0.365979
max,0.644928,0.822222,0.645833,1.0,0.878049,0.666667,0.644172,0.822785,1.0,0.743243,...,0.004323,0.028956,2.579753,0.006697,0.009032,0.025296,0.00486,2.931975,0.006727,1.104527


- trait 변수의 경우 변별성이 없으므로 제외

In [57]:
X2.drop(columns=['trait'], inplace=True)
X_test.drop(columns=['trait'], inplace=True)

In [58]:
def catgbmc(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:15]
    model = CatBoostClassifier(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY),
        verbose=0
        );     

    pred = model.predict(validX)
    score = f1_score(validY, pred, average='macro')
    print(score)
    
    return model

In [59]:
def catgbmr(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:15]
    model = CatBoostRegressor(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY),
        verbose=0
        );     

    pred = model.predict(validX)
    score = f1_score(validY, np.round(pred), average='macro')
    print(score)
    
    return model

In [60]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 120,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X2, y2, X2, y2, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 120}

model_reg = catgbmr(X2, y2, X2, y2, params)

1.0
1.0


In [61]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'SqrtBalanced'}

model_cls = catgbmc(X2, y2, X2, y2, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X2, y2, X2, y2, params)

1.0
1.0


### with aug

In [62]:
pd.DataFrame(y2).value_counts()

0    114
1     79
dtype: int64

In [63]:
strategy1 = {0:1000, 1:1000}
strategy2 = {0:1000, 1:1200}
strategy3 = {0:1140, 1:790}

categ_var = [x for x in range(15)]

smote1 = SMOTENC(categorical_features=categ_var, sampling_strategy=strategy1)
smote2 = SMOTENC(categorical_features=categ_var, sampling_strategy=strategy2)
smote3 = SMOTENC(categorical_features=categ_var, sampling_strategy=strategy3)

X01, y01 = smote1.fit_resample(X2, y2)
X02, y02 = smote2.fit_resample(X2, y2)
X03, y03 = smote3.fit_resample(X2, y2)

In [64]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X01, y01, X2, y2, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X01, y01, X2, y2, params)

1.0
0.9946527027401435


In [65]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X02, y02, X2, y2, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X02, y02, X2, y2, params)

1.0
1.0


In [66]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X03, y03, X2, y2, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X03, y03, X2, y2, params)

1.0
1.0


### Summary
- C에 대한 비중을 늘려주거나, 똑같게 할 경우 가장 높은 성능을 보임

In [77]:
def catgbmc(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:15]
    model = CatBoostClassifier(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY),
        verbose=0
        );     

    pred = model.predict(validX)
    score = f1_score(validY, pred, average='macro')
    print(score)
    
    return model

In [79]:
X2, y2 = train[train['class']!='A'].drop(columns=['id', 'class']).reset_index(drop=True), (train[train['class']!='A']['class'].values == 'C').astype(int)
X_test = test.drop(columns=['id'])
X2.drop(columns=['trait'], inplace=True)
X_test.drop(columns=['trait'], inplace=True)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 120,
          'auto_class_weights' : 'Balanced'}

model1 = catgbmc(X2, y2, X2, y2, params)

strategy3 = {0:1140, 1:790}

categ_var = [x for x in range(15)]
smote3 = SMOTENC(categorical_features=categ_var, sampling_strategy=strategy3)
X03, y03 = smote3.fit_resample(X2, y2)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 120,
          'auto_class_weights' : 'Balanced'}

model2 = catgbmc(X03, y03, X2, y2, params)

1.0
1.0


In [81]:
pred1 = model1.predict(X2)
pred2 = model2.predict(X2)

print(np.sum(pred1 == pred2) / len(pred1))

pred1 = model1.predict(X_test)
pred2 = model2.predict(X_test)

print(np.sum(pred1 == pred2) / len(pred1))

1.0
0.96


# STEP 03. ENSEMBLE
- 각 타겟 데이터들 별로 가장 성능이 좋았던 조합 구현
- 만약 cls의 성능이 동일하면 reg를 기준으로 가장 성능이 좋은 조합 구현
- 본래의 성능을 하락시키지 않는 선에서 규제 등을 추가

## 1) MODEL for ALL(w/o aug + w/ aug)

In [300]:
def catgbmc(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:16]
    model = CatBoostClassifier(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(validX, validY),
        verbose=0
        );     

    pred = model.predict(validX)
    score = f1_score(validY, pred, average='macro')
    print(score)
    
    return model

In [302]:
X, y = train.drop(columns=['id', 'class']), train['class'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2))
X_test = test.drop(columns=['id'])

categ_var = [x for x in range(16)]
strategy = {0:10000, 1:10000, 2:10000}
smote = SMOTENC(categorical_features=categ_var, sampling_strategy=strategy)
X1, y1 = smote.fit_resample(X, y)

params1 = {'iterations':3000,
          'learning_rate':0.05,
          'l2_leaf_reg' : 100}

params2 = {'iterations':3000,
          'learning_rate':0.03,
          'l2_leaf_reg' : 100}

model1 = catgbmc(X, y, X, y, params1)
model2 = catgbmc(X1, y1, X, y, params2)

pred1 = model1.predict_proba(X_test)
pred2 = model2.predict_proba(X_test)

0.9928566141091125
0.9784838350055741


## 2) MODEL for A&notA

In [303]:
X, y = train.drop(columns=['id', 'class']), (train['class'].values == 'A').astype(int)
X_test = test.drop(columns=['id'])

categ_var = [x for x in range(16)]
strategy = {0:10000, 1:10000}
smote = SMOTENC(categorical_features=categ_var, sampling_strategy=strategy)
X1, y1 = smote.fit_resample(X, y)

params1 = {'iterations':3000,
          'learning_rate':0.05,
          'l2_leaf_reg' : 100}

params2 = {'iterations':3000,
          'learning_rate':0.03,
          'l2_leaf_reg' : 100}

model3 = catgbmc(X, y, X, y, params1)
model4 = catgbmc(X1, y1, X, y, params2)

pred3 = model3.predict_proba(X_test)
pred4 = model4.predict_proba(X_test)

1.0
1.0


## 3) MODEL for B&notB

In [304]:
X, y = train.drop(columns=['id', 'class']), (train['class'].values == 'B').astype(int)
X_test = test.drop(columns=['id'])

categ_var = [x for x in range(16)]
strategy = {0:10000, 1:10000}
smote = SMOTENC(categorical_features=categ_var, sampling_strategy=strategy)
X1, y1 = smote.fit_resample(X, y)

params1 = {'iterations':3000,
          'learning_rate':0.05,
          'l2_leaf_reg' : 100}

params2 = {'iterations':3000,
          'learning_rate':0.03,
          'l2_leaf_reg' : 100}

model5 = catgbmc(X, y, X, y, params1)
model6 = catgbmc(X1, y1, X, y, params2)

pred5 = model5.predict_proba(X_test)
pred6 = model6.predict_proba(X_test)

0.9961138551446922
1.0


## 4) MODEL for C&notC

In [305]:
X, y = train.drop(columns=['id', 'class']), (train['class'].values == 'C').astype(int)
X_test = test.drop(columns=['id'])

categ_var = [x for x in range(16)]
strategy = {0:10000, 1:10000}
smote = SMOTENC(categorical_features=categ_var, sampling_strategy=strategy)
X1, y1 = smote.fit_resample(X, y)

params1 = {'iterations':3000,
          'learning_rate':0.05,
          'l2_leaf_reg' : 100}

params2 = {'iterations':3000,
          'learning_rate':0.03,
          'l2_leaf_reg' : 100}

model7 = catgbmc(X, y, X, y, params1)
model8 = catgbmc(X1, y1, X, y, params2)

pred7 = model7.predict_proba(X_test)
pred8 = model8.predict_proba(X_test)

1.0
1.0


## 5) MODEL for B&C

In [306]:
def catgbmc2(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:15]
    model = CatBoostClassifier(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY),
        verbose=0
        );     

    pred = model.predict(validX)
    score = f1_score(validY, pred, average='macro')
    print(score)
    
    return model

In [307]:
X2, y2 = train[train['class']!='A'].drop(columns=['id', 'class']).reset_index(drop=True), (train[train['class']!='A']['class'].values == 'C').astype(int)
X_test2 = test.drop(columns=['id'])
X2.drop(columns=['trait'], inplace=True)
X_test2.drop(columns=['trait'], inplace=True)

categ_var = [x for x in range(15)]
strategy = {0:10000, 1:10000}
smote = SMOTENC(categorical_features=categ_var, sampling_strategy=strategy)
X3, y3 = smote.fit_resample(X2, y2)

params1 = {'iterations':3000,
          'learning_rate':0.05,
          'l2_leaf_reg' : 100}

params2 = {'iterations':3000,
          'learning_rate':0.03,
          'l2_leaf_reg' : 100}

model9 = catgbmc2(X2, y2, X2, y2, params1)
model10 = catgbmc2(X3, y3, X2, y2, params2)

pred9 = model9.predict_proba(X_test2)
pred10 = model10.predict_proba(X_test2)

1.0
0.9946527027401435


## 6) Make Preds

In [308]:
total = pd.DataFrame()
total['all_a'] = pred1[:,0]
total['all_b'] = pred1[:,1]
total['all_c'] = pred1[:,2]
total['all2_a'] = pred2[:,0]
total['all2_b'] = pred2[:,1]
total['all2_c'] = pred2[:,2]

total['a1'] = pred3[:,1]
total['a2'] = pred4[:,1]
total['b1'] = pred5[:,1]
total['b2'] = pred6[:,1]
total['c1'] = pred7[:,1]
total['c2'] = pred8[:,1]

total['bc_b'] = pred9[:,0]
total['bc2_b'] = pred10[:,0]

total

Unnamed: 0,all_a,all_b,all_c,all2_a,all2_b,all2_c,a1,a2,b1,b2,c1,c2,bc_b,bc2_b
0,0.912317,0.042927,0.044756,0.996212,0.001823,0.001965,0.997326,0.999988,0.008441,0.000461,0.011397,0.000215,0.211519,0.319131
1,0.018978,0.954898,0.026124,0.000397,0.998899,0.000704,0.002205,0.000016,0.981467,0.999790,0.004770,0.000123,0.994331,0.999918
2,0.033434,0.041921,0.924645,0.002772,0.004139,0.993089,0.002733,0.000013,0.005279,0.001455,0.978392,0.999500,0.032108,0.000526
3,0.028977,0.918092,0.052932,0.001185,0.988932,0.009883,0.002071,0.000013,0.984803,0.996480,0.115246,0.273497,0.992047,0.957978
4,0.957891,0.020984,0.021125,0.999823,0.000058,0.000119,0.997052,0.999987,0.005604,0.000039,0.003816,0.000047,0.078544,0.095311
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,0.016235,0.955701,0.028063,0.000281,0.995770,0.003949,0.001869,0.000017,0.991280,0.999600,0.012867,0.000581,0.986111,0.999676
171,0.013198,0.016363,0.970439,0.000186,0.000516,0.999298,0.001995,0.000011,0.004691,0.000225,0.995960,0.999945,0.004939,0.000050
172,0.057594,0.080272,0.862134,0.009846,0.013227,0.976926,0.002698,0.000017,0.007443,0.002122,0.853735,0.995263,0.073563,0.002972
173,0.019223,0.944366,0.036411,0.000058,0.999824,0.000118,0.001977,0.000011,0.989378,0.999970,0.008130,0.000058,0.985139,0.999980


In [309]:
target_cols = ['all_a', 'all2_a', 'a1', 'a2']

highs, lows = [], []
for col in target_cols :
    high_idx = total[total[col] >= 0.5].index.tolist()
    low_idx = total[total[col] < 0.5].index.tolist()
    highs.append(high_idx)
    lows.append(low_idx)
    
for i in range(len(highs)-1) :
    high = highs[i]
    low = lows[i]
    for j in range(i, len(highs)) :
        high2 = highs[j]
        low2 = lows[j]
        if (high != high2) or (low != low2) :
            print("Not Same!", i, j)    

print("finish!")

finish!


In [310]:
# A는 확정
a_index = total[total['all_a'] > 0.5].index.tolist()
len(a_index)

51

In [311]:
total[(total.all_c >= 0.5) & (np.min(total[['all2_c', 'c1', 'c2']], axis=1) < 0.5)]

Unnamed: 0,all_a,all_b,all_c,all2_a,all2_b,all2_c,a1,a2,b1,b2,c1,c2,bc_b,bc2_b
12,0.051123,0.171888,0.776989,0.064657,0.264296,0.671047,0.001641,1.1e-05,0.153026,0.277208,0.874171,0.144946,0.115395,0.60997
126,0.053532,0.159094,0.787374,0.133399,0.232675,0.633926,0.004321,1.5e-05,0.154348,0.119593,0.652413,0.383019,0.189224,0.238155


In [312]:
total2 = pd.DataFrame()
total2['a'] = np.zeros(len(total))

total2['c_prob'] = (np.sum(total[['all_c', 'c1']], axis=1) + (1-total['bc_b'])) / 3
total2['c2_prob'] = (np.sum(total[['all2_c', 'c2']], axis=1) + (1-total['bc2_b'])) / 3
total2['c3_prob'] = (1-total['bc_b'])
total2['c4_prob'] = (1-total['bc2_b'])

total2['b_prob'] = np.mean(total[['all_b', 'b1', 'bc_b']], axis=1)
total2['b2_prob'] = np.mean(total[['all2_b', 'b2', 'bc2_b']], axis=1)
total2['b3_prob'] = total['bc_b']
total2['b4_prob'] = total['bc2_b']

total2.loc[a_index, 'a'] = 1
total2.iloc[a_index, 1:] = 0
total2

Unnamed: 0,a,c_prob,c2_prob,c3_prob,c4_prob,b_prob,b2_prob,b3_prob,b4_prob
0,1.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.0,0.012188,0.000303,0.005669,0.000082,0.976898,0.999536,0.994331,0.999918
2,0.0,0.956976,0.997354,0.967892,0.999474,0.026436,0.002040,0.032108,0.000526
3,0.0,0.058710,0.108467,0.007953,0.042022,0.964980,0.981130,0.992047,0.957978
4,1.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...
170,0.0,0.018273,0.001618,0.013889,0.000324,0.977698,0.998349,0.986111,0.999676
171,0.0,0.987153,0.999731,0.995061,0.999950,0.008665,0.000264,0.004939,0.000050
172,0.0,0.880769,0.989739,0.926437,0.997028,0.053759,0.006107,0.073563,0.002972
173,0.0,0.019801,0.000065,0.014861,0.000020,0.972961,0.999925,0.985139,0.999980


In [313]:
total2[(total2.c_prob >= 0.5) & ((total2.c2_prob < 0.5) | (total2.b_prob >= 0.5) | (total2.b2_prob >= 0.5)| (total2.b3_prob >= 0.5) | (total2.b4_prob >= 0.5))]

Unnamed: 0,a,c_prob,c2_prob,c3_prob,c4_prob,b_prob,b2_prob,b3_prob,b4_prob
12,0.0,0.845255,0.402008,0.884605,0.39003,0.146769,0.383825,0.115395,0.60997


In [314]:
c1_index = total2[total2.c_prob >= 0.5].index.tolist()
c2_index = total2[total2.c2_prob >= 0.5].index.tolist()
c3_index = total2[total2.c3_prob >= 0.5].index.tolist()
c4_index = total2[total2.c4_prob >= 0.5].index.tolist()

b1_index = total2[total2.b_prob >= 0.5].index.tolist()
b2_index = total2[total2.b2_prob >= 0.5].index.tolist()
b3_index = total2[total2.b3_prob >= 0.5].index.tolist()
b4_index = total2[total2.b4_prob >= 0.5].index.tolist()

In [315]:
total2.iloc[[117,119,162]]

Unnamed: 0,a,c_prob,c2_prob,c3_prob,c4_prob,b_prob,b2_prob,b3_prob,b4_prob
117,0.0,0.319208,0.719448,0.265688,0.923201,0.765201,0.489734,0.734312,0.076799
119,0.0,0.140675,0.500017,0.086005,0.655579,0.868845,0.616016,0.913995,0.344421
162,0.0,0.397922,0.947131,0.470636,0.963903,0.444691,0.168237,0.529364,0.036097


In [322]:
total_idx = [x for x in range(len(X_test))]
a_index = total[total['all_a'] >= 0.5].index.tolist()
b_index = total2[total2.b3_prob >= 0.5].index.tolist()
c_index = total2[total2.c3_prob >= 0.5].index.tolist()

print(len(a_index), len(b_index), len(c_index), len(a_index)+len(b_index)+len(c_index))                              

51 87 37 175


In [323]:
except_idx = [x for x in total_idx if (x not in a_index) & (x not in b_index) & (x not in c_index)]
except_idx

[]

In [324]:
total2.iloc[except_idx]

Unnamed: 0,a,c_prob,c2_prob,c3_prob,c4_prob,b_prob,b2_prob,b3_prob,b4_prob


In [325]:
submit = pd.read_csv("./data/sample_submission.csv")
submit['class'] = -1
submit.loc[a_index,'class'] = 0
submit.loc[b_index,'class'] = 1
submit.loc[c_index,'class'] = 2
submit['class'] = submit['class'].map(lambda x : 'A' if x==0 else ('B' if x==1 else 'C'))

submit.to_csv("./ensemble.csv", index=False)
submit['class'].value_counts()

B    87
A    51
C    37
Name: class, dtype: int64

In [326]:
submit

Unnamed: 0,id,class
0,TEST_000,A
1,TEST_001,B
2,TEST_002,C
3,TEST_003,B
4,TEST_004,A
...,...,...
170,TEST_170,B
171,TEST_171,C
172,TEST_172,C
173,TEST_173,B


In [327]:
submit.loc[[3,5,12,119,126,162,168], 'class']

3      B
5      B
12     C
119    B
126    C
162    B
168    B
Name: class, dtype: object

In [233]:
pd.read_csv("submit_0.9622_2.csv")

Unnamed: 0,id,class
0,TEST_000,A
1,TEST_001,B
2,TEST_002,C
3,TEST_003,B
4,TEST_004,A
...,...,...
170,TEST_170,B
171,TEST_171,C
172,TEST_172,C
173,TEST_173,B


In [205]:
target_cols = ['all_b', 'all2_b', 'b1', 'b2', 'bc_b', 'bc2_b']

highs, lows = [], []
for col in target_cols :
    high_idx = total[total[col] >= 0.5].index.tolist()
    low_idx = total[total[col] < 0.5].index.tolist()
    highs.append(high_idx)
    lows.append(low_idx)
    
for i in range(len(highs)-1) :
    high = highs[i]
    low = lows[i]
    for j in range(i, len(highs)) :
        high2 = highs[j]
        low2 = lows[j]
        if (high != high2) or (low != low2) :
            print("Not Same!", i, j)    

print("finish!")

Not Same! 0 1
Not Same! 0 3
Not Same! 0 4
Not Same! 0 5
Not Same! 1 2
Not Same! 1 3
Not Same! 1 4
Not Same! 2 3
Not Same! 2 4
Not Same! 2 5
Not Same! 3 4
Not Same! 3 5
Not Same! 4 5
finish!


In [206]:
total[(total.all_b >= 0.5) & (np.min(total[['all2_b', 'b1', 'b2', 'bc_b', 'bc2_b']], axis=1) < 0.5)]

Unnamed: 0,all_a,all_b,all_c,all2_a,all2_b,all2_c,a1,a2,b1,b2,c1,c2,bc_b,bc2_b,a_prob,b_prob,c_prob,pred
117,0.056267,0.728934,0.214799,0.019899,0.389816,0.590285,0.003081,6.8e-05,0.741767,0.756817,0.308518,0.929196,0.60774,0.027416,0.019829,0.317578,0.682422,2
119,0.040368,0.853287,0.106346,0.028666,0.396911,0.574424,0.003465,6.8e-05,0.741292,0.354281,0.120786,0.59446,0.845614,0.353159,0.018141,0.599386,0.400614,1


In [152]:
total['a_prob'] = (total['all_a']+total['all2_a']+total['a1']+total['a2']) / 4
total['b_prob'] = (total['all_b']+total['all2_b']+total['b1']+total['b2']+total['bc_b']+total['bc2_b']) / 6
total['c_prob'] = (total['all_c']+total['all2_c']+total['c1']+total['c2']+(1-total['bc_b'])+(1-total['bc2_b'])) / 6
total

Unnamed: 0,all_a,all_b,all_c,all2_a,all2_b,all2_c,a1,a2,b1,b2,c1,c2,bc_b,bc2_b,a_prob,b_prob,c_prob
0,0.960000,0.020477,0.019522,0.993239,0.002987,0.003775,0.999018,0.999975,0.002322,0.007068,0.005261,0.001262,0.102803,0.198903,0.988058,0.055760,0.288019
1,0.005849,0.983399,0.010752,0.000687,0.998075,0.001238,0.000830,0.000087,0.970961,0.999181,0.002340,0.000359,0.996485,0.999282,0.001863,0.991231,0.003154
2,0.016931,0.025062,0.958007,0.021292,0.036474,0.942235,0.000794,0.000024,0.001708,0.014502,0.991169,0.993658,0.028963,0.004733,0.009760,0.018574,0.975229
3,0.013692,0.946063,0.040245,0.011274,0.849615,0.139112,0.000573,0.000023,0.994444,0.973251,0.045093,0.425166,0.991711,0.974004,0.006390,0.954848,0.113984
4,0.981872,0.008518,0.009610,0.999210,0.000350,0.000440,0.999262,0.999911,0.001849,0.000449,0.001190,0.000292,0.054494,0.043463,0.995064,0.018187,0.318929
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,0.004802,0.983683,0.011515,0.000915,0.996697,0.002389,0.000478,0.000048,0.997320,0.999280,0.007689,0.001007,0.996600,0.998835,0.001561,0.995402,0.004528
171,0.004125,0.005577,0.990297,0.000794,0.001113,0.998093,0.000599,0.000022,0.001496,0.000593,0.998823,0.999886,0.001442,0.000117,0.001385,0.001723,0.997590
172,0.037071,0.053709,0.909220,0.049914,0.039605,0.910480,0.000905,0.000029,0.003246,0.017738,0.834084,0.935085,0.053997,0.008178,0.021980,0.029412,0.921116
173,0.005738,0.977631,0.016631,0.000288,0.998911,0.000801,0.000790,0.000023,0.993248,0.999753,0.002913,0.000310,0.995922,0.999816,0.001710,0.994213,0.004153


In [177]:
total['a_prob'] = (total['all_a']+total['all2_a']+total['a1']+total['a2']) / 4
total['b_prob'] = (total['bc_b']+total['bc2_b']) / 2
total['c_prob'] = ((1-total['bc_b'])+(1-total['bc2_b'])) / 2
total

Unnamed: 0,all_a,all_b,all_c,all2_a,all2_b,all2_c,a1,a2,b1,b2,c1,c2,bc_b,bc2_b,a_prob,b_prob,c_prob,pred
0,0.960000,0.020477,0.019522,0.993239,0.002987,0.003775,0.999018,0.999975,0.002322,0.007068,0.005261,0.001262,0.102803,0.198903,0.988058,0.150853,0.849147,0
1,0.005849,0.983399,0.010752,0.000687,0.998075,0.001238,0.000830,0.000087,0.970961,0.999181,0.002340,0.000359,0.996485,0.999282,0.001863,0.997884,0.002116,1
2,0.016931,0.025062,0.958007,0.021292,0.036474,0.942235,0.000794,0.000024,0.001708,0.014502,0.991169,0.993658,0.028963,0.004733,0.009760,0.016848,0.983152,2
3,0.013692,0.946063,0.040245,0.011274,0.849615,0.139112,0.000573,0.000023,0.994444,0.973251,0.045093,0.425166,0.991711,0.974004,0.006390,0.982857,0.017143,1
4,0.981872,0.008518,0.009610,0.999210,0.000350,0.000440,0.999262,0.999911,0.001849,0.000449,0.001190,0.000292,0.054494,0.043463,0.995064,0.048978,0.951022,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,0.004802,0.983683,0.011515,0.000915,0.996697,0.002389,0.000478,0.000048,0.997320,0.999280,0.007689,0.001007,0.996600,0.998835,0.001561,0.997717,0.002283,1
171,0.004125,0.005577,0.990297,0.000794,0.001113,0.998093,0.000599,0.000022,0.001496,0.000593,0.998823,0.999886,0.001442,0.000117,0.001385,0.000779,0.999221,2
172,0.037071,0.053709,0.909220,0.049914,0.039605,0.910480,0.000905,0.000029,0.003246,0.017738,0.834084,0.935085,0.053997,0.008178,0.021980,0.031087,0.968913,2
173,0.005738,0.977631,0.016631,0.000288,0.998911,0.000801,0.000790,0.000023,0.993248,0.999753,0.002913,0.000310,0.995922,0.999816,0.001710,0.997869,0.002131,1


In [178]:
np.sum(np.sum(total[['a_prob', 'b_prob', 'c_prob']] > 0.5, axis=1) == 0)

0

In [179]:
total['pred'] = np.argmax(total[['a_prob', 'b_prob', 'c_prob']].values, axis=1)
total['pred']

0      0
1      1
2      2
3      1
4      0
      ..
170    1
171    2
172    2
173    1
174    1
Name: pred, Length: 175, dtype: int64

In [155]:
df1 = pd.read_csv("./data/submit_0.99078.csv")
df2 = pd.read_csv("./data/submit_0.99078_2.csv")
df3 = pd.read_csv("./data/submit_0.99078_3.csv")
df4 = pd.read_csv("./data/submit_0.99078_4.csv")

In [180]:
df = pd.DataFrame()
df['pred'] = total['pred']
df['answer'] = df['pred'].map(lambda x : 'A' if x==0 else('B' if x==1 else 'C'))
df['submit1'] = df1['class']
df['submit2'] = df2['class']
df['submit3'] = df3['class']
df['submit4'] = df4['class']
df

Unnamed: 0,pred,answer,submit1,submit2,submit3,submit4
0,0,A,A,A,A,A
1,1,B,B,B,B,B
2,2,C,C,C,C,C
3,1,B,C,C,C,C
4,0,A,A,A,A,A
...,...,...,...,...,...,...
170,1,B,B,B,B,B
171,2,C,C,C,C,C
172,2,C,C,C,C,C
173,1,B,B,B,B,B


In [181]:
submit = pd.read_csv("./data/sample_submission.csv")
submit['class'] = df['answer']
submit.to_csv("./ensemble.csv", index=False)
submit['class'].value_counts()

B    85
A    48
C    42
Name: class, dtype: int64

In [182]:
display(df[df.answer != df.submit1],df[df.answer != df.submit2],df[df.answer != df.submit3],df[df.answer != df.submit4])

Unnamed: 0,pred,answer,submit1,submit2,submit3,submit4
3,1,B,C,C,C,C
5,1,B,C,C,C,C
8,2,C,A,A,A,A
12,2,C,B,C,C,C
60,2,C,A,A,A,A
119,1,B,C,C,C,C
126,2,C,B,C,B,B
140,2,C,A,A,A,A


Unnamed: 0,pred,answer,submit1,submit2,submit3,submit4
3,1,B,C,C,C,C
5,1,B,C,C,C,C
8,2,C,A,A,A,A
60,2,C,A,A,A,A
119,1,B,C,C,C,C
140,2,C,A,A,A,A


Unnamed: 0,pred,answer,submit1,submit2,submit3,submit4
3,1,B,C,C,C,C
5,1,B,C,C,C,C
8,2,C,A,A,A,A
60,2,C,A,A,A,A
119,1,B,C,C,C,C
126,2,C,B,C,B,B
140,2,C,A,A,A,A


Unnamed: 0,pred,answer,submit1,submit2,submit3,submit4
3,1,B,C,C,C,C
5,1,B,C,C,C,C
8,2,C,A,A,A,A
60,2,C,A,A,A,A
119,1,B,C,C,C,C
126,2,C,B,C,B,B
140,2,C,A,A,A,A
168,1,B,B,B,B,C


In [183]:
df.iloc[[3,5,12,119,126,162,168]]

Unnamed: 0,pred,answer,submit1,submit2,submit3,submit4
3,1,B,C,C,C,C
5,1,B,C,C,C,C
12,2,C,B,C,C,C
119,1,B,C,C,C,C
126,2,C,B,C,B,B
162,2,C,C,C,C,C
168,1,B,B,B,B,C


In [160]:
total

Unnamed: 0,all_a,all_b,all_c,all2_a,all2_b,all2_c,a1,a2,b1,b2,c1,c2,bc_b,bc2_b,a_prob,b_prob,c_prob,pred
0,0.960000,0.020477,0.019522,0.993239,0.002987,0.003775,0.999018,0.999975,0.002322,0.007068,0.005261,0.001262,0.102803,0.198903,0.988058,0.055760,0.288019,0
1,0.005849,0.983399,0.010752,0.000687,0.998075,0.001238,0.000830,0.000087,0.970961,0.999181,0.002340,0.000359,0.996485,0.999282,0.001863,0.991231,0.003154,1
2,0.016931,0.025062,0.958007,0.021292,0.036474,0.942235,0.000794,0.000024,0.001708,0.014502,0.991169,0.993658,0.028963,0.004733,0.009760,0.018574,0.975229,2
3,0.013692,0.946063,0.040245,0.011274,0.849615,0.139112,0.000573,0.000023,0.994444,0.973251,0.045093,0.425166,0.991711,0.974004,0.006390,0.954848,0.113984,1
4,0.981872,0.008518,0.009610,0.999210,0.000350,0.000440,0.999262,0.999911,0.001849,0.000449,0.001190,0.000292,0.054494,0.043463,0.995064,0.018187,0.318929,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,0.004802,0.983683,0.011515,0.000915,0.996697,0.002389,0.000478,0.000048,0.997320,0.999280,0.007689,0.001007,0.996600,0.998835,0.001561,0.995402,0.004528,1
171,0.004125,0.005577,0.990297,0.000794,0.001113,0.998093,0.000599,0.000022,0.001496,0.000593,0.998823,0.999886,0.001442,0.000117,0.001385,0.001723,0.997590,2
172,0.037071,0.053709,0.909220,0.049914,0.039605,0.910480,0.000905,0.000029,0.003246,0.017738,0.834084,0.935085,0.053997,0.008178,0.021980,0.029412,0.921116,2
173,0.005738,0.977631,0.016631,0.000288,0.998911,0.000801,0.000790,0.000023,0.993248,0.999753,0.002913,0.000310,0.995922,0.999816,0.001710,0.994213,0.004153,1


In [184]:
df['pred1'] = np.argmax(total[['all_a', 'all_b', 'all_c']].values, axis=1)
df['pred2'] = np.argmax(total[['all2_a', 'all2_b', 'all2_c']].values, axis=1)
df.iloc[[3,5,12,119,126,162,168]]

Unnamed: 0,pred,answer,submit1,submit2,submit3,submit4,pred1,pred2
3,1,B,C,C,C,C,1,1
5,1,B,C,C,C,C,1,1
12,2,C,B,C,C,C,2,2
119,1,B,C,C,C,C,1,2
126,2,C,B,C,B,B,2,2
162,2,C,C,C,C,C,2,2
168,1,B,B,B,B,C,1,1


In [127]:
total['all'].value_counts()

KeyError: 'all'

In [None]:
total['argmax'].value_counts()

In [None]:
total[total['all'] != total['argmax']]

In [None]:
target_index = total[total['all'] != total['argmax']].index.tolist()
total['answer'] = total['all']

for target in target_index :
    score = total['bc'][target]
    if score <= 0.5 :
        total['answer'][target] = 1
    else :
        total['answer'][target] = 2
        
total

In [None]:
submit = pd.read_csv("./data/sample_submission.csv")
submit['class'] = total['answer'].map(lambda x : 'A' if x==0 else ('B' if x==1 else 'C'))
submit['class'].value_counts()

In [None]:
submit.to_csv("./ensemble.csv", index=False)

# STEP 04. STACKING

In [None]:
X, X_test = train.drop(columns=['id', 'class']), test.drop(columns=['id'])
train[['all_a', 'all_b', 'all_c']] = model_all.predict_proba(X)
test[['all_a', 'all_b', 'all_c']] = model_all.predict_proba(X_test)

train['prob_a'] = model_a.predict_proba(X)[:,1]
test['prob_a'] = model_a.predict_proba(X_test)[:,1]
train['prob_b'] = model_b.predict_proba(X)[:,1]
test['prob_b'] = model_b.predict_proba(X_test)[:,1]
train['prob_c'] = model_c.predict_proba(X)[:,1]
test['prob_c'] = model_c.predict_proba(X_test)[:,1]

train['prob_bc'] = model_bc.predict_proba(X.drop(columns=['trait']))[:,1]
test['prob_bc'] = model_bc.predict_proba(X_test.drop(columns=['trait']))[:,1]

train.info()

In [None]:
test.info()

In [None]:
X_new, y_new = train.drop(columns=['id', 'class']), train['class'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2))
X_test_new = test.drop(columns=['id'])

In [None]:
def catgbmc(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:16]
    model = CatBoostClassifier(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        # l2_leaf_reg=10
        # random_state=random_seed
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY),
        verbose=0
        );     

    pred = model.predict(validX)
    score = f1_score(validY, pred, average='macro')
    print(score)
    
    return model

In [None]:
def catgbmr(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:16]
    model = CatBoostRegressor(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        # l2_leaf_reg=10
        # random_state=random_seed
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY),
        verbose=0
        );     

    pred = model.predict(validX)
    score = f1_score(validY, np.round(pred), average='macro')
    print(score)
    
    return model

In [None]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X_new, y_new, X_new, y_new, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X_new, y_new, X_new, y_new, params)

In [None]:
total['new_cls'] = model_cls.predict(X_test_new)
total['new_reg'] = model_reg.predict(X_test_new)
total['new_reg2'] = np.round(model_reg.predict(X_test_new)).astype(int)
total

In [None]:
total['new_reg2'].value_counts()

In [None]:
total[total.answer != total.new_cls]

In [None]:
total[total.answer != total.new_reg2]

In [None]:
total.new_cls.value_counts()

In [None]:
submit = pd.read_csv("./data/sample_submission.csv")
submit['class'] = total['new_cls'].map(lambda x : 'A' if x==0 else ('B' if x==1 else 'C'))
submit['class'].value_counts()

In [None]:
submit.loc[[3,5,12,119,126,162,168], 'class']

In [None]:
submit.to_csv("./stacking.csv", index=False)

In [None]:
submit = pd.read_csv("./data/sample_submission.csv")
submit['class'] = total['new_reg2'].map(lambda x : 'A' if x==0 else ('B' if x==1 else 'C'))
submit['class'].value_counts()
submit.to_csv("./stacking2.csv", index=False)

In [None]:
submit.loc[[3,5,12,119,126,162,168], 'class']

In [None]:
pd.DataFrame(data=model1.get_feature_importance(), index=model1.feature_names_, columns=['feature_importance']).sort_values('feature_importance', ascending=False)

In [None]:
pd.DataFrame(data=model2.get_feature_importance(), index=model2.feature_names_, columns=['feature_importance']).sort_values('feature_importance', ascending=False)

In [None]:
model2.predict(X_test).flatten().shape

In [None]:
pred = model2.predict(X_test).flatten()
high1 = pd.read_csv("./data/submit_0.99078.csv")
high2 = pd.read_csv("./data/submit_0.99078_2.csv")
high3 = pd.read_csv("./data/submit_0.99078_3.csv")
high4 = pd.read_csv("./data/submit_0.99078_4.csv")

total = pd.DataFrame()
total['pred'] = pd.Series(pred)
total['high1'] = high1['class']
total['high2'] = high2['class']
total['high3'] = high3['class']
total['high4'] = high4['class']

total

In [None]:
total[total.pred != total.high1]

In [None]:
total[total.pred != total.high2]

In [None]:
total[total.pred != total.high3]

In [None]:
high1 = pd.read_csv("./data/submit_0.99078.csv")
high2 = pd.read_csv("./data/submit_0.99078_2.csv")
high3 = pd.read_csv("./data/submit_0.99078_3.csv")
high4 = pd.read_csv("./data/submit_0.99078_4.csv")

pred_A = model1.predict_proba(X_test)[:,0]
pred_B = model1.predict_proba(X_test)[:,1]
pred_C = model1.predict_proba(X_test)[:,2]

pred_AA = model2.predict_proba(X_test)[:,0]
pred_BB = model2.predict_proba(X_test)[:,1]
pred_CC = model2.predict_proba(X_test)[:,2]

total = pd.DataFrame()
total['high1'] = high1['class']
total['high2'] = high2['class']
total['high3'] = high3['class']
total['high4'] = high4['class']

total['a_prob'] = pred_A
total['b_prob'] = pred_B
total['c_prob'] = pred_C
total['aa_prob'] = pred_AA
total['bb_prob'] = pred_BB
total['cc_prob'] = pred_CC

total['answer1'] = np.argmax(total[['a_prob', 'b_prob', 'c_prob']].values, axis=1)
total['answer2'] = np.argmax(total[['aa_prob', 'bb_prob', 'cc_prob']].values, axis=1)
total['target1'] = total.answer1.map(lambda x : 'A' if x==0 else ('B' if x==1 else 'C'))
total['target2'] = total.answer2.map(lambda x : 'A' if x==0 else ('B' if x==1 else 'C'))

total

In [None]:
total[total.target1 != total.target2]