In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
import multiprocessing
from tqdm.auto import tqdm

from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset, sampler

from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from imblearn.over_sampling import SMOTE, SMOTENC, SMOTEN, RandomOverSampler, ADASYN
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, CondensedNearestNeighbour, OneSidedSelection, NeighbourhoodCleaningRule

import shap
import catboost
from catboost import Pool, cv

pd.set_option('display.max_columns', 500)

In [2]:
train = pd.read_csv("./data/train.csv").drop(columns=['father', 'mother', 'gender'])
train.drop_duplicates(subset=train.columns.tolist()[5:20], inplace=True, ignore_index=True)
test = pd.read_csv("./data/test.csv").drop(columns=['father', 'mother', 'gender'])

train.iloc[:, 1:-1] = train.iloc[:, 1:-1].astype('category')
test.iloc[:, 1:] = test.iloc[:, 1:].astype('category')

answer = np.zeros(len(test)) - 1

train.info(), test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   id      248 non-null    object  
 1   trait   248 non-null    category
 2   SNP_01  248 non-null    category
 3   SNP_02  248 non-null    category
 4   SNP_03  248 non-null    category
 5   SNP_04  248 non-null    category
 6   SNP_05  248 non-null    category
 7   SNP_06  248 non-null    category
 8   SNP_07  248 non-null    category
 9   SNP_08  248 non-null    category
 10  SNP_09  248 non-null    category
 11  SNP_10  248 non-null    category
 12  SNP_11  248 non-null    category
 13  SNP_12  248 non-null    category
 14  SNP_13  248 non-null    category
 15  SNP_14  248 non-null    category
 16  SNP_15  248 non-null    category
 17  class   248 non-null    object  
dtypes: category(16), object(2)
memory usage: 9.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data colu

(None, None)

In [3]:
def catgbmc(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()
    model = CatBoostClassifier(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY)
        );     

    pred = model.predict(validX)
    score = f1_score(validY, pred, average='macro')
    print(score)
    
    return model

In [4]:
X, y = train.drop(columns=['id', 'class']), (train['class'].values == 'A').astype('int')
X_test = test.drop(columns=['id']).copy()

params = {'iterations':100,
          'learning_rate':0.05,
          'l2_leaf_reg' : 10,
          'loss_function' : 'CrossEntropy',
          'eval_metric' : 'F1',
          'verbose':0,
          'random_seed':0}

modelA = catgbmc(X, y, X, y, params)
predA = modelA.predict(X_test)
answer += predA

0.9944720593807814


In [5]:
train['class'].value_counts()

B    111
C     77
A     60
Name: class, dtype: int64

In [6]:
# text 형태의 categorical 변수들을 숫자형태로 변경

for i in tqdm(range(1, 15+1)) :
    target = str(i) if i >= 10 else "0"+str(i)
    cols = sorted(train[f"SNP_{target}"].unique().tolist())
    train[f"SNP_{target}"] = train[f"SNP_{target}"].map(lambda x : 0 if x==cols[0] else (1 if x==cols[1] else 2))
    test[f"SNP_{target}"] = test[f"SNP_{target}"].map(lambda x : 0 if x==cols[0] else (1 if x==cols[1] else 2))

train.info(), test.info()

  0%|          | 0/15 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   id      248 non-null    object  
 1   trait   248 non-null    category
 2   SNP_01  248 non-null    category
 3   SNP_02  248 non-null    category
 4   SNP_03  248 non-null    category
 5   SNP_04  248 non-null    category
 6   SNP_05  248 non-null    category
 7   SNP_06  248 non-null    category
 8   SNP_07  248 non-null    category
 9   SNP_08  248 non-null    category
 10  SNP_09  248 non-null    category
 11  SNP_10  248 non-null    category
 12  SNP_11  248 non-null    category
 13  SNP_12  248 non-null    category
 14  SNP_13  248 non-null    category
 15  SNP_14  248 non-null    category
 16  SNP_15  248 non-null    category
 17  class   248 non-null    object  
dtypes: category(16), object(2)
memory usage: 9.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data colu

(None, None)

In [7]:
random_seed=0
strategy1 = {0 : 40, 1 : 70, 2 : 50}

under1 = RandomUnderSampler(sampling_strategy=strategy1, random_state=random_seed)
under2 = EditedNearestNeighbours()
under3 = RepeatedEditedNearestNeighbours()
under4 = AllKNN()
under5 = CondensedNearestNeighbour(random_state=random_seed)
under6 = OneSidedSelection(random_state=random_seed)
under7 = NeighbourhoodCleaningRule()

X, y = train.iloc[:, 1:-1], train['class'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2)).values

X1, y1 = under1.fit_resample(X, y)
X2, y2 = under2.fit_resample(X, y)
X3, y3 = under3.fit_resample(X, y)
X4, y4 = under4.fit_resample(X, y)
X5, y5 = under5.fit_resample(X, y)
X6, y6 = under6.fit_resample(X, y)
X7, y7 = under7.fit_resample(X, y)

In [8]:
random_seed=0
strategy2 = {0 : 120, 1 : 220, 2 : 150}

over1 = SMOTEN(sampling_strategy=strategy2, random_state=random_seed)
over2 = SMOTE(sampling_strategy=strategy2, random_state=random_seed)
over3 = RandomOverSampler(sampling_strategy=strategy2, random_state=random_seed)

X8, y8 = over1.fit_resample(X, y)
X9, y9 = over2.fit_resample(X, y)
X10, y10 = over3.fit_resample(X, y)

In [9]:
len(list(y1)+list(y2))

356

In [10]:
df_train = pd.concat([X1, X2, X3, X4, X5, X6, X7, X8, X9, X10], ignore_index=True)

df_train['class'] = list(y1)+list(y2)+list(y3)+list(y4)+list(y5)+list(y6)+list(y7)+list(y8)+list(y9)+list(y10)

df_train

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15,class
0,1,1,2,0,2,0,2,2,0,2,0,2,1,2,1,1,0
1,1,1,2,0,1,1,1,1,0,1,0,2,1,2,1,0,0
2,1,0,2,0,1,0,2,2,0,1,1,1,1,2,1,2,0
3,1,0,2,0,2,0,2,2,0,1,1,2,1,2,1,2,0
4,1,0,2,0,1,1,2,1,0,0,0,2,2,2,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2605,2,2,0,1,0,2,1,0,1,0,1,0,0,1,0,2,2
2606,2,2,1,0,0,0,0,1,2,0,2,1,1,1,0,0,2
2607,2,2,0,1,0,1,1,0,1,0,2,1,1,1,0,0,2
2608,2,1,1,1,0,1,1,0,0,0,1,1,0,2,1,1,2


In [11]:
df_train.drop_duplicates(inplace=True, ignore_index=True)
df_train

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15,class
0,1,1,2,0,2,0,2,2,0,2,0,2,1,2,1,1,0
1,1,1,2,0,1,1,1,1,0,1,0,2,1,2,1,0,0
2,1,0,2,0,1,0,2,2,0,1,1,1,1,2,1,2,0
3,1,0,2,0,2,0,2,2,0,1,1,2,1,2,1,2,0
4,1,0,2,0,1,1,2,1,0,0,0,2,2,2,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,2,1,0,0,0,0,0,1,1,0,2,1,0,1,0,0,2
502,2,2,1,1,0,0,1,1,1,0,1,1,0,0,0,0,2
503,2,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,2
504,2,1,0,1,0,1,1,0,1,0,1,0,0,0,0,1,2


In [12]:
df_train['class'].value_counts()

1    230
2    165
0    111
Name: class, dtype: int64

In [13]:
class Autoencoder(nn.Module):
    def __init__(self, encoding_dim):
        super().__init__()
        self.encoding_dim = encoding_dim
        self.encoder = nn.Sequential(
            nn.Linear(16, 24),
            nn.GELU(),
            nn.Linear(24, 8),
            nn.GELU(),
            nn.Linear(8, encoding_dim),
            nn.GELU(),
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 8),
            nn.GELU(),
            nn.Linear(8, 24),
            nn.GELU(),
            nn.Linear(24, 16)
        )

    def forward(self, x):
        x1 = self.encoder(x)
        x2 = self.decoder(x1)
        return x1, x2

def ae_train(model, data_loader, criterion, optimizer, device, epochs=10):
    model.to(device)
    for epoch in range(epochs):
        epoch_loss = 0
        for x in data_loader:
            x = x[0].to(device)
            _, x_hat = model(x)
            loss = criterion(x_hat, x)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f'Epoch {epoch}: loss = {epoch_loss / len(data_loader):.4f}')

In [14]:
class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(VAE, self).__init__()
        self.input_dim = input_dim
        self.latent_dim = latent_dim
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, input_dim*2),
            nn.ReLU(),
            nn.Linear(input_dim*2, input_dim),
            nn.ReLU(),
            nn.Linear(input_dim, input_dim//2),
            nn.ReLU()
            )
        self.fc_mu = nn.Linear(input_dim//2, latent_dim)
        self.fc_logvar = nn.Linear(input_dim//2, latent_dim)
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, input_dim//2),
            nn.ReLU(),
            nn.Linear(input_dim//2, input_dim),
            nn.ReLU(),
            nn.Linear(input_dim, input_dim*2),
            nn.ReLU(),
            nn.Linear(input_dim*2, input_dim)
        )

    def encode(self, x):
        h = self.encoder(x)
        return self.fc_mu(h), self.fc_logvar(h)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return mu + eps*std

    def decode(self, z):
        return self.decoder(z)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

def loss_fn(recon_x, x, mu, logvar):
    reconstruction_loss = F.mse_loss(recon_x, x, reduction='sum')
    kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return reconstruction_loss + kl_loss

def vae_train(model, optimizer, train_loader, device):
    model.train()
    train_loss = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        loss = loss_fn(recon_batch, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        
        optimizer.step()
        
    return train_loss / len(train_loader.dataset)

def vae_test(model, test_loader, device):
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for data in test_loader:
            data = data.to(device)
            recon_batch, mu, logvar = model(data)
            test_loss += loss_fn(recon_batch, data, mu, logvar).item()
    return test_loss / len(test_loader.dataset)

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = torch.Tensor(df_train.drop(columns=['class']).to_numpy())

# Create a dataset and data loader
dataset = torch.utils.data.TensorDataset(X)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=len(X), shuffle=True)

# Initialize the model, criterion, and optimizer
encoding_dim = 3
model = Autoencoder(encoding_dim)
criterion = nn.MSELoss().to(device)
optimizer = optim.Adam(model.parameters())
epochs=3000

# Train the model
ae_train(model, data_loader, criterion, optimizer, device, epochs)

Epoch 0: loss = 1.3970
Epoch 1: loss = 1.3904
Epoch 2: loss = 1.3838
Epoch 3: loss = 1.3772
Epoch 4: loss = 1.3707
Epoch 5: loss = 1.3641
Epoch 6: loss = 1.3576
Epoch 7: loss = 1.3510
Epoch 8: loss = 1.3444
Epoch 9: loss = 1.3379
Epoch 10: loss = 1.3313
Epoch 11: loss = 1.3247
Epoch 12: loss = 1.3181
Epoch 13: loss = 1.3115
Epoch 14: loss = 1.3048
Epoch 15: loss = 1.2981
Epoch 16: loss = 1.2913
Epoch 17: loss = 1.2846
Epoch 18: loss = 1.2777
Epoch 19: loss = 1.2708
Epoch 20: loss = 1.2639
Epoch 21: loss = 1.2568
Epoch 22: loss = 1.2497
Epoch 23: loss = 1.2425
Epoch 24: loss = 1.2352
Epoch 25: loss = 1.2278
Epoch 26: loss = 1.2203
Epoch 27: loss = 1.2126
Epoch 28: loss = 1.2048
Epoch 29: loss = 1.1968
Epoch 30: loss = 1.1887
Epoch 31: loss = 1.1804
Epoch 32: loss = 1.1718
Epoch 33: loss = 1.1630
Epoch 34: loss = 1.1540
Epoch 35: loss = 1.1446
Epoch 36: loss = 1.1349
Epoch 37: loss = 1.1249
Epoch 38: loss = 1.1144
Epoch 39: loss = 1.1035
Epoch 40: loss = 1.0921
Epoch 41: loss = 1.0802
Ep

In [16]:
X1 = X.to(device)
X2 = torch.Tensor(test.drop(columns=['id']).to_numpy()).to(device)

pred_train = model(X1)
pred_test = model(X2)

# reconstruction error
loss_train = []
loss_test = []

for i in range(len(X1)) :
    loss = criterion(pred_train[1][i], X1[i]).detach().cpu().numpy()
    loss_train.append(loss)
for j in range(len(X2)) :
    loss = criterion(pred_test[1][j], X2[j]).detach().cpu().numpy()
    loss_test.append(loss)

# encoding values
enco_train = pred_train[0].detach().cpu().numpy()
enco_test = pred_test[0].detach().cpu().numpy()

trainLoss = pd.DataFrame(data=loss_train, columns=['ae_loss'])
testLoss = pd.DataFrame(data=loss_test, columns=['ae_loss'])

ae_train = pd.DataFrame(data=enco_train, columns=['ae_'+str(x) for x in range(enco_train.shape[1])])
ae_test = pd.DataFrame(data=enco_test, columns=['ae_'+str(x) for x in range(enco_train.shape[1])])

train2 = pd.concat([df_train, trainLoss, ae_train], axis=1)
test2 = pd.concat([test, testLoss, ae_test], axis=1)

train2.info(), test2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 21 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   trait    506 non-null    category
 1   SNP_01   506 non-null    category
 2   SNP_02   506 non-null    category
 3   SNP_03   506 non-null    category
 4   SNP_04   506 non-null    category
 5   SNP_05   506 non-null    category
 6   SNP_06   506 non-null    category
 7   SNP_07   506 non-null    category
 8   SNP_08   506 non-null    category
 9   SNP_09   506 non-null    category
 10  SNP_10   506 non-null    category
 11  SNP_11   506 non-null    category
 12  SNP_12   506 non-null    category
 13  SNP_13   506 non-null    category
 14  SNP_14   506 non-null    category
 15  SNP_15   506 non-null    category
 16  class    506 non-null    int64   
 17  ae_loss  506 non-null    float32 
 18  ae_0     506 non-null    float32 
 19  ae_1     506 non-null    float32 
 20  ae_2     506 non-null    float32

(None, None)

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = torch.Tensor(df_train.drop(columns=['class']).to_numpy())

input_dim = X.shape[1]
latent_dim = 3
batch_size = len(X)
num_epochs = 1000

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_data = X
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_data = X
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False)

model = VAE(input_dim, latent_dim).to(device)
optimizer = torch.optim.Adam(model.parameters())

for epoch in range(num_epochs):
    train_loss = vae_train(model, optimizer, train_loader, device)
    test_loss = vae_test(model, test_loader, device)
    print(f"Epoch {epoch}: Train loss = {train_loss}, Test loss = {test_loss}")

Epoch 0: Train loss = 24.694609992588934, Test loss = 24.540944216279645
Epoch 1: Train loss = 24.548598459115613, Test loss = 24.41209393527668
Epoch 2: Train loss = 24.3978720201334, Test loss = 24.279954993206523
Epoch 3: Train loss = 24.2948446763834, Test loss = 24.145706598937746
Epoch 4: Train loss = 24.14190842700099, Test loss = 24.020828186758894
Epoch 5: Train loss = 24.018896291378457, Test loss = 23.906869518898223
Epoch 6: Train loss = 23.894652837821145, Test loss = 23.76666718132411
Epoch 7: Train loss = 23.77350697875494, Test loss = 23.652000216156125
Epoch 8: Train loss = 23.665405755928855, Test loss = 23.52179895936265
Epoch 9: Train loss = 23.547257133152176, Test loss = 23.409048449851777
Epoch 10: Train loss = 23.385610949851777, Test loss = 23.280478013833992
Epoch 11: Train loss = 23.31599902729743, Test loss = 23.167125355113637
Epoch 12: Train loss = 23.181611675518774, Test loss = 23.046606734807312
Epoch 13: Train loss = 23.026454035943676, Test loss = 22.

In [18]:
X1 = X.to(device)
X2 = torch.Tensor(test.drop(columns='id').to_numpy()).to(device)

pred_train = model(X1)
pred_test = model(X2)

# reconstruction error
loss_train = []
loss_test = []

for i in range(len(X1)) :
    loss = criterion(pred_train[0][i], X1[i]).detach().cpu().numpy()
    loss_train.append(loss)
for j in range(len(X2)) :
    loss = criterion(pred_test[0][j], X2[j]).detach().cpu().numpy()
    loss_test.append(loss)

# encoding values
enco_train = pred_train[1].detach().cpu().numpy()
enco_test = pred_test[1].detach().cpu().numpy()

trainLoss = pd.DataFrame(data=loss_train, columns=['vae_loss'])
testLoss = pd.DataFrame(data=loss_test, columns=['vae_loss'])

ae_train = pd.DataFrame(data=enco_train, columns=['vae_'+str(x) for x in range(enco_train.shape[1])])
ae_test = pd.DataFrame(data=enco_test, columns=['vae_'+str(x) for x in range(enco_train.shape[1])])

train3 = pd.concat([train2, trainLoss, ae_train], axis=1)
test3 = pd.concat([test2, testLoss, ae_test], axis=1)

train3.info(), test3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 25 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   trait     506 non-null    category
 1   SNP_01    506 non-null    category
 2   SNP_02    506 non-null    category
 3   SNP_03    506 non-null    category
 4   SNP_04    506 non-null    category
 5   SNP_05    506 non-null    category
 6   SNP_06    506 non-null    category
 7   SNP_07    506 non-null    category
 8   SNP_08    506 non-null    category
 9   SNP_09    506 non-null    category
 10  SNP_10    506 non-null    category
 11  SNP_11    506 non-null    category
 12  SNP_12    506 non-null    category
 13  SNP_13    506 non-null    category
 14  SNP_14    506 non-null    category
 15  SNP_15    506 non-null    category
 16  class     506 non-null    int64   
 17  ae_loss   506 non-null    float32 
 18  ae_0      506 non-null    float32 
 19  ae_1      506 non-null    float32 
 20  ae_2      

(None, None)

In [19]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

train3.iloc[:, -8:] = scaler.fit_transform(train3.iloc[:, -8:])
test3.iloc[:, -8:] = scaler.transform(test3.iloc[:, -8:])

train3.describe()

Unnamed: 0,class,ae_loss,ae_0,ae_1,ae_2,vae_loss,vae_0,vae_1,vae_2
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,1.106719,-1.306866e-08,-1.137695e-08,5.902441e-10,-6.842276e-09,-1.24285e-08,4.991106e-09,-9.102942e-09,-1.583634e-08
std,0.731521,1.00099,1.00099,1.00099,1.00099,1.00099,1.00099,1.00099,1.00099
min,0.0,-1.541427,-2.061707,-2.098706,-1.591097,-2.057842,-1.941203,-2.961914,-3.125247
25%,1.0,-0.6999638,-0.7345761,-0.7583473,-0.8161051,-0.7049741,-0.3789696,-0.7914944,-0.652023
50%,1.0,-0.2438254,0.1227246,0.2293458,-0.1480021,-0.1564034,0.1701317,0.3588254,0.1222881
75%,2.0,0.5111353,0.8297671,0.7899615,0.6280074,0.5840181,0.8214899,0.6723969,0.7277414
max,2.0,4.986321,1.830701,2.052963,2.560057,3.681433,1.558451,1.847691,2.044302


In [20]:
test3.describe()

Unnamed: 0,ae_loss,ae_0,ae_1,ae_2,vae_loss,vae_0,vae_1,vae_2
count,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0
mean,0.617588,-0.072671,-0.034021,0.335675,0.195886,-0.03245,0.324966,-0.172102
std,1.344474,0.911218,1.10711,1.020785,1.284293,1.143805,0.904472,1.005201
min,-1.541427,-2.052616,-2.073251,-1.463447,-1.903642,-1.94109,-2.410738,-2.844079
25%,-0.276571,-0.905018,-1.196023,-0.444357,-0.631277,-1.470836,0.172117,-0.841766
50%,0.422668,0.123366,0.297094,0.241644,0.029142,0.355035,0.509682,-0.10316
75%,1.466722,0.643106,0.838541,1.076888,0.754867,0.908448,0.837351,0.601743
max,7.566397,1.578763,1.896568,2.709289,8.076214,1.595866,1.957817,1.758747


In [21]:
train3['class'].value_counts()

1    230
2    165
0    111
Name: class, dtype: int64

In [22]:
def catcv(inputX, inputY, params, cv_count) :  
    
    var_categ = inputX.columns.tolist()[:-8]
    
    cv_dataset = Pool(data=inputX,
                      label=inputY,
                      cat_features=var_categ)
    
    scores = cv(cv_dataset,
                params,
                fold_count=cv_count,
                stratified=True,
                plot=True)

In [23]:
params = {'iterations':100,
          'learning_rate':0.03,
          'loss_function' : 'CrossEntropy',
          'eval_metric' : 'F1',
          'verbose':0,
          'random_seed':2023}

In [24]:
X

tensor([[1., 1., 2.,  ..., 2., 1., 1.],
        [1., 1., 2.,  ..., 2., 1., 0.],
        [1., 0., 2.,  ..., 2., 1., 2.],
        ...,
        [2., 1., 0.,  ..., 1., 0., 0.],
        [2., 1., 0.,  ..., 0., 0., 1.],
        [2., 0., 0.,  ..., 2., 0., 0.]])

In [25]:
# B & notB 파생변수 없는 버전 성능 확인
X, y = train3[train3['class'] != 'A'].drop(columns=['class','trait']), (train3[train3['class'] != 'A']['class']==1).values.astype('int')
X_test = test3.drop(columns=['id', 'trait']).copy()

catcv(X, y, params, cv_count=5)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]

bestTest = 0.9247311828
bestIteration = 0

Training on fold [1/5]

bestTest = 0.967032967
bestIteration = 45

Training on fold [2/5]

bestTest = 0.9574468085
bestIteration = 75

Training on fold [3/5]

bestTest = 0.9677419355
bestIteration = 56

Training on fold [4/5]

bestTest = 0.9247311828
bestIteration = 45



In [26]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 506 entries, 0 to 505
Data columns (total 23 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   SNP_01    506 non-null    category
 1   SNP_02    506 non-null    category
 2   SNP_03    506 non-null    category
 3   SNP_04    506 non-null    category
 4   SNP_05    506 non-null    category
 5   SNP_06    506 non-null    category
 6   SNP_07    506 non-null    category
 7   SNP_08    506 non-null    category
 8   SNP_09    506 non-null    category
 9   SNP_10    506 non-null    category
 10  SNP_11    506 non-null    category
 11  SNP_12    506 non-null    category
 12  SNP_13    506 non-null    category
 13  SNP_14    506 non-null    category
 14  SNP_15    506 non-null    category
 15  ae_loss   506 non-null    float64 
 16  ae_0      506 non-null    float64 
 17  ae_1      506 non-null    float64 
 18  ae_2      506 non-null    float64 
 19  vae_loss  506 non-null    float64 
 20  vae_0     

In [36]:
def catgbmc(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:-8]
    model = CatBoostClassifier(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY)
        );     

    pred = model.predict(validX)
    score = f1_score(validY, pred, average='macro')
    
    return score

In [37]:
from sklearn.model_selection import train_test_split

high = 0.9
for i in tqdm(range(2000)) :
    params = {'iterations':100,
          'learning_rate':0.3,
          'loss_function' : 'CrossEntropy',
          'eval_metric' : 'F1',
          'verbose':0,
          'random_seed':i}
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, random_state=i, shuffle=True)
    score = catgbmc(X_train, y_train, X_valid, y_valid, params)   
    
    if score > high :
        high = score
        print(f"Random Seed : {i}, Score is {score}")

  0%|          | 0/2000 [00:00<?, ?it/s]

Random Seed : 0, Score is 0.927250576513075
Random Seed : 1, Score is 0.9533640149024765
Random Seed : 6, Score is 0.9595170454545454
Random Seed : 20, Score is 0.9602787456445993
Random Seed : 37, Score is 0.9730113636363636
Random Seed : 39, Score is 0.9735191637630662
Random Seed : 95, Score is 0.9736796536796537
Random Seed : 140, Score is 0.9800131492439186


In [29]:
# from sklearn.model_selection import train_test_split

# high = 0.9
# for i in tqdm(range(2000)) :
#     params = {'iterations':100,
#           'learning_rate':0.3,
#           'loss_function' : 'CrossEntropy',
#           'eval_metric' : 'F1',
#           'verbose':0,
#           'random_seed':i}
#     X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, stratify=y, random_state=i, shuffle=True)
#     score = catgbmc(X_train, y_train, X_valid, y_valid, params)   
    
#     if score > high :
#         high = score
#         print(f"Random Seed : {i}, Score is {score}")

In [30]:
def catgbmc(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:-8]
    model = CatBoostClassifier(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY)
        );     
    
    return model

In [31]:
from sklearn.model_selection import train_test_split

i = 333

params = {'iterations':300,
        'learning_rate':0.03,
        'loss_function' : 'CrossEntropy',
        'eval_metric' : 'F1',
        'verbose':0,
        'random_seed':i}
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, random_state=i, shuffle=True)
model = catgbmc(X_train, y_train, X_valid, y_valid, params)   

pred1 = model.predict(X_train)
score1 = f1_score(y_train, pred1, average='macro')
pred2 = model.predict(X_valid)
score2 = f1_score(y_valid, pred2, average='macro')

score1, score2

(1.0, 0.9736842105263157)

In [32]:
df = pd.DataFrame()
df['predA'] = answer
df['predB'] = model.predict_proba(X_test)[:,1]
df['pred'] = df['predA']
df.loc[df.predB >= 0.5, 'pred'] = 'B'
df.loc[df.predB < 0.5, 'pred'] = 'C'
df.loc[df.predA == 0, 'pred'] = 'A'
df

Unnamed: 0,predA,predB,pred
0,0.0,0.004361,A
1,-1.0,0.945724,B
2,-1.0,0.011626,C
3,-1.0,0.715592,B
4,0.0,0.004119,A
...,...,...,...
170,-1.0,0.986468,B
171,-1.0,0.004784,C
172,-1.0,0.016473,C
173,-1.0,0.992898,B


In [33]:
df.pred.value_counts()

B    88
A    48
C    39
Name: pred, dtype: int64

In [34]:
submit = pd.read_csv("submit_high1.csv")
submit['class'] = df['pred']
submit.to_csv("submit_last1.csv", index=False)
submit['class'].value_counts()

B    88
A    48
C    39
Name: class, dtype: int64