# CONCEPT 

- 데이터를 증강하지 않고, A와 notA를 분류
- A 데이터를 제외하고, B&C 데이터만을 남겨두기
- B&C 데이터에 대한 Label Encoder 변수 추가
- B&C 데이터에 대한 AE 및 VAE Encoding 값 및 error 값 추가
- B&C 데이터에 대한 데이터 증강 유무에 따른 성능 확인

# STEP 01. 데이터를 증강하지 않고 A와 notA를 분류

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
import multiprocessing
from tqdm.auto import tqdm

from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset, sampler

from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from imblearn.over_sampling import SMOTE, SMOTENC, SMOTEN

In [None]:
device = torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu')
device

In [None]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')              
df_train.info()

In [None]:
df_test.info()

In [None]:
df_train.describe()

In [None]:
df_test.describe()

In [None]:
df_train.drop(columns=['father', 'mother', 'gender'], inplace=True)
df_test.drop(columns=['father', 'mother', 'gender'], inplace=True)

In [None]:
df_train.info(), df_test.info()

In [None]:
def catgbmc(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:16]
    model = CatBoostClassifier(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY)
        );     

    pred = model.predict(validX)
    score = f1_score(validY, pred, average='macro')
    print(score)
    
    return model

In [None]:
X, y = df_train.iloc[:, 1:-1], (df_train['class'].values == 'A').astype(int)
params = {'iterations':1,
          'learning_rate':0.0001,
          'l2_leaf_reg' : 100,
          'verbose':0}

model1 = catgbmc(X, y, X, y, params)

In [None]:
pred = model1.predict(df_test.iloc[:, 1:])
len(pred)

In [None]:
train2 = df_train[df_train['class'] != 'A'].copy().reset_index(drop=True)
test2 = df_test[~pred.astype(bool)].copy().reset_index(drop=True)
test2

### Summay 01
- 이전의 과정들에서 수차례 증명되었듯, A와 notA를 분류하는 것은 아주 쉽다.  
 (기본적으로 trait 변수 하나만으로도 기계적인 분류가 가능하다)
- 때문에 우선적으로 A와 notA를 분류해놓고, B와 C만이 존재하는 데이터를 대상으로 파생변수 생성, 증강 등의 과정을 진행한다.

# STEP 02. 파생변수 추가

## 1) B와 C에 대한 Target Labeling

In [None]:
# categorical 변수 중 train에만 있거나, test에만 있는 칼럼 확인

train3 = pd.get_dummies(train2.iloc[:, 1:-1]).copy()
test3 = pd.get_dummies(test2.iloc[:, 1:]).copy()

target1 = train3.columns.tolist()
target2 = test3.columns.tolist()
[x for x in target1 if x not in target2], [y for y in target2 if y not in target2]

In [None]:
train2[train2.SNP_09 == 'G G']

In [None]:
train2[train2.SNP_12 == 'G G']

In [None]:
# text 형태의 categorical 변수들을 숫자형태로 변경

for i in tqdm(range(1, 15+1)) :
    target = str(i) if i >= 10 else "0"+str(i)
    cols = sorted(train2[f"SNP_{target}"].unique().tolist())
    train2[f"SNP_{target}"] = train2[f"SNP_{target}"].map(lambda x : 0 if x==cols[0] else (1 if x==cols[1] else 2))
    test2[f"SNP_{target}"] = test2[f"SNP_{target}"].map(lambda x : 0 if x==cols[0] else (1 if x==cols[1] else 2))

train2.info(), test2.info()

In [None]:
train2['class_B'] = train2['class'].map(lambda x : 1 if x=='B' else 0)
train2['class_C'] = train2['class'].map(lambda x : 1 if x=='C' else 0)

for i in range(1, 15+1) :
    target = str(i) if i >= 10 else "0"+str(i)
    target2 = train2.groupby(f'SNP_{target}')['class_B', 'class_C'].sum()
    target2['total'] = target2[['class_B', 'class_C']].sum(axis=1)
    for j in range(len(target2)) :
        if (target2['total'][j] < 3) : # 전체 표본이 너무 적은 경우 제외(확률 반반으로)
            target2['class_B'][j] = 1
            target2['class_C'][j] = 1
            
    value = target2['class_B'] / (target2['class_B']+target2['class_C'])    

    train2[f"SNP_{target}_ratio"] = train2[f"SNP_{target}"].map(lambda x : value[0] if x==0 else (value[1] if x==1 else value[2]))
    test2[f"SNP_{target}_ratio"] = test2[f"SNP_{target}"].map(lambda x : value[0] if x==0 else (value[1] if x==1 else value[2]))    

train2.drop(columns=['class_B', 'class_C'], inplace=True)
train2.info(), test2.info()

## 2) AE & VAE 정의

In [None]:
class Autoencoder(nn.Module):
    def __init__(self, encoding_dim):
        super().__init__()
        self.encoding_dim = encoding_dim
        self.encoder = nn.Sequential(
            nn.Linear(31, 64),
            nn.GELU(),
            nn.Linear(64, 16),
            nn.GELU(),
            nn.Linear(16, encoding_dim),
            nn.GELU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 16),
            nn.GELU(),
            nn.Linear(16, 64),
            nn.GELU(),
            nn.Linear(64, 31)
        )

    def forward(self, x):
        x1 = self.encoder(x)
        x2 = self.decoder(x1)
        return x1, x2

def ae_train(model, data_loader, criterion, optimizer, device, epochs=10):
    model.to(device)
    for epoch in range(epochs):
        epoch_loss = 0
        for x in data_loader:
            x = x[0].to(device)
            _, x_hat = model(x)
            loss = criterion(x_hat, x)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f'Epoch {epoch}: loss = {epoch_loss / len(data_loader):.4f}')

In [None]:
X = torch.randn(320, train2.shape[1]-2)  # generate some example data

# Create a dataset and data loader
dataset = torch.utils.data.TensorDataset(X)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

# Initialize the model, criterion, and optimizer
encoding_dim = 6
model = Autoencoder(encoding_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())

# Train the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
ae_train(model, data_loader, criterion, optimizer, device)

In [None]:
class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(VAE, self).__init__()
        self.input_dim = input_dim
        self.latent_dim = latent_dim
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, input_dim*2),
            nn.ReLU(),
            nn.Linear(input_dim*2, input_dim),
            nn.ReLU(),
            nn.Linear(input_dim, input_dim//2),
            nn.ReLU()
            )
        self.fc_mu = nn.Linear(input_dim//2, latent_dim)
        self.fc_logvar = nn.Linear(input_dim//2, latent_dim)
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, input_dim//2),
            nn.ReLU(),
            nn.Linear(input_dim//2, input_dim),
            nn.ReLU(),
            nn.Linear(input_dim, input_dim*2),
            nn.ReLU(),
            nn.Linear(input_dim*2, input_dim)
        )

    def encode(self, x):
        h = self.encoder(x)
        return self.fc_mu(h), self.fc_logvar(h)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return mu + eps*std

    def decode(self, z):
        return self.decoder(z)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

def loss_fn(recon_x, x, mu, logvar):
    reconstruction_loss = F.mse_loss(recon_x, x, reduction='sum')
    kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return reconstruction_loss + kl_loss

def train(model, optimizer, train_loader, device):
    model.train()
    train_loss = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        loss = loss_fn(recon_batch, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        
        optimizer.step()
        
    return train_loss / len(train_loader.dataset)

def test(model, test_loader, device):
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for data in test_loader:
            data = data.to(device)
            recon_batch, mu, logvar = model(data)
            test_loss += loss_fn(recon_batch, data, mu, logvar).item()
    return test_loss / len(test_loader.dataset)

In [None]:
input_dim = train2.shape[1]-1
latent_dim = 8
batch_size = 32
num_epochs = 10

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_data = torch.randn(100, input_dim)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size)
test_data = torch.randn(10, input_dim)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)

model = VAE(input_dim, latent_dim).to(device)
optimizer = torch.optim.Adam(model.parameters())

for epoch in range(num_epochs):
    train_loss = train(model, optimizer, train_loader, device)
    test_loss = test(model, test_loader, device)
    print(f"Epoch {epoch}: Train loss = {train_loss}, Test loss = {test_loss}")

## 3) AE & VAE 학습 및 변수 추가

### AutoEncoder

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = torch.Tensor(train2.drop(columns=['id', 'class']).to_numpy())

# Create a dataset and data loader
dataset = torch.utils.data.TensorDataset(X)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)

# Initialize the model, criterion, and optimizer
encoding_dim = 8
model = Autoencoder(encoding_dim)
criterion = nn.MSELoss().to(device)
optimizer = optim.Adam(model.parameters())
epochs=1000

# Train the model
ae_train(model, data_loader, criterion, optimizer, device, epochs)

In [None]:
X1 = X.to(device)
X2 = torch.Tensor(test2.drop(columns=['id']).to_numpy()).to(device)

pred_train = model(X1)
pred_test = model(X2)

# reconstruction error
loss_train = []
loss_test = []

for i in range(len(X1)) :
    loss = criterion(pred_train[1][i], X1[i]).detach().cpu().numpy()
    loss_train.append(loss)
for j in range(len(X2)) :
    loss = criterion(pred_test[1][j], X2[j]).detach().cpu().numpy()
    loss_test.append(loss)

# encoding values
enco_train = pred_train[0].detach().cpu().numpy()
enco_test = pred_test[0].detach().cpu().numpy()

trainLoss = pd.DataFrame(data=loss_train, columns=['ae_loss'])
testLoss = pd.DataFrame(data=loss_test, columns=['ae_loss'])

ae_train = pd.DataFrame(data=enco_train, columns=['ae_'+str(x) for x in range(enco_train.shape[1])])
ae_test = pd.DataFrame(data=enco_test, columns=['ae_'+str(x) for x in range(enco_train.shape[1])])

train3 = pd.concat([train2, trainLoss, ae_train], axis=1)
test3 = pd.concat([test2, testLoss, ae_test], axis=1)

train3.info(), test3.info()

### Variational AutoEncoder

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = torch.Tensor(train2.drop(columns=['id', 'class']).to_numpy())

input_dim = X.shape[1]
latent_dim = 8
batch_size = 32
num_epochs = 1000

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_data = X
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_data = X
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False)

model = VAE(input_dim, latent_dim).to(device)
optimizer = torch.optim.Adam(model.parameters())

for epoch in range(num_epochs):
    train_loss = train(model, optimizer, train_loader, device)
    test_loss = test(model, test_loader, device)
    print(f"Epoch {epoch}: Train loss = {train_loss}, Test loss = {test_loss}")

In [None]:
X1 = X.to(device)
X2 = torch.Tensor(test2.drop(columns=['id']).to_numpy()).to(device)

pred_train = model(X1)
pred_test = model(X2)

# reconstruction error
loss_train = []
loss_test = []

for i in range(len(X1)) :
    loss = criterion(pred_train[0][i], X1[i]).detach().cpu().numpy()
    loss_train.append(loss)
for j in range(len(X2)) :
    loss = criterion(pred_test[0][j], X2[j]).detach().cpu().numpy()
    loss_test.append(loss)

# encoding values
enco_train = pred_train[1].detach().cpu().numpy()
enco_test = pred_test[1].detach().cpu().numpy()

trainLoss = pd.DataFrame(data=loss_train, columns=['vae_loss'])
testLoss = pd.DataFrame(data=loss_test, columns=['vae_loss'])

ae_train = pd.DataFrame(data=enco_train, columns=['vae_'+str(x) for x in range(enco_train.shape[1])])
ae_test = pd.DataFrame(data=enco_test, columns=['vae_'+str(x) for x in range(enco_train.shape[1])])

train4 = pd.concat([train3, trainLoss, ae_train], axis=1)
test4 = pd.concat([test3, testLoss, ae_test], axis=1)

train4.info(), test4.info()

### Summary 02
- Target Labeling을 통해 15개의 파생변수를 생성
- AE와 VAE를 통해 encoding한 값 8개와 reconstruction error 1개를 각기 추가해, 18개의 파생변수를 생성
- 총 33개의 파생변수가 추가되어 49개의 변수를 이용해 B와 C를 구분

In [None]:
train4.to_csv("./data/train4.csv", index=False)
test4.to_csv("./data/test4.csv", index=False)

## Categorical Features
- 전체 변수들을 CATEGORY 타입으로 변환

In [None]:
train = pd.read_csv("./data/train4.csv")
test = pd.read_csv("./data/test4.csv")
train.iloc[:, 1:17] = train.iloc[:, 1:17].astype('category')
test.iloc[:, 1:17] = test.iloc[:, 1:17].astype('category')
train.info()

In [None]:
test.info()

# STEP 02. MODELING & VALIDATION
- 데이터 증강여부에 따른 성능향상 유무를 확인
- Classifier와 Regressor를 동시에 사용해 자체적인 ensemble 효과 추가 고려

## without Aug

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold

from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from imblearn.over_sampling import SMOTE, SMOTENC, SMOTEN

In [165]:
X, y = train.drop(columns=['id', 'class']), (train['class'].values == 'B').astype(int)

strategy = {0:1000, 1:1000}
smote = SMOTENC(categorical_features=[x for x in range(16)], sampling_strategy=strategy)
X2, y2 = smote.fit_resample(X, y)
X_test = test.drop(columns=['id'])
X2

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,...,ae_7,vae_loss,vae_0,vae_1,vae_2,vae_3,vae_4,vae_5,vae_6,vae_7
0,2,2,1,0,1,1,0,0,2,0,...,1.501133,0.128519,0.022841,0.005060,0.004752,0.000739,0.456544,0.002598,0.005025,0.003332
1,2,1,1,1,0,0,1,0,1,0,...,4.403416,0.118182,0.012891,-0.017528,0.010271,-0.011619,-0.922436,0.012849,-0.012928,-0.026791
2,2,2,2,0,1,2,2,0,1,1,...,3.964971,0.230729,-0.006556,-0.021662,-0.006980,-0.031515,1.069892,-0.023945,-0.046694,0.009493
3,2,2,2,2,0,2,0,0,0,0,...,1.937539,0.215517,0.003178,-0.011598,0.000311,-0.002459,0.606825,0.002101,0.014176,0.017142
4,2,2,2,1,0,2,0,0,1,0,...,1.792581,0.138570,0.008836,-0.008576,-0.002378,-0.012743,0.899890,-0.011360,-0.031863,0.011499
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,2,2,1,1,0,1,0,0,2,0,...,2.060444,0.172017,-0.010439,-0.017717,-0.003090,-0.007358,0.067714,-0.000099,0.019257,-0.000938
1996,2,2,2,2,1,2,1,0,2,0,...,1.135492,0.224902,0.026450,-0.018654,-0.007217,-0.017505,1.252944,-0.021561,-0.086939,0.014846
1997,2,2,1,1,0,1,1,0,1,0,...,6.743381,0.203564,0.003375,0.015454,0.013643,-0.000784,-0.029273,-0.001264,0.013179,0.004345
1998,2,2,2,1,1,2,1,0,1,0,...,4.173852,0.097439,0.014568,0.000579,0.004785,0.001115,0.294866,0.003786,0.013583,0.004069


In [166]:
y2

array([1, 0, 1, ..., 1, 1, 1])

In [167]:
X_test

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,...,ae_7,vae_loss,vae_0,vae_1,vae_2,vae_3,vae_4,vae_5,vae_6,vae_7
0,2,2,1,2,2,2,0,0,0,0,...,2.790059,0.217307,0.000218,0.006877,-0.009565,-0.009279,0.851794,-0.005484,-0.016489,0.014645
1,2,2,1,0,0,1,1,0,0,0,...,1.108588,0.154863,0.000644,-0.016445,0.002960,-0.020076,-1.629932,0.004768,0.005041,-0.031017
2,2,2,1,1,0,2,0,0,0,0,...,3.829773,0.179709,-0.000699,-0.020951,-0.005646,-0.002832,0.286670,0.004100,0.020569,0.004149
3,2,2,1,1,0,1,0,0,1,0,...,1.019381,0.113938,0.000593,-0.013546,-0.006326,-0.001122,0.240137,0.004211,0.012031,0.005734
4,2,0,1,1,0,1,1,0,0,0,...,3.418614,0.142387,-0.038420,-0.020248,0.005399,-0.022067,-1.584396,0.003951,0.027431,-0.023177
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,2,1,2,2,0,1,1,0,2,0,...,2.745098,0.224985,-0.000355,0.001457,-0.014952,-0.006648,0.805532,-0.003689,-0.013833,0.015952
120,2,2,0,0,0,1,1,0,0,0,...,0.403352,0.073524,-0.014105,-0.027721,-0.002757,-0.022914,-1.534130,0.007219,0.002148,-0.029621
121,2,2,0,0,0,1,1,0,0,0,...,-0.165106,0.187115,-0.037338,-0.015497,0.020117,-0.018566,-1.049608,0.009251,0.020054,-0.023991
122,2,1,2,1,1,2,2,0,1,0,...,4.361863,0.241954,0.000484,-0.010334,-0.009681,-0.026039,1.176126,-0.023566,-0.068861,0.012713


In [168]:
def catgbmc(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:16]
    model = CatBoostClassifier(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        # l2_leaf_reg=10
        # random_state=random_seed
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY),
        );     

    pred = model.predict(validX)
    score = f1_score(validY, pred, average='macro')
    print(score)
    
    return model

In [169]:
def catgbmr(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:16]
    model = CatBoostRegressor(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        # l2_leaf_reg=10
        # random_state=random_seed
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY),
        );     

    pred = model.predict(validX)
    score = f1_score(validY, np.round(pred), average='macro')
    print(score)
    
    return model

In [193]:
high1 = pd.read_csv("./submit_high1.csv")
high2 = pd.read_csv("./submit_high2.csv")
high1['class'].value_counts()

B    84
A    51
C    40
Name: class, dtype: int64

In [194]:
high2['class'].value_counts()

B    82
A    51
C    42
Name: class, dtype: int64

In [368]:
b_len = 88
target = np.zeros(175)
target[45:45+b_len] = 1
target[45+b_len:] = 2

tmp = target.copy()
tmp[-6:] = 1

f1_score(target, tmp, average='macro')

0.9633699633699634

In [207]:
target

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.])

In [243]:
total[total.high1 != total.high2]

Unnamed: 0,pred1,pred2,pred3,answer1,answer2,answer3,high1,high2
12,0,0.319024,0.342033,C,C,C,B,C
126,0,0.515816,0.456792,B,C,C,B,C


In [249]:
correct = total[(total.answer1 == total.answer2) & (total.answer2 == total.answer3) & (total.answer3 == total.high1) & (total.high1 == total.high2)].index.tolist()
prob = [x for x in range(175) if x not in correct]
prob

[3, 5, 12, 119, 126]

In [252]:
answer = np.zeros(175)
answer[correct] = total.loc[correct, 'answer1'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2))
answer

array([0., 1., 2., 0., 0., 0., 2., 1., 0., 0., 2., 1., 0., 0., 1., 1., 0.,
       1., 1., 2., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 0., 0., 1., 2.,
       0., 1., 2., 1., 1., 2., 0., 1., 2., 1., 1., 1., 1., 2., 1., 2., 0.,
       1., 0., 1., 1., 1., 2., 0., 1., 2., 0., 1., 2., 2., 2., 0., 1., 0.,
       0., 1., 1., 1., 0., 0., 2., 1., 2., 1., 1., 1., 2., 1., 0., 1., 1.,
       1., 1., 1., 2., 0., 1., 1., 2., 1., 1., 2., 0., 1., 0., 2., 0., 1.,
       1., 2., 0., 0., 2., 1., 0., 1., 2., 1., 1., 1., 1., 0., 0., 2., 1.,
       0., 0., 1., 1., 2., 2., 1., 0., 1., 0., 1., 0., 0., 1., 1., 1., 2.,
       0., 0., 1., 0., 0., 0., 2., 1., 1., 1., 0., 1., 2., 0., 0., 1., 0.,
       1., 1., 0., 0., 1., 2., 0., 1., 2., 2., 1., 0., 0., 2., 1., 1., 0.,
       1., 2., 2., 1., 1.])

In [256]:
from itertools import product

groups = product('12', repeat=5)
for g in groups :
    print(g)

AttributeError: 'tuple' object has no attribute 'astype'

In [261]:
correct = total[total.answer1 == total.answer2) & (total.answer2 == total.answer3) & (total.answer3 == total.high1) & (total.high1 == total.high2)].index.tolist()
prob = [x for x in range(175) if x not in correct]
answer = np.zeros(175)
answer[correct] = total.loc[correct, 'answer1'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2))

In [275]:
correct = total[total.pred1==1].index.tolist()
prob = [x for x in range(175) if x not in correct]
answer = np.zeros(175)
answer[correct] = total.loc[correct, 'answer1'].map(lambda x : 0 if x=='A' else 1)

In [270]:
prob = [3,5,12,119,126,162,168]
correct = [x for x in range(175) if x not in prob]
answer = np.zeros(175)
answer[correct] = total.loc[correct, 'answer1'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2))

In [267]:
len(prob)

7

In [364]:
a_len = 45
b_len = 175-55
answer = np.zeros(175)
answer[a_len:a_len+b_len] = 1
answer[a_len+b_len:] = 2
result = np.zeros(175)
# result = answer.copy()
# result[a_len:51] = 0
# result[-2:] = 0

# result[a_len:a_len+18]=2

f1_score(answer, result, average='macro')

0.13636363636363635

In [285]:
answer

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2.])

In [300]:
2**175

47890485652059026823698344598447161988085597568237568

In [353]:
from itertools import product

target1 = total['high1'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2))
target2 = total['high2'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2))
groups = product('012', repeat=175)
all_range = 2**175
now = 0

for group in groups :
    now += 1
    if now < 3**175//3 :
        continue
    answer = [int(x) for x in group]
    if answer.count(0) != 45 :
        continue
    score1 = f1_score(answer, target1, average='macro')
    score2 = f1_score(answer, target2, average='macro')
    if (score1 == score2) & (score1 > 0.9) :
        print(answer)    
        print(score1)
    elif now % 10000 == 0 :
        print(f"{now / all_range * 100:.4f}%")

KeyboardInterrupt: 

In [350]:
answer.count(0)

160

In [276]:
from itertools import product

target1 = total['high1'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2))
target2 = total['high2'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2))
groups = product('012', repeat=175)
for group in groups :
    answer2 = answer.copy()
    for i in range(len(group)) :
        g = int(group[i])
        t = prob[i]
        answer2[t] = g
        
    score = f1_score(answer2, target, average='macro')
    if score > 0.99 :    
        print(group)
        print(score)

KeyboardInterrupt: 

In [195]:
high1.iloc[[3,5,12,119,126,162,168]]

Unnamed: 0,id,class
3,TEST_003,C
5,TEST_005,C
12,TEST_012,B
119,TEST_119,C
126,TEST_126,B
162,TEST_162,C
168,TEST_168,B


In [196]:
high2.iloc[[3,5,12,119,126,162,168]]

Unnamed: 0,id,class
3,TEST_003,C
5,TEST_005,C
12,TEST_012,C
119,TEST_119,C
126,TEST_126,C
162,TEST_162,C
168,TEST_168,B


In [199]:
total['high1'] = high1['class']
total['high2'] = high2['class']
total[total.answer1 != total.high1]

Unnamed: 0,pred1,pred2,pred3,answer1,answer2,answer3,high1,high2
3,0,0.892971,0.742926,B,B,B,C,C
5,0,0.911946,0.769649,B,B,B,C,C
12,0,0.319024,0.342033,C,C,C,B,C
119,0,0.610629,0.571767,B,B,B,C,C


In [200]:
total[total.answer1 != total.high2]

Unnamed: 0,pred1,pred2,pred3,answer1,answer2,answer3,high1,high2
3,0,0.892971,0.742926,B,B,B,C,C
5,0,0.911946,0.769649,B,B,B,C,C
119,0,0.610629,0.571767,B,B,B,C,C
126,0,0.515816,0.456792,B,C,C,B,C


In [184]:
params = {'iterations':300,
          'learning_rate':0.003,
          # 'l2_leaf_reg' : 10,
        #   'auto_class_weights' : 'Balanced',
          # 'grow_policy' : 'Depthwise',
          'verbose':0}

model2 = catgbmc(X2, y2, X2, y2, params)
model3 = catgbmr(X2, y2, X2, y2,params)

0.9904999406246289
0.9874999968749993


In [185]:
total = pd.DataFrame()

test00 = pd.read_csv("./data/df_test.csv")
X_test1 = test00.drop(columns=['id'])
X_test1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 46 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   trait       175 non-null    int64
 1   SNP_01_A A  175 non-null    int64
 2   SNP_01_A G  175 non-null    int64
 3   SNP_01_G G  175 non-null    int64
 4   SNP_02_A A  175 non-null    int64
 5   SNP_02_A G  175 non-null    int64
 6   SNP_02_G G  175 non-null    int64
 7   SNP_03_A A  175 non-null    int64
 8   SNP_03_C A  175 non-null    int64
 9   SNP_03_C C  175 non-null    int64
 10  SNP_04_A A  175 non-null    int64
 11  SNP_04_G A  175 non-null    int64
 12  SNP_04_G G  175 non-null    int64
 13  SNP_05_A A  175 non-null    int64
 14  SNP_05_C A  175 non-null    int64
 15  SNP_05_C C  175 non-null    int64
 16  SNP_06_A A  175 non-null    int64
 17  SNP_06_A G  175 non-null    int64
 18  SNP_06_G G  175 non-null    int64
 19  SNP_07_A A  175 non-null    int64
 20  SNP_07_G A  175 non-null    int6

In [186]:
total['pred1'] = model1.predict(X_test1)
pred2 = model2.predict_proba(X_test)[:,1]
pred3 = model3.predict(X_test)
total['pred2'] = 0
total['pred3'] = 0

total.loc[total.pred1==0, 'pred2'] = pred2
total.loc[total.pred1==0, 'pred3'] = pred3
total

Unnamed: 0,pred1,pred2,pred3
0,1,0.000000,0.000000
1,0,0.940014,0.769139
2,0,0.073975,0.239314
3,0,0.892971,0.742926
4,1,0.000000,0.000000
...,...,...,...
170,0,0.942043,0.774462
171,0,0.049483,0.222387
172,0,0.108955,0.245246
173,0,0.947252,0.784804


In [187]:
total[(total.pred2 >= 0.5) & (total.pred3 < 0.5)]

Unnamed: 0,pred1,pred2,pred3
126,0,0.515816,0.456792


In [188]:
total[(total.pred2 < 0.5) & (total.pred3 >= 0.5)]

Unnamed: 0,pred1,pred2,pred3


In [189]:
total['answer1'] = total['pred1'].map(lambda x : 'A' if x==1 else 'D')
total['answer2'] = total['pred1'].map(lambda x : 'A' if x==1 else 'D')
total['answer3'] = total['pred1'].map(lambda x : 'A' if x==1 else 'D')

total.loc[total.answer1=='D', 'answer1'] = (pred2 > 0.5).astype(int)
total.loc[total.answer2=='D', 'answer2'] = (pred3 > 0.5).astype(int)
total.loc[total.answer3=='D', 'answer3'] = (np.mean([pred2, pred3], axis=0) > 0.5).astype(int)

total['answer1'] = total['answer1'].map(lambda x : 'A' if x=='A' else ('B' if x==1 else 'C'))
total['answer2'] = total['answer2'].map(lambda x : 'A' if x=='A' else ('B' if x==1 else 'C'))
total['answer3'] = total['answer3'].map(lambda x : 'A' if x=='A' else ('B' if x==1 else 'C'))

In [190]:
total.iloc[[3,5,12,119,126,162,168]]

Unnamed: 0,pred1,pred2,pred3,answer1,answer2,answer3
3,0,0.892971,0.742926,B,B,B
5,0,0.911946,0.769649,B,B,B
12,0,0.319024,0.342033,C,C,C
119,0,0.610629,0.571767,B,B,B
126,0,0.515816,0.456792,B,C,C
162,0,0.137762,0.237675,C,C,C
168,0,0.874658,0.738388,B,B,B
