In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
import multiprocessing
from tqdm import tqdm

from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset, sampler
import torch.optim.lr_scheduler as lr_scheduler

from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from imblearn.over_sampling import SMOTE, SMOTENC, SMOTEN, RandomOverSampler, ADASYN
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, CondensedNearestNeighbour, OneSidedSelection, NeighbourhoodCleaningRule

import shap
import catboost
from catboost import Pool, cv

pd.set_option('display.max_columns', 500)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
train = pd.read_csv("./data/train.csv").drop(columns=['father', 'mother', 'gender'])
train.drop_duplicates(subset=train.columns.tolist()[5:20], inplace=True, ignore_index=True)
test = pd.read_csv("./data/test.csv").drop(columns=['father', 'mother', 'gender'])

train.iloc[:, 1:-1] = train.iloc[:, 1:-1].astype('category')
test.iloc[:, 1:] = test.iloc[:, 1:].astype('category')

answer = np.zeros(len(test)) - 1

train.info(), test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   id      248 non-null    object  
 1   trait   248 non-null    category
 2   SNP_01  248 non-null    category
 3   SNP_02  248 non-null    category
 4   SNP_03  248 non-null    category
 5   SNP_04  248 non-null    category
 6   SNP_05  248 non-null    category
 7   SNP_06  248 non-null    category
 8   SNP_07  248 non-null    category
 9   SNP_08  248 non-null    category
 10  SNP_09  248 non-null    category
 11  SNP_10  248 non-null    category
 12  SNP_11  248 non-null    category
 13  SNP_12  248 non-null    category
 14  SNP_13  248 non-null    category
 15  SNP_14  248 non-null    category
 16  SNP_15  248 non-null    category
 17  class   248 non-null    object  
dtypes: category(16), object(2)
memory usage: 9.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data colu

(None, None)

In [3]:
# text 형태의 categorical 변수들을 숫자형태로 변경

for i in tqdm(range(1, 15+1)) :
    target = str(i) if i >= 10 else "0"+str(i)
    try :   
        cols = sorted(train[f"SNP_{target}"].unique().tolist())  
        train[f"SNP_{target}"] = train[f"SNP_{target}"].map(lambda x : 0 if x==cols[0] else (1 if x==cols[1] else 2))
        test[f"SNP_{target}"] = test[f"SNP_{target}"].map(lambda x : 0 if x==cols[0] else (1 if x==cols[1] else 2))
    except :
        continue

train.info(), test.info()

100%|██████████| 15/15 [00:00<00:00, 1920.82it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   id      248 non-null    object  
 1   trait   248 non-null    category
 2   SNP_01  248 non-null    category
 3   SNP_02  248 non-null    category
 4   SNP_03  248 non-null    category
 5   SNP_04  248 non-null    category
 6   SNP_05  248 non-null    category
 7   SNP_06  248 non-null    category
 8   SNP_07  248 non-null    category
 9   SNP_08  248 non-null    category
 10  SNP_09  248 non-null    category
 11  SNP_10  248 non-null    category
 12  SNP_11  248 non-null    category
 13  SNP_12  248 non-null    category
 14  SNP_13  248 non-null    category
 15  SNP_14  248 non-null    category
 16  SNP_15  248 non-null    category
 17  class   248 non-null    object  
dtypes: category(16), object(2)
memory usage: 9.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data colu




(None, None)

In [4]:
X, y = train.iloc[:, 1:-1].to_numpy(), train['class'].map(lambda x : 0 if x=='A' else(1 if x=='B' else 2)).values
X_test = test.iloc[:,1:].to_numpy()

# Load the data
train_data = torch.from_numpy(X)
test_data = torch.from_numpy(X_test)

# Preprocess the data by one-hot encoding the categories
one_hot_data_01 = F.one_hot(train_data[:,:1]-1, num_classes=2).view(len(X), 2).float()
one_hot_data_02 = F.one_hot(train_data[:,1:], num_classes=3).view(len(X), 3*train_data[:,1:].size(1)).float()

one_hot_train = torch.concat([one_hot_data_01, one_hot_data_02], axis=1)

one_hot_data_01 = F.one_hot(test_data[:,:1]-1, num_classes=2).view(len(X_test), 2).float()
one_hot_data_02 = F.one_hot(test_data[:,1:], num_classes=3).view(len(X_test), 3*test_data[:,1:].size(1)).float()

one_hot_test = torch.concat([one_hot_data_01, one_hot_data_02], axis=1)

one_hot_train.shape, one_hot_test.shape

(torch.Size([248, 47]), torch.Size([175, 47]))

In [5]:
random_seed=0
strategy1 = {0 : 40, 1 : 70, 2 : 50}

under1 = RandomUnderSampler(sampling_strategy=strategy1, random_state=random_seed)
under2 = EditedNearestNeighbours()
under3 = RepeatedEditedNearestNeighbours()
under4 = AllKNN()
under5 = CondensedNearestNeighbour(random_state=random_seed)
under6 = OneSidedSelection(random_state=random_seed)
under7 = NeighbourhoodCleaningRule()

X, y = one_hot_train, train['class'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2)).values

X1, y1 = under1.fit_resample(X, y)
X2, y2 = under2.fit_resample(X, y)
X3, y3 = under3.fit_resample(X, y)
X4, y4 = under4.fit_resample(X, y)
X5, y5 = under5.fit_resample(X, y)
X6, y6 = under6.fit_resample(X, y)
X7, y7 = under7.fit_resample(X, y)

strategy2 = {0 : 100, 1 : 120, 2 : 110}

over1 = SMOTEN(sampling_strategy=strategy2, random_state=random_seed)
over2 = SMOTE(sampling_strategy=strategy2, random_state=random_seed)
over3 = RandomOverSampler(sampling_strategy=strategy2, random_state=random_seed)

X8, y8 = over1.fit_resample(X, y)
X9, y9 = over2.fit_resample(X, y)
X10, y10 = over3.fit_resample(X, y)

data = np.concatenate([one_hot_train, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10])
label = np.concatenate([y, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10])

data.shape, label.shape

((2362, 47), (2362,))

In [6]:
random_seed = 10
strategy1 = {0 : 40, 1 : 70, 2 : 50}

under1 = RandomUnderSampler(sampling_strategy=strategy1, random_state=random_seed)
under2 = EditedNearestNeighbours()
under3 = RepeatedEditedNearestNeighbours()
under4 = AllKNN()
under5 = CondensedNearestNeighbour(random_state=random_seed)
under6 = OneSidedSelection(random_state=random_seed)
under7 = NeighbourhoodCleaningRule()

X, y = one_hot_train, train['class'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2)).values

X1, y1 = under1.fit_resample(X, y)
X2, y2 = under2.fit_resample(X, y)
X3, y3 = under3.fit_resample(X, y)
X4, y4 = under4.fit_resample(X, y)
X5, y5 = under5.fit_resample(X, y)
X6, y6 = under6.fit_resample(X, y)
X7, y7 = under7.fit_resample(X, y)

strategy2 = {0 : 100, 1 : 120, 2 : 110}

over1 = SMOTEN(sampling_strategy=strategy2, random_state=random_seed)
over2 = SMOTE(sampling_strategy=strategy2, random_state=random_seed)
over3 = RandomOverSampler(sampling_strategy=strategy2, random_state=random_seed)

X8, y8 = over1.fit_resample(X, y)
X9, y9 = over2.fit_resample(X, y)
X10, y10 = over3.fit_resample(X, y)

data = np.concatenate([data, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10])
label = np.concatenate([label, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10])

data.shape, label.shape

((4475, 47), (4475,))

In [7]:
random_seed = 100
strategy1 = {0 : 40, 1 : 70, 2 : 50}

under1 = RandomUnderSampler(sampling_strategy=strategy1, random_state=random_seed)
under2 = EditedNearestNeighbours()
under3 = RepeatedEditedNearestNeighbours()
under4 = AllKNN()
under5 = CondensedNearestNeighbour(random_state=random_seed)
under6 = OneSidedSelection(random_state=random_seed)
under7 = NeighbourhoodCleaningRule()

X, y = one_hot_train, train['class'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2)).values

X1, y1 = under1.fit_resample(X, y)
X2, y2 = under2.fit_resample(X, y)
X3, y3 = under3.fit_resample(X, y)
X4, y4 = under4.fit_resample(X, y)
X5, y5 = under5.fit_resample(X, y)
X6, y6 = under6.fit_resample(X, y)
X7, y7 = under7.fit_resample(X, y)

strategy2 = {0 : 100, 1 : 120, 2 : 110}

over1 = SMOTEN(sampling_strategy=strategy2, random_state=random_seed)
over2 = SMOTE(sampling_strategy=strategy2, random_state=random_seed)
over3 = RandomOverSampler(sampling_strategy=strategy2, random_state=random_seed)

X8, y8 = over1.fit_resample(X, y)
X9, y9 = over2.fit_resample(X, y)
X10, y10 = over3.fit_resample(X, y)

data = np.concatenate([data, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10])
label = np.concatenate([label, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10])

data.shape, label.shape

((6616, 47), (6616,))

In [8]:
random_seed = 1000
strategy1 = {0 : 40, 1 : 70, 2 : 50}

under1 = RandomUnderSampler(sampling_strategy=strategy1, random_state=random_seed)
under2 = EditedNearestNeighbours()
under3 = RepeatedEditedNearestNeighbours()
under4 = AllKNN()
under5 = CondensedNearestNeighbour(random_state=random_seed)
under6 = OneSidedSelection(random_state=random_seed)
under7 = NeighbourhoodCleaningRule()

X, y = one_hot_train, train['class'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2)).values

X1, y1 = under1.fit_resample(X, y)
X2, y2 = under2.fit_resample(X, y)
X3, y3 = under3.fit_resample(X, y)
X4, y4 = under4.fit_resample(X, y)
X5, y5 = under5.fit_resample(X, y)
X6, y6 = under6.fit_resample(X, y)
X7, y7 = under7.fit_resample(X, y)

strategy2 = {0 : 100, 1 : 120, 2 : 110}

over1 = SMOTEN(sampling_strategy=strategy2, random_state=random_seed)
over2 = SMOTE(sampling_strategy=strategy2, random_state=random_seed)
over3 = RandomOverSampler(sampling_strategy=strategy2, random_state=random_seed)

X8, y8 = over1.fit_resample(X, y)
X9, y9 = over2.fit_resample(X, y)
X10, y10 = over3.fit_resample(X, y)

data = np.concatenate([data, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10])
label = np.concatenate([label, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10])

data.shape, label.shape

((8737, 47), (8737,))

In [9]:
random_seed = 10000
strategy1 = {0 : 40, 1 : 70, 2 : 50}

under1 = RandomUnderSampler(sampling_strategy=strategy1, random_state=random_seed)
under2 = EditedNearestNeighbours()
under3 = RepeatedEditedNearestNeighbours()
under4 = AllKNN()
under5 = CondensedNearestNeighbour(random_state=random_seed)
under6 = OneSidedSelection(random_state=random_seed)
under7 = NeighbourhoodCleaningRule()

X, y = one_hot_train, train['class'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2)).values

X1, y1 = under1.fit_resample(X, y)
X2, y2 = under2.fit_resample(X, y)
X3, y3 = under3.fit_resample(X, y)
X4, y4 = under4.fit_resample(X, y)
X5, y5 = under5.fit_resample(X, y)
X6, y6 = under6.fit_resample(X, y)
X7, y7 = under7.fit_resample(X, y)

strategy2 = {0 : 100, 1 : 120, 2 : 110}

over1 = SMOTEN(sampling_strategy=strategy2, random_state=random_seed)
over2 = SMOTE(sampling_strategy=strategy2, random_state=random_seed)
over3 = RandomOverSampler(sampling_strategy=strategy2, random_state=random_seed)

X8, y8 = over1.fit_resample(X, y)
X9, y9 = over2.fit_resample(X, y)
X10, y10 = over3.fit_resample(X, y)

data = np.concatenate([data, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10])
label = np.concatenate([label, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10])

data.shape, label.shape

((10841, 47), (10841,))

In [10]:
df = pd.DataFrame(data=data, columns=[f"var_{x}" for x in range(data.shape[1])])
df[df >= 0.5] = 1
df[df < 0.5] = 0
df['class'] = label
df.drop_duplicates(inplace=True, ignore_index=True)
df

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,var_11,var_12,var_13,var_14,var_15,var_16,var_17,var_18,var_19,var_20,var_21,var_22,var_23,var_24,var_25,var_26,var_27,var_28,var_29,var_30,var_31,var_32,var_33,var_34,var_35,var_36,var_37,var_38,var_39,var_40,var_41,var_42,var_43,var_44,var_45,var_46,class
0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1
1,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,2
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1
394,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2
395,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2
396,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,2


In [11]:
class Transformer(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_heads, num_classes):
        super(Transformer, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_heads = num_heads
        self.num_classes = num_classes

        self.input_layer = nn.Linear(input_dim, hidden_dim)
        self.attention = nn.MultiheadAttention(hidden_dim, num_heads)
        self.output_layer = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.input_layer(x)
        x = self.attention(x, x, x)[0]
        x = self.output_layer(x)
        return x


In [12]:
class SparseDataset(Dataset):
    def __init__(self, data, labels, transform=None):
        self.data = data
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        label = self.labels[idx]
        if self.transform:
            sample = self.transform(sample)
        return sample, label


In [13]:
def train(model, data, num_epochs, batch_size, learning_rate):
    model = model.to(device)

    # Define the loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    step_size = 5
    gamma=0.9

    # define the scheduler
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

    # Split the data into training and validation sets
    train_data, val_data = data, data

    # Create DataLoaders for the training and validation sets
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

    # Loop over the number of epochs
    for epoch in range(num_epochs):
        # Set the model to training mode
        model.train()

        # Loop over the training data
        for x, y in train_loader:
            # Move the data to the correct device
            x, y = x.to(device), torch.Tensor(y).to(device)

            # Forward pass
            logits = model(x)
            loss = criterion(logits, y)

            # Backward pass and optimization step
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        # update the learning rate
        scheduler.step()

        # Set the model to evaluation mode
        model.eval()

        # Initialize the validation loss and accuracy
        val_loss = 0.0
        val_acc = 0.0

        # Loop over the validation data
        for x, y in val_loader:
            # Move the data to the correct device
            x, y = x.to(device), y.to(device)

            # Forward pass
            logits = model(x)
            loss = criterion(logits, y)

            # Update the validation loss and accuracy
            val_loss += loss.item()
            val_acc += (logits.argmax(1) == y).float().mean().item()

        # Calculate the average validation loss and accuracy
        val_loss /= len(val_loader)
        val_acc /= len(val_loader)

        # Print the epoch, loss, and accuracy
        print(f"Epoch {epoch+1}: loss = {loss:.4f}, val_loss = {val_loss:.4f}, val_accuracy = {val_acc:.4f}")

    return model


In [14]:
# Define the model
modelA = Transformer(input_dim=47, hidden_dim=256, num_heads=4, num_classes=2)
modelB = Transformer(input_dim=47, hidden_dim=256, num_heads=8, num_classes=2)
modelC = Transformer(input_dim=47, hidden_dim=256, num_heads=8, num_classes=2)
modelBC = Transformer(input_dim=47, hidden_dim=128, num_heads=4, num_classes=2)

# Create the dataset and data loader
dataset_A = SparseDataset(df.iloc[:,:-1].values, (df['class']==0).values.astype(int), transform=None)
dataset_B = SparseDataset(df.iloc[:,:-1].values, (df['class']==1).values.astype(int), transform=None)
dataset_C = SparseDataset(df.iloc[:,:-1].values, (df['class']==2).values.astype(int), transform=None)
dataset_BC = SparseDataset(df[df['class']!=0].iloc[:,:-1].values, (df[df['class']!=0]['class']==2).values.astype(int), transform=None)

# Train the model
model_A = train(modelA, dataset_A, num_epochs=3, batch_size=64, learning_rate=0.001)

Epoch 1: loss = 0.4505, val_loss = 0.4753, val_accuracy = 0.7328
Epoch 2: loss = 0.0022, val_loss = 0.0017, val_accuracy = 1.0000
Epoch 3: loss = 0.0000, val_loss = 0.0000, val_accuracy = 1.0000


In [15]:
# Train the model
model_B = train(modelB, dataset_B, num_epochs=100, batch_size=16, learning_rate=0.001)

Epoch 1: loss = 0.2036, val_loss = 0.3317, val_accuracy = 0.8068
Epoch 2: loss = 0.0436, val_loss = 0.1900, val_accuracy = 0.8975
Epoch 3: loss = 0.0036, val_loss = 0.1226, val_accuracy = 0.9375
Epoch 4: loss = 0.0059, val_loss = 0.0944, val_accuracy = 0.9700
Epoch 5: loss = 0.0097, val_loss = 0.1368, val_accuracy = 0.9525
Epoch 6: loss = 0.0120, val_loss = 0.0865, val_accuracy = 0.9700
Epoch 7: loss = 0.0086, val_loss = 0.0711, val_accuracy = 0.9775
Epoch 8: loss = 0.0014, val_loss = 0.0545, val_accuracy = 0.9825
Epoch 9: loss = 0.0264, val_loss = 0.1443, val_accuracy = 0.9400
Epoch 10: loss = 0.0022, val_loss = 0.0984, val_accuracy = 0.9700
Epoch 11: loss = 0.0127, val_loss = 0.0542, val_accuracy = 0.9825
Epoch 12: loss = 0.0017, val_loss = 0.0376, val_accuracy = 0.9900
Epoch 13: loss = 0.0056, val_loss = 0.0460, val_accuracy = 0.9850
Epoch 14: loss = 0.0023, val_loss = 0.0667, val_accuracy = 0.9750
Epoch 15: loss = 0.0028, val_loss = 0.0676, val_accuracy = 0.9700
Epoch 16: loss = 0.

In [16]:
# Train the model
model_C = train(modelC, dataset_C, num_epochs=100, batch_size=4, learning_rate=0.001)

Epoch 1: loss = 0.7143, val_loss = 0.4587, val_accuracy = 0.7275
Epoch 2: loss = 0.2147, val_loss = 0.1490, val_accuracy = 0.9600
Epoch 3: loss = 0.1007, val_loss = 0.1112, val_accuracy = 0.9675
Epoch 4: loss = 0.1922, val_loss = 0.1148, val_accuracy = 0.9675
Epoch 5: loss = 0.4223, val_loss = 0.1505, val_accuracy = 0.9525
Epoch 6: loss = 0.2151, val_loss = 0.1726, val_accuracy = 0.9525
Epoch 7: loss = 0.2747, val_loss = 0.1384, val_accuracy = 0.9700
Epoch 8: loss = 0.0110, val_loss = 0.1521, val_accuracy = 0.9500
Epoch 9: loss = 0.0222, val_loss = 0.0750, val_accuracy = 0.9750
Epoch 10: loss = 0.0244, val_loss = 0.0719, val_accuracy = 0.9800
Epoch 11: loss = 0.0070, val_loss = 0.0566, val_accuracy = 0.9775
Epoch 12: loss = 0.1274, val_loss = 0.0720, val_accuracy = 0.9825
Epoch 13: loss = 0.0208, val_loss = 0.0682, val_accuracy = 0.9725
Epoch 14: loss = 0.0046, val_loss = 0.0920, val_accuracy = 0.9675
Epoch 15: loss = 0.0441, val_loss = 0.2158, val_accuracy = 0.9100
Epoch 16: loss = 0.

In [17]:
# Train the model
model_BC = train(modelBC, dataset_BC, num_epochs=125, batch_size=4, learning_rate=0.001)

Epoch 1: loss = 0.1437, val_loss = 0.3105, val_accuracy = 0.9247
Epoch 2: loss = 0.0539, val_loss = 0.1204, val_accuracy = 0.9760
Epoch 3: loss = 0.0809, val_loss = 0.0824, val_accuracy = 0.9863
Epoch 4: loss = 0.1185, val_loss = 0.1059, val_accuracy = 0.9692
Epoch 5: loss = 0.0410, val_loss = 0.0683, val_accuracy = 0.9795
Epoch 6: loss = 0.0789, val_loss = 0.0951, val_accuracy = 0.9726
Epoch 7: loss = 0.0485, val_loss = 0.0854, val_accuracy = 0.9795
Epoch 8: loss = 0.0997, val_loss = 0.1000, val_accuracy = 0.9692
Epoch 9: loss = 0.0639, val_loss = 0.0539, val_accuracy = 0.9863
Epoch 10: loss = 0.0205, val_loss = 0.0494, val_accuracy = 0.9829
Epoch 11: loss = 0.0612, val_loss = 0.0602, val_accuracy = 0.9897
Epoch 12: loss = 0.0870, val_loss = 0.0764, val_accuracy = 0.9760
Epoch 13: loss = 0.1332, val_loss = 0.0854, val_accuracy = 0.9658
Epoch 14: loss = 0.0949, val_loss = 0.0557, val_accuracy = 0.9795
Epoch 15: loss = 0.1116, val_loss = 0.0881, val_accuracy = 0.9829
Epoch 16: loss = 0.

In [18]:
X_test = one_hot_test.to(device)

pred_A = torch.softmax(model_A(X_test), axis=1)[:,1].detach().cpu().numpy()
pred_B = torch.softmax(model_B(X_test), axis=1)[:,1].detach().cpu().numpy()
pred_C = torch.softmax(model_C(X_test), axis=1)[:,1].detach().cpu().numpy()
pred_BC = torch.softmax(model_BC(X_test), axis=1)[:,1].detach().cpu().numpy()

In [19]:
submit = pd.read_csv("submit_high1.csv")
submit['A_prob'] = np.round(pred_A, 4)
submit['B_prob'] = np.round(pred_B, 4)
submit['C_prob'] = np.round(pred_C, 4)
submit['BC_prob'] = np.round(pred_BC, 4)
# submit['class'] = submit['class'].map(lambda x : 'A' if x==0 else ('B' if x==1 else 'C'))
submit['total'] = np.argmax(submit[['A_prob', 'B_prob', 'C_prob']].values, axis=1)
submit['total'] = submit['total'].map(lambda x : 'A' if x==0 else ('B' if x==1 else 'C'))
submit

Unnamed: 0,id,class,A_prob,B_prob,C_prob,BC_prob,total
0,TEST_000,A,1.0,0.0000,0.0,1.0,A
1,TEST_001,B,0.0,1.0000,0.0,0.0,B
2,TEST_002,C,0.0,0.0000,1.0,1.0,C
3,TEST_003,C,0.0,0.9997,1.0,1.0,C
4,TEST_004,A,1.0,0.0000,0.0,1.0,A
...,...,...,...,...,...,...,...
170,TEST_170,B,0.0,1.0000,0.0,0.0,B
171,TEST_171,C,0.0,0.0000,1.0,1.0,C
172,TEST_172,C,0.0,0.0000,1.0,1.0,C
173,TEST_173,B,0.0,1.0000,0.0,0.0,B


In [20]:
submit[submit['class'] != submit.total]

Unnamed: 0,id,class,A_prob,B_prob,C_prob,BC_prob,total
5,TEST_005,C,0.0,1.0,0.0,0.0,B
12,TEST_012,B,0.0,0.0029,1.0,1.0,C
162,TEST_162,C,0.0,0.9865,0.0001,1.0,B


In [29]:
submit['total2'] = 1
submit.loc[submit.BC_prob >= 0.6, 'total2'] = 2
submit.loc[submit.A_prob >= 0.5, 'total2'] = 0
submit['total2'] = submit['total2'].map(lambda x : 'A' if x==0 else ('B' if x==1 else 'C'))
submit[submit['class'] != submit.total2]

Unnamed: 0,id,class,A_prob,B_prob,C_prob,BC_prob,total,total2,last1,last2,last3
5,TEST_005,C,0.0,1.0,0.0,0.0,B,B,B,B,B
12,TEST_012,B,0.0,0.0029,1.0,1.0,C,C,C,C,C
111,TEST_111,B,0.0,1.0,0.0,0.9833,B,C,B,B,B
126,TEST_126,B,0.0,0.2664,0.0003,1.0,B,C,B,B,B
127,TEST_127,B,0.0,0.1326,0.0,1.0,B,C,C,B,B


In [30]:
submit['total'].value_counts()

B    85
A    51
C    39
Name: total, dtype: int64

In [31]:
submit['last1'] = pd.read_csv("submit_last1.csv")['class']
submit['last2'] = pd.read_csv("submit_last2.csv")['class']

submit[submit.total != submit.last2]

Unnamed: 0,id,class,A_prob,B_prob,C_prob,BC_prob,total,total2,last1,last2,last3


In [32]:
submit[submit.total2 != submit.last3]

Unnamed: 0,id,class,A_prob,B_prob,C_prob,BC_prob,total,total2,last1,last2,last3
111,TEST_111,B,0.0,1.0,0.0,0.9833,B,C,B,B,B
126,TEST_126,B,0.0,0.2664,0.0003,1.0,B,C,B,B,B
127,TEST_127,B,0.0,0.1326,0.0,1.0,B,C,C,B,B
162,TEST_162,C,0.0,0.9865,0.0001,1.0,B,C,B,B,B


In [33]:
submit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       175 non-null    object 
 1   class    175 non-null    object 
 2   A_prob   175 non-null    float32
 3   B_prob   175 non-null    float32
 4   C_prob   175 non-null    float32
 5   BC_prob  175 non-null    float32
 6   total    175 non-null    object 
 7   total2   175 non-null    object 
 8   last1    175 non-null    object 
 9   last2    175 non-null    object 
 10  last3    175 non-null    object 
dtypes: float32(4), object(7)
memory usage: 12.4+ KB


In [34]:
df = pd.read_csv("submit_high1.csv")
df['class'] = submit['total']
df.to_csv("submit_last01.csv", index=False)
df

Unnamed: 0,id,class
0,TEST_000,A
1,TEST_001,B
2,TEST_002,C
3,TEST_003,C
4,TEST_004,A
...,...,...
170,TEST_170,B
171,TEST_171,C
172,TEST_172,C
173,TEST_173,B


In [35]:
df['class'].value_counts()

B    85
A    51
C    39
Name: class, dtype: int64

In [36]:
df = pd.read_csv("submit_high1.csv")
df['class'] = submit['total2']
df.to_csv("submit_last02.csv", index=False)
df

Unnamed: 0,id,class
0,TEST_000,A
1,TEST_001,B
2,TEST_002,C
3,TEST_003,C
4,TEST_004,A
...,...,...
170,TEST_170,B
171,TEST_171,C
172,TEST_172,C
173,TEST_173,B


In [37]:
df['class'].value_counts()

B    81
A    51
C    43
Name: class, dtype: int64