In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
import multiprocessing
from tqdm import tqdm

from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset, sampler
import torch.optim.lr_scheduler as lr_scheduler

from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from imblearn.over_sampling import SMOTE, SMOTENC, SMOTEN, RandomOverSampler, ADASYN
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, CondensedNearestNeighbour, OneSidedSelection, NeighbourhoodCleaningRule

import shap
import catboost
from catboost import Pool, cv

pd.set_option('display.max_columns', 500)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
train = pd.read_csv("./data/train.csv").drop(columns=['father', 'mother', 'gender'])
train.drop_duplicates(subset=train.columns.tolist()[5:20], inplace=True, ignore_index=True)
test = pd.read_csv("./data/test.csv").drop(columns=['father', 'mother', 'gender'])

train.iloc[:, 1:-1] = train.iloc[:, 1:-1].astype('category')
test.iloc[:, 1:] = test.iloc[:, 1:].astype('category')

answer = np.zeros(len(test)) - 1

train.info(), test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   id      248 non-null    object  
 1   trait   248 non-null    category
 2   SNP_01  248 non-null    category
 3   SNP_02  248 non-null    category
 4   SNP_03  248 non-null    category
 5   SNP_04  248 non-null    category
 6   SNP_05  248 non-null    category
 7   SNP_06  248 non-null    category
 8   SNP_07  248 non-null    category
 9   SNP_08  248 non-null    category
 10  SNP_09  248 non-null    category
 11  SNP_10  248 non-null    category
 12  SNP_11  248 non-null    category
 13  SNP_12  248 non-null    category
 14  SNP_13  248 non-null    category
 15  SNP_14  248 non-null    category
 16  SNP_15  248 non-null    category
 17  class   248 non-null    object  
dtypes: category(16), object(2)
memory usage: 9.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data colu

(None, None)

In [3]:
# text 형태의 categorical 변수들을 숫자형태로 변경

for i in tqdm(range(1, 15+1)) :
    target = str(i) if i >= 10 else "0"+str(i)
    try :   
        cols = sorted(train[f"SNP_{target}"].unique().tolist())  
        train[f"SNP_{target}"] = train[f"SNP_{target}"].map(lambda x : 0 if x==cols[0] else (1 if x==cols[1] else 2))
        test[f"SNP_{target}"] = test[f"SNP_{target}"].map(lambda x : 0 if x==cols[0] else (1 if x==cols[1] else 2))
    except :
        continue

train.info(), test.info()

100%|██████████| 15/15 [00:00<00:00, 1862.92it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   id      248 non-null    object  
 1   trait   248 non-null    category
 2   SNP_01  248 non-null    category
 3   SNP_02  248 non-null    category
 4   SNP_03  248 non-null    category
 5   SNP_04  248 non-null    category
 6   SNP_05  248 non-null    category
 7   SNP_06  248 non-null    category
 8   SNP_07  248 non-null    category
 9   SNP_08  248 non-null    category
 10  SNP_09  248 non-null    category
 11  SNP_10  248 non-null    category
 12  SNP_11  248 non-null    category
 13  SNP_12  248 non-null    category
 14  SNP_13  248 non-null    category
 15  SNP_14  248 non-null    category
 16  SNP_15  248 non-null    category
 17  class   248 non-null    object  
dtypes: category(16), object(2)
memory usage: 9.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data colu




(None, None)

In [4]:
X, y = train.iloc[:, 1:-1].to_numpy(), train['class'].map(lambda x : 0 if x=='A' else(1 if x=='B' else 2)).values
X_test = test.iloc[:,1:].to_numpy()

# Load the data
train_data = torch.from_numpy(X)
test_data = torch.from_numpy(X_test)

# Preprocess the data by one-hot encoding the categories
one_hot_data_01 = F.one_hot(train_data[:,:1]-1, num_classes=2).view(len(X), 2).float()
one_hot_data_02 = F.one_hot(train_data[:,1:], num_classes=3).view(len(X), 3*train_data[:,1:].size(1)).float()

one_hot_train = torch.concat([one_hot_data_01, one_hot_data_02], axis=1)

one_hot_data_01 = F.one_hot(test_data[:,:1]-1, num_classes=2).view(len(X_test), 2).float()
one_hot_data_02 = F.one_hot(test_data[:,1:], num_classes=3).view(len(X_test), 3*test_data[:,1:].size(1)).float()

one_hot_test = torch.concat([one_hot_data_01, one_hot_data_02], axis=1)

one_hot_train.shape, one_hot_test.shape

(torch.Size([248, 47]), torch.Size([175, 47]))

In [5]:
class Transformer(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_heads, num_classes):
        super(Transformer, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_heads = num_heads
        self.num_classes = num_classes

        self.input_layer = nn.Linear(input_dim, hidden_dim)
        self.attention = nn.MultiheadAttention(hidden_dim, num_heads)
        self.output_layer = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.input_layer(x)
        x = self.attention(x, x, x)[0]
        x = self.output_layer(x)
        return x


In [6]:
class SparseDataset(Dataset):
    def __init__(self, data, labels, transform=None):
        self.data = data
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        label = self.labels[idx]
        if self.transform:
            sample = self.transform(sample)
        return sample, label


In [18]:
def train(model, data, num_epochs, batch_size, learning_rate):
    model = model.to(device)

    # Define the loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    step_size = 20
    gamma=1

    # define the scheduler
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

    # Split the data into training and validation sets
    train_data, val_data = data, data

    # Create DataLoaders for the training and validation sets
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

    # Loop over the number of epochs
    for epoch in range(num_epochs):
        # Set the model to training mode
        model.train()

        # Loop over the training data
        for x, y in train_loader:
            # Move the data to the correct device
            x, y = x.to(device), torch.Tensor(y).to(device)

            # Forward pass
            logits = model(x)
            loss = criterion(logits, y)

            # Backward pass and optimization step
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        # update the learning rate
        scheduler.step()

        # Set the model to evaluation mode
        model.eval()

        # Initialize the validation loss and accuracy
        val_loss = 0.0
        val_acc = 0.0

        # Loop over the validation data
        for x, y in val_loader:
            # Move the data to the correct device
            x, y = x.to(device), y.to(device)

            # Forward pass
            logits = model(x)
            loss = criterion(logits, y)

            # Update the validation loss and accuracy
            val_loss += loss.item()
            val_acc += (logits.argmax(1) == y).float().mean().item()

        # Calculate the average validation loss and accuracy
        val_loss /= len(val_loader)
        val_acc /= len(val_loader)

        # Print the epoch, loss, and accuracy
        print(f"Epoch {epoch+1}: loss = {loss:.4f}, val_loss = {val_loss:.4f}, val_accuracy = {val_acc:.4f}")

    return model


In [24]:
# Define the model
model = Transformer(input_dim=47, hidden_dim=64, num_heads=8, num_classes=3)

# Create the dataset and data loader
dataset = SparseDataset(one_hot_train, y, transform=None)

# Train the model
model = train(model, dataset, num_epochs=200, batch_size=64, learning_rate=0.001)

Epoch 1: loss = 1.0543, val_loss = 1.0676, val_accuracy = 0.4492
Epoch 2: loss = 1.0411, val_loss = 1.0659, val_accuracy = 0.4492
Epoch 3: loss = 1.0414, val_loss = 1.0634, val_accuracy = 0.4492
Epoch 4: loss = 1.0422, val_loss = 1.0580, val_accuracy = 0.4492
Epoch 5: loss = 1.0378, val_loss = 1.0494, val_accuracy = 0.4492
Epoch 6: loss = 1.0126, val_loss = 1.0235, val_accuracy = 0.4492
Epoch 7: loss = 0.9324, val_loss = 0.9463, val_accuracy = 0.4967
Epoch 8: loss = 0.7326, val_loss = 0.7602, val_accuracy = 0.6791
Epoch 9: loss = 0.5124, val_loss = 0.5557, val_accuracy = 0.6914
Epoch 10: loss = 0.4235, val_loss = 0.4728, val_accuracy = 0.7031
Epoch 11: loss = 0.3786, val_loss = 0.4040, val_accuracy = 0.8644
Epoch 12: loss = 0.2910, val_loss = 0.3209, val_accuracy = 0.9124
Epoch 13: loss = 0.1850, val_loss = 0.2279, val_accuracy = 0.9280
Epoch 14: loss = 0.1008, val_loss = 0.1496, val_accuracy = 0.9526
Epoch 15: loss = 0.0537, val_loss = 0.1097, val_accuracy = 0.9565
Epoch 16: loss = 0.

In [25]:
pred = torch.argmax(model(one_hot_test.to(device)), axis=1).detach().cpu().numpy()
pred

array([0, 1, 2, 2, 0, 1, 2, 1, 0, 0, 2, 1, 2, 0, 1, 1, 0, 1, 1, 2, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 2, 0, 1, 2, 1, 1, 2, 0, 1, 2, 1,
       1, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 1, 2, 0, 1, 2, 0, 1, 2, 2, 2, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 2, 1, 2, 1, 1, 1, 2, 1, 0, 1, 1, 1, 1, 1,
       2, 0, 1, 1, 2, 1, 1, 2, 0, 1, 0, 2, 0, 1, 1, 2, 0, 0, 2, 1, 0, 1,
       2, 1, 1, 1, 1, 0, 0, 2, 1, 2, 0, 1, 1, 2, 2, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 2, 0, 0, 1, 0, 0, 0, 2, 1, 1, 1, 0, 1, 2, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 2, 0, 1, 2, 1, 1, 0, 0, 2, 1, 1, 0, 1, 2, 2, 1, 1])

In [26]:
df = pd.read_csv("submit_high1.csv")
df['class'] = pred
df['class'] = df['class'].map(lambda x : 'A' if x==0 else ('B' if x==1 else 'C'))
df.to_csv("submit_last3.csv", index=False)
df

Unnamed: 0,id,class
0,TEST_000,A
1,TEST_001,B
2,TEST_002,C
3,TEST_003,C
4,TEST_004,A
...,...,...
170,TEST_170,B
171,TEST_171,C
172,TEST_172,C
173,TEST_173,B


In [27]:
df['class'].value_counts()

B    85
A    51
C    39
Name: class, dtype: int64