In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
import multiprocessing
from tqdm.auto import tqdm

from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset, sampler

from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from imblearn.over_sampling import SMOTE, SMOTENC, SMOTEN, RandomOverSampler, ADASYN
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, CondensedNearestNeighbour, OneSidedSelection, NeighbourhoodCleaningRule

import shap
import catboost
from catboost import Pool, cv

pd.set_option('display.max_columns', 500)

In [2]:
train = pd.read_csv("./data/train.csv").drop(columns=['father', 'mother', 'gender'])
train.drop_duplicates(subset=train.columns.tolist()[5:20], inplace=True, ignore_index=True)
test = pd.read_csv("./data/test.csv").drop(columns=['father', 'mother', 'gender'])

train.iloc[:, 1:-1] = train.iloc[:, 1:-1].astype('category')
test.iloc[:, 1:] = test.iloc[:, 1:].astype('category')

answer = np.zeros(len(test)) - 1

train.info(), test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   id      248 non-null    object  
 1   trait   248 non-null    category
 2   SNP_01  248 non-null    category
 3   SNP_02  248 non-null    category
 4   SNP_03  248 non-null    category
 5   SNP_04  248 non-null    category
 6   SNP_05  248 non-null    category
 7   SNP_06  248 non-null    category
 8   SNP_07  248 non-null    category
 9   SNP_08  248 non-null    category
 10  SNP_09  248 non-null    category
 11  SNP_10  248 non-null    category
 12  SNP_11  248 non-null    category
 13  SNP_12  248 non-null    category
 14  SNP_13  248 non-null    category
 15  SNP_14  248 non-null    category
 16  SNP_15  248 non-null    category
 17  class   248 non-null    object  
dtypes: category(16), object(2)
memory usage: 9.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data colu

(None, None)

In [3]:
# text 형태의 categorical 변수들을 숫자형태로 변경

for i in tqdm(range(1, 15+1)) :
    target = str(i) if i >= 10 else "0"+str(i)
    try :   
        cols = sorted(train[f"SNP_{target}"].unique().tolist())  
        train[f"SNP_{target}"] = train[f"SNP_{target}"].map(lambda x : 0 if x==cols[0] else (1 if x==cols[1] else 2))
        test[f"SNP_{target}"] = test[f"SNP_{target}"].map(lambda x : 0 if x==cols[0] else (1 if x==cols[1] else 2))
    except :
        continue

train.info(), test.info()

  0%|          | 0/15 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   id      248 non-null    object  
 1   trait   248 non-null    category
 2   SNP_01  248 non-null    category
 3   SNP_02  248 non-null    category
 4   SNP_03  248 non-null    category
 5   SNP_04  248 non-null    category
 6   SNP_05  248 non-null    category
 7   SNP_06  248 non-null    category
 8   SNP_07  248 non-null    category
 9   SNP_08  248 non-null    category
 10  SNP_09  248 non-null    category
 11  SNP_10  248 non-null    category
 12  SNP_11  248 non-null    category
 13  SNP_12  248 non-null    category
 14  SNP_13  248 non-null    category
 15  SNP_14  248 non-null    category
 16  SNP_15  248 non-null    category
 17  class   248 non-null    object  
dtypes: category(16), object(2)
memory usage: 9.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data colu

(None, None)

In [4]:
random_seed=0
strategy1 = {0 : 40, 1 : 70, 2 : 50}

under1 = RandomUnderSampler(sampling_strategy=strategy1, random_state=random_seed)
under2 = EditedNearestNeighbours()
under3 = RepeatedEditedNearestNeighbours()
under4 = AllKNN()
under5 = CondensedNearestNeighbour(random_state=random_seed)
under6 = OneSidedSelection(random_state=random_seed)
under7 = NeighbourhoodCleaningRule()

X, y = train.iloc[:, 1:-1], train['class'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2)).values

X1, y1 = under1.fit_resample(X, y)
X2, y2 = under2.fit_resample(X, y)
X3, y3 = under3.fit_resample(X, y)
X4, y4 = under4.fit_resample(X, y)
X5, y5 = under5.fit_resample(X, y)
X6, y6 = under6.fit_resample(X, y)
X7, y7 = under7.fit_resample(X, y)

strategy2 = {0 : 100, 1 : 120, 2 : 110}

over1 = SMOTEN(sampling_strategy=strategy2, random_state=random_seed)
over2 = SMOTE(sampling_strategy=strategy2, random_state=random_seed)
over3 = RandomOverSampler(sampling_strategy=strategy2, random_state=random_seed)

X8, y8 = over1.fit_resample(X, y)
X9, y9 = over2.fit_resample(X, y)
X10, y10 = over3.fit_resample(X, y)

df_train = pd.concat([X1, X2, X3, X4, X5, X6, X7, X8, X9, X10], ignore_index=True)
df_train['class'] = list(y1)+list(y2)+list(y3)+list(y4)+list(y5)+list(y6)+list(y7)+list(y8)+list(y9)+list(y10)

df = df_train.copy()

In [5]:
random_seed= 10
strategy1 = {0 : 40, 1 : 70, 2 : 50}

under1 = RandomUnderSampler(sampling_strategy=strategy1, random_state=random_seed)
under2 = EditedNearestNeighbours()
under3 = RepeatedEditedNearestNeighbours()
under4 = AllKNN()
under5 = CondensedNearestNeighbour(random_state=random_seed)
under6 = OneSidedSelection(random_state=random_seed)
under7 = NeighbourhoodCleaningRule()

X, y = train.iloc[:, 1:-1], train['class'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2)).values

X1, y1 = under1.fit_resample(X, y)
X2, y2 = under2.fit_resample(X, y)
X3, y3 = under3.fit_resample(X, y)
X4, y4 = under4.fit_resample(X, y)
X5, y5 = under5.fit_resample(X, y)
X6, y6 = under6.fit_resample(X, y)
X7, y7 = under7.fit_resample(X, y)

strategy2 = {0 : 100, 1 : 120, 2 : 110}

over1 = SMOTEN(sampling_strategy=strategy2, random_state=random_seed)
over2 = SMOTE(sampling_strategy=strategy2, random_state=random_seed)
over3 = RandomOverSampler(sampling_strategy=strategy2, random_state=random_seed)

X8, y8 = over1.fit_resample(X, y)
X9, y9 = over2.fit_resample(X, y)
X10, y10 = over3.fit_resample(X, y)

df_train = pd.concat([X1, X2, X3, X4, X5, X6, X7, X8, X9, X10], ignore_index=True)
df_train['class'] = list(y1)+list(y2)+list(y3)+list(y4)+list(y5)+list(y6)+list(y7)+list(y8)+list(y9)+list(y10)

df = pd.concat([df, df_train], ignore_index=True)

In [6]:
random_seed= 100
strategy1 = {0 : 40, 1 : 70, 2 : 50}

under1 = RandomUnderSampler(sampling_strategy=strategy1, random_state=random_seed)
under2 = EditedNearestNeighbours()
under3 = RepeatedEditedNearestNeighbours()
under4 = AllKNN()
under5 = CondensedNearestNeighbour(random_state=random_seed)
under6 = OneSidedSelection(random_state=random_seed)
under7 = NeighbourhoodCleaningRule()

X, y = train.iloc[:, 1:-1], train['class'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2)).values

X1, y1 = under1.fit_resample(X, y)
X2, y2 = under2.fit_resample(X, y)
X3, y3 = under3.fit_resample(X, y)
X4, y4 = under4.fit_resample(X, y)
X5, y5 = under5.fit_resample(X, y)
X6, y6 = under6.fit_resample(X, y)
X7, y7 = under7.fit_resample(X, y)

strategy2 = {0 : 100, 1 : 120, 2 : 110}

over1 = SMOTEN(sampling_strategy=strategy2, random_state=random_seed)
over2 = SMOTE(sampling_strategy=strategy2, random_state=random_seed)
over3 = RandomOverSampler(sampling_strategy=strategy2, random_state=random_seed)

X8, y8 = over1.fit_resample(X, y)
X9, y9 = over2.fit_resample(X, y)
X10, y10 = over3.fit_resample(X, y)

df_train = pd.concat([X1, X2, X3, X4, X5, X6, X7, X8, X9, X10], ignore_index=True)
df_train['class'] = list(y1)+list(y2)+list(y3)+list(y4)+list(y5)+list(y6)+list(y7)+list(y8)+list(y9)+list(y10)

df = pd.concat([df, df_train], ignore_index=True)

In [9]:
df.drop_duplicates(ignore_index=True, inplace=True)
df

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15,class
0,1,1,2,0,2,0,2,2,0,2,0,2,1,2,1,1,0
1,1,1,2,0,1,1,1,1,0,1,0,2,1,2,1,0,0
2,1,0,2,0,1,0,2,2,0,1,1,1,1,2,1,2,0
3,1,0,2,0,2,0,2,2,0,1,1,2,1,2,1,2,0
4,1,0,2,0,1,1,2,1,0,0,0,2,2,2,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
472,2,1,1,0,0,1,0,0,0,0,1,0,0,1,0,1,2
473,2,1,0,0,0,1,0,0,1,0,0,0,0,1,0,1,2
474,2,1,1,1,0,1,0,0,0,0,1,1,0,2,1,1,2
475,2,1,1,1,0,0,1,0,1,0,1,0,0,2,0,0,2


In [15]:
one_hot_data.shape

torch.Size([477, 3])

In [18]:
torch.arange(data.size(0))

tensor([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
         28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
         42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
         56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
         70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
         84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
         98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
        126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
        140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
        154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
        168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 1

In [19]:
data

tensor([[1, 1, 2,  ..., 2, 1, 1],
        [1, 1, 2,  ..., 2, 1, 0],
        [1, 0, 2,  ..., 2, 1, 2],
        ...,
        [2, 1, 1,  ..., 2, 1, 1],
        [2, 1, 1,  ..., 2, 0, 0],
        [2, 2, 1,  ..., 0, 0, 0]])

In [16]:
one_hot_data[torch.arange(data.size(0)), data] = 1

IndexError: shape mismatch: indexing tensors could not be broadcast together with shapes [477], [477, 16]

In [13]:
X, y = df.iloc[:, :-1].to_numpy(), df['class']

# Load the data
data = torch.from_numpy(X)

# Preprocess the data by one-hot encoding the categories
one_hot_data = torch.zeros(data.size(0), data.max()+1)
one_hot_data[torch.arange(data.size(0)), data] = 1

# Define the model architecture
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(Autoencoder, self).__init__()

        self.input_dim = input_dim
        self.encoding_dim = encoding_dim

        # Encoder
        self.fc1 = nn.Linear(input_dim, 32)
        self.fc2 = nn.Linear(32, encoding_dim)

        # Decoder
        self.fc3 = nn.Linear(encoding_dim, 32)
        self.fc4 = nn.Linear(32, input_dim)

    def encode(self, x):
        h1 = torch.relu(self.fc1(x))
        return self.fc2(h1)

    def decode(self, z):
        h3 = torch.relu(self.fc3(z))
        return self.fc4(h3)

    def forward(self, x):
        z = self.encode(x)
        return self.decode(z)

# Instantiate the model
model = Autoencoder(input_dim=one_hot_data.size(1), encoding_dim=16)

# Define the loss function and optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Train the model
for epoch in range(10):
    model.train()
    train_loss = 0
    for i, x in enumerate(one_hot_data):
        optimizer.zero_grad()
        recon_x = model(x)
        loss = F.binary_cross_entropy(recon_x, x, reduction='sum')
        loss.backward()
        train_loss += loss.item()
        optimizer.step()

# Use the encoder to generate the encoded representation of the data
encoder = nn.Sequential(*list(model.children())[:2])
encoded_data = encoder(one_hot_data)

# Use the decoder to generate new variables from the encoded representation
decoder = nn.Sequential(*list(model.children())[2:])
new_variables = decoder(encoded_data)

IndexError: shape mismatch: indexing tensors could not be broadcast together with shapes [477], [477, 16]

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim

# Load the data
data = torch.from_numpy(X)

# Preprocess the data by one-hot encoding the categories
one_hot_data = torch.zeros(data.size(0), data.max()+1)
one_hot_data[torch.arange(data.size(0)), data] = 1

# Define the generator network
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim

        self.fc1 = nn.Linear(input_dim, 32)
        self.fc2 = nn.Linear(32, output_dim)

    def forward(self, z):
        h1 = torch.relu(self.fc1(z))
        return self.fc2(h1)

# Define the discriminator network
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()

        self.input_dim = input_dim

        self.fc1 = nn.Linear(input_dim, 32)
        self.fc2 = nn.Linear(32, 1)

    def forward(self, x):
        h1 = torch.relu(self.fc1(x))
        return self.fc2(h1)

# Instantiate the generator and discriminator networks
generator = Generator(input_dim=100, output_dim=one_hot_data.size(1))
discriminator = Discriminator(input_dim=one_hot_data.size(1))

# Define the loss function and optimizers
optimizer_G = optim.Adam(generator.parameters(), lr=1e-3)
optimizer_D = optim.Adam(discriminator.parameters(), lr=1e-3)

# Train the GAN
for epoch in range(10):
    for i, real_data in enumerate(one_hot_data):
        # Generate fake data
        noise = torch.randn(100)
        fake_data = generator(noise)

        # Train the discriminator
        optimizer_D.zero_grad()
        loss_real = F.binary_cross_entropy(discriminator(real_data), torch.ones_like(discriminator(real_data)))
        loss_fake = F.binary_cross_entropy(discriminator(fake_data), torch.zeros_like(discriminator(fake_data)))
        loss_D = (loss_real + loss_fake) / 2
        loss_D.backward()
        optimizer_D.step()

        # Train the generator
        optimizer_G.zero_grad()
        loss_G = F.binary_cross_entropy(discriminator(fake_data), torch.ones_like(discriminator(fake_data)))
        loss_G.backward()
        optimizer_G.step()

# Use the generator to generate new variables
new_variables = generator(torch.randn(100))


IndexError: shape mismatch: indexing tensors could not be broadcast together with shapes [477], [477, 16]

In [None]:
import numpy as np
from sklearn.manifold import TSNE

# Load the data
data = np.load('categorical_data.npy')

# Preprocess the data by one-hot encoding the categories
one_hot_data = np.zeros((data.size, data.max()+1))
one_hot_data[np.arange(data.size), data] = 1

# Use t-SNE to reduce the dimensionality of the data
tsne = TSNE(n_components=2)
new_variables = tsne.fit_transform(one_hot_data)
