In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
import multiprocessing
from tqdm import tqdm

from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset, sampler
import torch.optim.lr_scheduler as lr_scheduler

from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from imblearn.over_sampling import SMOTE, SMOTENC, SMOTEN, RandomOverSampler, ADASYN
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, CondensedNearestNeighbour, OneSidedSelection, NeighbourhoodCleaningRule

import shap
import catboost
from catboost import Pool, cv

pd.set_option('display.max_columns', 500)

In [2]:
train = pd.read_csv("./data/train.csv").drop(columns=['father', 'mother', 'gender'])
train.drop_duplicates(subset=train.columns.tolist()[5:20], inplace=True, ignore_index=True)
test = pd.read_csv("./data/test.csv").drop(columns=['father', 'mother', 'gender'])

train.iloc[:, 1:-1] = train.iloc[:, 1:-1].astype('category')
test.iloc[:, 1:] = test.iloc[:, 1:].astype('category')

answer = np.zeros(len(test)) - 1

train.info(), test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   id      248 non-null    object  
 1   trait   248 non-null    category
 2   SNP_01  248 non-null    category
 3   SNP_02  248 non-null    category
 4   SNP_03  248 non-null    category
 5   SNP_04  248 non-null    category
 6   SNP_05  248 non-null    category
 7   SNP_06  248 non-null    category
 8   SNP_07  248 non-null    category
 9   SNP_08  248 non-null    category
 10  SNP_09  248 non-null    category
 11  SNP_10  248 non-null    category
 12  SNP_11  248 non-null    category
 13  SNP_12  248 non-null    category
 14  SNP_13  248 non-null    category
 15  SNP_14  248 non-null    category
 16  SNP_15  248 non-null    category
 17  class   248 non-null    object  
dtypes: category(16), object(2)
memory usage: 9.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data colu

(None, None)

In [3]:
# text 형태의 categorical 변수들을 숫자형태로 변경

for i in tqdm(range(1, 15+1)) :
    target = str(i) if i >= 10 else "0"+str(i)
    try :   
        cols = sorted(train[f"SNP_{target}"].unique().tolist())  
        train[f"SNP_{target}"] = train[f"SNP_{target}"].map(lambda x : 0 if x==cols[0] else (1 if x==cols[1] else 2))
        test[f"SNP_{target}"] = test[f"SNP_{target}"].map(lambda x : 0 if x==cols[0] else (1 if x==cols[1] else 2))
    except :
        continue

train.info(), test.info()

100%|██████████| 15/15 [00:00<00:00, 1869.51it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   id      248 non-null    object  
 1   trait   248 non-null    category
 2   SNP_01  248 non-null    category
 3   SNP_02  248 non-null    category
 4   SNP_03  248 non-null    category
 5   SNP_04  248 non-null    category
 6   SNP_05  248 non-null    category
 7   SNP_06  248 non-null    category
 8   SNP_07  248 non-null    category
 9   SNP_08  248 non-null    category
 10  SNP_09  248 non-null    category
 11  SNP_10  248 non-null    category
 12  SNP_11  248 non-null    category
 13  SNP_12  248 non-null    category
 14  SNP_13  248 non-null    category
 15  SNP_14  248 non-null    category
 16  SNP_15  248 non-null    category
 17  class   248 non-null    object  
dtypes: category(16), object(2)
memory usage: 9.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data colu




(None, None)

In [4]:
random_seed=0
strategy1 = {0 : 40, 1 : 70, 2 : 50}

under1 = RandomUnderSampler(sampling_strategy=strategy1, random_state=random_seed)
under2 = EditedNearestNeighbours()
under3 = RepeatedEditedNearestNeighbours()
under4 = AllKNN()
under5 = CondensedNearestNeighbour(random_state=random_seed)
under6 = OneSidedSelection(random_state=random_seed)
under7 = NeighbourhoodCleaningRule()

X, y = train.iloc[:, 1:-1], train['class'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2)).values

X1, y1 = under1.fit_resample(X, y)
X2, y2 = under2.fit_resample(X, y)
X3, y3 = under3.fit_resample(X, y)
X4, y4 = under4.fit_resample(X, y)
X5, y5 = under5.fit_resample(X, y)
X6, y6 = under6.fit_resample(X, y)
X7, y7 = under7.fit_resample(X, y)

strategy2 = {0 : 100, 1 : 120, 2 : 110}

over1 = SMOTEN(sampling_strategy=strategy2, random_state=random_seed)
over2 = SMOTE(sampling_strategy=strategy2, random_state=random_seed)
over3 = RandomOverSampler(sampling_strategy=strategy2, random_state=random_seed)

X8, y8 = over1.fit_resample(X, y)
X9, y9 = over2.fit_resample(X, y)
X10, y10 = over3.fit_resample(X, y)

df_train = pd.concat([X1, X2, X3, X4, X5, X6, X7, X8, X9, X10], ignore_index=True)
df_train['class'] = list(y1)+list(y2)+list(y3)+list(y4)+list(y5)+list(y6)+list(y7)+list(y8)+list(y9)+list(y10)

df = df_train.copy()

In [5]:
random_seed= 10
strategy1 = {0 : 40, 1 : 70, 2 : 50}

under1 = RandomUnderSampler(sampling_strategy=strategy1, random_state=random_seed)
under2 = EditedNearestNeighbours()
under3 = RepeatedEditedNearestNeighbours()
under4 = AllKNN()
under5 = CondensedNearestNeighbour(random_state=random_seed)
under6 = OneSidedSelection(random_state=random_seed)
under7 = NeighbourhoodCleaningRule()

X, y = train.iloc[:, 1:-1], train['class'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2)).values

X1, y1 = under1.fit_resample(X, y)
X2, y2 = under2.fit_resample(X, y)
X3, y3 = under3.fit_resample(X, y)
X4, y4 = under4.fit_resample(X, y)
X5, y5 = under5.fit_resample(X, y)
X6, y6 = under6.fit_resample(X, y)
X7, y7 = under7.fit_resample(X, y)

strategy2 = {0 : 100, 1 : 120, 2 : 110}

over1 = SMOTEN(sampling_strategy=strategy2, random_state=random_seed)
over2 = SMOTE(sampling_strategy=strategy2, random_state=random_seed)
over3 = RandomOverSampler(sampling_strategy=strategy2, random_state=random_seed)

X8, y8 = over1.fit_resample(X, y)
X9, y9 = over2.fit_resample(X, y)
X10, y10 = over3.fit_resample(X, y)

df_train = pd.concat([X1, X2, X3, X4, X5, X6, X7, X8, X9, X10], ignore_index=True)
df_train['class'] = list(y1)+list(y2)+list(y3)+list(y4)+list(y5)+list(y6)+list(y7)+list(y8)+list(y9)+list(y10)

df = pd.concat([df, df_train], ignore_index=True)

In [6]:
random_seed= 100
strategy1 = {0 : 40, 1 : 70, 2 : 50}

under1 = RandomUnderSampler(sampling_strategy=strategy1, random_state=random_seed)
under2 = EditedNearestNeighbours()
under3 = RepeatedEditedNearestNeighbours()
under4 = AllKNN()
under5 = CondensedNearestNeighbour(random_state=random_seed)
under6 = OneSidedSelection(random_state=random_seed)
under7 = NeighbourhoodCleaningRule()

X, y = train.iloc[:, 1:-1], train['class'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2)).values

X1, y1 = under1.fit_resample(X, y)
X2, y2 = under2.fit_resample(X, y)
X3, y3 = under3.fit_resample(X, y)
X4, y4 = under4.fit_resample(X, y)
X5, y5 = under5.fit_resample(X, y)
X6, y6 = under6.fit_resample(X, y)
X7, y7 = under7.fit_resample(X, y)

strategy2 = {0 : 100, 1 : 120, 2 : 110}

over1 = SMOTEN(sampling_strategy=strategy2, random_state=random_seed)
over2 = SMOTE(sampling_strategy=strategy2, random_state=random_seed)
over3 = RandomOverSampler(sampling_strategy=strategy2, random_state=random_seed)

X8, y8 = over1.fit_resample(X, y)
X9, y9 = over2.fit_resample(X, y)
X10, y10 = over3.fit_resample(X, y)

df_train = pd.concat([X1, X2, X3, X4, X5, X6, X7, X8, X9, X10], ignore_index=True)
df_train['class'] = list(y1)+list(y2)+list(y3)+list(y4)+list(y5)+list(y6)+list(y7)+list(y8)+list(y9)+list(y10)

df = pd.concat([df, df_train], ignore_index=True)

In [7]:
df.drop_duplicates(ignore_index=True, inplace=True)
df

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15,class
0,1,1,2,0,2,0,2,2,0,2,0,2,1,2,1,1,0
1,1,1,2,0,1,1,1,1,0,1,0,2,1,2,1,0,0
2,1,0,2,0,1,0,2,2,0,1,1,1,1,2,1,2,0
3,1,0,2,0,2,0,2,2,0,1,1,2,1,2,1,2,0
4,1,0,2,0,1,1,2,1,0,0,0,2,2,2,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
472,2,1,1,0,0,1,0,0,0,0,1,0,0,1,0,1,2
473,2,1,0,0,0,1,0,0,1,0,0,0,0,1,0,1,2
474,2,1,1,1,0,1,0,0,0,0,1,1,0,2,1,1,2
475,2,1,1,1,0,0,1,0,1,0,1,0,0,2,0,0,2


In [8]:
X, y = df.iloc[:, :-1].to_numpy(), df['class']

# Load the data
data = torch.from_numpy(X)

# Preprocess the data by one-hot encoding the categories
one_hot_data_01 = F.one_hot(data[:,:1]-1, num_classes=2).view(len(X), 2).float()
one_hot_data_02 = F.one_hot(data[:,1:], num_classes=3).view(len(X), 3*data[:,1:].size(1)).float()

In [9]:
one_hot_data_01.shape, one_hot_data_02.shape

(torch.Size([477, 2]), torch.Size([477, 45]))

In [10]:
one_hot_data = torch.concat([one_hot_data_01, one_hot_data_02], axis=1)
one_hot_data.shape

torch.Size([477, 47])

In [56]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameters
latent_size = 3
hidden_size = 64
num_epochs = 100000
batch_size = len(one_hot_data)
sample_size = 100

# Custom dataloader
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return len(self.data)

# Create the dataset
data = one_hot_data # Replace this with your own data
dataset = CustomDataset(data)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Discriminator
class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.fc1 = nn.Linear(47, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Generator
class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()
        self.fc1 = nn.Linear(latent_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, 47)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Initialize models and move to device
D = Discriminator().to(device)
G = Generator().to(device)

# Loss and optimizer
criterion = nn.BCEWithLogitsLoss()
d_optimizer = torch.optim.Adam(D.parameters(), lr=0.0002)
g_optimizer = torch.optim.Adam(G.parameters(), lr=0.0001999)

# Scheduler
d_scheduler = lr_scheduler.ReduceLROnPlateau(d_optimizer, 'min')
g_scheduler = lr_scheduler.ReduceLROnPlateau(g_optimizer, 'min')

# Training loop
for epoch in range(num_epochs):
    for i, data in enumerate(dataloader):
        # 1. Train the discriminator
        D.zero_grad()
        real_data = data.to(device)
        real_score = D(real_data)
        real_loss = criterion(real_score, torch.ones_like(real_score))

        # Generate fake data
        latent = torch.randn(batch_size, latent_size).to(device)
        fake_data = G(latent)
        fake_score = D(fake_data)
        fake_loss = criterion(fake_score, torch.zeros_like(fake_score))

        # Compute loss and backpropagate
        d_loss = real_loss + fake_loss
        d_loss.backward()
        d_optimizer.step()
        d_scheduler.step(d_loss)  # Update learning rate

        # 2. Train the generator
        G.zero_grad()
        latent = torch.randn(batch_size, latent_size).to(device)
        fake_data = G(latent)
        fake_score = D(fake_data)

        # Compute loss and backpropagate
        g_loss = criterion(fake_score, torch.ones_like(fake_score))
        g_loss.backward()
        g_optimizer.step()
        g_scheduler.step(g_loss)  # Update learning rate

    # Print loss every 10 epochs
    if (epoch+1) % 200 == 0:
        print('Epoch [{}/{}], D_loss: {:.4f}, G_loss: {:.4f}'.format(epoch+1, num_epochs, d_loss.item(), g_loss.item()))


Epoch [200/100000], D_loss: 3.8957, G_loss: 0.0431
Epoch [400/100000], D_loss: 6.1195, G_loss: 0.0075
Epoch [600/100000], D_loss: 6.0832, G_loss: 0.0074
Epoch [800/100000], D_loss: 6.1410, G_loss: 0.0071
Epoch [1000/100000], D_loss: 6.2362, G_loss: 0.0083
Epoch [1200/100000], D_loss: 6.1514, G_loss: 0.0065
Epoch [1400/100000], D_loss: 6.0868, G_loss: 0.0068
Epoch [1600/100000], D_loss: 6.1437, G_loss: 0.0073
Epoch [1800/100000], D_loss: 6.1423, G_loss: 0.0068
Epoch [2000/100000], D_loss: 6.2221, G_loss: 0.0074
Epoch [2200/100000], D_loss: 6.2058, G_loss: 0.0074
Epoch [2400/100000], D_loss: 6.1394, G_loss: 0.0068
Epoch [2600/100000], D_loss: 6.2495, G_loss: 0.0072
Epoch [2800/100000], D_loss: 6.2265, G_loss: 0.0072
Epoch [3000/100000], D_loss: 6.1758, G_loss: 0.0075
Epoch [3200/100000], D_loss: 6.1714, G_loss: 0.0073
Epoch [3400/100000], D_loss: 6.1211, G_loss: 0.0076
Epoch [3600/100000], D_loss: 6.2072, G_loss: 0.0065
Epoch [3800/100000], D_loss: 6.1053, G_loss: 0.0078
Epoch [4000/1000

KeyboardInterrupt: 

In [None]:
torch.argmax(recon_x.view(16,3), axis=1)

In [None]:
torch.argmax(x.view(16,3), axis=1)

In [None]:
type(x)

In [None]:
model(x)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Load the data
data = torch.from_numpy(X)

# Preprocess the data by one-hot encoding the categories
one_hot_data = torch.zeros(data.size(0), data.max()+1)
one_hot_data[torch.arange(data.size(0)), data] = 1

# Define the generator network
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim

        self.fc1 = nn.Linear(input_dim, 32)
        self.fc2 = nn.Linear(32, output_dim)

    def forward(self, z):
        h1 = torch.relu(self.fc1(z))
        return self.fc2(h1)

# Define the discriminator network
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()

        self.input_dim = input_dim

        self.fc1 = nn.Linear(input_dim, 32)
        self.fc2 = nn.Linear(32, 1)

    def forward(self, x):
        h1 = torch.relu(self.fc1(x))
        return self.fc2(h1)

# Instantiate the generator and discriminator networks
generator = Generator(input_dim=100, output_dim=one_hot_data.size(1))
discriminator = Discriminator(input_dim=one_hot_data.size(1))

# Define the loss function and optimizers
optimizer_G = optim.Adam(generator.parameters(), lr=1e-3)
optimizer_D = optim.Adam(discriminator.parameters(), lr=1e-3)

# Train the GAN
for epoch in range(10):
    for i, real_data in enumerate(one_hot_data):
        # Generate fake data
        noise = torch.randn(100)
        fake_data = generator(noise)

        # Train the discriminator
        optimizer_D.zero_grad()
        loss_real = F.binary_cross_entropy(discriminator(real_data), torch.ones_like(discriminator(real_data)))
        loss_fake = F.binary_cross_entropy(discriminator(fake_data), torch.zeros_like(discriminator(fake_data)))
        loss_D = (loss_real + loss_fake) / 2
        loss_D.backward()
        optimizer_D.step()

        # Train the generator
        optimizer_G.zero_grad()
        loss_G = F.binary_cross_entropy(discriminator(fake_data), torch.ones_like(discriminator(fake_data)))
        loss_G.backward()
        optimizer_G.step()

# Use the generator to generate new variables
new_variables = generator(torch.randn(100))


In [None]:
import numpy as np
from sklearn.manifold import TSNE

# Load the data
data = np.load('categorical_data.npy')

# Preprocess the data by one-hot encoding the categories
one_hot_data = np.zeros((data.size, data.max()+1))
one_hot_data[np.arange(data.size), data] = 1

# Use t-SNE to reduce the dimensionality of the data
tsne = TSNE(n_components=2)
new_variables = tsne.fit_transform(one_hot_data)
