# 01. EDA

In [15]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
import multiprocessing
from tqdm.auto import tqdm

from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset, sampler

from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from imblearn.over_sampling import SMOTE, SMOTENC, SMOTEN

import shap
import catboost

pd.set_option('display.max_columns', 500)

In [16]:
train = pd.read_csv("./data/train.csv")
train.drop_duplicates(subset=train.columns.tolist()[5:20], inplace=True, ignore_index=True)
test = pd.read_csv("./data/test.csv")

train2 = pd.concat([pd.get_dummies(train.iloc[:, 5:20]), train['class']], axis=1)
test2 = pd.get_dummies(test.iloc[:, 5:20])

train2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 46 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   SNP_01_A A  256 non-null    uint8 
 1   SNP_01_A G  256 non-null    uint8 
 2   SNP_01_G G  256 non-null    uint8 
 3   SNP_02_A A  256 non-null    uint8 
 4   SNP_02_A G  256 non-null    uint8 
 5   SNP_02_G G  256 non-null    uint8 
 6   SNP_03_A A  256 non-null    uint8 
 7   SNP_03_C A  256 non-null    uint8 
 8   SNP_03_C C  256 non-null    uint8 
 9   SNP_04_A A  256 non-null    uint8 
 10  SNP_04_G A  256 non-null    uint8 
 11  SNP_04_G G  256 non-null    uint8 
 12  SNP_05_A A  256 non-null    uint8 
 13  SNP_05_C A  256 non-null    uint8 
 14  SNP_05_C C  256 non-null    uint8 
 15  SNP_06_A A  256 non-null    uint8 
 16  SNP_06_A G  256 non-null    uint8 
 17  SNP_06_G G  256 non-null    uint8 
 18  SNP_07_A A  256 non-null    uint8 
 19  SNP_07_G A  256 non-null    uint8 
 20  SNP_07_G G

In [17]:
test2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 45 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   SNP_01_A A  175 non-null    uint8
 1   SNP_01_A G  175 non-null    uint8
 2   SNP_01_G G  175 non-null    uint8
 3   SNP_02_A A  175 non-null    uint8
 4   SNP_02_A G  175 non-null    uint8
 5   SNP_02_G G  175 non-null    uint8
 6   SNP_03_A A  175 non-null    uint8
 7   SNP_03_C A  175 non-null    uint8
 8   SNP_03_C C  175 non-null    uint8
 9   SNP_04_A A  175 non-null    uint8
 10  SNP_04_G A  175 non-null    uint8
 11  SNP_04_G G  175 non-null    uint8
 12  SNP_05_A A  175 non-null    uint8
 13  SNP_05_C A  175 non-null    uint8
 14  SNP_05_C C  175 non-null    uint8
 15  SNP_06_A A  175 non-null    uint8
 16  SNP_06_A G  175 non-null    uint8
 17  SNP_06_G G  175 non-null    uint8
 18  SNP_07_A A  175 non-null    uint8
 19  SNP_07_G A  175 non-null    uint8
 20  SNP_07_G G  175 non-null    uint

In [18]:
train2.describe()

Unnamed: 0,SNP_01_A A,SNP_01_A G,SNP_01_G G,SNP_02_A A,SNP_02_A G,SNP_02_G G,SNP_03_A A,SNP_03_C A,SNP_03_C C,SNP_04_A A,SNP_04_G A,SNP_04_G G,SNP_05_A A,SNP_05_C A,SNP_05_C C,SNP_06_A A,SNP_06_A G,SNP_06_G G,SNP_07_A A,SNP_07_G A,SNP_07_G G,SNP_08_A A,SNP_08_G A,SNP_08_G G,SNP_09_A A,SNP_09_G A,SNP_09_G G,SNP_10_A A,SNP_10_A G,SNP_10_G G,SNP_11_A A,SNP_11_A G,SNP_11_G G,SNP_12_A A,SNP_12_G A,SNP_12_G G,SNP_13_A A,SNP_13_A G,SNP_13_G G,SNP_14_A A,SNP_14_C A,SNP_14_C C,SNP_15_A A,SNP_15_G A,SNP_15_G G
count,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0
mean,0.175781,0.277344,0.546875,0.222656,0.378906,0.398438,0.457031,0.355469,0.1875,0.46875,0.355469,0.175781,0.347656,0.335938,0.316406,0.234375,0.476562,0.289062,0.632812,0.1875,0.179688,0.324219,0.371094,0.304688,0.707031,0.210938,0.082031,0.164062,0.25,0.585938,0.320312,0.371094,0.308594,0.527344,0.28125,0.191406,0.199219,0.371094,0.429688,0.714844,0.199219,0.085938,0.417969,0.378906,0.203125
std,0.38138,0.448565,0.498773,0.416844,0.486065,0.490535,0.499126,0.479593,0.391077,0.5,0.479593,0.38138,0.477159,0.473242,0.465984,0.424437,0.500429,0.454215,0.482982,0.391077,0.384679,0.468999,0.484044,0.461177,0.456016,0.408773,0.27495,0.371058,0.433861,0.493524,0.467511,0.484044,0.462818,0.50023,0.45049,0.394179,0.400195,0.484044,0.496001,0.452373,0.400195,0.280821,0.494191,0.486065,0.403113
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.25,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
train2[train2['SNP_03_C C'] == 0]['class'].value_counts()

B    82
A    64
C    62
Name: class, dtype: int64

In [20]:
train2[train2['class']=='A'].describe()

Unnamed: 0,SNP_01_A A,SNP_01_A G,SNP_01_G G,SNP_02_A A,SNP_02_A G,SNP_02_G G,SNP_03_A A,SNP_03_C A,SNP_03_C C,SNP_04_A A,SNP_04_G A,SNP_04_G G,SNP_05_A A,SNP_05_C A,SNP_05_C C,SNP_06_A A,SNP_06_A G,SNP_06_G G,SNP_07_A A,SNP_07_G A,SNP_07_G G,SNP_08_A A,SNP_08_G A,SNP_08_G G,SNP_09_A A,SNP_09_G A,SNP_09_G G,SNP_10_A A,SNP_10_A G,SNP_10_G G,SNP_11_A A,SNP_11_A G,SNP_11_G G,SNP_12_A A,SNP_12_G A,SNP_12_G G,SNP_13_A A,SNP_13_A G,SNP_13_G G,SNP_14_A A,SNP_14_C A,SNP_14_C C,SNP_15_A A,SNP_15_G A,SNP_15_G G
count,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0
mean,0.625,0.328125,0.046875,0.0,0.09375,0.90625,1.0,0.0,0.0,0.03125,0.46875,0.5,0.78125,0.21875,0.0,0.0,0.3125,0.6875,0.0,0.28125,0.71875,0.84375,0.15625,0.0,0.203125,0.484375,0.3125,0.484375,0.46875,0.046875,0.0,0.1875,0.8125,0.015625,0.296875,0.6875,0.0,0.046875,0.953125,0.15625,0.53125,0.3125,0.0625,0.296875,0.640625
std,0.48795,0.473242,0.213042,0.0,0.293785,0.293785,0.0,0.0,0.0,0.175368,0.502967,0.503953,0.416667,0.416667,0.0,0.0,0.467177,0.467177,0.0,0.453163,0.453163,0.365963,0.365963,0.0,0.405505,0.503706,0.467177,0.503706,0.502967,0.213042,0.0,0.393398,0.393398,0.125,0.460493,0.467177,0.0,0.213042,0.213042,0.365963,0.502967,0.467177,0.243975,0.460493,0.48361
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.5,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
75%,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0
max,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
train2[train2['class']=='B'].describe()

Unnamed: 0,SNP_01_A A,SNP_01_A G,SNP_01_G G,SNP_02_A A,SNP_02_A G,SNP_02_G G,SNP_03_A A,SNP_03_C A,SNP_03_C C,SNP_04_A A,SNP_04_G A,SNP_04_G G,SNP_05_A A,SNP_05_C A,SNP_05_C C,SNP_06_A A,SNP_06_A G,SNP_06_G G,SNP_07_A A,SNP_07_G A,SNP_07_G G,SNP_08_A A,SNP_08_G A,SNP_08_G G,SNP_09_A A,SNP_09_G A,SNP_09_G G,SNP_10_A A,SNP_10_A G,SNP_10_G G,SNP_11_A A,SNP_11_A G,SNP_11_G G,SNP_12_A A,SNP_12_G A,SNP_12_G G,SNP_13_A A,SNP_13_A G,SNP_13_G G,SNP_14_A A,SNP_14_C A,SNP_14_C C,SNP_15_A A,SNP_15_G A,SNP_15_G G
count,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0
mean,0.0,0.221239,0.778761,0.168142,0.513274,0.318584,0.212389,0.513274,0.274336,0.424779,0.460177,0.115044,0.061947,0.309735,0.628319,0.283186,0.539823,0.176991,0.920354,0.079646,0.0,0.070796,0.362832,0.566372,0.79646,0.19469,0.00885,0.0,0.035398,0.964602,0.300885,0.486726,0.212389,0.681416,0.283186,0.035398,0.371681,0.469027,0.159292,1.0,0.0,0.0,0.628319,0.345133,0.026549
std,0.0,0.41693,0.41693,0.375658,0.50205,0.468003,0.410821,0.50205,0.448167,0.496511,0.500632,0.320497,0.242133,0.464444,0.485406,0.452553,0.500632,0.383361,0.27195,0.27195,0.0,0.257627,0.482959,0.497783,0.404424,0.397726,0.094072,0.0,0.185607,0.185607,0.460686,0.50205,0.410821,0.468003,0.452553,0.185607,0.485406,0.501263,0.367578,0.0,0.0,0.0,0.485406,0.477529,0.161476
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
75%,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
max,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0


In [22]:
train2[train2['class']=='C'].describe()

Unnamed: 0,SNP_01_A A,SNP_01_A G,SNP_01_G G,SNP_02_A A,SNP_02_A G,SNP_02_G G,SNP_03_A A,SNP_03_C A,SNP_03_C C,SNP_04_A A,SNP_04_G A,SNP_04_G G,SNP_05_A A,SNP_05_C A,SNP_05_C C,SNP_06_A A,SNP_06_A G,SNP_06_G G,SNP_07_A A,SNP_07_G A,SNP_07_G G,SNP_08_A A,SNP_08_G A,SNP_08_G G,SNP_09_A A,SNP_09_G A,SNP_09_G G,SNP_10_A A,SNP_10_A G,SNP_10_G G,SNP_11_A A,SNP_11_A G,SNP_11_G G,SNP_12_A A,SNP_12_G A,SNP_12_G G,SNP_13_A A,SNP_13_A G,SNP_13_G G,SNP_14_A A,SNP_14_C A,SNP_14_C C,SNP_15_A A,SNP_15_G A,SNP_15_G G
count,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0
mean,0.063291,0.316456,0.620253,0.481013,0.417722,0.101266,0.367089,0.417722,0.21519,0.886076,0.113924,0.0,0.405063,0.468354,0.126582,0.35443,0.518987,0.126582,0.734177,0.265823,0.0,0.265823,0.556962,0.177215,0.987342,0.012658,0.0,0.139241,0.379747,0.481013,0.607595,0.35443,0.037975,0.721519,0.265823,0.012658,0.113924,0.493671,0.392405,0.759494,0.21519,0.025316,0.405063,0.493671,0.101266
std,0.245042,0.468065,0.488425,0.502832,0.496335,0.303608,0.485091,0.496335,0.41358,0.319749,0.319749,0.0,0.494041,0.502186,0.334629,0.481397,0.502832,0.334629,0.444593,0.444593,0.0,0.444593,0.499919,0.384291,0.112509,0.112509,0.0,0.348409,0.488425,0.502832,0.491406,0.481397,0.192356,0.451116,0.444593,0.112509,0.319749,0.503155,0.491406,0.430122,0.41358,0.158088,0.494041,0.503155,0.303608
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [23]:
test2.describe()

Unnamed: 0,SNP_01_A A,SNP_01_A G,SNP_01_G G,SNP_02_A A,SNP_02_A G,SNP_02_G G,SNP_03_A A,SNP_03_C A,SNP_03_C C,SNP_04_A A,SNP_04_G A,SNP_04_G G,SNP_05_A A,SNP_05_C A,SNP_05_C C,SNP_06_A A,SNP_06_A G,SNP_06_G G,SNP_07_A A,SNP_07_G A,SNP_07_G G,SNP_08_A A,SNP_08_G A,SNP_08_G G,SNP_09_A A,SNP_09_G A,SNP_09_G G,SNP_10_A A,SNP_10_A G,SNP_10_G G,SNP_11_A A,SNP_11_A G,SNP_11_G G,SNP_12_A A,SNP_12_G A,SNP_12_G G,SNP_13_A A,SNP_13_A G,SNP_13_G G,SNP_14_A A,SNP_14_C A,SNP_14_C C,SNP_15_A A,SNP_15_G A,SNP_15_G G
count,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0
mean,0.222857,0.297143,0.48,0.211429,0.325714,0.462857,0.468571,0.388571,0.142857,0.365714,0.434286,0.2,0.297143,0.348571,0.354286,0.274286,0.457143,0.268571,0.577143,0.228571,0.194286,0.365714,0.297143,0.337143,0.685714,0.245714,0.068571,0.177143,0.268571,0.554286,0.234286,0.388571,0.377143,0.531429,0.274286,0.194286,0.234286,0.331429,0.434286,0.685714,0.188571,0.125714,0.388571,0.388571,0.222857
std,0.417357,0.458311,0.501033,0.409493,0.469986,0.500049,0.500443,0.488824,0.350931,0.483012,0.497085,0.401148,0.458311,0.477885,0.479669,0.447434,0.499589,0.444488,0.495431,0.421117,0.396785,0.483012,0.458311,0.474091,0.465563,0.431745,0.253449,0.382885,0.444488,0.498471,0.424767,0.488824,0.486062,0.500443,0.447434,0.396785,0.424767,0.472077,0.497085,0.465563,0.39229,0.332478,0.488824,0.488824,0.417357
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [24]:
class Autoencoder(nn.Module):
    def __init__(self, encoding_dim):
        super().__init__()
        self.encoding_dim = encoding_dim
        self.encoder = nn.Sequential(
            nn.Linear(45, 32),
            nn.GELU(),
            nn.Linear(32, 8),
            nn.GELU(),
            nn.Linear(8, encoding_dim),
            nn.GELU(),
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 8),
            nn.GELU(),
            nn.Linear(8, 32),
            nn.GELU(),
            nn.Linear(32, 45)
        )

    def forward(self, x):
        x1 = self.encoder(x)
        x2 = self.decoder(x1)
        return x1, x2

def ae_train(model, data_loader, criterion, optimizer, device, epochs=10):
    model.to(device)
    for epoch in range(epochs):
        epoch_loss = 0
        for x in data_loader:
            x = x[0].to(device)
            _, x_hat = model(x)
            loss = criterion(x_hat, x)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f'Epoch {epoch}: loss = {epoch_loss / len(data_loader):.4f}')

In [25]:
class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(VAE, self).__init__()
        self.input_dim = input_dim
        self.latent_dim = latent_dim
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, input_dim*2),
            nn.ReLU(),
            nn.Linear(input_dim*2, input_dim),
            nn.ReLU(),
            nn.Linear(input_dim, input_dim//2),
            nn.ReLU()
            )
        self.fc_mu = nn.Linear(input_dim//2, latent_dim)
        self.fc_logvar = nn.Linear(input_dim//2, latent_dim)
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, input_dim//2),
            nn.ReLU(),
            nn.Linear(input_dim//2, input_dim),
            nn.ReLU(),
            nn.Linear(input_dim, input_dim*2),
            nn.ReLU(),
            nn.Linear(input_dim*2, input_dim)
        )

    def encode(self, x):
        h = self.encoder(x)
        return self.fc_mu(h), self.fc_logvar(h)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return mu + eps*std

    def decode(self, z):
        return self.decoder(z)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

def loss_fn(recon_x, x, mu, logvar):
    reconstruction_loss = F.mse_loss(recon_x, x, reduction='sum')
    kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return reconstruction_loss + kl_loss

def train(model, optimizer, train_loader, device):
    model.train()
    train_loss = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        loss = loss_fn(recon_batch, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        
        optimizer.step()
        
    return train_loss / len(train_loader.dataset)

def test(model, test_loader, device):
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for data in test_loader:
            data = data.to(device)
            recon_batch, mu, logvar = model(data)
            test_loss += loss_fn(recon_batch, data, mu, logvar).item()
    return test_loss / len(test_loader.dataset)

In [26]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = torch.Tensor(train2.drop(columns=['class']).to_numpy())

# Create a dataset and data loader
dataset = torch.utils.data.TensorDataset(X)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=256, shuffle=True)

# Initialize the model, criterion, and optimizer
encoding_dim = 12
model = Autoencoder(encoding_dim)
criterion = nn.MSELoss().to(device)
optimizer = optim.Adam(model.parameters())
epochs=1000

# Train the model
ae_train(model, data_loader, criterion, optimizer, device, epochs)

Epoch 0: loss = 0.3332
Epoch 1: loss = 0.3306
Epoch 2: loss = 0.3280
Epoch 3: loss = 0.3255
Epoch 4: loss = 0.3230
Epoch 5: loss = 0.3206
Epoch 6: loss = 0.3182
Epoch 7: loss = 0.3158
Epoch 8: loss = 0.3134
Epoch 9: loss = 0.3110
Epoch 10: loss = 0.3087
Epoch 11: loss = 0.3064
Epoch 12: loss = 0.3042
Epoch 13: loss = 0.3019
Epoch 14: loss = 0.2997
Epoch 15: loss = 0.2975
Epoch 16: loss = 0.2953
Epoch 17: loss = 0.2931
Epoch 18: loss = 0.2909
Epoch 19: loss = 0.2887
Epoch 20: loss = 0.2865
Epoch 21: loss = 0.2844
Epoch 22: loss = 0.2822
Epoch 23: loss = 0.2800
Epoch 24: loss = 0.2778
Epoch 25: loss = 0.2756
Epoch 26: loss = 0.2734
Epoch 27: loss = 0.2711
Epoch 28: loss = 0.2688
Epoch 29: loss = 0.2665
Epoch 30: loss = 0.2642
Epoch 31: loss = 0.2618
Epoch 32: loss = 0.2593
Epoch 33: loss = 0.2568
Epoch 34: loss = 0.2542
Epoch 35: loss = 0.2516
Epoch 36: loss = 0.2489
Epoch 37: loss = 0.2462
Epoch 38: loss = 0.2434
Epoch 39: loss = 0.2406
Epoch 40: loss = 0.2379
Epoch 41: loss = 0.2351
Ep

In [27]:
X1 = X.to(device)
X2 = torch.Tensor(test2.to_numpy()).to(device)

pred_train = model(X1)
pred_test = model(X2)

# reconstruction error
loss_train = []
loss_test = []

for i in range(len(X1)) :
    loss = criterion(pred_train[1][i], X1[i]).detach().cpu().numpy()
    loss_train.append(loss)
for j in range(len(X2)) :
    loss = criterion(pred_test[1][j], X2[j]).detach().cpu().numpy()
    loss_test.append(loss)

# encoding values
enco_train = pred_train[0].detach().cpu().numpy()
enco_test = pred_test[0].detach().cpu().numpy()

trainLoss = pd.DataFrame(data=loss_train, columns=['ae_loss'])
testLoss = pd.DataFrame(data=loss_test, columns=['ae_loss'])

ae_train = pd.DataFrame(data=enco_train, columns=['ae_'+str(x) for x in range(enco_train.shape[1])])
ae_test = pd.DataFrame(data=enco_test, columns=['ae_'+str(x) for x in range(enco_train.shape[1])])

train3 = pd.concat([train2, trainLoss, ae_train], axis=1)
test3 = pd.concat([test2, testLoss, ae_test], axis=1)

train3.info(), test3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 59 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   SNP_01_A A  256 non-null    uint8  
 1   SNP_01_A G  256 non-null    uint8  
 2   SNP_01_G G  256 non-null    uint8  
 3   SNP_02_A A  256 non-null    uint8  
 4   SNP_02_A G  256 non-null    uint8  
 5   SNP_02_G G  256 non-null    uint8  
 6   SNP_03_A A  256 non-null    uint8  
 7   SNP_03_C A  256 non-null    uint8  
 8   SNP_03_C C  256 non-null    uint8  
 9   SNP_04_A A  256 non-null    uint8  
 10  SNP_04_G A  256 non-null    uint8  
 11  SNP_04_G G  256 non-null    uint8  
 12  SNP_05_A A  256 non-null    uint8  
 13  SNP_05_C A  256 non-null    uint8  
 14  SNP_05_C C  256 non-null    uint8  
 15  SNP_06_A A  256 non-null    uint8  
 16  SNP_06_A G  256 non-null    uint8  
 17  SNP_06_G G  256 non-null    uint8  
 18  SNP_07_A A  256 non-null    uint8  
 19  SNP_07_G A  256 non-null    u

(None, None)

In [28]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = torch.Tensor(train2.drop(columns=['class']).to_numpy())

input_dim = X.shape[1]
latent_dim = 12
batch_size = 64
num_epochs = 1000

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_data = X
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_data = X
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False)

model = VAE(input_dim, latent_dim).to(device)
optimizer = torch.optim.Adam(model.parameters())

for epoch in range(num_epochs):
    train_loss = train(model, optimizer, train_loader, device)
    test_loss = test(model, test_loader, device)
    print(f"Epoch {epoch}: Train loss = {train_loss}, Test loss = {test_loss}")

Epoch 0: Train loss = 14.535635948181152, Test loss = 13.893090724945068
Epoch 1: Train loss = 13.59974980354309, Test loss = 13.023449182510376
Epoch 2: Train loss = 12.665371179580688, Test loss = 12.062449932098389
Epoch 3: Train loss = 11.742890357971191, Test loss = 11.09129023551941
Epoch 4: Train loss = 10.851985931396484, Test loss = 10.396318197250366
Epoch 5: Train loss = 10.167058229446411, Test loss = 9.96699571609497
Epoch 6: Train loss = 9.89912724494934, Test loss = 9.783190727233887
Epoch 7: Train loss = 9.767432928085327, Test loss = 9.489997625350952
Epoch 8: Train loss = 9.491695880889893, Test loss = 9.329458475112915
Epoch 9: Train loss = 9.305176496505737, Test loss = 9.349501132965088
Epoch 10: Train loss = 9.331107139587402, Test loss = 9.255607604980469
Epoch 11: Train loss = 9.283725500106812, Test loss = 9.219717741012573
Epoch 12: Train loss = 9.22048544883728, Test loss = 9.22669005393982
Epoch 13: Train loss = 9.204759120941162, Test loss = 9.2065560817718

In [29]:
X1 = X.to(device)
X2 = torch.Tensor(test2.to_numpy()).to(device)

pred_train = model(X1)
pred_test = model(X2)

# reconstruction error
loss_train = []
loss_test = []

for i in range(len(X1)) :
    loss = criterion(pred_train[0][i], X1[i]).detach().cpu().numpy()
    loss_train.append(loss)
for j in range(len(X2)) :
    loss = criterion(pred_test[0][j], X2[j]).detach().cpu().numpy()
    loss_test.append(loss)

# encoding values
enco_train = pred_train[1].detach().cpu().numpy()
enco_test = pred_test[1].detach().cpu().numpy()

trainLoss = pd.DataFrame(data=loss_train, columns=['vae_loss'])
testLoss = pd.DataFrame(data=loss_test, columns=['vae_loss'])

ae_train = pd.DataFrame(data=enco_train, columns=['vae_'+str(x) for x in range(enco_train.shape[1])])
ae_test = pd.DataFrame(data=enco_test, columns=['vae_'+str(x) for x in range(enco_train.shape[1])])

train4 = pd.concat([train3, trainLoss, ae_train], axis=1)
test4 = pd.concat([test3, testLoss, ae_test], axis=1)

train4.info(), test4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 72 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   SNP_01_A A  256 non-null    uint8  
 1   SNP_01_A G  256 non-null    uint8  
 2   SNP_01_G G  256 non-null    uint8  
 3   SNP_02_A A  256 non-null    uint8  
 4   SNP_02_A G  256 non-null    uint8  
 5   SNP_02_G G  256 non-null    uint8  
 6   SNP_03_A A  256 non-null    uint8  
 7   SNP_03_C A  256 non-null    uint8  
 8   SNP_03_C C  256 non-null    uint8  
 9   SNP_04_A A  256 non-null    uint8  
 10  SNP_04_G A  256 non-null    uint8  
 11  SNP_04_G G  256 non-null    uint8  
 12  SNP_05_A A  256 non-null    uint8  
 13  SNP_05_C A  256 non-null    uint8  
 14  SNP_05_C C  256 non-null    uint8  
 15  SNP_06_A A  256 non-null    uint8  
 16  SNP_06_A G  256 non-null    uint8  
 17  SNP_06_G G  256 non-null    uint8  
 18  SNP_07_A A  256 non-null    uint8  
 19  SNP_07_G A  256 non-null    u

(None, None)

# BaseModeling

In [30]:
train4.iloc[:, :45] = train4.iloc[:, :45].astype('category')
test4.iloc[:, :45] = test4.iloc[:, :45].astype('category')
train4.info(), test4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 72 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   SNP_01_A A  256 non-null    category
 1   SNP_01_A G  256 non-null    category
 2   SNP_01_G G  256 non-null    category
 3   SNP_02_A A  256 non-null    category
 4   SNP_02_A G  256 non-null    category
 5   SNP_02_G G  256 non-null    category
 6   SNP_03_A A  256 non-null    category
 7   SNP_03_C A  256 non-null    category
 8   SNP_03_C C  256 non-null    category
 9   SNP_04_A A  256 non-null    category
 10  SNP_04_G A  256 non-null    category
 11  SNP_04_G G  256 non-null    category
 12  SNP_05_A A  256 non-null    category
 13  SNP_05_C A  256 non-null    category
 14  SNP_05_C C  256 non-null    category
 15  SNP_06_A A  256 non-null    category
 16  SNP_06_A G  256 non-null    category
 17  SNP_06_G G  256 non-null    category
 18  SNP_07_A A  256 non-null    category
 19  SNP_07_G

(None, None)

In [31]:
from catboost import Pool, cv

def catcv(inputX, inputY, validX, validY, params, cv_count) :  
    
    var_categ = inputX.columns.tolist()[:45]
    
    cv_dataset = Pool(data=inputX,
                      label=inputY,
                      cat_features=var_categ)
    
    scores = cv(cv_dataset,
                params,
                fold_count=cv_count,
                stratified=True,
                plot=True)

In [32]:
def catgbmc(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:45]
    model = CatBoostClassifier(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY)
        );     

    pred = model.predict(validX)
    score = f1_score(validY, pred, average='macro')
    print(score)
    
    return model

In [36]:
from imblearn.under_sampling import RandomUnderSampler

X, y = train2.drop(columns='class'), (train2['class']=='A').astype(int)
X_test = test2.copy()

strategy1 = {0:50, 1:50}
strategy2 = {0:1000, 1:1000}

smote1 = RandomUnderSampler(sampling_strategy=strategy1)
smote2 = SMOTEN(sampling_strategy=strategy2, k_neighbors=20)

X1, y1 = smote1.fit_resample(X, y)
X2, y2 = smote2.fit_resample(X, y)

In [37]:
params = {'iterations':300,
          'learning_rate':0.3,
          'l2_leaf_reg' : 10,
          # 'grow_policy' : 'Lossguide',
        #   'auto_class_weights' : 'SqrtBalanced',
          'loss_function' : 'CrossEntropy',
          'eval_metric' : 'F1',
          # 'early_stopping_rounds' : 50,
          'verbose':0,
          'random_seed':0}

In [38]:
catcv(X, y, X, y, params, cv_count=5)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]

bestTest = 1
bestIteration = 1

Training on fold [1/5]

bestTest = 1
bestIteration = 2

Training on fold [2/5]

bestTest = 1
bestIteration = 2

Training on fold [3/5]

bestTest = 1
bestIteration = 3

Training on fold [4/5]

bestTest = 0.9565217391
bestIteration = 1



In [39]:
catcv(X1, y1, X, y, params, cv_count=3)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/3]

bestTest = 1
bestIteration = 70

Training on fold [1/3]

bestTest = 1
bestIteration = 4

Training on fold [2/3]

bestTest = 1
bestIteration = 12



In [40]:
catcv(X2, y2, X, y, params, cv_count=10)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/10]

bestTest = 1
bestIteration = 1

Training on fold [1/10]

bestTest = 1
bestIteration = 1

Training on fold [2/10]

bestTest = 1
bestIteration = 2

Training on fold [3/10]

bestTest = 0.9949748744
bestIteration = 0

Training on fold [4/10]

bestTest = 1
bestIteration = 0

Training on fold [5/10]

bestTest = 1
bestIteration = 0

Training on fold [6/10]

bestTest = 1
bestIteration = 3

Training on fold [7/10]

bestTest = 1
bestIteration = 2

Training on fold [8/10]

bestTest = 1
bestIteration = 0

Training on fold [9/10]

bestTest = 1
bestIteration = 0



In [57]:
params = {'iterations':500,
          'learning_rate':0.03,
          'l2_leaf_reg' : 3,
          # 'grow_policy' : 'Lossguide',
        #   'auto_class_weights' : 'SqrtBalanced',
          'loss_function' : 'CrossEntropy',
          'eval_metric' : 'F1',
          # 'early_stopping_rounds' : 50,
          'verbose':0,
          'random_seed':0}

In [58]:
modelA = catgbmc(X1, y1, X, y, params)

1.0


In [59]:
modelA.predict(X_test).sum()

49

## B & not B

In [60]:
from imblearn.under_sampling import RandomUnderSampler

X, y = train2.drop(columns='class'), (train2['class']=='B').astype(int)
X_test = test2.copy()

strategy1 = {0:50, 1:50}
strategy2 = {0:1000, 1:1000}

smote1 = RandomUnderSampler(sampling_strategy=strategy1)
smote2 = SMOTEN(sampling_strategy=strategy2, k_neighbors=20)

X1, y1 = smote1.fit_resample(X, y)
X2, y2 = smote2.fit_resample(X, y)

In [61]:
params = {'iterations':500,
          'learning_rate':0.03,
          'l2_leaf_reg' : 3,
          # 'grow_policy' : 'Lossguide',
        #   'auto_class_weights' : 'SqrtBalanced',
          'loss_function' : 'CrossEntropy',
          'eval_metric' : 'F1',
          # 'early_stopping_rounds' : 50,
          'verbose':0,
          'random_seed':0}

In [62]:
catcv(X, y, X, y, params, cv_count=5)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]

bestTest = 0.92
bestIteration = 9

Training on fold [1/5]

bestTest = 0.9787234043
bestIteration = 0

Training on fold [2/5]

bestTest = 0.9387755102
bestIteration = 52

Training on fold [3/5]

bestTest = 0.9090909091
bestIteration = 0

Training on fold [4/5]

bestTest = 0.9777777778
bestIteration = 16



In [63]:
catcv(X1, y1, X, y, params, cv_count=3)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/3]

bestTest = 0.9189189189
bestIteration = 0

Training on fold [1/3]

bestTest = 0.9411764706
bestIteration = 1

Training on fold [2/3]

bestTest = 0.9411764706
bestIteration = 20



In [65]:
catcv(X2, y2, X, y, params, cv_count=5)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]

bestTest = 0.9974937343
bestIteration = 75

Training on fold [1/5]

bestTest = 0.992481203
bestIteration = 199

Training on fold [2/5]

bestTest = 0.9975062344
bestIteration = 23

Training on fold [3/5]

bestTest = 0.9900497512
bestIteration = 32

Training on fold [4/5]

bestTest = 0.995
bestIteration = 56



In [72]:
params = {'iterations':500,
          'learning_rate':0.03,
          'l2_leaf_reg' : 3,
          # 'grow_policy' : 'Lossguide',
        #   'auto_class_weights' : 'SqrtBalanced',
          'loss_function' : 'CrossEntropy',
          'eval_metric' : 'F1',
          # 'early_stopping_rounds' : 50,
          'verbose':0,
          'random_seed':0}

In [73]:
modelB = catgbmc(X2, y2, X, y, params)

1.0


In [74]:
modelB.predict(X_test).sum()

85

## C & not C

In [75]:
from imblearn.under_sampling import RandomUnderSampler

X, y = train2.drop(columns='class'), (train2['class']=='C').astype(int)
X_test = test2.copy()

strategy1 = {0:50, 1:50}
strategy2 = {0:1000, 1:1000}

smote1 = RandomUnderSampler(sampling_strategy=strategy1)
smote2 = SMOTEN(sampling_strategy=strategy2, k_neighbors=20)

X1, y1 = smote1.fit_resample(X, y)
X2, y2 = smote2.fit_resample(X, y)

In [76]:
params = {'iterations':500,
          'learning_rate':0.03,
          'l2_leaf_reg' : 3,
          # 'grow_policy' : 'Lossguide',
        #   'auto_class_weights' : 'SqrtBalanced',
          'loss_function' : 'CrossEntropy',
          'eval_metric' : 'F1',
          # 'early_stopping_rounds' : 50,
          'verbose':0,
          'random_seed':0}

In [77]:
catcv(X, y, X, y, params, cv_count=5)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]

bestTest = 0.8387096774
bestIteration = 60

Training on fold [1/5]

bestTest = 0.9375
bestIteration = 94

Training on fold [2/5]

bestTest = 0.8965517241
bestIteration = 122

Training on fold [3/5]

bestTest = 0.9375
bestIteration = 111

Training on fold [4/5]

bestTest = 0.9677419355
bestIteration = 46



In [78]:
catcv(X1, y1, X, y, params, cv_count=5)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]

bestTest = 1
bestIteration = 8

Training on fold [1/5]

bestTest = 0.8421052632
bestIteration = 4

Training on fold [2/5]

bestTest = 1
bestIteration = 166

Training on fold [3/5]

bestTest = 1
bestIteration = 10

Training on fold [4/5]

bestTest = 0.9
bestIteration = 84



In [79]:
catcv(X2, y2, X, y, params, cv_count=5)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]

bestTest = 0.9975062344
bestIteration = 213

Training on fold [1/5]

bestTest = 0.9875311721
bestIteration = 114

Training on fold [2/5]

bestTest = 0.9924433249
bestIteration = 1

Training on fold [3/5]

bestTest = 0.9898989899
bestIteration = 3

Training on fold [4/5]

bestTest = 0.9925187032
bestIteration = 163



In [80]:
params = {'iterations':500,
          'learning_rate':0.03,
          'l2_leaf_reg' : 3,
          # 'grow_policy' : 'Lossguide',
        #   'auto_class_weights' : 'SqrtBalanced',
          'loss_function' : 'CrossEntropy',
          'eval_metric' : 'F1',
          # 'early_stopping_rounds' : 50,
          'verbose':0,
          'random_seed':0}

In [87]:
modelC = catgbmc(X2, y2, X, y, params)

1.0


In [88]:
modelC.predict(X_test).sum()

39

In [92]:
sample = pd.read_csv("./data/test.csv")[['id']].copy()
sample['class_A'] = modelA.predict_proba(X_test)[:,1]
sample['class_B'] = modelB.predict_proba(X_test)[:,1]
sample['class_C'] = modelC.predict_proba(X_test)[:,1]
sample['total'] = np.argmax(sample[['class_A', 'class_B', 'class_C']].values, axis=1)
sample

Unnamed: 0,id,class_A,class_B,class_C,total
0,TEST_000,0.618000,0.003390,0.009222,0
1,TEST_001,0.334375,0.995504,0.010459,1
2,TEST_002,0.470204,0.001677,0.996923,2
3,TEST_003,0.309502,0.870278,0.369231,1
4,TEST_004,0.651448,0.001689,0.001710,0
...,...,...,...,...,...
170,TEST_170,0.352781,0.983192,0.007799,1
171,TEST_171,0.365122,0.000684,0.999645,2
172,TEST_172,0.388396,0.024013,0.993668,2
173,TEST_173,0.309240,0.999085,0.001330,1


In [93]:
sample[(sample.class_A > 0.5) & ((sample.class_B > 0.1) | (sample.class_C > 0.1))]

Unnamed: 0,id,class_A,class_B,class_C,total


In [94]:
sample[(sample.class_B > 0.5) & ((sample.class_A > 0.5) | (sample.class_C > 0.5))]

Unnamed: 0,id,class_A,class_B,class_C,total
119,TEST_119,0.355428,0.569391,0.776953,2


In [95]:
sample[(sample.class_B < 0.5) & (sample.class_A < 0.5) & (sample.class_C < 0.5)]

Unnamed: 0,id,class_A,class_B,class_C,total
28,TEST_028,0.495003,0.003697,0.0072,0
60,TEST_060,0.413473,0.013395,0.125204,0
66,TEST_066,0.410908,0.384954,0.019929,0


In [96]:
sample['total'].value_counts()

1    84
0    52
2    39
Name: total, dtype: int64

In [97]:
df = pd.read_csv("submit_high1.csv")
df['class'] = sample['total'].map(lambda x : 'A' if x==0 else ('B' if x==1 else 'C'))
df.to_csv("submit.csv", index=False)
df

Unnamed: 0,id,class
0,TEST_000,A
1,TEST_001,B
2,TEST_002,C
3,TEST_003,B
4,TEST_004,A
...,...,...
170,TEST_170,B
171,TEST_171,C
172,TEST_172,C
173,TEST_173,B


In [98]:
df['class'].value_counts()

B    84
A    52
C    39
Name: class, dtype: int64

# 0.9719