In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
import multiprocessing
from tqdm.auto import tqdm

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler

from transformers import get_cosine_schedule_with_warmup

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset, sampler

from imblearn.over_sampling import SMOTE

In [2]:
device = torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda', index=1)

In [3]:
train = pd.read_csv('./data/df_train00.csv')
test = pd.read_csv('./data/df_test00.csv')              

In [4]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold

from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from imblearn.over_sampling import SMOTE, SMOTENC

In [5]:
random_seed = 6327
strategy = {0:1000, 1:1000, 2:1000}

In [6]:
train['class'].value_counts()

1    114
2     79
0     69
Name: class, dtype: int64

In [7]:
ae = pd.read_csv("./data/ae_values.csv")
# target_idx = train[train['class'] != 0].index.tolist()
# train2 = train.iloc[target_idx].copy().reset_index(drop=True)
# train2['class'] = train2['class'] -1
train2 = train.copy()

train2 = pd.concat([train2, ae[:len(train2)]], axis=1)
test2 = pd.concat([test, ae[len(train2):].reset_index(drop=True)], axis=1)
train2['class'].value_counts()

1    114
2     79
0     69
Name: class, dtype: int64

In [8]:
y = (train2['class'].values).astype(int)
X = train2.drop(['id', 'class'], axis=1)
X.iloc[:,:16] = X.iloc[:,:16].astype('category')
X_test = test2.drop(['id', 'class'], axis=1)
X_test.iloc[:,:16] = X_test.iloc[:,:16].astype('category')

smote = SMOTENC(categorical_features=[x for x in range(16)], random_state=random_seed, sampling_strategy=strategy)
X_train, y_train = smote.fit_resample(X, y)
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 40 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   trait         3000 non-null   category
 1   SNP_01        3000 non-null   category
 2   SNP_02        3000 non-null   category
 3   SNP_03        3000 non-null   category
 4   SNP_04        3000 non-null   category
 5   SNP_05        3000 non-null   category
 6   SNP_06        3000 non-null   category
 7   SNP_07        3000 non-null   category
 8   SNP_08        3000 non-null   category
 9   SNP_09        3000 non-null   category
 10  SNP_10        3000 non-null   category
 11  SNP_11        3000 non-null   category
 12  SNP_12        3000 non-null   category
 13  SNP_13        3000 non-null   category
 14  SNP_14        3000 non-null   category
 15  SNP_15        3000 non-null   category
 16  SNP_01_ratio  3000 non-null   float64 
 17  SNP_02_ratio  3000 non-null   float64 
 18  SNP_03_r

In [9]:
def train_on_gpu() :  
    var_categ = X_train.columns.tolist()[:16]
    model = CatBoostClassifier(
        cat_features=var_categ,
        iterations=100,
        learning_rate=0.03,
        task_type='GPU',
        devices='0:5',
        # random_state=random_seed
        )
  
    model.fit(
        X_train, y_train,
        eval_set=(X_train, y_train),
        verbose=10
        );     

    pred = model.predict(X)
    score = f1_score(y, pred, average='macro')
    print(score)
    
    return model

In [10]:
model = train_on_gpu()
model

0:	learn: 1.0477963	test: 1.0477964	best: 1.0477964 (0)	total: 16.3ms	remaining: 1.61s
10:	learn: 0.6990332	test: 0.6990331	best: 0.6990331 (10)	total: 151ms	remaining: 1.22s
20:	learn: 0.5008181	test: 0.5008181	best: 0.5008181 (20)	total: 257ms	remaining: 968ms
30:	learn: 0.3744777	test: 0.3744777	best: 0.3744777 (30)	total: 360ms	remaining: 802ms
40:	learn: 0.2876780	test: 0.2876780	best: 0.2876780 (40)	total: 459ms	remaining: 661ms
50:	learn: 0.2260976	test: 0.2260976	best: 0.2260976 (50)	total: 565ms	remaining: 543ms
60:	learn: 0.1810976	test: 0.1810976	best: 0.1810976 (60)	total: 667ms	remaining: 426ms
70:	learn: 0.1474605	test: 0.1474605	best: 0.1474605 (70)	total: 769ms	remaining: 314ms
80:	learn: 0.1218365	test: 0.1218365	best: 0.1218365 (80)	total: 872ms	remaining: 205ms
90:	learn: 0.1017928	test: 0.1017928	best: 0.1017928 (90)	total: 973ms	remaining: 96.2ms
99:	learn: 0.0878397	test: 0.0878397	best: 0.0878397 (99)	total: 1.06s	remaining: 0us
bestTest = 0.08783967082
bestItera

<catboost.core.CatBoostClassifier at 0x7f9a25cf2c40>

In [11]:
answer = model.predict(X_test)
answer

array([[0],
       [1],
       [2],
       [1],
       [0],
       [2],
       [2],
       [1],
       [0],
       [0],
       [2],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [1],
       [2],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [2],
       [0],
       [1],
       [2],
       [1],
       [1],
       [2],
       [0],
       [1],
       [2],
       [1],
       [1],
       [1],
       [1],
       [2],
       [1],
       [2],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [2],
       [0],
       [1],
       [2],
       [0],
       [1],
       [2],
       [2],
       [2],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [2],
       [1],
       [2],
       [1],
       [1],
       [1],
       [2],
       [1],
       [0],
    

In [13]:
high_one = pd.read_csv("./data/submit_0.99078.csv")
high_two = pd.read_csv("./data/submit_0.99078_2.csv")
high_one

Unnamed: 0,id,class
0,TEST_000,A
1,TEST_001,B
2,TEST_002,C
3,TEST_003,C
4,TEST_004,A
...,...,...
170,TEST_170,B
171,TEST_171,C
172,TEST_172,C
173,TEST_173,B


In [None]:
# A_index = high_one[high_one['class']=='A'].index.tolist()
# answer[A_index] = 0
# answer

In [14]:
submit = high_one.copy()
submit['class'] = answer
submit['class'] = submit['class'].map(lambda x : 'A' if x==0 else ('B' if x==1 else 'C'))
submit

Unnamed: 0,id,class
0,TEST_000,A
1,TEST_001,B
2,TEST_002,C
3,TEST_003,B
4,TEST_004,A
...,...,...
170,TEST_170,B
171,TEST_171,C
172,TEST_172,C
173,TEST_173,B


In [15]:
sum(high_one['class'] == submit['class'])

172

In [16]:
submit['class'].value_counts()

B    85
A    51
C    39
Name: class, dtype: int64

In [17]:
high_one['class'].value_counts()

B    84
A    51
C    40
Name: class, dtype: int64

In [18]:
high_two['class'].value_counts()

B    82
A    51
C    42
Name: class, dtype: int64

In [19]:
high_one[high_one['class'] != high_two['class']]

Unnamed: 0,id,class
12,TEST_012,B
126,TEST_126,B


In [20]:
high_two[high_one['class'] != high_two['class']]

Unnamed: 0,id,class
12,TEST_012,C
126,TEST_126,C


In [21]:
display(submit[submit['class'] != high_one['class']], high_one[submit['class'] != high_one['class']])

Unnamed: 0,id,class
3,TEST_003,B
119,TEST_119,B
126,TEST_126,C


Unnamed: 0,id,class
3,TEST_003,C
119,TEST_119,C
126,TEST_126,B


In [23]:
display(submit[submit['class'] != high_two['class']], high_two[submit['class'] != high_two['class']])

Unnamed: 0,id,class
3,TEST_003,B
12,TEST_012,B
119,TEST_119,B


Unnamed: 0,id,class
3,TEST_003,C
12,TEST_012,C
119,TEST_119,C


In [24]:
display(submit.iloc[[3,12,119,126]],high_one.iloc[[3,12,119,126]],high_two.iloc[[3,12,119,126]])

Unnamed: 0,id,class
3,TEST_003,B
12,TEST_012,B
119,TEST_119,B
126,TEST_126,C


Unnamed: 0,id,class
3,TEST_003,C
12,TEST_012,B
119,TEST_119,C
126,TEST_126,B


Unnamed: 0,id,class
3,TEST_003,C
12,TEST_012,C
119,TEST_119,C
126,TEST_126,C


In [44]:
check_y = np.zeros(187)
check_y[50:50+87] = 1
check_y[50+87:] = 2
check_y

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.])

In [45]:
check_two = np.zeros(187)
check_two[51:51+84] = 1
check_two[51+84:] = 2

f1_score(check_y, check_one, average='macro')

0.9895233709417454

In [32]:
check_one = np.zeros(187)
check_one[51:51+85] = 1
check_one[51+85:] = 2

f1_score(check_y, check_one, average='macro')

0.9844881665240947

In [22]:
submit[submit['class'] != high_two['class']]

Unnamed: 0,id,class
3,TEST_003,B
12,TEST_012,B
119,TEST_119,B


In [None]:
submit.to_csv("submit.csv", index=False)