In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
import multiprocessing
from tqdm.auto import tqdm

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler

from transformers import get_cosine_schedule_with_warmup

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset, sampler

from imblearn.over_sampling import SMOTE

In [2]:
device = torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda', index=1)

In [3]:
train = pd.read_csv('./data/df_train00.csv')
test = pd.read_csv('./data/df_test00.csv')              

In [4]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold

from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from imblearn.over_sampling import SMOTE, SMOTENC

In [5]:
xgb_params = {
    'booster': 'gbtree',
    'grow_policy': 'lossguide',
    'max_depth': 0,
    'learning_rate': 0.4,
    # 'n_estimators': 25,
    'reg_lambda': 100,
    'reg_alpha' : 10,
    'subsample': 0.9,
    'num_parallel_tree': 1,
    # 'colsample_bytree' : 0.9,
    'colsample_bynode' : 0.9
    # 'rate_drop': 0.3
}

In [6]:
random_seed = 6327
strategy = {0:300, 1:300}

In [7]:
train['class'].value_counts()

1    114
2     79
0     69
Name: class, dtype: int64

In [8]:
ae = pd.read_csv("./data/ae_values.csv")
target_idx = train[train['class'] != 0].index.tolist()
train2 = train.iloc[target_idx].copy().reset_index(drop=True)
train2['class'] = train2['class'] -1

train2 = pd.concat([train2, ae[:len(train2)]], axis=1)
test2 = pd.concat([test, ae[len(train2):].reset_index(drop=True)], axis=1)
train2['class'].value_counts()

0    114
1     79
Name: class, dtype: int64

In [9]:
y = (train2['class'].values).astype(int)
X = train2.drop(['id', 'class'], axis=1)
X.iloc[:,:16] = X.iloc[:,:16].astype('category')
X_test = test2.drop(['id', 'class'], axis=1)
X_test.iloc[:,:16] = X_test.iloc[:,:16].astype('category')

smote = SMOTENC(categorical_features=[x for x in range(16)], random_state=random_seed, sampling_strategy=strategy)
X_train, y_train = smote.fit_resample(X, y)
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 40 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   trait         600 non-null    category
 1   SNP_01        600 non-null    category
 2   SNP_02        600 non-null    category
 3   SNP_03        600 non-null    category
 4   SNP_04        600 non-null    category
 5   SNP_05        600 non-null    category
 6   SNP_06        600 non-null    category
 7   SNP_07        600 non-null    category
 8   SNP_08        600 non-null    category
 9   SNP_09        600 non-null    category
 10  SNP_10        600 non-null    category
 11  SNP_11        600 non-null    category
 12  SNP_12        600 non-null    category
 13  SNP_13        600 non-null    category
 14  SNP_14        600 non-null    category
 15  SNP_15        600 non-null    category
 16  SNP_01_ratio  600 non-null    float64 
 17  SNP_02_ratio  600 non-null    float64 
 18  SNP_03_rat

In [39]:
def lgbmc() :  
    var_categ = X_train.columns.tolist()[:16]
    model = CatBoostClassifier(
        cat_features=var_categ,
        iterations=100,
        learning_rate=0.3,
        task_type='GPU',
        devices='0',
        # random_state=random_seed
        )
  
    model.fit(
        X_train, y_train,
        eval_set=(X_train, y_train),
        verbose=10
        );     

    pred = model.predict(X)
    score = f1_score(y, pred, average='macro')
    print(score)
    
    return model

In [40]:
def lgbmr() :  
    var_categ = X_train.columns.tolist()[:16]
    model = CatBoostRegressor(
        cat_features=var_categ,
        iterations=100,
        learning_rate=0.3,
        task_type='GPU',
        devices='5',
        # random_state=random_seed
        )
  
    model.fit(
        X_train, y_train,
        eval_set=(X_train, y_train),
        verbose=10
        );     

    pred = model.predict(X)
    score = f1_score(y, np.round(pred), average='macro')
    print(score)
    
    return model

In [41]:
model1 = lgbmc()
model1

0:	learn: 0.3342471	test: 0.3342471	best: 0.3342471 (0)	total: 10.1ms	remaining: 1000ms
10:	learn: 0.0224078	test: 0.0224078	best: 0.0224078 (10)	total: 104ms	remaining: 843ms
20:	learn: 0.0099625	test: 0.0101762	best: 0.0101762 (20)	total: 198ms	remaining: 744ms
30:	learn: 0.0058263	test: 0.0060054	best: 0.0060054 (30)	total: 291ms	remaining: 649ms
40:	learn: 0.0040324	test: 0.0042058	best: 0.0042058 (40)	total: 384ms	remaining: 552ms
50:	learn: 0.0032999	test: 0.0034556	best: 0.0034556 (50)	total: 476ms	remaining: 458ms
60:	learn: 0.0026529	test: 0.0027940	best: 0.0027940 (60)	total: 568ms	remaining: 363ms
70:	learn: 0.0021009	test: 0.0022247	best: 0.0022247 (70)	total: 659ms	remaining: 269ms
80:	learn: 0.0018088	test: 0.0019187	best: 0.0019187 (80)	total: 750ms	remaining: 176ms
90:	learn: 0.0015369	test: 0.0016415	best: 0.0016415 (90)	total: 840ms	remaining: 83.1ms
99:	learn: 0.0013475	test: 0.0014394	best: 0.0014394 (99)	total: 921ms	remaining: 0us
bestTest = 0.001439448694
bestIte

<catboost.core.CatBoostClassifier at 0x7f0a8858ce50>

In [42]:
model2 = lgbmr()
model2

0:	learn: 0.3850517	test: 0.3850517	best: 0.3850517 (0)	total: 5.69ms	remaining: 564ms
10:	learn: 0.0968951	test: 0.0968951	best: 0.0968951 (10)	total: 54.1ms	remaining: 438ms
20:	learn: 0.0736448	test: 0.0740544	best: 0.0740544 (20)	total: 103ms	remaining: 386ms
30:	learn: 0.0596070	test: 0.0604077	best: 0.0604077 (30)	total: 150ms	remaining: 335ms
40:	learn: 0.0480319	test: 0.0490553	best: 0.0490553 (40)	total: 199ms	remaining: 286ms
50:	learn: 0.0403749	test: 0.0430205	best: 0.0430205 (50)	total: 248ms	remaining: 238ms
60:	learn: 0.0344469	test: 0.0381584	best: 0.0381584 (60)	total: 296ms	remaining: 189ms
70:	learn: 0.0289789	test: 0.0330376	best: 0.0330376 (70)	total: 344ms	remaining: 140ms
80:	learn: 0.0253158	test: 0.0295110	best: 0.0295110 (80)	total: 393ms	remaining: 92.1ms
90:	learn: 0.0224728	test: 0.0273989	best: 0.0273989 (90)	total: 441ms	remaining: 43.7ms
99:	learn: 0.0202137	test: 0.0253462	best: 0.0253462 (99)	total: 486ms	remaining: 0us
bestTest = 0.02534617127
bestIte

<catboost.core.CatBoostRegressor at 0x7f0a8865c430>

In [43]:
answer1 = model1.predict_proba(X_test)[:,1] 
answer1

array([5.32214590e-01, 3.36959983e-04, 9.99342093e-01, 8.45514919e-01,
       9.64563394e-02, 3.44750575e-01, 9.98668513e-01, 3.00119891e-03,
       4.65769145e-01, 1.63417517e-01, 9.99910271e-01, 1.32873650e-03,
       8.87084509e-01, 2.29268646e-01, 2.70190490e-03, 2.99137585e-04,
       7.78808415e-01, 2.94446390e-03, 1.41293006e-04, 9.89472647e-01,
       5.17173204e-04, 2.33785327e-05, 8.43256913e-03, 1.57699943e-01,
       2.00956578e-04, 2.10520459e-02, 9.53294774e-04, 6.79600803e-01,
       5.19303645e-02, 2.22772358e-03, 9.27065746e-02, 1.55250506e-01,
       3.61282824e-04, 9.95898436e-01, 3.58986356e-01, 1.46136900e-02,
       9.99557697e-01, 4.28693831e-04, 2.54382884e-03, 9.99808087e-01,
       2.98968954e-01, 3.04087389e-04, 9.99810639e-01, 1.09894944e-03,
       2.75057466e-04, 7.51006655e-03, 6.19928171e-02, 9.99245371e-01,
       8.88969443e-05, 9.96754779e-01, 6.58763927e-01, 7.54872530e-04,
       1.13600517e-01, 8.97833531e-05, 9.60526319e-05, 6.13454843e-04,
      

In [44]:
answer2 = model2.predict(X_test)
answer2

array([ 0.71164287,  0.04906433,  1.04504479,  0.77110856,  0.17761294,
        0.76863142,  1.03041334,  0.15961657,  0.34765884,  0.36167575,
        0.98926041,  0.0842239 ,  0.62647615,  0.36407528,  0.06303468,
        0.02005826,  0.67041652,  0.05547938, -0.02697338,  0.84186277,
       -0.01701689, -0.05874553,  0.28098357,  0.34816336, -0.0303449 ,
        0.16147037,  0.13206059,  0.46142912,  0.13269689,  0.21404571,
        0.28484474,  0.41392559,  0.00825517,  1.01240031,  0.64415963,
        0.25503852,  1.01380161, -0.01566375,  0.02485078,  1.01013843,
        0.51378803,  0.00261512,  0.84745586,  0.04352867,  0.05280579,
        0.02572753,  0.29258742,  0.99147607,  0.0089165 ,  0.96081131,
        0.66172076,  0.01254039,  0.26117863, -0.05490835,  0.0271442 ,
       -0.03601745,  0.96490559,  0.61973212,  0.06729711,  0.98409768,
        0.92812734, -0.02165885,  0.94373734,  0.9941257 ,  1.07015717,
        0.60738607,  0.23931593,  0.64997857,  0.66544803, -0.02

In [45]:
answer3 = answer1+answer2
answer3

array([ 1.24385746e+00,  4.94012910e-02,  2.04438688e+00,  1.61662348e+00,
        2.74069275e-01,  1.11338200e+00,  2.02908186e+00,  1.62617767e-01,
        8.13427985e-01,  5.25093267e-01,  1.98917068e+00,  8.55526338e-02,
        1.51356066e+00,  5.93343928e-01,  6.57365879e-02,  2.03574008e-02,
        1.44922494e+00,  5.84238464e-02, -2.68320911e-02,  1.83133541e+00,
       -1.64997138e-02, -5.87221492e-02,  2.89416139e-01,  5.05863302e-01,
       -3.01439434e-02,  1.82522418e-01,  1.33013889e-01,  1.14102992e+00,
        1.84627259e-01,  2.16273437e-01,  3.77551315e-01,  5.69176091e-01,
        8.61645060e-03,  2.00829874e+00,  1.00314599e+00,  2.69652208e-01,
        2.01335930e+00, -1.52350602e-02,  2.73946072e-02,  2.00994652e+00,
        8.12756980e-01,  2.91920382e-03,  1.84726650e+00,  4.46276204e-02,
        5.30808475e-02,  3.32376015e-02,  3.54580241e-01,  1.99072144e+00,
        9.00539640e-03,  1.95756609e+00,  1.32048468e+00,  1.32952625e-02,
        3.74779145e-01, -

In [46]:
high_one = pd.read_csv("./data/submit_0.99078.csv")
high_one

Unnamed: 0,id,class
0,TEST_000,A
1,TEST_001,B
2,TEST_002,C
3,TEST_003,C
4,TEST_004,A
...,...,...
170,TEST_170,B
171,TEST_171,C
172,TEST_172,C
173,TEST_173,B


In [47]:
A_index = high_one[high_one['class']=='A'].index.tolist()
answer3[A_index] = 0
answer3

array([ 0.00000000e+00,  4.94012910e-02,  2.04438688e+00,  1.61662348e+00,
        0.00000000e+00,  1.11338200e+00,  2.02908186e+00,  1.62617767e-01,
        0.00000000e+00,  0.00000000e+00,  1.98917068e+00,  8.55526338e-02,
        1.51356066e+00,  0.00000000e+00,  6.57365879e-02,  2.03574008e-02,
        0.00000000e+00,  5.84238464e-02, -2.68320911e-02,  1.83133541e+00,
       -1.64997138e-02, -5.87221492e-02,  2.89416139e-01,  0.00000000e+00,
       -3.01439434e-02,  1.82522418e-01,  1.33013889e-01,  0.00000000e+00,
        0.00000000e+00,  2.16273437e-01,  0.00000000e+00,  0.00000000e+00,
        8.61645060e-03,  2.00829874e+00,  0.00000000e+00,  2.69652208e-01,
        2.01335930e+00, -1.52350602e-02,  2.73946072e-02,  2.00994652e+00,
        0.00000000e+00,  2.91920382e-03,  1.84726650e+00,  4.46276204e-02,
        5.30808475e-02,  3.32376015e-02,  3.54580241e-01,  1.99072144e+00,
        9.00539640e-03,  1.95756609e+00,  0.00000000e+00,  1.32952625e-02,
        0.00000000e+00, -

In [64]:
sum(answer3 >= 0.9)

42

In [68]:
submit = high_one.copy()
submit['class'] = answer3
submit['class'] = submit['class'].map(lambda x : 0 if x==0 else (1 if x < 0.9 else 2))
submit['class'] = submit['class'].map(lambda x : 'A' if x==0 else ('B' if x==1 else 'C'))
submit

Unnamed: 0,id,class
0,TEST_000,A
1,TEST_001,B
2,TEST_002,C
3,TEST_003,C
4,TEST_004,A
...,...,...
170,TEST_170,B
171,TEST_171,C
172,TEST_172,C
173,TEST_173,B


In [69]:
sum(high_one['class'] == submit['class'])

173

In [70]:
submit['class'].value_counts()

B    82
A    51
C    42
Name: class, dtype: int64

In [57]:
high_one['class'].value_counts()

B    84
A    51
C    40
Name: class, dtype: int64

In [71]:
submit.to_csv("submit.csv", index=False)

In [61]:
high1 = pd.read_csv("./data/submit_0.99078.csv")
high2 = pd.read_csv("./data/submit_0.99078_2.csv")
high3 = pd.read_csv("./data/submit_0.99078_3.csv")

In [62]:
display(high3[high3['class'] != high1['class']], high1[high3['class'] != high1['class']])

Unnamed: 0,id,class
12,TEST_012,C


Unnamed: 0,id,class
12,TEST_012,B


In [63]:
display(high3[high3['class'] != high2['class']], high2[high3['class'] != high2['class']])

Unnamed: 0,id,class
126,TEST_126,B


Unnamed: 0,id,class
126,TEST_126,C
