In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
import multiprocessing
from tqdm.auto import tqdm

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler

from transformers import get_cosine_schedule_with_warmup

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset, sampler

from imblearn.over_sampling import SMOTE

In [2]:
device = torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda', index=1)

In [3]:
train = pd.read_csv('./data/df_train00.csv')
test = pd.read_csv('./data/df_test00.csv')              

In [4]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold

from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from imblearn.over_sampling import SMOTE, SMOTENC

In [5]:
xgb_params = {
    'booster': 'gbtree',
    'grow_policy': 'lossguide',
    'max_depth': 0,
    'learning_rate': 0.4,
    # 'n_estimators': 25,
    'reg_lambda': 100,
    'reg_alpha' : 10,
    'subsample': 0.9,
    'num_parallel_tree': 1,
    # 'colsample_bytree' : 0.9,
    'colsample_bynode' : 0.9
    # 'rate_drop': 0.3
}

In [6]:
random_seed = 6327
strategy = {0:300, 1:300}

In [7]:
train['class'].value_counts()

1    114
2     79
0     69
Name: class, dtype: int64

In [8]:
ae = pd.read_csv("./data/ae_values.csv")
target_idx = train[train['class'] != 0].index.tolist()
train2 = train.iloc[target_idx].copy().reset_index(drop=True)
train2['class'] = train2['class'] -1

train2 = pd.concat([train2, ae[:len(train2)]], axis=1)
test2 = pd.concat([test, ae[len(train2):].reset_index(drop=True)], axis=1)
train2['class'].value_counts()

0    114
1     79
Name: class, dtype: int64

In [29]:
y = (train2['class'].values).astype(int)
X = train2.drop(['id', 'class'], axis=1)
X.iloc[:,:16] = X.iloc[:,:16].astype('category')
X_test = test2.drop(['id', 'class'], axis=1)
X_test.iloc[:,:16] = X_test.iloc[:,:16].astype('category')

smote = SMOTENC(categorical_features=[x for x in range(16)], random_state=random_seed)
X_train, y_train = smote.fit_resample(X, y)
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 40 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   trait         600 non-null    category
 1   SNP_01        600 non-null    category
 2   SNP_02        600 non-null    category
 3   SNP_03        600 non-null    category
 4   SNP_04        600 non-null    category
 5   SNP_05        600 non-null    category
 6   SNP_06        600 non-null    category
 7   SNP_07        600 non-null    category
 8   SNP_08        600 non-null    category
 9   SNP_09        600 non-null    category
 10  SNP_10        600 non-null    category
 11  SNP_11        600 non-null    category
 12  SNP_12        600 non-null    category
 13  SNP_13        600 non-null    category
 14  SNP_14        600 non-null    category
 15  SNP_15        600 non-null    category
 16  SNP_01_ratio  600 non-null    float64 
 17  SNP_02_ratio  600 non-null    float64 
 18  SNP_03_rat

In [30]:
def train_on_gpu() :  
    var_categ = X_train.columns.tolist()[:16]
    model = CatBoostClassifier(
        cat_features=var_categ,
        iterations=100,
        learning_rate=0.03,
        task_type='GPU',
        devices='0:5',
        # random_state=random_seed
        )
  
    model.fit(
        X_train, y_train,
        eval_set=(X_train, y_train),
        verbose=10
        );     

    pred = model.predict(X)
    score = f1_score(y, pred, average='macro')
    print(score)
    
    return model

In [31]:
model = train_on_gpu()
model

0:	learn: 0.6385110	test: 0.6385110	best: 0.6385110 (0)	total: 12.3ms	remaining: 1.22s
10:	learn: 0.3237788	test: 0.3237788	best: 0.3237788 (10)	total: 116ms	remaining: 942ms
20:	learn: 0.1878876	test: 0.1878876	best: 0.1878876 (20)	total: 222ms	remaining: 835ms
30:	learn: 0.1212826	test: 0.1212826	best: 0.1212826 (30)	total: 326ms	remaining: 725ms
40:	learn: 0.0880284	test: 0.0880284	best: 0.0880284 (40)	total: 434ms	remaining: 624ms
50:	learn: 0.0673039	test: 0.0673039	best: 0.0673039 (50)	total: 542ms	remaining: 521ms
60:	learn: 0.0544920	test: 0.0544053	best: 0.0544053 (60)	total: 650ms	remaining: 416ms
70:	learn: 0.0453372	test: 0.0453951	best: 0.0453951 (70)	total: 760ms	remaining: 311ms
80:	learn: 0.0388985	test: 0.0390147	best: 0.0390147 (80)	total: 877ms	remaining: 206ms
90:	learn: 0.0339114	test: 0.0341780	best: 0.0341780 (90)	total: 995ms	remaining: 98.4ms
99:	learn: 0.0304334	test: 0.0306610	best: 0.0306610 (99)	total: 1.1s	remaining: 0us
bestTest = 0.03066099803
bestIterat

<catboost.core.CatBoostClassifier at 0x7fd0e8eccf10>

In [15]:
answer = model.predict(X_test) + 1 
answer

array([2, 1, 2, 2, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1,
       1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1,
       1, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 1,
       1, 2, 2, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 2, 1, 1, 1,
       2, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 2, 1,
       1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1,
       1, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 1])

In [16]:
high_one = pd.read_csv("submit_0.98142.csv")
high_one

Unnamed: 0,id,class
0,TEST_000,A
1,TEST_001,B
2,TEST_002,C
3,TEST_003,C
4,TEST_004,A
...,...,...
170,TEST_170,B
171,TEST_171,C
172,TEST_172,C
173,TEST_173,B


In [19]:
A_index = high_one[high_one['class']=='A'].index.tolist()
answer[A_index] = 0
answer

array([0, 1, 2, 2, 0, 2, 2, 1, 0, 0, 2, 1, 1, 0, 1, 1, 0, 1, 1, 2, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 2, 0, 1, 2, 1, 1, 2, 0, 1, 2, 1,
       1, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 1, 2, 0, 1, 2, 0, 1, 2, 2, 2, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 2, 1, 2, 1, 1, 1, 2, 1, 0, 1, 1, 1, 1, 1,
       2, 0, 1, 1, 2, 1, 1, 2, 0, 1, 0, 2, 0, 1, 1, 2, 0, 0, 2, 1, 0, 1,
       2, 1, 1, 1, 1, 0, 0, 2, 1, 2, 0, 1, 1, 2, 2, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 2, 0, 0, 1, 0, 0, 0, 2, 1, 1, 1, 0, 1, 2, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 2, 0, 1, 2, 2, 1, 0, 0, 2, 1, 1, 0, 1, 2, 2, 1, 1])

In [22]:
submit = high_one.copy()
submit['class'] = answer
submit['class'] = submit['class'].map(lambda x : 'A' if x==0 else ('B' if x==1 else 'C'))
submit

Unnamed: 0,id,class
0,TEST_000,A
1,TEST_001,B
2,TEST_002,C
3,TEST_003,C
4,TEST_004,A
...,...,...
170,TEST_170,B
171,TEST_171,C
172,TEST_172,C
173,TEST_173,B


In [23]:
sum(high_one['class'] == submit['class'])

173

In [24]:
submit['class'].value_counts()

B    84
A    51
C    40
Name: class, dtype: int64

In [25]:
high_one['class'].value_counts()

B    84
A    51
C    40
Name: class, dtype: int64

In [27]:
submit.to_csv("submit.csv", index=False)