In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
import multiprocessing
from tqdm.auto import tqdm

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler

from transformers import get_cosine_schedule_with_warmup

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset, sampler

from imblearn.over_sampling import SMOTE

In [2]:
gpu_id = "0,1,2"

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]= gpu_id

In [3]:
device = torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda', index=1)

In [4]:
train = pd.read_csv('./data/df_train00.csv')
test = pd.read_csv('./data/df_test00.csv')              

In [5]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold

from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from imblearn.over_sampling import SMOTE, SMOTENC

In [6]:
xgb_params = {
    'booster': 'gbtree',
    'grow_policy': 'lossguide',
    'max_depth': 0,
    'learning_rate': 0.4,
    # 'n_estimators': 25,
    'reg_lambda': 100,
    'reg_alpha' : 10,
    'subsample': 0.9,
    'num_parallel_tree': 1,
    # 'colsample_bytree' : 0.9,
    'colsample_bynode' : 0.9
    # 'rate_drop': 0.3
}

In [7]:
random_seed = 6327
strategy = {0:1200, 1:1000}

In [8]:
train['class'].value_counts()

1    114
2     79
0     69
Name: class, dtype: int64

In [9]:
ae = pd.read_csv("./data/ae_values.csv")
target_idx = train[train['class'] != 0].index.tolist()
train2 = train.iloc[target_idx].copy().reset_index(drop=True)
train2['class'] = train2['class'] -1

train2 = pd.concat([train2, ae[:len(train2)]], axis=1)
test2 = pd.concat([test, ae[len(train2):].reset_index(drop=True)], axis=1)
train2['class'].value_counts()

0    114
1     79
Name: class, dtype: int64

In [10]:
train2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 42 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            193 non-null    object 
 1   trait         193 non-null    int64  
 2   SNP_01        193 non-null    int64  
 3   SNP_02        193 non-null    int64  
 4   SNP_03        193 non-null    int64  
 5   SNP_04        193 non-null    int64  
 6   SNP_05        193 non-null    int64  
 7   SNP_06        193 non-null    int64  
 8   SNP_07        193 non-null    int64  
 9   SNP_08        193 non-null    int64  
 10  SNP_09        193 non-null    int64  
 11  SNP_10        193 non-null    int64  
 12  SNP_11        193 non-null    int64  
 13  SNP_12        193 non-null    int64  
 14  SNP_13        193 non-null    int64  
 15  SNP_14        193 non-null    int64  
 16  SNP_15        193 non-null    int64  
 17  class         193 non-null    int64  
 18  SNP_01_ratio  193 non-null    

In [12]:
random_seed = 5833

y = train2['class'].values
X = train2.drop(['id', 'class'], axis=1)
X.iloc[:,:16] = X.iloc[:,:16].astype('category')
X_test = test2.drop(['id', 'class'], axis=1)
X_test.iloc[:,:16] = X_test.iloc[:,:16].astype('category')

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_seed)

oof_val_preds = np.zeros((X.shape[0], 2))
oof_test_preds = np.zeros((X_test.shape[0], 2))
strategy = {0:300, 1:300}

# OOF
for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):

    # print('#'*30, f'Fold [{fold+1}/{skf.n_splits}]', '#'*30)

    # train, valid data 설정
    X_train, y_train = X.iloc[train_idx], y[train_idx]
    X_valid, y_valid = X.iloc[valid_idx], y[valid_idx]
    
    smote = SMOTENC(categorical_features=[x for x in range(16)], random_state=random_seed, sampling_strategy=strategy)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    # 불균형 데이터 가중치 조정 값 => 음성(0) 타깃값 개수 / 양성(1) 타깃값 개수
    _, counts = np.unique(np.array(y_train), return_counts=True)
    scale_weight = counts[0] / counts[1]

    var_categ = X_train.columns.tolist()[:16]
    model = CatBoostClassifier(
        cat_features=var_categ,
        iterations=1000,
        learning_rate=0.3,
        task_type='GPU',
        devices='0',
        # random_state=random_seed
        )
  
    model.fit(
        X_train, y_train,
        eval_set=(X_valid, y_valid),
        verbose=10
        );     

    # pred = model.predict(X)
    # score = f1_score(y, pred, average='macro')
    # print(score)

    oof_test_preds += model.predict_proba(X_test) / skf.n_splits
    oof_val_preds[valid_idx] += model.predict_proba(X_valid)
    
    # if fold == 1 :
    #     pred = xgb_model.predict(X_test)
    #     break
    
    #model save
    # xgb_model.save_model(f'./models/new_xgb_{skf.n_splits}_{fold}.json')
    del [[X_train, y_train, X_valid, y_valid, model]]
    gc.collect()

#     model score check
preds = np.argmax(oof_val_preds, axis=1)
print(f1_score(y, preds, average="macro"))

# # save OOF test preds
# np.save(f'./results/new_{skf.n_splits}_oof_test_preds.npy', oof_test_preds[:, 1])

0:	learn: 0.2629063	test: 0.4242608	best: 0.4242608 (0)	total: 35.5ms	remaining: 35.4s
10:	learn: 0.0197517	test: 0.3251447	best: 0.2997539 (2)	total: 341ms	remaining: 30.7s
20:	learn: 0.0081838	test: 0.2983886	best: 0.2983886 (20)	total: 688ms	remaining: 32.1s
30:	learn: 0.0050346	test: 0.2987223	best: 0.2878101 (22)	total: 1.03s	remaining: 32.2s
40:	learn: 0.0037586	test: 0.2883975	best: 0.2878101 (22)	total: 1.37s	remaining: 32.1s
50:	learn: 0.0030016	test: 0.2843752	best: 0.2843752 (50)	total: 1.71s	remaining: 31.8s
60:	learn: 0.0021736	test: 0.2877661	best: 0.2843752 (50)	total: 2.05s	remaining: 31.5s
70:	learn: 0.0019054	test: 0.2845503	best: 0.2826143 (66)	total: 2.39s	remaining: 31.3s
80:	learn: 0.0015409	test: 0.2875962	best: 0.2826143 (66)	total: 2.73s	remaining: 31s
90:	learn: 0.0014396	test: 0.2851300	best: 0.2826143 (66)	total: 3.07s	remaining: 30.6s
100:	learn: 0.0013288	test: 0.2839347	best: 0.2826143 (66)	total: 3.41s	remaining: 30.3s
110:	learn: 0.0011779	test: 0.28548

In [20]:
random_seed = 5833

y = train2['class'].values
X = train2.drop(['id', 'class'], axis=1)
X.iloc[:,:16] = X.iloc[:,:16].astype('category')
X_test = test2.drop(['id', 'class'], axis=1)
X_test.iloc[:,:16] = X_test.iloc[:,:16].astype('category')

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_seed)

oof_val_preds = np.zeros((X.shape[0]))
oof_test_preds = np.zeros((X_test.shape[0]))
strategy = {0:300, 1:300}

# OOF
for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):

    # print('#'*30, f'Fold [{fold+1}/{skf.n_splits}]', '#'*30)

    # train, valid data 설정
    X_train, y_train = X.iloc[train_idx], y[train_idx]
    X_valid, y_valid = X.iloc[valid_idx], y[valid_idx]
    
    smote = SMOTENC(categorical_features=[x for x in range(16)], random_state=random_seed, sampling_strategy=strategy)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    # 불균형 데이터 가중치 조정 값 => 음성(0) 타깃값 개수 / 양성(1) 타깃값 개수
    _, counts = np.unique(np.array(y_train), return_counts=True)
    scale_weight = counts[0] / counts[1]

    var_categ = X_train.columns.tolist()[:16]
    model = CatBoostRegressor(
        cat_features=var_categ,
        iterations=1000,
        learning_rate=0.3,
        task_type='GPU',
        devices='1',
        # random_state=random_seed
        )
  
    model.fit(
        X_train, y_train,
        eval_set=(X_valid, y_valid),
        verbose=10
        );     

    # pred = model.predict(X)
    # score = f1_score(y, pred, average='macro')
    # print(score)

    oof_test_preds += model.predict(X_test) / skf.n_splits
    oof_val_preds[valid_idx] += model.predict(X_valid)
    
    # if fold == 1 :
    #     pred = xgb_model.predict(X_test)
    #     break
    
    #model save
    # xgb_model.save_model(f'./models/new_xgb_{skf.n_splits}_{fold}.json')
    del [[X_train, y_train, X_valid, y_valid, model]]
    gc.collect()

#     model score check
preds = np.round(oof_val_preds)
print(f1_score(y, preds, average="macro"))

# # save OOF test preds
# np.save(f'./results/new_{skf.n_splits}_oof_test_preds.npy', oof_test_preds[:, 1])

0:	learn: 0.3725845	test: 0.4168920	best: 0.4168920 (0)	total: 24.1ms	remaining: 24s
10:	learn: 0.1038881	test: 0.2806606	best: 0.2806606 (10)	total: 234ms	remaining: 21s
20:	learn: 0.0699196	test: 0.2804476	best: 0.2780575 (16)	total: 454ms	remaining: 21.1s
30:	learn: 0.0609026	test: 0.2809278	best: 0.2780575 (16)	total: 667ms	remaining: 20.9s
40:	learn: 0.0528208	test: 0.2788610	best: 0.2780575 (16)	total: 878ms	remaining: 20.5s
50:	learn: 0.0468848	test: 0.2778379	best: 0.2776868 (49)	total: 1.09s	remaining: 20.3s
60:	learn: 0.0431683	test: 0.2775842	best: 0.2768607 (51)	total: 1.32s	remaining: 20.4s
70:	learn: 0.0404170	test: 0.2758356	best: 0.2756298 (66)	total: 1.55s	remaining: 20.3s
80:	learn: 0.0384035	test: 0.2762220	best: 0.2756298 (66)	total: 1.77s	remaining: 20.1s
90:	learn: 0.0373753	test: 0.2765068	best: 0.2756298 (66)	total: 1.97s	remaining: 19.7s
100:	learn: 0.0363767	test: 0.2768822	best: 0.2756298 (66)	total: 2.19s	remaining: 19.5s
110:	learn: 0.0350635	test: 0.276628

In [19]:
oof_test_preds.shape

(175, 1)

In [17]:
model.predict(X_test).shape

(175,)

In [None]:
y = (train2['class'].values).astype(int)
X = train2.drop(['id', 'class'], axis=1)
X.iloc[:,:16] = X.iloc[:,:16].astype('category')
X_test = test2.drop(['id', 'class'], axis=1)
X_test.iloc[:,:16] = X_test.iloc[:,:16].astype('category')

smote = SMOTENC(categorical_features=[x for x in range(16)], random_state=random_seed, sampling_strategy=strategy)
X_train, y_train = smote.fit_resample(X, y)
X_train.info()

In [None]:
def train_on_gpu() :  
    var_categ = X_train.columns.tolist()[:16]
    model = CatBoostClassifier(
        cat_features=var_categ,
        iterations=100,
        learning_rate=0.03,
        task_type='GPU',
        devices='0:5',
        # random_state=random_seed
        )
  
    model.fit(
        X_train, y_train,
        eval_set=(X_train, y_train),
        verbose=10
        );     

    pred = model.predict(X)
    score = f1_score(y, pred, average='macro')
    print(score)
    
    return model

In [None]:
model = train_on_gpu()
model

In [None]:
answer = model.predict(X_test) + 1 
answer

In [None]:
high_one = pd.read_csv("submit_0.99078.csv")
high_one

In [None]:
A_index = high_one[high_one['class']=='A'].index.tolist()
answer[A_index] = 0
answer

In [None]:
submit = high_one.copy()
submit['class'] = answer
submit['class'] = submit['class'].map(lambda x : 'A' if x==0 else ('B' if x==1 else 'C'))
submit

In [None]:
sum(high_one['class'] == submit['class'])

In [None]:
submit['class'].value_counts()

In [None]:
high_one['class'].value_counts()

In [None]:
submit.to_csv("submit.csv", index=False)