# CONCEPT 

- 기초 데이터부터 각 단계별 성능을 구체적으로 확인
- 제공받은 데이터만 썼을 경우, 파생변수 추가, AE 변수 추가, 데이터 증강의 효과 중 첫 번째 것

# STEP 01. EDA

In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
import multiprocessing
from tqdm.auto import tqdm

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler

from transformers import get_cosine_schedule_with_warmup

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset, sampler

from imblearn.over_sampling import SMOTE

In [2]:
device = torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda', index=1)

In [3]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')              
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 21 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      262 non-null    object
 1   father  262 non-null    int64 
 2   mother  262 non-null    int64 
 3   gender  262 non-null    int64 
 4   trait   262 non-null    int64 
 5   SNP_01  262 non-null    object
 6   SNP_02  262 non-null    object
 7   SNP_03  262 non-null    object
 8   SNP_04  262 non-null    object
 9   SNP_05  262 non-null    object
 10  SNP_06  262 non-null    object
 11  SNP_07  262 non-null    object
 12  SNP_08  262 non-null    object
 13  SNP_09  262 non-null    object
 14  SNP_10  262 non-null    object
 15  SNP_11  262 non-null    object
 16  SNP_12  262 non-null    object
 17  SNP_13  262 non-null    object
 18  SNP_14  262 non-null    object
 19  SNP_15  262 non-null    object
 20  class   262 non-null    object
dtypes: int64(4), object(17)
memory usage: 43.1+ KB


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      175 non-null    object
 1   father  175 non-null    int64 
 2   mother  175 non-null    int64 
 3   gender  175 non-null    int64 
 4   trait   175 non-null    int64 
 5   SNP_01  175 non-null    object
 6   SNP_02  175 non-null    object
 7   SNP_03  175 non-null    object
 8   SNP_04  175 non-null    object
 9   SNP_05  175 non-null    object
 10  SNP_06  175 non-null    object
 11  SNP_07  175 non-null    object
 12  SNP_08  175 non-null    object
 13  SNP_09  175 non-null    object
 14  SNP_10  175 non-null    object
 15  SNP_11  175 non-null    object
 16  SNP_12  175 non-null    object
 17  SNP_13  175 non-null    object
 18  SNP_14  175 non-null    object
 19  SNP_15  175 non-null    object
dtypes: int64(4), object(16)
memory usage: 27.5+ KB


In [5]:
train.describe()

Unnamed: 0,father,mother,gender,trait
count,262.0,262.0,262.0,262.0
mean,0.0,0.0,0.0,1.736641
std,0.0,0.0,0.0,0.441298
min,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,2.0
75%,0.0,0.0,0.0,2.0
max,0.0,0.0,0.0,2.0


In [6]:
test.describe()

Unnamed: 0,father,mother,gender,trait
count,175.0,175.0,175.0,175.0
mean,0.0,0.0,0.0,1.708571
std,0.0,0.0,0.0,0.455724
min,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,2.0
75%,0.0,0.0,0.0,2.0
max,0.0,0.0,0.0,2.0


In [7]:
train.drop(columns=['father', 'mother', 'gender'], inplace=True)
test.drop(columns=['father', 'mother', 'gender'], inplace=True)

### Summary 01
- father, mother, gender column은 무의미하므로, 삭제
- trait은 1 혹은 2밖에 존재하지 않음

## Categorical Features
- 전체 변수들을 CATEGORY 타입으로 변환

In [8]:
train.iloc[:, :-1] = train.iloc[:, :-1].astype('category')
test = test.astype('category')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   id      262 non-null    category
 1   trait   262 non-null    category
 2   SNP_01  262 non-null    category
 3   SNP_02  262 non-null    category
 4   SNP_03  262 non-null    category
 5   SNP_04  262 non-null    category
 6   SNP_05  262 non-null    category
 7   SNP_06  262 non-null    category
 8   SNP_07  262 non-null    category
 9   SNP_08  262 non-null    category
 10  SNP_09  262 non-null    category
 11  SNP_10  262 non-null    category
 12  SNP_11  262 non-null    category
 13  SNP_12  262 non-null    category
 14  SNP_13  262 non-null    category
 15  SNP_14  262 non-null    category
 16  SNP_15  262 non-null    category
 17  class   262 non-null    object  
dtypes: category(17), object(1)
memory usage: 19.0+ KB


In [9]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   id      175 non-null    category
 1   trait   175 non-null    category
 2   SNP_01  175 non-null    category
 3   SNP_02  175 non-null    category
 4   SNP_03  175 non-null    category
 5   SNP_04  175 non-null    category
 6   SNP_05  175 non-null    category
 7   SNP_06  175 non-null    category
 8   SNP_07  175 non-null    category
 9   SNP_08  175 non-null    category
 10  SNP_09  175 non-null    category
 11  SNP_10  175 non-null    category
 12  SNP_11  175 non-null    category
 13  SNP_12  175 non-null    category
 14  SNP_13  175 non-null    category
 15  SNP_14  175 non-null    category
 16  SNP_15  175 non-null    category
dtypes: category(17)
memory usage: 10.7 KB


# STEP 02. MODELING & VALIDATION
- 데이터 증강여부에 따른 성능향상 유무를 확인
- Classifier와 Regressor를 동시에 사용해 자체적인 ensemble 효과 추가

0. All
1. A & notA
2. B & notB
3. C & notC

## 0. All
### without Aug

In [10]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold

from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from imblearn.over_sampling import SMOTE, SMOTENC, SMOTEN

In [11]:
X, y = train.drop(columns=['id', 'class']), train['class'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2))
X_test = test.drop(columns=['id'])
X

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15
0,2,G G,A G,A A,G A,C A,A A,A A,G G,A A,G G,A G,A A,A A,A A,A A
1,2,A G,A G,C A,A A,A A,A G,A A,G A,A A,A G,A A,G A,G G,A A,A A
2,2,G G,G G,A A,G A,C C,G G,A A,G A,G A,A G,A A,A A,A A,A A,A A
3,1,A A,G G,A A,G A,A A,G G,G G,A A,G G,A G,G G,G G,G G,A A,G G
4,2,G G,G G,C C,A A,C C,A A,A A,A A,A A,G G,A A,A A,A G,A A,G A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,2,A G,A G,A A,G A,C C,A G,A A,G A,A A,G G,A G,G A,A A,A A,A A
258,2,G G,A A,C A,A A,A A,A G,G A,G A,A A,A G,A G,A A,A G,A A,G A
259,1,A G,G G,A A,G A,A A,A G,G G,G A,G A,A A,G G,G G,G G,C A,G G
260,1,A A,G G,A A,G A,A A,G G,G G,A A,G A,A G,A G,G A,G G,C A,G G


In [12]:
y

0      1
1      2
2      1
3      0
4      2
      ..
257    1
258    2
259    0
260    0
261    1
Name: class, Length: 262, dtype: int64

In [13]:
X_test

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15
0,1,A G,G G,A A,G A,A A,A G,G G,G A,G A,A G,A G,G A,G G,C A,G A
1,2,G G,A G,C C,G G,C C,A A,A A,A A,A A,G G,A G,A A,A A,A A,A A
2,2,G G,A G,A A,A A,C A,A G,A A,A A,A A,A G,A A,G A,G G,A A,G G
3,2,G G,A G,C A,A A,C C,A A,A A,A A,A A,G G,A A,G A,A G,A A,A A
4,1,A A,G G,A A,G G,A A,G G,G G,A A,G G,A G,G G,G A,G G,A A,G G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,2,A G,G G,C C,A A,C A,A G,A A,G G,A A,G G,G G,A A,A A,A A,G A
171,2,G G,A A,A A,A A,C A,A G,A A,A A,A A,A G,A A,A A,A G,A A,G A
172,2,G G,A A,A A,A A,C A,A G,A A,A A,A A,G G,A G,A A,A G,A A,G G
173,2,A G,G G,C A,G A,C C,G G,A A,G A,A A,G G,A G,A A,A A,A A,A A


In [45]:
def catgbmc(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:16]
    model = CatBoostClassifier(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        # l2_leaf_reg=10
        # random_state=random_seed
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY),
        verbose=0
        );     

    pred = model.predict(validX)
    score = f1_score(validY, pred, average='macro')
    print(score)
    
    return model

In [46]:
def catgbmr(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:16]
    model = CatBoostRegressor(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        # l2_leaf_reg=10
        # random_state=random_seed
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY),
        verbose=0
        );     

    pred = model.predict(validX)
    score = f1_score(validY, np.round(pred), average='macro')
    print(score)
    
    return model

In [33]:
params = {'iterations':1000,
          'learning_rate':0.5}

model_cls = catgbmc(X, y, X, y, params)
model_reg = catgbmr(X, y, X, y,params)

0.9820323046129498
0.9785698423273373


In [35]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 10,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X, y, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 10}

model_reg = catgbmr(X, y, X, y, params)

0.989305405480287
0.9820323046129498


In [36]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 10,
          'auto_class_weights' : 'SqrtBalanced'}

model_cls = catgbmc(X, y, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X, y, X, y, params)

0.9857132282182248
0.989305405480287


In [37]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X, y, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 1000}

model_reg = catgbmr(X, y, X, y, params)

0.9893443754313319
0.9567845947156292


### with Aug

In [38]:
train['class'].value_counts()

B    114
C     79
A     69
Name: class, dtype: int64

In [41]:
strategy1 = {0:1000, 1:1000, 2:1000}
strategy2 = {0:1000, 1:1200, 2:1100}
strategy3 = {0:690, 1:1140, 2:790}

smote1 = SMOTEN(sampling_strategy=strategy1)
smote2 = SMOTEN(sampling_strategy=strategy2)
smote3 = SMOTEN(sampling_strategy=strategy3)

X1, y1 = smote1.fit_resample(X, y)
X2, y2 = smote2.fit_resample(X, y)
X3, y3 = smote3.fit_resample(X, y)

In [47]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X1, y1, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X1, y1, X, y, params)

1.0
0.9964351351600956


In [48]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X2, y2, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X2, y2, X, y, params)

0.9964351351600956
0.9964351351600956


In [49]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X3, y3, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X3, y3, X, y, params)

0.9964351351600956
0.9964351351600956


### Summary
- 단일 모델로 모든 클래스를 분류하는 것은 학습 데이터에 대해서는 충분히 최적화가 가능하나, 이 성능이 평가셋으로 그대로 전이되지 않을 수 있음
- 현재 각 클래스 별로 동일한 사이즈(1,000)로 증강한 뒤 분류 모델로 학습 및 평가한 결과가 가장 높은 성능을 보임

## 01. A & notA
### without Aug

In [53]:
X, y = train.drop(columns=['id', 'class']), (train['class'].values == 'A').astype(int)
X_test = test.drop(columns=['id'])
display(train, y)

Unnamed: 0,id,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15,class
0,TRAIN_000,2,G G,A G,A A,G A,C A,A A,A A,G G,A A,G G,A G,A A,A A,A A,A A,B
1,TRAIN_001,2,A G,A G,C A,A A,A A,A G,A A,G A,A A,A G,A A,G A,G G,A A,A A,C
2,TRAIN_002,2,G G,G G,A A,G A,C C,G G,A A,G A,G A,A G,A A,A A,A A,A A,A A,B
3,TRAIN_003,1,A A,G G,A A,G A,A A,G G,G G,A A,G G,A G,G G,G G,G G,A A,G G,A
4,TRAIN_004,2,G G,G G,C C,A A,C C,A A,A A,A A,A A,G G,A A,A A,A G,A A,G A,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,TRAIN_257,2,A G,A G,A A,G A,C C,A G,A A,G A,A A,G G,A G,G A,A A,A A,A A,B
258,TRAIN_258,2,G G,A A,C A,A A,A A,A G,G A,G A,A A,A G,A G,A A,A G,A A,G A,C
259,TRAIN_259,1,A G,G G,A A,G A,A A,A G,G G,G A,G A,A A,G G,G G,G G,C A,G G,A
260,TRAIN_260,1,A A,G G,A A,G A,A A,G G,G G,A A,G A,A G,A G,G A,G G,C A,G G,A


array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0])

In [54]:
def catgbmc(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()
    model = CatBoostClassifier(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY),
        verbose=0
        );     

    pred = model.predict(validX)
    score = f1_score(validY, pred, average='macro')
    print(score)
    
    return model

In [55]:
def catgbmr(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()
    model = CatBoostRegressor(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY),
        verbose=0
        );     

    pred = model.predict(validX)
    score = f1_score(validY, np.round(pred), average='macro')
    print(score)
    
    return model

In [56]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'SqrtBalanced'}

model_cls = catgbmc(X, y, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X, y, X, y, params)

1.0
1.0


In [57]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X, y, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 120}

model_reg = catgbmr(X, y, X, y, params)

1.0
1.0


### Summary
- A & notA 를 분류하는 것은 증강없이 1의 성능을 확인하였으므로, 증강을 진행하지 않음

## 02. B & notB
### without Aug

In [58]:
X, y = train.drop(columns=['id', 'class']), (train['class'].values == 'B').astype(int)
X_test = test.drop(columns=['id'])
display(train, y)

Unnamed: 0,id,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15,class
0,TRAIN_000,2,G G,A G,A A,G A,C A,A A,A A,G G,A A,G G,A G,A A,A A,A A,A A,B
1,TRAIN_001,2,A G,A G,C A,A A,A A,A G,A A,G A,A A,A G,A A,G A,G G,A A,A A,C
2,TRAIN_002,2,G G,G G,A A,G A,C C,G G,A A,G A,G A,A G,A A,A A,A A,A A,A A,B
3,TRAIN_003,1,A A,G G,A A,G A,A A,G G,G G,A A,G G,A G,G G,G G,G G,A A,G G,A
4,TRAIN_004,2,G G,G G,C C,A A,C C,A A,A A,A A,A A,G G,A A,A A,A G,A A,G A,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,TRAIN_257,2,A G,A G,A A,G A,C C,A G,A A,G A,A A,G G,A G,G A,A A,A A,A A,B
258,TRAIN_258,2,G G,A A,C A,A A,A A,A G,G A,G A,A A,A G,A G,A A,A G,A A,G A,C
259,TRAIN_259,1,A G,G G,A A,G A,A A,A G,G G,G A,G A,A A,G G,G G,G G,C A,G G,A
260,TRAIN_260,1,A A,G G,A A,G A,A A,G G,G G,A A,G A,A G,A G,G A,G G,C A,G G,A


array([1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1])

In [59]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 120,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X, y, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 120}

model_reg = catgbmr(X, y, X, y, params)

0.9844713134186818
0.9766585496228544


In [60]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'SqrtBalanced'}

model_cls = catgbmc(X, y, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X, y, X, y, params)

0.9844713134186818
0.9766585496228544


### with Aug

In [62]:
pd.DataFrame(train['class'].values=='B').astype(int).value_counts()

0    148
1    114
dtype: int64

In [63]:
strategy1 = {0:1000, 1:1000}
strategy2 = {0:1000, 1:1200}
strategy3 = {0:1480, 1:1140}

smote1 = SMOTEN(sampling_strategy=strategy1)
smote2 = SMOTEN(sampling_strategy=strategy2)
smote3 = SMOTEN(sampling_strategy=strategy3)

X1, y1 = smote1.fit_resample(X, y)
X2, y2 = smote2.fit_resample(X, y)
X3, y3 = smote3.fit_resample(X, y)

In [64]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X1, y1, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X1, y1, X, y, params)

0.9961138551446922
0.9883415654340764


In [65]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X2, y2, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X2, y2, X, y, params)

1.0
0.9767069701280229


In [66]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X3, y3, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X3, y3, X, y, params)

1.0
0.9922356567093409


### Summary 
- B & notB 는 오히려 데이터 양을 불균등하게 할 경우, 성능이 최고치

## 03. C & notC
### without Aug

In [67]:
X, y = train.drop(columns=['id', 'class']), (train['class'].values == 'C').astype(int)
X_test = test.drop(columns=['id'])
display(train, y)

Unnamed: 0,id,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15,class
0,TRAIN_000,2,G G,A G,A A,G A,C A,A A,A A,G G,A A,G G,A G,A A,A A,A A,A A,B
1,TRAIN_001,2,A G,A G,C A,A A,A A,A G,A A,G A,A A,A G,A A,G A,G G,A A,A A,C
2,TRAIN_002,2,G G,G G,A A,G A,C C,G G,A A,G A,G A,A G,A A,A A,A A,A A,A A,B
3,TRAIN_003,1,A A,G G,A A,G A,A A,G G,G G,A A,G G,A G,G G,G G,G G,A A,G G,A
4,TRAIN_004,2,G G,G G,C C,A A,C C,A A,A A,A A,A A,G G,A A,A A,A G,A A,G A,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,TRAIN_257,2,A G,A G,A A,G A,C C,A G,A A,G A,A A,G G,A G,G A,A A,A A,A A,B
258,TRAIN_258,2,G G,A A,C A,A A,A A,A G,G A,G A,A A,A G,A G,A A,A G,A A,G A,C
259,TRAIN_259,1,A G,G G,A A,G A,A A,A G,G G,G A,G A,A A,G G,G G,G G,C A,G G,A
260,TRAIN_260,1,A A,G G,A A,G A,A A,G G,G G,A A,G A,A G,A G,G A,G G,C A,G G,A


array([0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0])

In [68]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 120,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X, y, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 120}

model_reg = catgbmr(X, y, X, y, params)

0.9864564486947531
0.9818772912775817


In [69]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'SqrtBalanced'}

model_cls = catgbmc(X, y, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X, y, X, y, params)

0.990938645638791
0.9772644440202016


### with Aug

In [71]:
pd.DataFrame(train['class'].values=='C').astype(int).value_counts()

0    183
1     79
dtype: int64

In [72]:
strategy1 = {0:1000, 1:1000}
strategy2 = {0:1000, 1:1200}
strategy3 = {0:1830, 1:7900}

smote1 = SMOTEN(sampling_strategy=strategy1)
smote2 = SMOTEN(sampling_strategy=strategy2)
smote3 = SMOTEN(sampling_strategy=strategy3)

X1, y1 = smote1.fit_resample(X, y)
X2, y2 = smote2.fit_resample(X, y)
X3, y3 = smote3.fit_resample(X, y)

In [73]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X1, y1, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X1, y1, X, y, params)

1.0
0.9910027472527473


In [74]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X2, y2, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X2, y2, X, y, params)

1.0
0.9954854828982511


In [75]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X3, y3, X, y, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X3, y3, X, y, params)

1.0
0.9910027472527473


### Summary
- C & notC case의 경우, A보단 어렵지만 B보단 쉬운 정도의 문제로 보여짐
- 이 경우, A와 C는 성능을 떨어뜨리지 않는 수준에서 규제를 추가하고
- 가장 쉬운 문제인 A를 제외하고, B&C case를 추가로 학습

## 04. B & C
### without Aug

In [78]:
X2, y2 = train[train['class']!='A'].drop(columns=['id', 'class']).reset_index(drop=True), (train[train['class']!='A']['class'].values == 'C').astype(int)
X_test = test.drop(columns=['id'])

display(X2, y)

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15
0,2,G G,A G,A A,G A,C A,A A,A A,G G,A A,G G,A G,A A,A A,A A,A A
1,2,A G,A G,C A,A A,A A,A G,A A,G A,A A,A G,A A,G A,G G,A A,A A
2,2,G G,G G,A A,G A,C C,G G,A A,G A,G A,A G,A A,A A,A A,A A,A A
3,2,G G,G G,C C,A A,C C,A A,A A,A A,A A,G G,A A,A A,A G,A A,G A
4,2,G G,G G,C A,A A,C C,A A,A A,G A,A A,G G,A A,A A,A A,A A,A A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,2,A G,G G,C A,G A,C C,A G,G A,G G,A A,G G,G G,A A,A A,A A,G A
189,2,G G,G G,C A,G A,C C,A G,A A,G G,A A,G G,A G,G A,A A,A A,A A
190,2,A G,A G,A A,G A,C C,A G,A A,G A,A A,G G,A G,G A,A A,A A,A A
191,2,G G,A A,C A,A A,A A,A G,G A,G A,A A,A G,A G,A A,A G,A A,G A


array([0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0])

In [79]:
X2.describe()

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15
count,193,193,193,193,193,193,193,193,193,193,193,193,193,193,193,193
unique,1,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3
top,2,G G,A G,C A,A A,C C,A G,A A,G A,A A,G G,A A,A A,A G,A A,A A
freq,193,138,91,92,118,82,102,163,85,169,148,83,135,92,174,103


- trait 변수의 경우 변별성이 없으므로 제외

In [80]:
X2.drop(columns=['trait'], inplace=True)
X_test.drop(columns=['trait'], inplace=True)

In [81]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 120,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X2, y2, X2, y2, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 120}

model_reg = catgbmr(X2, y2, X2, y2, params)

0.9785698423273372
0.9675884460367219


In [82]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'SqrtBalanced'}

model_cls = catgbmc(X2, y2, X2, y2, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X2, y2, X2, y2, params)

0.9729305169850486
0.978483835005574


### with aug

In [83]:
pd.DataFrame(y2).value_counts()

0    114
1     79
dtype: int64

In [84]:
strategy1 = {0:1000, 1:1000}
strategy2 = {0:1000, 1:1200}
strategy3 = {0:1140, 1:790}

smote1 = SMOTEN(sampling_strategy=strategy1)
smote2 = SMOTEN(sampling_strategy=strategy2)
smote3 = SMOTEN(sampling_strategy=strategy3)

X01, y01 = smote1.fit_resample(X2, y2)
X02, y02 = smote2.fit_resample(X2, y2)
X03, y03 = smote3.fit_resample(X2, y2)

In [85]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X01, y01, X2, y2, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X01, y01, X2, y2, params)

1.0
0.9893252212389381


In [86]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X02, y02, X2, y2, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X02, y02, X2, y2, params)

1.0
0.9839581082204305


In [87]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X03, y03, X2, y2, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X03, y03, X2, y2, params)

0.9946527027401435
0.9839581082204305


### Summary
- C에 대한 비중을 늘려주거나, 똑같게 할 경우 가장 높은 성능을 보임

# STEP 03. ENSEMBLE
- 각 타겟 데이터들 별로 가장 성능이 좋았던 조합 구현
- 만약 cls의 성능이 동일하면 reg를 기준으로 가장 성능이 좋은 조합 구현
- 본래의 성능을 하락시키지 않는 선에서 규제 등을 추가

## 1) MODEL for ALL

In [88]:
X, y = train.drop(columns=['id', 'class']), train['class'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2))
X_test = test.drop(columns=['id'])

strategy1 = {0:1000, 1:1000, 2:1000}
X1, y1 = smote1.fit_resample(X, y)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_all = catgbmc(X1, y1, X, y, params)

1.0


## 2) MODEL for A&notA

In [91]:
X, y = train.drop(columns=['id', 'class']), (train['class'].values == 'A').astype(int)
X_test = test.drop(columns=['id'])

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 1000,
          'auto_class_weights' : 'SqrtBalanced'}

model_a = catgbmc(X, y, X, y, params)

1.0


## 3) MODEL for B&notB

In [94]:
X, y = train.drop(columns=['id', 'class']), (train['class'].values == 'B').astype(int)
X_test = test.drop(columns=['id'])

strategy3 = {0:1480, 1:1140}
smote3 = SMOTEN(sampling_strategy=strategy3)
X3, y3 = smote3.fit_resample(X, y)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 200,
          'auto_class_weights' : 'Balanced'}

model_b = catgbmc(X3, y3, X, y, params)

1.0


## 4) MODEL for C&notC

In [97]:
X, y = train.drop(columns=['id', 'class']), (train['class'].values == 'C').astype(int)
X_test = test.drop(columns=['id'])

strategy2 = {0:1000, 1:1200}
smote2 = SMOTEN(sampling_strategy=strategy2)
X2, y2 = smote2.fit_resample(X, y)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_c = catgbmc(X2, y2, X, y, params)

1.0


## 5) MODEL for B&C

In [99]:
X2, y2 = train[train['class']!='A'].drop(columns=['id', 'class']).reset_index(drop=True), (train[train['class']!='A']['class'].values == 'C').astype(int)
X_test2 = test.drop(columns=['id'])
X2.drop(columns=['trait'], inplace=True)
X_test2.drop(columns=['trait'], inplace=True)

strategy1 = {0:1000, 1:1000}
smote1 = SMOTEN(sampling_strategy=strategy1)
X01, y01 = smote1.fit_resample(X2, y2)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 300,
          'auto_class_weights' : 'Balanced'}

model_bc = catgbmc(X01, y01, X2, y2, params)

1.0


## 6) Make Preds

In [104]:
pred_all = model_all.predict(X_test).flatten()
pred_a = model_a.predict_proba(X_test)[:,1]
pred_b = model_b.predict_proba(X_test)[:,1]
pred_c = model_c.predict_proba(X_test)[:,1]
pred_bc = model_bc.predict_proba(X_test2)[:,1]

total = pd.DataFrame()
total['all'] = pred_all
total['a'] = pred_a
total['b'] = pred_b
total['c'] = pred_c
total['bc'] = pred_bc

total['argmax'] = np.argmax(total[['a', 'b', 'c']].values, axis=1)

total

Unnamed: 0,all,a,b,c,bc,argmax
0,0,0.998133,0.000226,0.000992,0.831450,0
1,1,0.001031,0.999806,0.005248,0.000360,1
2,2,0.001180,0.000109,0.999968,0.999955,2
3,2,0.001012,0.993338,0.618692,0.073046,1
4,0,0.998263,0.000145,0.000131,0.791162,0
...,...,...,...,...,...,...
170,1,0.001013,0.998804,0.003576,0.008260,1
171,2,0.001012,0.000043,0.999965,0.999957,2
172,2,0.001031,0.000353,0.999579,0.999413,2
173,1,0.001031,0.999923,0.000111,0.000462,1


In [108]:
total['all'].value_counts()

1    85
0    51
2    39
Name: all, dtype: int64

In [109]:
total['argmax'].value_counts()

1    87
0    51
2    37
Name: argmax, dtype: int64

In [107]:
total[total['all'] != total['argmax']]

Unnamed: 0,all,a,b,c,bc,argmax
3,2,0.001012,0.993338,0.618692,0.073046,1
12,2,0.001013,0.67392,0.039512,0.759016,1


In [114]:
target_index = total[total['all'] != total['argmax']].index.tolist()
total['answer'] = total['all']

for target in target_index :
    score = total['bc'][target]
    if score <= 0.5 :
        total['answer'][target] = 1
    else :
        total['answer'][target] = 2
        
total

Unnamed: 0,all,a,b,c,bc,argmax,answer
0,0,0.998133,0.000226,0.000992,0.831450,0,0
1,1,0.001031,0.999806,0.005248,0.000360,1,1
2,2,0.001180,0.000109,0.999968,0.999955,2,2
3,2,0.001012,0.993338,0.618692,0.073046,1,1
4,0,0.998263,0.000145,0.000131,0.791162,0,0
...,...,...,...,...,...,...,...
170,1,0.001013,0.998804,0.003576,0.008260,1,1
171,2,0.001012,0.000043,0.999965,0.999957,2,2
172,2,0.001031,0.000353,0.999579,0.999413,2,2
173,1,0.001031,0.999923,0.000111,0.000462,1,1


In [116]:
submit = pd.read_csv("./data/sample_submission.csv")
submit['class'] = total['answer'].map(lambda x : 'A' if x==0 else ('B' if x==1 else 'C'))
submit['class'].value_counts()

B    86
A    51
C    38
Name: class, dtype: int64

In [119]:
submit.to_csv("./ensemble.csv", index=False)

# STEP 04. STACKING

In [126]:
X, X_test = train.drop(columns=['id', 'class']), test.drop(columns=['id'])
train[['all_a', 'all_b', 'all_c']] = model_all.predict_proba(X)
test[['all_a', 'all_b', 'all_c']] = model_all.predict_proba(X_test)

train['prob_a'] = model_a.predict_proba(X)[:,1]
test['prob_a'] = model_a.predict_proba(X_test)[:,1]
train['prob_b'] = model_b.predict_proba(X)[:,1]
test['prob_b'] = model_b.predict_proba(X_test)[:,1]
train['prob_c'] = model_c.predict_proba(X)[:,1]
test['prob_c'] = model_c.predict_proba(X_test)[:,1]

train['prob_bc'] = model_bc.predict_proba(X.drop(columns=['trait']))[:,1]
test['prob_bc'] = model_bc.predict_proba(X_test.drop(columns=['trait']))[:,1]

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 25 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   id       262 non-null    category
 1   trait    262 non-null    category
 2   SNP_01   262 non-null    category
 3   SNP_02   262 non-null    category
 4   SNP_03   262 non-null    category
 5   SNP_04   262 non-null    category
 6   SNP_05   262 non-null    category
 7   SNP_06   262 non-null    category
 8   SNP_07   262 non-null    category
 9   SNP_08   262 non-null    category
 10  SNP_09   262 non-null    category
 11  SNP_10   262 non-null    category
 12  SNP_11   262 non-null    category
 13  SNP_12   262 non-null    category
 14  SNP_13   262 non-null    category
 15  SNP_14   262 non-null    category
 16  SNP_15   262 non-null    category
 17  class    262 non-null    object  
 18  all_a    262 non-null    float64 
 19  all_b    262 non-null    float64 
 20  all_c    262 non-null    float64

In [127]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 24 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   id       175 non-null    category
 1   trait    175 non-null    category
 2   SNP_01   175 non-null    category
 3   SNP_02   175 non-null    category
 4   SNP_03   175 non-null    category
 5   SNP_04   175 non-null    category
 6   SNP_05   175 non-null    category
 7   SNP_06   175 non-null    category
 8   SNP_07   175 non-null    category
 9   SNP_08   175 non-null    category
 10  SNP_09   175 non-null    category
 11  SNP_10   175 non-null    category
 12  SNP_11   175 non-null    category
 13  SNP_12   175 non-null    category
 14  SNP_13   175 non-null    category
 15  SNP_14   175 non-null    category
 16  SNP_15   175 non-null    category
 17  all_a    175 non-null    float64 
 18  all_b    175 non-null    float64 
 19  all_c    175 non-null    float64 
 20  prob_a   175 non-null    float64

In [128]:
X_new, y_new = train.drop(columns=['id', 'class']), train['class'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2))
X_test_new = test.drop(columns=['id'])

In [129]:
def catgbmc(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:16]
    model = CatBoostClassifier(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        # l2_leaf_reg=10
        # random_state=random_seed
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY),
        verbose=0
        );     

    pred = model.predict(validX)
    score = f1_score(validY, pred, average='macro')
    print(score)
    
    return model

In [130]:
def catgbmr(inputX, inputY, validX, validY, params) :  
    var_categ = inputX.columns.tolist()[:16]
    model = CatBoostRegressor(
        cat_features=var_categ,
        **params,
        task_type='GPU',
        devices='0',
        # l2_leaf_reg=10
        # random_state=random_seed
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY),
        verbose=0
        );     

    pred = model.predict(validX)
    score = f1_score(validY, np.round(pred), average='macro')
    print(score)
    
    return model

In [132]:
params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100,
          'auto_class_weights' : 'Balanced'}

model_cls = catgbmc(X_new, y_new, X_new, y_new, params)

params = {'iterations':1000,
          'learning_rate':0.5,
          'l2_leaf_reg' : 100}

model_reg = catgbmr(X_new, y_new, X_new, y_new, params)

1.0
1.0


In [141]:
total['new_cls'] = model_cls.predict(X_test_new)
total['new_reg'] = model_reg.predict(X_test_new)
total['new_reg2'] = np.round(model_reg.predict(X_test_new)).astype(int)
total

Unnamed: 0,all,a,b,c,bc,argmax,answer,new_cls,new_reg,new_reg2
0,0,0.998133,0.000226,0.000992,0.831450,0,0,0,-0.008822,0
1,1,0.001031,0.999806,0.005248,0.000360,1,1,1,0.995491,1
2,2,0.001180,0.000109,0.999968,0.999955,2,2,2,2.001305,2
3,2,0.001012,0.993338,0.618692,0.073046,1,1,2,1.534375,2
4,0,0.998263,0.000145,0.000131,0.791162,0,0,0,-0.042572,0
...,...,...,...,...,...,...,...,...,...,...
170,1,0.001013,0.998804,0.003576,0.008260,1,1,1,1.004010,1
171,2,0.001012,0.000043,0.999965,0.999957,2,2,2,2.057799,2
172,2,0.001031,0.000353,0.999579,0.999413,2,2,2,2.040068,2
173,1,0.001031,0.999923,0.000111,0.000462,1,1,1,1.003939,1


In [142]:
total['new_reg2'].value_counts()

1    86
0    51
2    38
Name: new_reg2, dtype: int64

In [143]:
total[total.answer != total.new_cls]

Unnamed: 0,all,a,b,c,bc,argmax,answer,new_cls,new_reg,new_reg2
3,2,0.001012,0.993338,0.618692,0.073046,1,1,2,1.534375,2


In [144]:
total[total.answer != total.new_reg2]

Unnamed: 0,all,a,b,c,bc,argmax,answer,new_cls,new_reg,new_reg2
3,2,0.001012,0.993338,0.618692,0.073046,1,1,2,1.534375,2
12,2,0.001013,0.67392,0.039512,0.759016,1,2,2,1.36939,1


In [135]:
total.new_cls.value_counts()

1    85
0    51
2    39
Name: new_cls, dtype: int64

In [146]:
submit = pd.read_csv("./data/sample_submission.csv")
submit['class'] = total['new_cls'].map(lambda x : 'A' if x==0 else ('B' if x==1 else 'C'))
submit['class'].value_counts()

B    85
A    51
C    39
Name: class, dtype: int64

In [137]:
submit.loc[[3,5,12,119,126,162,168], 'class']

3      C
5      B
12     C
119    C
126    B
162    B
168    B
Name: class, dtype: object

In [147]:
submit.to_csv("./stacking.csv", index=False)

In [149]:
submit = pd.read_csv("./data/sample_submission.csv")
submit['class'] = total['new_reg2'].map(lambda x : 'A' if x==0 else ('B' if x==1 else 'C'))
submit['class'].value_counts()
submit.to_csv("./stacking2.csv", index=False)

In [150]:
submit.loc[[3,5,12,119,126,162,168], 'class']

3      C
5      B
12     B
119    C
126    B
162    B
168    B
Name: class, dtype: object

In [29]:
pd.DataFrame(data=model1.get_feature_importance(), index=model1.feature_names_, columns=['feature_importance']).sort_values('feature_importance', ascending=False)

Unnamed: 0,feature_importance
SNP_05,24.860814
SNP_08,17.622925
trait,16.090092
SNP_04,11.521258
SNP_02,8.052254
SNP_15,5.756705
SNP_10,4.96612
SNP_11,3.54647
SNP_14,2.405219
SNP_13,1.564936


In [30]:
pd.DataFrame(data=model2.get_feature_importance(), index=model2.feature_names_, columns=['feature_importance']).sort_values('feature_importance', ascending=False)

Unnamed: 0,feature_importance
SNP_04,28.62412
SNP_05,20.690495
SNP_08,14.510042
SNP_02,7.96804
SNP_15,6.747978
SNP_10,6.083997
SNP_06,3.289543
SNP_11,2.779722
SNP_14,2.651923
trait,2.390422


In [36]:
model2.predict(X_test).flatten().shape

(175,)

In [37]:
pred = model2.predict(X_test).flatten()
high1 = pd.read_csv("./data/submit_0.99078.csv")
high2 = pd.read_csv("./data/submit_0.99078_2.csv")
high3 = pd.read_csv("./data/submit_0.99078_3.csv")
high4 = pd.read_csv("./data/submit_0.99078_4.csv")

total = pd.DataFrame()
total['pred'] = pd.Series(pred)
total['high1'] = high1['class']
total['high2'] = high2['class']
total['high3'] = high3['class']
total['high4'] = high4['class']

total

Unnamed: 0,pred,high1,high2,high3,high4
0,A,A,A,A,A
1,B,B,B,B,B
2,C,C,C,C,C
3,B,C,C,C,C
4,A,A,A,A,A
...,...,...,...,...,...
170,B,B,B,B,B
171,C,C,C,C,C
172,C,C,C,C,C
173,B,B,B,B,B


In [38]:
total[total.pred != total.high1]

Unnamed: 0,pred,high1,high2,high3,high4
3,B,C,C,C,C
5,B,C,C,C,C
19,B,C,C,C,C
97,C,B,B,B,B
119,B,C,C,C,C
162,B,C,C,C,C


In [39]:
total[total.pred != total.high2]

Unnamed: 0,pred,high1,high2,high3,high4
3,B,C,C,C,C
5,B,C,C,C,C
12,B,B,C,C,C
19,B,C,C,C,C
97,C,B,B,B,B
119,B,C,C,C,C
126,B,B,C,B,B
162,B,C,C,C,C


In [40]:
total[total.pred != total.high3]

Unnamed: 0,pred,high1,high2,high3,high4
3,B,C,C,C,C
5,B,C,C,C,C
12,B,B,C,C,C
19,B,C,C,C,C
97,C,B,B,B,B
119,B,C,C,C,C
162,B,C,C,C,C


In [None]:
high1 = pd.read_csv("./data/submit_0.99078.csv")
high2 = pd.read_csv("./data/submit_0.99078_2.csv")
high3 = pd.read_csv("./data/submit_0.99078_3.csv")
high4 = pd.read_csv("./data/submit_0.99078_4.csv")

pred_A = model1.predict_proba(X_test)[:,0]
pred_B = model1.predict_proba(X_test)[:,1]
pred_C = model1.predict_proba(X_test)[:,2]

pred_AA = model2.predict_proba(X_test)[:,0]
pred_BB = model2.predict_proba(X_test)[:,1]
pred_CC = model2.predict_proba(X_test)[:,2]

total = pd.DataFrame()
total['high1'] = high1['class']
total['high2'] = high2['class']
total['high3'] = high3['class']
total['high4'] = high4['class']

total['a_prob'] = pred_A
total['b_prob'] = pred_B
total['c_prob'] = pred_C
total['aa_prob'] = pred_AA
total['bb_prob'] = pred_BB
total['cc_prob'] = pred_CC

total['answer1'] = np.argmax(total[['a_prob', 'b_prob', 'c_prob']].values, axis=1)
total['answer2'] = np.argmax(total[['aa_prob', 'bb_prob', 'cc_prob']].values, axis=1)
total['target1'] = total.answer1.map(lambda x : 'A' if x==0 else ('B' if x==1 else 'C'))
total['target2'] = total.answer2.map(lambda x : 'A' if x==0 else ('B' if x==1 else 'C'))

total

In [None]:
total[total.target1 != total.target2]