---
# Setting
---

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

<br></br>

---
# Data Load
---

In [2]:
train_df = pd.read_csv('./data/train.csv')
test_df  = pd.read_csv('./data/test.csv')

In [3]:
train_df.shape, test_df.shape

((262, 21), (175, 20))

In [4]:
train_df.head()

Unnamed: 0,id,father,mother,gender,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,...,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15,class
0,TRAIN_000,0,0,0,2,G G,A G,A A,G A,C A,...,A A,G G,A A,G G,A G,A A,A A,A A,A A,B
1,TRAIN_001,0,0,0,2,A G,A G,C A,A A,A A,...,A A,G A,A A,A G,A A,G A,G G,A A,A A,C
2,TRAIN_002,0,0,0,2,G G,G G,A A,G A,C C,...,A A,G A,G A,A G,A A,A A,A A,A A,A A,B
3,TRAIN_003,0,0,0,1,A A,G G,A A,G A,A A,...,G G,A A,G G,A G,G G,G G,G G,A A,G G,A
4,TRAIN_004,0,0,0,2,G G,G G,C C,A A,C C,...,A A,A A,A A,G G,A A,A A,A G,A A,G A,C


<br></br>

---
# Data Preprocess
---

## Delete Unique Features

In [5]:
# father,mother,gender는 모두 1개의 값
print('> nunique info')
print('\n(1) train')
display(train_df.apply(lambda x: x.nunique()))
print('\n(2) test')
display(test_df.apply(lambda x: x.nunique()))

> nunique info

(1) train


id        262
father      1
mother      1
gender      1
trait       2
SNP_01      3
SNP_02      3
SNP_03      3
SNP_04      3
SNP_05      3
SNP_06      3
SNP_07      3
SNP_08      3
SNP_09      3
SNP_10      3
SNP_11      3
SNP_12      3
SNP_13      3
SNP_14      3
SNP_15      3
class       3
dtype: int64


(2) test


id        175
father      1
mother      1
gender      1
trait       2
SNP_01      3
SNP_02      3
SNP_03      3
SNP_04      3
SNP_05      3
SNP_06      3
SNP_07      3
SNP_08      3
SNP_09      3
SNP_10      3
SNP_11      3
SNP_12      3
SNP_13      3
SNP_14      3
SNP_15      3
dtype: int64

In [6]:
unique_features = ['father','mother','gender']

train_df.drop(columns=unique_features,inplace=True)
test_df .drop(columns=unique_features,inplace=True)

In [7]:
train_df.head()

Unnamed: 0,id,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15,class
0,TRAIN_000,2,G G,A G,A A,G A,C A,A A,A A,G G,A A,G G,A G,A A,A A,A A,A A,B
1,TRAIN_001,2,A G,A G,C A,A A,A A,A G,A A,G A,A A,A G,A A,G A,G G,A A,A A,C
2,TRAIN_002,2,G G,G G,A A,G A,C C,G G,A A,G A,G A,A G,A A,A A,A A,A A,A A,B
3,TRAIN_003,1,A A,G G,A A,G A,A A,G G,G G,A A,G G,A G,G G,G G,G G,A A,G G,A
4,TRAIN_004,2,G G,G G,C C,A A,C C,A A,A A,A A,A A,G G,A A,A A,A G,A A,G A,C


<br>

## train/test 한쪽에만 값이 있는지 확인

In [8]:
check_features = [col for col in test_df.columns if col not in ['id']]

check_df = []
for feature in check_features:
    n1 = len(list(set(train_df[feature].unique())-set(test_df [feature].unique())))
    n2 = len(list(set(test_df [feature].unique())-set(train_df[feature].unique())))
    check_df.append([feature,n1,n2])
    
pd.DataFrame(check_df,columns=['feature','only_train','only_test'])

Unnamed: 0,feature,only_train,only_test
0,trait,0,0
1,SNP_01,0,0
2,SNP_02,0,0
3,SNP_03,0,0
4,SNP_04,0,0
5,SNP_05,0,0
6,SNP_06,0,0
7,SNP_07,0,0
8,SNP_08,0,0
9,SNP_09,0,0


<br>

<br>

## New Features

In [9]:
train_df2 = train_df.copy()
test_df2  = test_df .copy()

In [10]:
snp_features = [col for col in train_df2.columns if col.find('SNP_')>=0]

for snp_feature in snp_features:
    train_df2[f'{snp_feature}_0'] = [x.split(' ')[0] for x in train_df2[snp_feature]]
    train_df2[f'{snp_feature}_1'] = [x.split(' ')[1] for x in train_df2[snp_feature]]
    
    test_df2 [f'{snp_feature}_0'] = [x.split(' ')[0] for x in test_df2 [snp_feature]]
    test_df2 [f'{snp_feature}_1'] = [x.split(' ')[1] for x in test_df2 [snp_feature]]
    
    train_df2.drop(snp_feature,axis=1,inplace=True)
    test_df2 .drop(snp_feature,axis=1,inplace=True)

In [11]:
print(train_df2.shape)
train_df2.head()

(262, 33)


Unnamed: 0,id,trait,class,SNP_01_0,SNP_01_1,SNP_02_0,SNP_02_1,SNP_03_0,SNP_03_1,SNP_04_0,...,SNP_11_0,SNP_11_1,SNP_12_0,SNP_12_1,SNP_13_0,SNP_13_1,SNP_14_0,SNP_14_1,SNP_15_0,SNP_15_1
0,TRAIN_000,2,B,G,G,A,G,A,A,G,...,A,G,A,A,A,A,A,A,A,A
1,TRAIN_001,2,C,A,G,A,G,C,A,A,...,A,A,G,A,G,G,A,A,A,A
2,TRAIN_002,2,B,G,G,G,G,A,A,G,...,A,A,A,A,A,A,A,A,A,A
3,TRAIN_003,1,A,A,A,G,G,A,A,G,...,G,G,G,G,G,G,A,A,G,G
4,TRAIN_004,2,C,G,G,G,G,C,C,A,...,A,A,A,A,A,G,A,A,G,A


<br></br>

---
# Modeling
---

In [12]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split, KFold

In [None]:
# %%time

# X = train_df2.drop(columns=['id','class']).astype(str)
# y = train_df2['class']

# X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.2,random_state=42)

# train_dataset = Pool(data=X_train,label=y_train,cat_features=X.columns.tolist())
# valid_dataset = Pool(data=X_valid,label=y_valid,cat_features=X.columns.tolist())

# model = CatBoostClassifier(iterations=2000,eval_metric='TotalF1')
# model.fit(train_dataset,eval_set=valid_dataset,use_best_model=True,verbose=False)#,metric_period=10,early_stopping_rounds=100)

In [157]:
# model = CatBoostClassifier(iterations=1)
# model.fit([1,2],[1,3])
# model.get_all_params()

In [161]:
import time
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score

def kf_catboost_model(
    n_splits,iterations,early_stopping_rounds,
    metric_period,depth,grow_policy,learning_rate,l2_leaf_reg,random_seed,verbose
):
    
    total_start_time = time.time()
    
    X = train_df2.drop(columns=['id','class']).astype(str)
    X = pd.get_dummies(X,columns=X.columns.tolist())
    y = train_df2['class']

    X_test = test_df2.drop(columns=['id']).astype(str)
    X_test = pd.get_dummies(X_test,columns=X_test.columns.tolist())

    kf = KFold(n_splits=n_splits,shuffle=True,random_state=random_seed)
    models = []
    i=0

    train_pred = np.zeros([X     .shape[0],3])
    test_pred  = np.zeros([X_test.shape[0],3])

    # pbar = tqdm(kf.split(X),total=n_splits)
    pbar = kf.split(X)
    for tr_idx,va_idx in pbar:
        i+=1

        kf_start_time = time.time()
        
        X_train, X_valid = X.iloc[tr_idx,:], X.iloc[va_idx,:]
        y_train, y_valid = y.iloc[tr_idx  ], y.iloc[va_idx  ]

        train_dataset = Pool(data=X_train,label=y_train,cat_features=X_train.columns.tolist())
        valid_dataset = Pool(data=X_valid,label=y_valid,cat_features=X_valid.columns.tolist())

        model = CatBoostClassifier(
            eval_metric = 'MultiClass',
            iterations = iterations,
            metric_period = metric_period,
            early_stopping_rounds = early_stopping_rounds,
            grow_policy = grow_policy,
            # 트리 노드 생성 방식
            #   1) Depthwise(지정한 depth에 이를 때까지 level 순으로 노드 분할)
            #   2) Lossguide(loss 변화가 큰 순으로 노드 분할)
            depth = depth,
            learning_rate = learning_rate,
            l2_leaf_reg = l2_leaf_reg,        # L2 정규화
            random_seed = random_seed+i,      # 랜덤시드 고정
        )
        model.fit(
            train_dataset,
            eval_set=valid_dataset,
            use_best_model=True,
            verbose=verbose,
            # plot=True,
        )

        models.append(model)
        train_pred += model.predict_proba(X)      / n_splits
        test_pred  += model.predict_proba(X_test) / n_splits
    
        kf_end_time = time.time()
        kf_run_time = float(kf_end_time-kf_start_time)
        remaining = (n_splits-i)*kf_run_time
    
        train_score = f1_score(y_pred=model.predict(X_train),y_true=y_train,average='macro')
        valid_score = f1_score(y_pred=model.predict(X_valid),y_true=y_valid,average='macro')
        text = '({}/{}) train: {:.3f}, validation: {:.3f}, elapsed: {:.3f}s, remaining: {:.3f}s'\
            .format(str(i).zfill(2),n_splits,train_score,valid_score,kf_run_time,remaining)
        print(text)
        # pbar.set_description(text)
        # pbar.update(1)
    
    # save
    submission = pd.read_csv('./data/sample_submission.csv')
    submission['class'] = [['A','B','C'][np.argmax(x)] for x in test_pred]
    save_path = './submit/CatBoost_{}_iterations{}_depth{}_lr{}_l2leafreg{}.csv'.format(
        grow_policy,iterations,depth,learning_rate,l2_leaf_reg)
    submission.to_csv(save_path,index=False)
    
    return test_pred

In [171]:
# param_grid = {
#     'iterations' : [10000,20000,30000],
#     'depth' : [4,6,10],
#     'grow_policy' : ['Depthwise','Lossguide'],
#     'learning_rate' : [0.1,0.01,0.001],
#     'l2_leaf_reg' : [3,20,50,80]
# }

# tot = 1
# for key,value in param_grid.items():
#     tot*=len(value)
# print(tot)

# from sklearn.model_selection import ParameterGrid
# ParameterGrid(param_grid)

In [173]:
pred1 = kf_catboost_model(
    n_splits = 10,
    iterations = 25000,
    early_stopping_rounds = 500,
    metric_period = None,
    depth = 6,
    grow_policy = 'Depthwise',
    learning_rate = 0.01,
    l2_leaf_reg = 20,
    random_seed = 2023,
    verbose = False,
)

(01/10) train: 1.000, validation: 0.911, elapsed: 23.260s, remaining: 209.338s
(02/10) train: 1.000, validation: 1.000, elapsed: 149.034s, remaining: 1192.270s
(03/10) train: 1.000, validation: 0.914, elapsed: 145.949s, remaining: 1021.642s
(04/10) train: 1.000, validation: 1.000, elapsed: 149.373s, remaining: 896.239s
(05/10) train: 1.000, validation: 0.958, elapsed: 103.969s, remaining: 519.843s
(06/10) train: 1.000, validation: 0.963, elapsed: 150.211s, remaining: 600.844s
(07/10) train: 0.996, validation: 0.826, elapsed: 15.869s, remaining: 47.607s
(08/10) train: 1.000, validation: 0.925, elapsed: 24.384s, remaining: 48.767s
(09/10) train: 1.000, validation: 0.960, elapsed: 25.852s, remaining: 25.852s
(10/10) train: 1.000, validation: 1.000, elapsed: 147.745s, remaining: 0.000s


In [174]:
pred2 = kf_catboost_model(
    n_splits = 10,
    iterations = 25000,
    early_stopping_rounds = 500,
    metric_period = None,
    depth = 6,
    grow_policy = 'Depthwise',
    learning_rate = 0.01,
    l2_leaf_reg = 50,
    random_seed = 2023,
    verbose = False,
)

(01/10) train: 1.000, validation: 0.911, elapsed: 42.930s, remaining: 386.371s
(02/10) train: 1.000, validation: 1.000, elapsed: 144.803s, remaining: 1158.422s
(03/10) train: 1.000, validation: 0.914, elapsed: 145.678s, remaining: 1019.745s
(04/10) train: 1.000, validation: 1.000, elapsed: 144.422s, remaining: 866.532s
(05/10) train: 1.000, validation: 0.958, elapsed: 147.069s, remaining: 735.347s
(06/10) train: 1.000, validation: 1.000, elapsed: 143.047s, remaining: 572.189s
(07/10) train: 1.000, validation: 0.826, elapsed: 31.741s, remaining: 95.224s
(08/10) train: 1.000, validation: 0.925, elapsed: 53.550s, remaining: 107.100s
(09/10) train: 1.000, validation: 0.960, elapsed: 57.625s, remaining: 57.625s
(10/10) train: 1.000, validation: 1.000, elapsed: 143.637s, remaining: 0.000s


In [175]:
pred3 = kf_catboost_model(
    n_splits = 10,
    iterations = 25000,
    early_stopping_rounds = 500,
    metric_period = None,
    depth = 6,
    grow_policy = 'Depthwise',
    learning_rate = 0.01,
    l2_leaf_reg = 80,
    random_seed = 2023,
    verbose = False,
)

(01/10) train: 1.000, validation: 0.911, elapsed: 62.309s, remaining: 560.785s
(02/10) train: 1.000, validation: 1.000, elapsed: 143.578s, remaining: 1148.625s
(03/10) train: 1.000, validation: 0.866, elapsed: 146.167s, remaining: 1023.168s
(04/10) train: 1.000, validation: 1.000, elapsed: 144.209s, remaining: 865.255s
(05/10) train: 1.000, validation: 0.958, elapsed: 144.561s, remaining: 722.803s
(06/10) train: 1.000, validation: 1.000, elapsed: 141.567s, remaining: 566.268s
(07/10) train: 1.000, validation: 0.855, elapsed: 46.087s, remaining: 138.262s
(08/10) train: 1.000, validation: 0.925, elapsed: 79.251s, remaining: 158.503s
(09/10) train: 1.000, validation: 0.960, elapsed: 78.393s, remaining: 78.393s
(10/10) train: 1.000, validation: 1.000, elapsed: 141.869s, remaining: 0.000s


In [176]:
pred4 = kf_catboost_model(
    n_splits = 10,
    iterations = 25000,
    early_stopping_rounds = 500,
    metric_period = None,
    depth = 6,
    grow_policy = 'Lossguide',
    learning_rate = 0.01,
    l2_leaf_reg = 20,
    random_seed = 2023,
    verbose = False,
)

(01/10) train: 1.000, validation: 0.911, elapsed: 19.618s, remaining: 176.563s
(02/10) train: 1.000, validation: 1.000, elapsed: 119.508s, remaining: 956.066s
(03/10) train: 1.000, validation: 0.955, elapsed: 108.858s, remaining: 762.007s
(04/10) train: 1.000, validation: 1.000, elapsed: 118.640s, remaining: 711.838s
(05/10) train: 1.000, validation: 0.958, elapsed: 42.739s, remaining: 213.694s
(06/10) train: 1.000, validation: 1.000, elapsed: 120.557s, remaining: 482.230s
(07/10) train: 0.996, validation: 0.826, elapsed: 12.809s, remaining: 38.427s
(08/10) train: 1.000, validation: 0.925, elapsed: 17.350s, remaining: 34.699s
(09/10) train: 1.000, validation: 0.960, elapsed: 19.442s, remaining: 19.442s
(10/10) train: 1.000, validation: 1.000, elapsed: 118.565s, remaining: 0.000s


In [177]:
pred5 = kf_catboost_model(
    n_splits = 10,
    iterations = 25000,
    early_stopping_rounds = 500,
    metric_period = None,
    depth = 6,
    grow_policy = 'Lossguide',
    learning_rate = 0.01,
    l2_leaf_reg = 50,
    random_seed = 2023,
    verbose = False,
)

(01/10) train: 1.000, validation: 0.911, elapsed: 40.783s, remaining: 367.043s
(02/10) train: 1.000, validation: 1.000, elapsed: 118.203s, remaining: 945.623s
(03/10) train: 1.000, validation: 0.906, elapsed: 118.922s, remaining: 832.452s
(04/10) train: 1.000, validation: 1.000, elapsed: 119.674s, remaining: 718.042s
(05/10) train: 1.000, validation: 0.958, elapsed: 89.286s, remaining: 446.429s
(06/10) train: 1.000, validation: 1.000, elapsed: 117.980s, remaining: 471.920s
(07/10) train: 1.000, validation: 0.855, elapsed: 35.331s, remaining: 105.993s
(08/10) train: 1.000, validation: 0.925, elapsed: 34.868s, remaining: 69.736s
(09/10) train: 1.000, validation: 0.960, elapsed: 52.090s, remaining: 52.090s
(10/10) train: 1.000, validation: 1.000, elapsed: 117.998s, remaining: 0.000s


In [178]:
pred6 = kf_catboost_model(
    n_splits = 10,
    iterations = 25000,
    early_stopping_rounds = 500,
    metric_period = None,
    depth = 6,
    grow_policy = 'Lossguide',
    learning_rate = 0.01,
    l2_leaf_reg = 80,
    random_seed = 2023,
    verbose = False,
)

(01/10) train: 1.000, validation: 0.911, elapsed: 58.794s, remaining: 529.145s
(02/10) train: 1.000, validation: 1.000, elapsed: 117.584s, remaining: 940.671s
(03/10) train: 1.000, validation: 0.906, elapsed: 118.283s, remaining: 827.981s
(04/10) train: 1.000, validation: 1.000, elapsed: 117.439s, remaining: 704.635s
(05/10) train: 1.000, validation: 0.958, elapsed: 117.586s, remaining: 587.928s
(06/10) train: 1.000, validation: 1.000, elapsed: 116.995s, remaining: 467.978s
(07/10) train: 1.000, validation: 0.855, elapsed: 51.275s, remaining: 153.825s
(08/10) train: 1.000, validation: 0.925, elapsed: 55.114s, remaining: 110.228s
(09/10) train: 1.000, validation: 0.960, elapsed: 75.886s, remaining: 75.886s
(10/10) train: 1.000, validation: 1.000, elapsed: 116.921s, remaining: 0.000s


In [191]:
final_pred = (pred1 + pred2 + pred3 + pred4 + pred5 + pred6)/6

# # check 1
# np.sum(final_pred,axis=1)

# # check 2
# final_pred = (pred1 + pred2 + pred3 + pred4 + pred5 + pred6)/6
# for x in final_pred:
#     print(['A','B','C'][np.argmax(x)], x[np.argmax(x)])

submission = pd.read_csv('./data/sample_submission.csv')
submission['class'] = [['A','B','C'][np.argmax(x)] for x in final_pred]
submission.to_csv('./submit/CatBoost_Ensemble.csv',index=False)

<br></br><br></br>

In [None]:
# %%time

# n_splits=10

# X = train_df2.drop(columns=['id','class']).astype(str)
# y = train_df2['class']

# kf = KFold(n_splits=n_splits)

# models = []
# for tr_idx,va_idx in tqdm(kf.split(X),total=n_splits):
#     X_train, X_valid = X.iloc[tr_idx,:], X.iloc[va_idx,:]
#     y_train, y_valid = y.iloc[tr_idx  ], y.iloc[va_idx  ]
    
#     train_dataset = Pool(data=X_train,label=y_train,cat_features=X.columns.tolist())
#     valid_dataset = Pool(data=X_valid,label=y_valid,cat_features=X.columns.tolist())

#     model = CatBoostClassifier(iterations=5000,eval_metric='TotalF1')
#     model.fit(train_dataset,eval_set=valid_dataset,use_best_model=True,verbose=False)#,early_stopping_rounds=100)
#     models.append(model)

In [None]:
X_test = test_df2.drop(columns=['id']).astype(str)

In [None]:
tr_pred_mat = np.array([model.predict(X) for model in models])[:,:,-1]
tr_preds = []
for i in range(tr_pred_mat.shape[1]):
    tr_pred = pd.Series(tr_pred_mat[:,i]).value_counts(ascending=False).index.tolist()[0]
    tr_preds.append(tr_pred)
    
te_pred_mat = np.array([model.predict(X_test) for model in models])[:,:,-1]
te_preds = []
for i in range(te_pred_mat.shape[1]):
    te_pred = pd.Series(te_pred_mat[:,i]).value_counts(ascending=False).index.tolist()[0]
    te_preds.append(te_pred)

In [None]:
tmp = pd.DataFrame({
    'pred_new':np.array(te_preds),
    'pred_org':submission['class']
})
tmp[tmp.pred_new!=tmp.pred_org]

In [None]:
from sklearn.metrics import f1_score



train_f1 = f1_score(y_true=y_train,y_pred=model.predict(X_train),average='macro')
valid_f1 = f1_score(y_true=y_valid,y_pred=model.predict(X_valid),average='macro')

print(train_f1,valid_f1)

In [None]:
from sklearn.metrics import f1_score

train_f1 = f1_score(y_true=y_train,y_pred=model.predict(X_train),average='macro')
valid_f1 = f1_score(y_true=y_valid,y_pred=model.predict(X_valid),average='macro')

print(train_f1,valid_f1)

In [None]:
X_test = test_df.drop(columns=['id']).astype(str)
pred = model.predict(X_test)

In [None]:
submission = pd.read_csv('./data/sample_submission.csv')
submission['class'] = pred
submission.to_csv('./submit/catboost.csv',index=False)

<br></br>

# EDA

<br>

[Case1]
- trait=1인 경우, A 100%
- SNP_07='G G'인 경우, A 100%

In [None]:
features = train_df.drop(columns=['id','class']).columns.tolist()

for i,feature in enumerate(features):
    ct = 100*pd.crosstab(train_df[feature],train_df['class'],normalize='index')
    
    plt.figure(figsize=(8,4))
    sns.heatmap(ct,annot=True,fmt='.2f')
    plt.title(f'({i+1}/{len(features)})',size=15,color='blue')
    plt.show()

In [None]:
tmp = test_df.copy()
tmp['class'] = te_preds

pd.crosstab(tmp['class'],tmp['trait'])  # (1)
pd.crosstab(tmp['class'],tmp['SNP_07']) # (2)

<br>

[Case2] : Case1을 제외
- SNP_01='A A'인 경우, C 100%
- SNP_04='G G'인 경우, B 100%
- SNP_09='G G'인 경우, B 100%
- SNP_10='A A'인 경우, C 100%
- SNP_14='C A','C C'인 경우, C 100%

In [None]:
train_df2 = train_df[(train_df.trait!=1) & (train_df.SNP_07!='G G')]

features = train_df2.drop(columns=['id','class','trait']).columns.tolist()

for i,feature in enumerate(features):
    ct = 100*pd.crosstab(train_df2[feature],train_df2['class'],normalize='index')
    
    plt.figure(figsize=(8,4))
    sns.heatmap(ct,annot=True,fmt='.2f')
    plt.title(f'({i+1}/{len(features)})',size=15,color='blue')
    plt.show()

In [None]:
tmp = test_df.copy()
tmp['class'] = te_preds

tmp = tmp[(tmp['trait']!=1) & (tmp['SNP_07']!='G G')]

pd.crosstab(tmp['class'],tmp['SNP_01']) # (1)
pd.crosstab(tmp['class'],tmp['SNP_04']) # (2)
pd.crosstab(tmp['class'],tmp['SNP_09']) # (3)
pd.crosstab(tmp['class'],tmp['SNP_10']) # (4)
pd.crosstab(tmp['class'],tmp['SNP_14']) # (4)