---
# Setting
---

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

<br></br>

---
# Data Load
---

In [2]:
train_df = pd.read_csv('./data/train.csv')
test_df  = pd.read_csv('./data/test.csv')

In [3]:
train_df.shape, test_df.shape

((262, 21), (175, 20))

In [4]:
train_df.head()

Unnamed: 0,id,father,mother,gender,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,...,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15,class
0,TRAIN_000,0,0,0,2,G G,A G,A A,G A,C A,...,A A,G G,A A,G G,A G,A A,A A,A A,A A,B
1,TRAIN_001,0,0,0,2,A G,A G,C A,A A,A A,...,A A,G A,A A,A G,A A,G A,G G,A A,A A,C
2,TRAIN_002,0,0,0,2,G G,G G,A A,G A,C C,...,A A,G A,G A,A G,A A,A A,A A,A A,A A,B
3,TRAIN_003,0,0,0,1,A A,G G,A A,G A,A A,...,G G,A A,G G,A G,G G,G G,G G,A A,G G,A
4,TRAIN_004,0,0,0,2,G G,G G,C C,A A,C C,...,A A,A A,A A,G G,A A,A A,A G,A A,G A,C


<br></br>

---
# Data Preprocess
---

## Delete Unique Features

In [5]:
# father,mother,gender는 모두 1개의 값
print('> nunique info')
print('\n(1) train')
display(train_df.apply(lambda x: x.nunique()))
print('\n(2) test')
display(test_df.apply(lambda x: x.nunique()))

> nunique info

(1) train


id        262
father      1
mother      1
gender      1
trait       2
SNP_01      3
SNP_02      3
SNP_03      3
SNP_04      3
SNP_05      3
SNP_06      3
SNP_07      3
SNP_08      3
SNP_09      3
SNP_10      3
SNP_11      3
SNP_12      3
SNP_13      3
SNP_14      3
SNP_15      3
class       3
dtype: int64


(2) test


id        175
father      1
mother      1
gender      1
trait       2
SNP_01      3
SNP_02      3
SNP_03      3
SNP_04      3
SNP_05      3
SNP_06      3
SNP_07      3
SNP_08      3
SNP_09      3
SNP_10      3
SNP_11      3
SNP_12      3
SNP_13      3
SNP_14      3
SNP_15      3
dtype: int64

In [6]:
unique_features = ['father','mother','gender']

train_df.drop(columns=unique_features,inplace=True)
test_df .drop(columns=unique_features,inplace=True)

In [7]:
train_df.head()

Unnamed: 0,id,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15,class
0,TRAIN_000,2,G G,A G,A A,G A,C A,A A,A A,G G,A A,G G,A G,A A,A A,A A,A A,B
1,TRAIN_001,2,A G,A G,C A,A A,A A,A G,A A,G A,A A,A G,A A,G A,G G,A A,A A,C
2,TRAIN_002,2,G G,G G,A A,G A,C C,G G,A A,G A,G A,A G,A A,A A,A A,A A,A A,B
3,TRAIN_003,1,A A,G G,A A,G A,A A,G G,G G,A A,G G,A G,G G,G G,G G,A A,G G,A
4,TRAIN_004,2,G G,G G,C C,A A,C C,A A,A A,A A,A A,G G,A A,A A,A G,A A,G A,C


<br>

## train/test 한쪽에만 값이 있는지 확인

In [8]:
check_features = [col for col in test_df.columns if col not in ['id']]

check_df = []
for feature in check_features:
    n1 = len(list(set(train_df[feature].unique())-set(test_df [feature].unique())))
    n2 = len(list(set(test_df [feature].unique())-set(train_df[feature].unique())))
    check_df.append([feature,n1,n2])
    
pd.DataFrame(check_df,columns=['feature','only_train','only_test'])

Unnamed: 0,feature,only_train,only_test
0,trait,0,0
1,SNP_01,0,0
2,SNP_02,0,0
3,SNP_03,0,0
4,SNP_04,0,0
5,SNP_05,0,0
6,SNP_06,0,0
7,SNP_07,0,0
8,SNP_08,0,0
9,SNP_09,0,0


<br>

<br>

## New Features

In [9]:
train_df2 = train_df.copy()
test_df2  = test_df .copy()

In [10]:
snp_features = [col for col in train_df2.columns if col.find('SNP_')>=0]

for snp_feature in snp_features:
    train_df2[f'{snp_feature}_0'] = [x.split(' ')[0] for x in train_df2[snp_feature]]
    train_df2[f'{snp_feature}_1'] = [x.split(' ')[1] for x in train_df2[snp_feature]]
    
    test_df2 [f'{snp_feature}_0'] = [x.split(' ')[0] for x in test_df2 [snp_feature]]
    test_df2 [f'{snp_feature}_1'] = [x.split(' ')[1] for x in test_df2 [snp_feature]]
    
    train_df2.drop(snp_feature,axis=1,inplace=True)
    test_df2 .drop(snp_feature,axis=1,inplace=True)

In [11]:
print(train_df2.shape)
train_df2.head()

(262, 33)


Unnamed: 0,id,trait,class,SNP_01_0,SNP_01_1,SNP_02_0,SNP_02_1,SNP_03_0,SNP_03_1,SNP_04_0,...,SNP_11_0,SNP_11_1,SNP_12_0,SNP_12_1,SNP_13_0,SNP_13_1,SNP_14_0,SNP_14_1,SNP_15_0,SNP_15_1
0,TRAIN_000,2,B,G,G,A,G,A,A,G,...,A,G,A,A,A,A,A,A,A,A
1,TRAIN_001,2,C,A,G,A,G,C,A,A,...,A,A,G,A,G,G,A,A,A,A
2,TRAIN_002,2,B,G,G,G,G,A,A,G,...,A,A,A,A,A,A,A,A,A,A
3,TRAIN_003,1,A,A,A,G,G,A,A,G,...,G,G,G,G,G,G,A,A,G,G
4,TRAIN_004,2,C,G,G,G,G,C,C,A,...,A,A,A,A,A,G,A,A,G,A


<br></br>

---
# Modeling
---

In [12]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split, KFold

In [13]:
# %%time

# X = train_df2.drop(columns=['id','class']).astype(str)
# y = train_df2['class']

# X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.2,random_state=42)

# train_dataset = Pool(data=X_train,label=y_train,cat_features=X.columns.tolist())
# valid_dataset = Pool(data=X_valid,label=y_valid,cat_features=X.columns.tolist())

# model = CatBoostClassifier(iterations=2000,eval_metric='TotalF1')
# model.fit(train_dataset,eval_set=valid_dataset,use_best_model=True,verbose=False)#,metric_period=10,early_stopping_rounds=100)

In [56]:
n_splits=5
iterations=25000
random_state=0
verbose=True

In [58]:
# overall start time
total_start_time = time.time()

# prepare the training dataset
X = train_df2.drop(columns=['id','class']).astype(str)
X = pd.get_dummies(X,columns=X.columns.tolist())
y = train_df2['class']

# prepare the submission dataset
X_test = test_df2.drop(columns=['id']).astype(str)
X_test = pd.get_dummies(X_test,columns=X_test.columns.tolist())

# prepare the prediction probabilities
train_pred = np.zeros([X     .shape[0],3])
test_pred  = np.zeros([X_test.shape[0],3])

# set-up the k-fold
kf = KFold(n_splits=n_splits,shuffle=True,random_state=random_state)

# training by k-fold
models = []
i=0
for tr_idx,va_idx in kf.split(X):
    i+=1

    # k-fold start time
    kf_start_time = time.time()

    # prepare the train/validation dataset
    X_train, X_valid = X.iloc[tr_idx,:], X.iloc[va_idx,:]
    y_train, y_valid = y.iloc[tr_idx  ], y.iloc[va_idx  ]

    # model define
    model = LogisticRegression(
        multi_class='multinomial',
        penalty='elasticnet',
        solver='saga',
        max_iter=iterations,
        l1_ratio=0.5,
    )
    model.fit(X_train,y_train)

    # save the model result
    models.append(model)
    train_pred += model.predict_proba(X)      / n_splits
    test_pred  += model.predict_proba(X_test) / n_splits

    # k-fold run time
    kf_end_time = time.time()
    kf_run_time = float(kf_end_time-kf_start_time)
    remaining = (n_splits-i)*kf_run_time

    # print the train/validation score
    if verbose:
        str_i = str(i).zfill(len(str(n_splits)))
        train_score = f1_score(y_pred=model.predict(X_train),y_true=y_train,average='macro')
        valid_score = f1_score(y_pred=model.predict(X_valid),y_true=y_valid,average='macro')
        text = '({}/{}) train: {:.3f}, validation: {:.3f}, elapsed: {:.3f}s, remaining: {:.3f}s'\
            .format(str_i,n_splits,train_score,valid_score,kf_run_time,remaining)
        print(text)

# overall run time
total_end_time = time.time()
total_run_time = float(total_end_time-total_start_time)

# print the overall score
if verbose:
    final_train_pred = [['A','B','C'][np.argmax(x)] for x in train_pred]
    score = f1_score(y_pred=final_train_pred,y_true=y,average='macro')
    text = '(Total) f1_score: {:.3f}, total: {:.3f}s'\
        .format(score,total_run_time)
    print(text)

# save the test prediction
submission = pd.read_csv('./data/sample_submission.csv')
submission['class'] = [['A','B','C'][np.argmax(x)] for x in test_pred]
save_path = './submit/LR_kf{}_iterations{}_seed{}.csv'\
    .format(n_splits,iterations,random_state)
submission.to_csv(save_path,index=False)

(1/5) train: 0.982, validation: 0.947, elapsed: 0.047s, remaining: 0.190s
(2/5) train: 0.986, validation: 0.882, elapsed: 0.042s, remaining: 0.126s
(3/5) train: 0.978, validation: 0.955, elapsed: 0.042s, remaining: 0.083s
(4/5) train: 0.982, validation: 0.982, elapsed: 0.043s, remaining: 0.043s
(5/5) train: 0.982, validation: 0.933, elapsed: 0.041s, remaining: 0.000s
(Total) f1_score: 0.978, total: 0.255s


In [48]:
import time
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score

def kf_voting_model(n_splits,iterations,random_state,verbose):

    # overall start time
    total_start_time = time.time()

    # prepare the training dataset
    X = train_df2.drop(columns=['id','class']).astype(str)
    X = pd.get_dummies(X,columns=X.columns.tolist())
    y = train_df2['class']

    # prepare the submission dataset
    X_test = test_df2.drop(columns=['id']).astype(str)
    X_test = pd.get_dummies(X_test,columns=X_test.columns.tolist())

    # prepare the prediction probabilities
    train_pred = np.zeros([X     .shape[0],3])
    test_pred  = np.zeros([X_test.shape[0],3])

    # set-up the k-fold
    kf = KFold(n_splits=n_splits,shuffle=True,random_state=random_state)

    # training by k-fold
    models = []
    i=0
    for tr_idx,va_idx in kf.split(X):
        i+=1

        # k-fold start time
        kf_start_time = time.time()

        # prepare the train/validation dataset
        X_train, X_valid = X.iloc[tr_idx,:], X.iloc[va_idx,:]
        y_train, y_valid = y.iloc[tr_idx  ], y.iloc[va_idx  ]

        # model define
        kf_random_state = random_state+i
        LR   = LogisticRegression(multi_class='multinomial')
        LGBM = LGBMClassifier(random_state=kf_random_state,n_estimators=iterations)
        XGB  = XGBClassifier(random_state=kf_random_state,n_estimators=iterations)
        RF   = RandomForestClassifier(verbose=False,random_state=kf_random_state,n_estimators=iterations)
        CAT  = CatBoostClassifier(verbose=False,random_state=kf_random_state,iterations=iterations)

        estimators = [
            ('lr',LR),
            ('rf',RF),
            ('xgb',XGB),
            ('lgbm',LGBM),
            ('cat',CAT)
        ]

        model = VotingClassifier(estimators=estimators,voting='soft',verbose=False,n_jobs=-1)
        model.fit(X_train,y_train)

        # save the model result
        models.append(model)
        train_pred += model.predict_proba(X)      / n_splits
        test_pred  += model.predict_proba(X_test) / n_splits

        # k-fold run time
        kf_end_time = time.time()
        kf_run_time = float(kf_end_time-kf_start_time)
        remaining = (n_splits-i)*kf_run_time

        # print the train/validation score
        if verbose:
            str_i = str(i).zfill(len(str(n_splits)))
            train_score = f1_score(y_pred=model.predict(X_train),y_true=y_train,average='macro')
            valid_score = f1_score(y_pred=model.predict(X_valid),y_true=y_valid,average='macro')
            text = '({}/{}) train: {:.3f}, validation: {:.3f}, elapsed: {:.3f}s, remaining: {:.3f}s'\
                .format(str_i,n_splits,train_score,valid_score,kf_run_time,remaining)
            print(text)
    
    # overall run time
    total_end_time = time.time()
    total_run_time = float(total_end_time-total_start_time)
    
    # print the overall score
    if verbose:
        final_train_pred = [['A','B','C'][np.argmax(x)] for x in train_pred]
        score = f1_score(y_pred=final_train_pred,y_true=y,average='macro')
        text = '(Total) f1_score: {:.3f}, total: {:.3f}s'\
            .format(score,total_run_time)
        print(text)
    
    # save the test prediction
    submission = pd.read_csv('./data/sample_submission.csv')
    submission['class'] = [['A','B','C'][np.argmax(x)] for x in test_pred]
    save_path = './submit/voting_kf{}_iterations{}_seed{}.csv'\
        .format(n_splits,iterations,random_state)
    submission.to_csv(save_path,index=False)
    
    return [['A','B','C'][np.argmax(x)] for x in test_pred]

In [49]:
test_preds = kf_voting_model(
    n_splits=5,
    iterations=25000,
    random_state=0,
    verbose=True,
)

(1/5) train: 1.000, validation: 0.982, elapsed: 83.370s, remaining: 333.482s
(2/5) train: 1.000, validation: 0.900, elapsed: 82.441s, remaining: 247.323s
(3/5) train: 1.000, validation: 0.978, elapsed: 85.711s, remaining: 171.423s
(4/5) train: 1.000, validation: 0.982, elapsed: 82.741s, remaining: 82.741s
(5/5) train: 1.000, validation: 0.933, elapsed: 82.968s, remaining: 0.000s
(Total) f1_score: 1.000, total: 434.434s


<br></br><br></br>

In [None]:
X_test = test_df2.drop(columns=['id']).astype(str)
X_test = pd.get_dummies(X_test,columns=X_test.columns.tolist())

In [None]:
tr_pred_mat = np.array([model.predict(X) for model in models])
tr_preds = []
for i in range(tr_pred_mat.shape[1]):
    tr_pred = pd.Series(tr_pred_mat[:,i]).value_counts(ascending=False).index.tolist()[0]
    tr_preds.append(tr_pred)
    
te_pred_mat = np.array([model.predict(X_test) for model in models])
te_preds = []
for i in range(te_pred_mat.shape[1]):
    te_pred = pd.Series(te_pred_mat[:,i]).value_counts(ascending=False).index.tolist()[0]
    te_preds.append(te_pred)

In [None]:
f1_score(y_true=y,y_pred=tr_preds,average='macro')

In [None]:
submission = pd.read_csv('./data/sample_submission.csv')
submission['class'] = te_preds
submission.to_csv('./submit/votingclassifier.csv',index=False)

In [None]:
# %%time

# n_splits=10

# X = train_df2.drop(columns=['id','class']).astype(str)
# y = train_df2['class']

# kf = KFold(n_splits=n_splits)

# models = []
# for tr_idx,va_idx in tqdm(kf.split(X),total=n_splits):
#     X_train, X_valid = X.iloc[tr_idx,:], X.iloc[va_idx,:]
#     y_train, y_valid = y.iloc[tr_idx  ], y.iloc[va_idx  ]
    
#     train_dataset = Pool(data=X_train,label=y_train,cat_features=X.columns.tolist())
#     valid_dataset = Pool(data=X_valid,label=y_valid,cat_features=X.columns.tolist())

#     model = CatBoostClassifier(iterations=5000,eval_metric='TotalF1')
#     model.fit(train_dataset,eval_set=valid_dataset,use_best_model=True,verbose=False)#,early_stopping_rounds=100)
#     models.append(model)

In [None]:
X_test = test_df2.drop(columns=['id']).astype(str)

In [None]:
tr_pred_mat = np.array([model.predict(X) for model in models])[:,:,-1]
tr_preds = []
for i in range(tr_pred_mat.shape[1]):
    tr_pred = pd.Series(tr_pred_mat[:,i]).value_counts(ascending=False).index.tolist()[0]
    tr_preds.append(tr_pred)
    
te_pred_mat = np.array([model.predict(X_test) for model in models])[:,:,-1]
te_preds = []
for i in range(te_pred_mat.shape[1]):
    te_pred = pd.Series(te_pred_mat[:,i]).value_counts(ascending=False).index.tolist()[0]
    te_preds.append(te_pred)

In [None]:
tmp = pd.DataFrame({
    'pred_new':np.array(te_preds),
    'pred_org':submission['class']
})
tmp[tmp.pred_new!=tmp.pred_org]

In [None]:
from sklearn.metrics import f1_score



train_f1 = f1_score(y_true=y_train,y_pred=model.predict(X_train),average='macro')
valid_f1 = f1_score(y_true=y_valid,y_pred=model.predict(X_valid),average='macro')

print(train_f1,valid_f1)

In [None]:
from sklearn.metrics import f1_score

train_f1 = f1_score(y_true=y_train,y_pred=model.predict(X_train),average='macro')
valid_f1 = f1_score(y_true=y_valid,y_pred=model.predict(X_valid),average='macro')

print(train_f1,valid_f1)

In [None]:
X_test = test_df.drop(columns=['id']).astype(str)
pred = model.predict(X_test)

In [None]:
submission = pd.read_csv('./data/sample_submission.csv')
submission['class'] = pred
submission.to_csv('./submit/catboost.csv',index=False)

<br></br>

# EDA

<br>

[Case1]
- trait=1인 경우, A 100%
- SNP_07='G G'인 경우, A 100%

In [None]:
features = train_df.drop(columns=['id','class']).columns.tolist()

for i,feature in enumerate(features):
    ct = 100*pd.crosstab(train_df[feature],train_df['class'],normalize='index')
    
    plt.figure(figsize=(8,4))
    sns.heatmap(ct,annot=True,fmt='.2f')
    plt.title(f'({i+1}/{len(features)})',size=15,color='blue')
    plt.show()

In [None]:
tmp = test_df.copy()
tmp['class'] = te_preds

pd.crosstab(tmp['class'],tmp['trait'])  # (1)
pd.crosstab(tmp['class'],tmp['SNP_07']) # (2)

<br>

[Case2] : Case1을 제외
- SNP_01='A A'인 경우, C 100%
- SNP_04='G G'인 경우, B 100%
- SNP_09='G G'인 경우, B 100%
- SNP_10='A A'인 경우, C 100%
- SNP_14='C A','C C'인 경우, C 100%

In [None]:
train_df2 = train_df[(train_df.trait!=1) & (train_df.SNP_07!='G G')]

features = train_df2.drop(columns=['id','class','trait']).columns.tolist()

for i,feature in enumerate(features):
    ct = 100*pd.crosstab(train_df2[feature],train_df2['class'],normalize='index')
    
    plt.figure(figsize=(8,4))
    sns.heatmap(ct,annot=True,fmt='.2f')
    plt.title(f'({i+1}/{len(features)})',size=15,color='blue')
    plt.show()

In [None]:
tmp = test_df.copy()
tmp['class'] = te_preds

tmp = tmp[(tmp['trait']!=1) & (tmp['SNP_07']!='G G')]

pd.crosstab(tmp['class'],tmp['SNP_01']) # (1)
pd.crosstab(tmp['class'],tmp['SNP_04']) # (2)
pd.crosstab(tmp['class'],tmp['SNP_09']) # (3)
pd.crosstab(tmp['class'],tmp['SNP_10']) # (4)
pd.crosstab(tmp['class'],tmp['SNP_14']) # (4)