## 필요 패키지 불러오기

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.datasets import load_digits
from matplotlib import font_manager
from matplotlib import gridspec
from math import factorial
import sklearn
import pprint
import re
from sklearn.preprocessing import LabelEncoder


import warnings
warnings.filterwarnings('ignore')

font_fname = 'C:/Windows/Fonts/malgun.ttf'
font_family = font_manager.FontProperties(fname=font_fname).get_name()

plt.rcParams["font.family"] = font_family
plt.rcParams["axes.unicode_minus"] = False


%matplotlib inline

## 데이터 불러오기

In [2]:
train_df = pd.read_csv('./train_transform_weight.csv')

In [83]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123993 entries, 0 to 123992
Columns: 149 entries, A0T0G0C10_and_A0T0G10C0 to sample_weight
dtypes: int64(148), object(1)
memory usage: 141.0+ MB


In [84]:
train_df.head()

Unnamed: 0,A0T0G0C10_and_A0T0G10C0,A0T0G1C9_and_A0T0G9C1,A0T0G2C8_and_A0T0G8C2,A0T0G3C7_and_A0T0G7C3,A0T0G4C6_and_A0T0G6C4,A0T0G5C5,A0T1G0C9_and_A1T0G9C0,A0T1G1C8_and_A1T0G8C1,A0T1G2C7_and_A1T0G7C2,A0T1G3C6_and_A1T0G6C3,...,A4T4G0C2_and_A4T4G2C0,A4T4G1C1,A4T5G0C1_and_A5T4G1C0,A4T5G1C0_and_A5T4G0C1,A4T6G0C0_and_A6T4G0C0,A5T5G0C0,gcd,decamers,target,sample_weight
0,0,0,0,0,0,0,0,0,0,0,...,20000,0,0,10000,0,0,10000,100,Escherichia_coli,18
1,0,0,0,2000,0,1000,0,0,0,5000,...,4000,2000,3000,7000,1000,0,1000,1000,Salmonella_enterica,17
2,0,0,0,0,0,0,0,0,0,0,...,28000,28000,20000,22000,5000,6000,1000,1000,Staphylococcus_aureus,17
3,0,0,0,0,0,0,0,0,0,0,...,10000,30000,0,10000,0,0,10000,100,Bacteroides_fragilis,16
4,0,0,0,0,0,0,0,0,0,0,...,25000,17000,21000,20000,15000,5000,1000,1000,Campylobacter_jejuni,16


In [85]:
target_list = train_df['target'].unique()

target의 인코딩 진행

각 타갯의 순서는 없음으로, LabelEncoder를 진행

In [86]:
label_encoder = LabelEncoder()
train_df['target'] = label_encoder.fit_transform(train_df['target'])

Decision Tree

여러가지의 Decision Tree를 사용하여 판별력이 높은 트리를 선정.

RandomForest, ExtraTrees, xgboost, lightgbm을 사용.

In [3]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from tqdm import tqdm

StratifiedKFold을 통해 10개의 데이터로 나눠 교차검증을 시행

In [4]:
def early_model(df, model, time=True, splits=10, weight=True, fit_data=False):
    X = df.drop(['gcd', 'target', 'sample_weight'], axis=1)
    y = df['target']
    sample_weight = df['sample_weight']
    model_name = type(model).__name__
    N_SPLITS = splits
    folds = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=0)
    scores, decamer_100s, decamer_1000s, decamer_100000s, decamer_1000000s ,fit_list = [], [], [], [], [], []
    fit_dit = {}
    
    if time == True:
        folds_split = tqdm(folds.split(X, y), total=N_SPLITS)
    else:
        folds_split = folds.split(X, y)
    
    # Index Splitting
    for fold, (train_id, valid_id) in enumerate(folds_split):
        if time == True:
            print('####### Fold: ', fold)
        
        # Splitting
        X_train, y_train, sample_weight_train = X.iloc[train_id], y.iloc[train_id], sample_weight.iloc[train_id]
        X_valid, y_valid, sample_weight_valid = X.iloc[valid_id], y.iloc[valid_id], sample_weight.iloc[valid_id]

        # Fit
        if weight == True:
            model.fit(X_train, y_train, sample_weight_train)
        else:
            model.fit(X_train, y_train)
        
        if fit_data == True:
            fit_list.append(model)
        
        # Validation
        valid_pred = model.predict(X_valid)
        valid_score = accuracy_score(y_valid, valid_pred, sample_weight=sample_weight_valid)
        
        scores.append(valid_score)
        
        # 각 decamer 별로 Validation
        for decamer in [100, 1000, 100000, 1000000]:
            # decamer별로 나눔
            X_test = X_valid[X_valid['decamers'] == decamer]
            y_test = y_valid[X_valid['decamers'] == decamer]
            sample_weight_test = sample_weight_valid[X_valid['decamers'] == decamer]
            
            # Prediction
            test_pred = model.predict(X_test)
            test_score = accuracy_score(y_test, test_pred, sample_weight=sample_weight_test)
              
            if decamer == 100:
                decamer_100s.append(test_score)
            elif decamer == 1000:
                decamer_1000s.append(test_score)
            elif decamer == 100000:
                decamer_100000s.append(test_score)
            else:
                decamer_1000000s.append(test_score)
                
    # scores
    formatted_scores = [f'{score:.5f}' for score in scores]
    avg_score = np.array(scores).mean()

    formatted_decamer_100s = [f'{score:.5f}' for score in decamer_100s]
    decamer_100 = np.array(decamer_100s).mean()

    formatted_decamer_1000s = [f'{score:.5f}' for score in decamer_1000s]
    decamer_1000 = np.array(decamer_1000s).mean()

    formatted_decamer_100000s = [f'{score:.5f}' for score in decamer_100000s]
    decamer_100000 = np.array(decamer_100000s).mean()

    formatted_decamer_1000000s = [f'{score:.5f}' for score in decamer_1000000s]
    decamer_1000000 = np.array(decamer_1000000s).mean()
    
    
    # scores print
    print('---------------',model_name,'---------------','\n')

    print(f'Accuracy scores: {formatted_scores}')
    print(f'Mean accuracy score: {avg_score:6f}\n')

    print(f'decamer 100 Accuracy score: {formatted_decamer_100s}')
    print(f'Mean accuracy decamer 100 score: {decamer_100:6f}\n')

    print(f'decamer 1000 Accuracy score: {formatted_decamer_1000s}')
    print(f'Mean accuracy decamer 1000 score: {decamer_1000:6f}\n')

    print(f'decamer 100000 Accuracy score: {formatted_decamer_100000s}')
    print(f'Mean accuracy decamer 100000 score: {decamer_100000:6f}\n')

    print(f'decamer 1000000 Accuracy score: {formatted_decamer_1000000s}')
    print(f'Mean accuracy decamer 1000000 score: {decamer_1000000:6f}\n')
    
    if fit_data == True:
        fit_dit[model_name] = fit_list
        return fit_dit

In [31]:
# RandomForestClassifier
model= RandomForestClassifier(n_estimators=300, n_jobs=-1)

early_model(train_df, model)

  0%|          | 0/10 [00:00<?, ?it/s]

####### Fold:  0


 10%|█         | 1/10 [00:21<03:09, 21.05s/it]


####### Fold:  1


 20%|██        | 2/10 [00:42<02:51, 21.49s/it]


####### Fold:  2


 30%|███       | 3/10 [01:04<02:32, 21.78s/it]


####### Fold:  3


 40%|████      | 4/10 [01:26<02:10, 21.78s/it]


####### Fold:  4


 50%|█████     | 5/10 [01:49<01:49, 21.97s/it]


####### Fold:  5


 60%|██████    | 6/10 [02:12<01:29, 22.35s/it]


####### Fold:  6


 70%|███████   | 7/10 [02:34<01:07, 22.34s/it]


####### Fold:  7


 80%|████████  | 8/10 [02:56<00:44, 22.21s/it]


####### Fold:  8


 90%|█████████ | 9/10 [03:19<00:22, 22.41s/it]


####### Fold:  9


100%|██████████| 10/10 [03:41<00:00, 22.19s/it]


Accuracy scores: ['0.94579', '0.94264', '0.95071', '0.94577', '0.95018', '0.94381', '0.94892', '0.94577', '0.94758', '0.94861']
Mean accuracy score: 0.946978

decamer 100 Accuracy score: ['0.86615', '0.86972', '0.88311', '0.87220', '0.87112', '0.86042', '0.87608', '0.87097', '0.87554', '0.88466']
Mean accuracy decamer 100 score: 0.872998

decamer 100 Accuracy score: ['0.91778', '0.90311', '0.92098', '0.90669', '0.92988', '0.91830', '0.92019', '0.90977', '0.91435', '0.90847']
Mean accuracy decamer 1000 score: 0.914951

decamer 100 Accuracy score: ['0.99980', '1.00000', '1.00000', '1.00000', '0.99980', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000']
Mean accuracy decamer 100000 score: 0.999960

decamer 100 Accuracy score: ['1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000']
Mean accuracy decamer 1000000 score: 1.000000






- 정확도 ExtraTreesClassifier 948574 > RandomForestClassifier 946893 = LGBMClassifier 946596 = XGBClassifier 946367
- 성능 ExtraTreesClassifier 11 > LGBMClassifier 13 > RandomForestClassifier 21 >> XGBClassifier 180

RandomForestClassifier와 ExtraTreesClassifier은 매우 비슷한 알고리즘이지만

RandomForestClassifier는 최적의 분할과 gini가 낮아지는 최적의 노드를 사용 -> 경계선 분할

ExtraTreesClassifier는 무작위 분할과 무작위 노드 분할을 사용

하지만 데이터에 노이즈(실험오차, 돌연변이)가 있기때문에 ExtraTrees의 무작위 분할이 더 높은 정확도와 성능을 보임

XGBClassifier는 너무 오래걸려서 제외

In [40]:
# 4가지의 Decision Tree
models = [RandomForestClassifier(n_estimators=300, n_jobs=-1), 
          ExtraTreesClassifier(n_estimators=300, n_jobs=-1),
          XGBClassifier(n_estimators=300, n_jobs=-1),
          LGBMClassifier(n_estimators=300, n_jobs=-1)
          ]

for model in models:
    early_model(train_df, model, time=False)

RandomForestClassifier
Accuracy scores: ['0.94628', '0.94649', '0.94501', '0.94874', '0.94729', '0.93958', '0.95198', '0.94502', '0.94819', '0.95035']
Mean accuracy score: 0.946893

decamer 100 Accuracy score: ['0.86798', '0.86478', '0.85343', '0.87282', '0.87061', '0.85414', '0.88251', '0.87177', '0.87485', '0.88151']
Mean accuracy decamer 100 score: 0.869439

decamer 100 Accuracy score: ['0.91538', '0.91686', '0.92813', '0.92252', '0.92055', '0.90572', '0.92599', '0.91055', '0.91629', '0.91874']
Mean accuracy decamer 1000 score: 0.918072

decamer 100 Accuracy score: ['1.00000', '1.00000', '1.00000', '0.99980', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '0.99980']
Mean accuracy decamer 100000 score: 0.999960

decamer 100 Accuracy score: ['1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000']
Mean accuracy decamer 1000000 score: 1.000000

ExtraTreesClassifier
Accuracy scores: ['0.95111', '0.94717', '0.94406', '0.94585',

### ExtraTreesClassifier
- Mean accuracy score: 0.948574
- Mean accuracy decamer 100 score: 0.878315
- Mean accuracy decamer 1000 score: 0.915937
- Mean accuracy decamer 100000 score: 1.000000
- Mean accuracy decamer 1000000 score: 1.000000



### LGBMClassifier
- Mean accuracy score: 0.946596
- Mean accuracy decamer 100 score: 0.865578
- Mean accuracy decamer 1000 score: 0.920868
- Mean accuracy decamer 100000 score: 0.999901
- Mean accuracy decamer 1000000 score: 1.000000

## clustering

유사도를 기반으로 구분 -> KNN

Manhattan: 절대값으로 계산 L1
Euclidean: 직선 거리가(피타고라스) 얼마인지 계산 L2

차원의 수가 많고, 노이즈가 많으면 L2보다 계산이 간단한 L1이 더 성능이 좋다.

In [5]:
model = KNeighborsClassifier(n_jobs=-1,metric='euclidean',weights='distance',n_neighbors=2)

early_model(train_df, model, weight=False)

  0%|          | 0/10 [00:00<?, ?it/s]

####### Fold:  0


 10%|█         | 1/10 [00:58<08:48, 58.74s/it]

####### Fold:  1


 20%|██        | 2/10 [01:55<07:41, 57.73s/it]

####### Fold:  2


 30%|███       | 3/10 [02:52<06:41, 57.37s/it]

####### Fold:  3


 40%|████      | 4/10 [03:48<05:40, 56.77s/it]

####### Fold:  4


 50%|█████     | 5/10 [04:45<04:44, 56.96s/it]

####### Fold:  5


 60%|██████    | 6/10 [05:42<03:47, 56.79s/it]

####### Fold:  6


 70%|███████   | 7/10 [06:38<02:50, 56.71s/it]

####### Fold:  7


 80%|████████  | 8/10 [07:35<01:53, 56.78s/it]

####### Fold:  8


 90%|█████████ | 9/10 [08:31<00:56, 56.35s/it]

####### Fold:  9


100%|██████████| 10/10 [09:27<00:00, 56.74s/it]

--------------- KNeighborsClassifier --------------- 

Accuracy scores: ['0.91685', '0.91617', '0.91977', '0.91784', '0.91426', '0.91606', '0.91226', '0.91758', '0.91518', '0.92336']
Mean accuracy score: 0.916933

decamer 100 Accuracy score: ['0.79043', '0.77609', '0.79202', '0.78734', '0.78613', '0.79759', '0.79532', '0.78877', '0.77378', '0.80016']
Mean accuracy decamer 100 score: 0.788763

decamer 1000 Accuracy score: ['0.87787', '0.88510', '0.88756', '0.88335', '0.87160', '0.88065', '0.85000', '0.87439', '0.88873', '0.89094']
Mean accuracy decamer 1000 score: 0.879019

decamer 100000 Accuracy score: ['1.00000', '1.00000', '0.99980', '0.99980', '0.99980', '1.00000', '0.99980', '0.99961', '0.99980', '0.99980']
Mean accuracy decamer 100000 score: 0.999841

decamer 1000000 Accuracy score: ['1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000']
Mean accuracy decamer 1000000 score: 1.000000






In [90]:
model = KNeighborsClassifier(n_jobs=-1,metric='manhattan',weights='distance',n_neighbors=2)

early_model(train_df, model, weight=False)

  0%|          | 0/10 [00:00<?, ?it/s]

####### Fold:  0


 10%|█         | 1/10 [02:02<18:21, 122.40s/it]

####### Fold:  1


 20%|██        | 2/10 [04:08<16:36, 124.51s/it]

####### Fold:  2


 30%|███       | 3/10 [06:15<14:39, 125.66s/it]

####### Fold:  3


 40%|████      | 4/10 [08:19<12:29, 125.00s/it]

####### Fold:  4


 50%|█████     | 5/10 [10:24<10:25, 125.05s/it]

####### Fold:  5


 60%|██████    | 6/10 [12:29<08:20, 125.03s/it]

####### Fold:  6


 70%|███████   | 7/10 [14:35<06:15, 125.33s/it]

####### Fold:  7


 80%|████████  | 8/10 [16:43<04:12, 126.05s/it]

####### Fold:  8


 90%|█████████ | 9/10 [18:49<02:06, 126.06s/it]

####### Fold:  9


100%|██████████| 10/10 [20:58<00:00, 125.81s/it]

--------------- KNeighborsClassifier --------------- 

Accuracy scores: ['0.96040', '0.95806', '0.96266', '0.96443', '0.96217', '0.95967', '0.95975', '0.96153', '0.96322', '0.96137']
Mean accuracy score: 0.961328

decamer 100 Accuracy score: ['0.80693', '0.78743', '0.80473', '0.81572', '0.81052', '0.79127', '0.80458', '0.81455', '0.80909', '0.80496']
Mean accuracy decamer 100 score: 0.804978

decamer 1000 Accuracy score: ['0.86376', '0.87299', '0.89109', '0.88814', '0.87736', '0.88361', '0.86322', '0.87319', '0.89166', '0.88200']
Mean accuracy decamer 1000 score: 0.878703

decamer 100000 Accuracy score: ['0.99979', '0.99958', '0.99979', '0.99958', '1.00000', '0.99979', '1.00000', '0.99978', '0.99979', '0.99979']
Mean accuracy decamer 100000 score: 0.999788

decamer 1000000 Accuracy score: ['1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000']
Mean accuracy decamer 1000000 score: 1.000000






모든 모델에서 100000, 1000000에서 매우 높은 정확도를 보여주나

100, 1000에서는 성능이 낮은것을 확인

특히 클러스터링이 좀 더 좋은 성능을 보여줄것으로 예측했으나 특성공학과 적절한 알고리즘을 적용하지 못하여 점수가 낮음

## 적용하지 못한 특성공학들.

유전자의 돌연변이는 쉽게 일어날 수 있는 돌연변이와 어려운 돌연변이가 있음

A에서 T가 되는건 쉬운데 A에서 C가되는건 어려움

이러한 특성을 이용해서 거리를 측정할 때 A9T0G0C1 은 A9T0G1C0과는 가깝다 볼 수 있지만 A9T1G0C0은 가깝다 보기 어려움

Earth Movers Distance (EMD) 두 개의 확률 분포 간의 차이를 측정하는 메트릭

EMD을 이용하여 변수들간 거리를 만들어 줄 수 있음

--------------------------------------------------------------

박태리아의 플라스미드 크기는 1kbp ~ 1000kbp등으로 다양

하지만 100 decamers로는 1kbp임으로 플라스미드의 모든 정보를 가지고 있다 보기 힘듬

즉 100, 1000 decamers들은 즉 업스케일링된 데이터일 가능성이 매우 높음

In [74]:
copied_df = train_df.copy()
selected_data = copied_df[(copied_df['gcd'] == 10000) | (copied_df['gcd'] == 1000)].iloc[:, :-4] / 2
selected_data = selected_data.astype(int)
copied_df.loc[selected_data.index, selected_data.columns] = selected_data

In [77]:
model = ExtraTreesClassifier(n_estimators=300, n_jobs=-1)

early_model(copied_df, model)

  0%|          | 0/10 [00:00<?, ?it/s]

####### Fold:  0


 10%|█         | 1/10 [00:10<01:33, 10.43s/it]

####### Fold:  1


 20%|██        | 2/10 [00:20<01:20, 10.04s/it]

####### Fold:  2


 30%|███       | 3/10 [00:30<01:09,  9.96s/it]

####### Fold:  3


 40%|████      | 4/10 [00:39<00:59,  9.95s/it]

####### Fold:  4


 50%|█████     | 5/10 [00:50<00:50, 10.00s/it]

####### Fold:  5


 60%|██████    | 6/10 [01:00<00:39,  9.98s/it]

####### Fold:  6


 70%|███████   | 7/10 [01:09<00:29,  9.96s/it]

####### Fold:  7


 80%|████████  | 8/10 [01:19<00:19,  9.90s/it]

####### Fold:  8


 90%|█████████ | 9/10 [01:29<00:10, 10.00s/it]

####### Fold:  9


100%|██████████| 10/10 [01:39<00:00,  9.96s/it]

--------------- ExtraTreesClassifier --------------- 

Accuracy scores: ['0.95078', '0.95047', '0.95140', '0.95485', '0.95189', '0.94925', '0.94667', '0.94873', '0.94661', '0.94310']
Mean accuracy score: 0.949376

decamer 100 Accuracy score: ['0.88153', '0.88678', '0.88348', '0.90068', '0.87913', '0.86968', '0.88410', '0.87267', '0.87893', '0.86772']
Mean accuracy decamer 100 score: 0.880469

decamer 1000 Accuracy score: ['0.92128', '0.91343', '0.92076', '0.91660', '0.92922', '0.92428', '0.90402', '0.92243', '0.91035', '0.90945']
Mean accuracy decamer 1000 score: 0.917181

decamer 100000 Accuracy score: ['1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000']
Mean accuracy decamer 100000 score: 1.000000

decamer 1000000 Accuracy score: ['1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000', '1.00000']
Mean accuracy decamer 1000000 score: 1.000000




