# Task3. Enzyme Commission Number Assignment

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2021-12-09  


## 1. Import packages

In [3]:
import numpy as np
import pandas as pd
import time
import datetime
import sys
import os
from tqdm import tqdm
from functools import reduce
import joblib

sys.path.append("../tools/")
import funclib

sys.path.append("../")
import benchmark_train as btrain
import benchmark_test as btest
import config as cfg
import benchmark_evaluation as eva

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

from pandarallel import pandarallel #  import pandaralle
pandarallel.initialize() # init

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 80 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 2.  Load data

In [4]:
#read train test data
train = pd.read_feather(cfg.DATADIR+'task3/train.feather')
test = pd.read_feather(cfg.DATADIR+'task3/test.feather')
print('train size: {0}\ntest size: {1}'.format(len(train), len(test)))

train size: 222567
test size: 3304


## 3. Make label

In [6]:
train_set= funclib.split_ecdf_to_single_lines(train)
test_set=funclib.split_ecdf_to_single_lines(test)

#4. 加载EC号训练数据
print('loading ec to label dict')
if os.path.exists(cfg.FILE_EC_LABEL_DICT):
    dict_ec_label = np.load(cfg.FILE_EC_LABEL_DICT, allow_pickle=True).item()
else:
    dict_ec_label = btrain.make_ec_label(train_label=train_set['ec_number'], test_label=test_set['ec_number'], file_save= cfg.FILE_EC_LABEL_DICT, force_model_update=cfg.UPDATE_MODEL)
    
train_set['ec_label'] = train_set.ec_number.parallel_apply(lambda x: dict_ec_label.get(x))
test_set['ec_label'] = test_set.ec_number.parallel_apply(lambda x: dict_ec_label.get(x))

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 222567/222567 [04:39<00:00, 796.96it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████| 3304/3304 [00:00<00:00, 69170.93it/s]

loading ec to label dict





## 4. Embedding Comparison
### 4.1 one-hot + ML

In [11]:
trainset = train_set.copy()
testset = test_set.copy()

In [12]:
MAX_SEQ_LENGTH = 1500 #定义序列最长的长度
trainset.seq = trainset.seq.map(lambda x : x[0:MAX_SEQ_LENGTH].ljust(MAX_SEQ_LENGTH, 'X'))
testset.seq = testset.seq.map(lambda x : x[0:MAX_SEQ_LENGTH].ljust(MAX_SEQ_LENGTH, 'X'))
f_train = funclib.dna_onehot(trainset) #训练集编码
f_test = funclib.dna_onehot(testset) #测试集编码

In [None]:
# 计算指标
X_train = np.array(f_train.iloc[:,2:])
X_test = np.array(f_test.iloc[:,2:])
Y_train = np.array(trainset.ec_label.astype('int'))
Y_test = np.array(testset.ec_label.astype('int'))
funclib.run_baseline(X_train, Y_train, X_test, Y_test, type='multi')

 baslineName 		 accuracy 	 precision-macro 	 recall-macro 	 f1-macro
         knn  		0.068579  	0.572562 		0.376389 	0.044914
