# Task3. Enzyme Commission Number Assignment

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2021-12-09  


## 1. Import packages

In [1]:
import numpy as np
import pandas as pd
import time
import datetime
import sys
import os
from tqdm import tqdm
from functools import reduce
import joblib

sys.path.append("../tools/")
import funclib

sys.path.append("../")
import benchmark_train as btrain
import benchmark_test as btest
import config as cfg
import benchmark_evaluation as eva

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

from pandarallel import pandarallel #  import pandaralle
pandarallel.initialize() # init

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 80 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 2.  Load data

In [2]:
#read train test data
train = pd.read_feather(cfg.DATADIR+'task3/train.feather')
test = pd.read_feather(cfg.DATADIR+'task3/test.feather')
print('train size: {0}\ntest size: {1}'.format(len(train), len(test)))

train size: 222567
test size: 3304


## 3. Make label

In [3]:
train_set= funclib.split_ecdf_to_single_lines(train)
test_set=funclib.split_ecdf_to_single_lines(test)

#4. 加载EC号训练数据
print('loading ec to label dict')
if os.path.exists(cfg.FILE_EC_LABEL_DICT):
    dict_ec_label = np.load(cfg.FILE_EC_LABEL_DICT, allow_pickle=True).item()
else:
    dict_ec_label = btrain.make_ec_label(train_label=train_set['ec_number'], test_label=test_set['ec_number'], file_save= cfg.FILE_EC_LABEL_DICT, force_model_update=cfg.UPDATE_MODEL)
    
train_set['ec_label'] = train_set.ec_number.parallel_apply(lambda x: dict_ec_label.get(x))
test_set['ec_label'] = test_set.ec_number.parallel_apply(lambda x: dict_ec_label.get(x))

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 222567/222567 [04:46<00:00, 778.03it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████| 3304/3304 [00:00<00:00, 94681.62it/s]

loading ec to label dict





## 4. Embedding Comparison

### 4.5 ESM REP0 + ML

In [4]:
trainset = train_set.copy()
testset = test_set.copy()

encode_dict = dict(zip(set(trainset.ec_label),range(len(set(trainset.ec_label)))))
trainset['ec_label_ecd']=trainset.ec_label.apply(lambda x: 0 if encode_dict.get(x)==None else encode_dict.get(x))
testset['ec_label_ecd']=testset.ec_label.apply(lambda x: 0 if encode_dict.get(x)==None else encode_dict.get(x))

train_esm_0 = pd.read_feather(cfg.DATADIR + 'train_rep32.feather')
test_esm_0 = pd.read_feather(cfg.DATADIR + 'test_rep32.feather')

train_esm = trainset.merge(train_esm_0, on='id', how='left')
test_esm = testset.merge(test_esm_0, on='id', how='left')

In [None]:
X_train = np.array(train_esm.iloc[:,5:])
X_test = np.array(test_esm.iloc[:,5:])

# Y_train = np.array(train_esm.ec_label.astype('int')).flatten()
# Y_test = np.array(test_esm.ec_label.astype('int')).flatten()

Y_train = np.array(trainset.ec_label_ecd.astype('int'))
Y_test = np.array(testset.ec_label_ecd.astype('int'))

# funclib.run_baseline(X_train, Y_train, X_test, Y_test, type='multi')
funclib.run_baseline_2(X_train, Y_train, X_test, Y_test, type='multi')

 baslineName 		 accuracy 	 precision-macro 	 recall-macro 	 f1-macro
         knn  		0.440164  	0.473911 		0.667822 	0.239496
          xg  		0.134426  	0.811717 		0.190549 	0.039573
