## Methos: SimiProtein
> 2024-11-05

### 1. 导入必要的包

In [1]:
import numpy as np
import pandas as pd
import sys,os
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../')
from config import conf as cfg
from tkinter import _flatten 
from tqdm.notebook import tqdm
tqdm.pandas()
import modules.simi_caculator as simitool
from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize(progress_bar=False)
import concurrent.futures
import warnings
# pandarallel.initialize(progress_bar=True)

os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 112 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### 2. Load data and features
#### 2.1 Load datasets

In [2]:

dict_feature_path ={
    'esm':cfg.FILE_EMBD_PROTEIN_ESM2_L33_650M,
    'unirep':cfg.FILE_EMBD_PROTEIN_UNIREP,
    't5':cfg.FILE_EMBD_PROTEIN_T5_SEQ
}

def load_feature(method):
    print(f'Loading {method} features from {dict_feature_path.get(method)}')
    feature = pd.read_feather(dict_feature_path.get(method))
    if method=='esm':
        feature = feature.rename(columns={'rep33': 'features'})
    if method=='unirep':
        feature = feature.rename(columns={'unirep': 'features'})
    if method=='t5':
        feature = feature.rename(columns={'t5_per_protein': 'features'})
    print(f'{method} features loaded, shape: {feature.shape}')
        
    return feature[['uniprot_id', 'features']]

def merge_features(embdfeatures, data_train, data_test):
    print('adding features to trian and test data')
    res_train = data_train.merge(embdfeatures, on='uniprot_id', how='left')
    res_test = data_test.merge(embdfeatures, on='uniprot_id', how='left')
    
    return res_train, res_test

def load_data(path):
    data = pd.read_feather(path)
    data = data[['uniprot_id','reaction_id','ec_number']]
    return data

def load_dataset(fold_num):
    train_path = f'{cfg.DIR_DATASET}validation/fold{fold_num}/train.feather'
    test_path = f'{cfg.DIR_DATASET}validation/fold{fold_num}/valid.feather'
    print(f'Loading data from {train_path}')
    data_train = load_data(train_path)
    print(f'Loading data from {test_path}')
    data_test = load_data(test_path)
    
    print(f'Data loaded, Fold: {fold_num}. Train size: {len(data_train)}, Test size: {len(data_test)}')
    
    return data_train, data_test


def get_top_protein_simi(x_feature, y_feature, y_uniprot_id, topk):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_euclidean = executor.submit(simitool.get_euclidean_distances, x_feature, y_feature)
        future_cosine = executor.submit(simitool.get_cosine_similarity, x_feature, y_feature)
        
        res_euclidean = np.round(future_euclidean.result()[0], 6)
        res_cosine = np.round(future_cosine.result()[0], 6)
    
    res_euclidean_dict = dict(zip(y_uniprot_id, res_euclidean))
    final_res_euclidean = sorted(res_euclidean_dict.items(), key=lambda item: item[1])[:topk]

    res_cosine_dict = dict(zip(y_uniprot_id, res_cosine))
    final_res_cosine = sorted(res_cosine_dict.items(), key=lambda item: item[1], reverse=True)[:topk]

    return final_res_euclidean, final_res_cosine


In [6]:

def run_fold(fold_num: int, embd_methd: str):
    
    print(f'Running fold {fold_num} with embedding method {embd_methd}...')

    # 加载训练集、测试集
    data_train, data_test = load_dataset(fold_num=fold_num)

    #加载特征向量
    embd_featuers = load_feature(method=embd_methd)

    #拼合特征到训练集、测试集
    data_train, data_test = merge_features(embdfeatures=embd_featuers, data_train=data_train, data_test=data_test)

    # 计算比对库特征
    dim_features = len(data_train.features.values[0])
    bank_features = np.concatenate(data_train.features.values).reshape(-1,dim_features)


    data_test = data_test.head(10)

    print('Caculating similarity between test set and bank set...')
    data_test[['euclidean', 'cosine']] = data_test.progress_apply(lambda x : get_top_protein_simi(x_feature= x.features[np.newaxis, :],  # type: ignore
                                                                                                    y_feature=bank_features, 
                                                                                                    y_uniprot_id = data_train.uniprot_id.to_list(), 
                                                                                                    topk=100),
                                                                    axis=1, result_type="expand") 


    data_test = data_test[['uniprot_id', 'reaction_id', 'ec_number', 'euclidean', 'cosine']]

    try:
        warnings.simplefilter("ignore", pd.errors.PerformanceWarning)
        with pd.HDFStore(f'{cfg.RESULTS_DIR}simi/fold_{fold_num}_{embd_methd}_results.h5','w') as h5:
            h5['data'] = data_test
            h5.close()
    except Exception as e:
        print(f"Error writing HDF5 file: {e}")
        
    print(f'Similarity calculation for fold {fold_num} and embedding method {embd_methd} is done, file saved to {cfg.RESULTS_DIR}simi/fold_{fold_num}_{embd_methd}_results.h5')

In [4]:
FOLD_NUM = 1
EMBD_METHD = 'esm'
EMBD_METHDS = ['esm', 'unirep', 't5']

run_fold(fold_num=FOLD_NUM, embd_methd=EMBD_METHD)

Loading data from /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/datasets/task240524/validation/fold1/train.feather
Loading data from /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/datasets/task240524/validation/fold1/valid.feather
Data loaded, Fold: 1. Train size: 457729, Test size: 50858
Loading esm features from /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/featurebank/esm2_l33_650m.feather
esm features loaded, shape: (565183, 6)
adding features to trian and test data
Caculating similarity between test set and bank set...


  0%|          | 0/10 [00:00<?, ?it/s]

Similarity calculation for fold 1 and embedding method esm is done, file saved to /hpcfs/fhome/shizhenkun/codebase/RXNRECer/results/simi/fold_1_esm_results.h5


In [12]:
with pd.HDFStore('/hpcfs/fhome/shizhenkun/codebase/RXNRECer/results/simi/fold_1_esm_results.h5', 'r') as h5:
    res_esm = h5['data']
res_esm

Unnamed: 0,uniprot_id,reaction_id,ec_number,euclidean,cosine
0,Q9UYB6,-,-,"[(O58185, 0.473194), (Q8U3J1, 0.865424), (O282...","[(O58185, 0.997938), (Q8U3J1, 0.993183), (O282..."
1,C1AQW9,RHEA:19669,3.6.5.-,"[(A1KL96, 0.0), (A5U598, 0.0), (P9WK96, 0.0), ...","[(P65270, 1.0), (A1KL96, 1.0), (A5U598, 1.0), ..."
2,P64647,-,-,"[(P64648, 0.0), (P64646, 4e-06), (P64453, 1.04...","[(P64648, 1.0), (P64646, 1.0), (P64453, 0.9928..."
3,Q9MTM3,RHEA:21248,2.7.7.6,"[(B0Z545, 0.038679), (B0Z5C9, 0.053632), (B0Z4...","[(B0Z545, 0.99999), (B0Z5C9, 0.999982), (B0Z4W..."
4,P45894,RHEA:17989;RHEA:46608,2.7.11.1,"[(Q95ZQ4, 0.71042), (U4PR86, 0.835448), (O7453...","[(Q95ZQ4, 0.995369), (Q09137, 0.993601), (P546..."
5,A9MPW3,RHEA:27473;RHEA:27465,2.7.1.167;2.7.7.70,"[(Q5PC86, 0.166025), (B5BG11, 0.166025), (B5QZ...","[(Q5PC86, 0.999656), (B5BG11, 0.999656), (B5QZ..."
6,P53567,-,-,"[(P26801, 0.292173), (Q3T0B9, 0.353782), (Q8R0...","[(P26801, 0.999124), (Q3T0B9, 0.998702), (Q8R0..."
7,P53866,-,-,"[(A6ZRL6, 0.035527), (Q6CQ59, 0.419905), (Q6FP...","[(A6ZRL6, 0.999992), (Q6CQ59, 0.998936), (Q6FP..."
8,Q8YAC5,RHEA:16373,2.7.1.33,"[(C1KYF4, 0.108294), (Q724J2, 0.108294), (B8DG...","[(C1KYF4, 0.999862), (Q724J2, 0.999862), (B8DG..."
9,Q8Z495,-,-,"[(P0CL00, 0.139058), (E1WAC5, 0.139059), (P0A1...","[(E1WAC5, 0.999834), (P0CL00, 0.999834), (P0A1..."


In [13]:
with pd.HDFStore('/hpcfs/fhome/shizhenkun/codebase/RXNRECer/results/simi/fold_1_esm_results.h5', 'r') as h5:
    res_esm = h5['data']
res_esm

Unnamed: 0,uniprot_id,reaction_id,ec_number,euclidean,cosine
0,Q9UYB6,-,-,"[(O58185, 0.473194), (Q8U3J1, 0.865424), (O282...","[(O58185, 0.997938), (Q8U3J1, 0.993183), (O282..."
1,C1AQW9,RHEA:19669,3.6.5.-,"[(A1KL96, 0.0), (A5U598, 0.0), (P9WK96, 0.0), ...","[(P65270, 1.0), (A1KL96, 1.0), (A5U598, 1.0), ..."
2,P64647,-,-,"[(P64648, 0.0), (P64646, 4e-06), (P64453, 1.04...","[(P64648, 1.0), (P64646, 1.0), (P64453, 0.9928..."
3,Q9MTM3,RHEA:21248,2.7.7.6,"[(B0Z545, 0.038679), (B0Z5C9, 0.053632), (B0Z4...","[(B0Z545, 0.99999), (B0Z5C9, 0.999982), (B0Z4W..."
4,P45894,RHEA:17989;RHEA:46608,2.7.11.1,"[(Q95ZQ4, 0.71042), (U4PR86, 0.835448), (O7453...","[(Q95ZQ4, 0.995369), (Q09137, 0.993601), (P546..."
5,A9MPW3,RHEA:27473;RHEA:27465,2.7.1.167;2.7.7.70,"[(Q5PC86, 0.166025), (B5BG11, 0.166025), (B5QZ...","[(Q5PC86, 0.999656), (B5BG11, 0.999656), (B5QZ..."
6,P53567,-,-,"[(P26801, 0.292173), (Q3T0B9, 0.353782), (Q8R0...","[(P26801, 0.999124), (Q3T0B9, 0.998702), (Q8R0..."
7,P53866,-,-,"[(A6ZRL6, 0.035527), (Q6CQ59, 0.419905), (Q6FP...","[(A6ZRL6, 0.999992), (Q6CQ59, 0.998936), (Q6FP..."
8,Q8YAC5,RHEA:16373,2.7.1.33,"[(C1KYF4, 0.108294), (Q724J2, 0.108294), (B8DG...","[(C1KYF4, 0.999862), (Q724J2, 0.999862), (B8DG..."
9,Q8Z495,-,-,"[(P0CL00, 0.139058), (E1WAC5, 0.139059), (P0A1...","[(E1WAC5, 0.999834), (P0CL00, 0.999834), (P0A1..."


In [5]:
FOLD_NUM = 1
EMBD_METHD = 'unirep'
EMBD_METHDS = ['esm', 'unirep', 't5']

run_fold(fold_num=FOLD_NUM, embd_methd=EMBD_METHD)

Loading data from /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/datasets/task240524/validation/fold1/train.feather
Loading data from /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/datasets/task240524/validation/fold1/valid.feather
Data loaded, Fold: 1. Train size: 457729, Test size: 50858
Loading unirep features from /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/featurebank/unirep.feather
unirep features loaded, shape: (522102, 3)
adding features to trian and test data
Caculating similarity between test set and bank set...


  0%|          | 0/10 [00:00<?, ?it/s]

Similarity calculation for fold 1 and embedding method unirep is done, file saved to /hpcfs/fhome/shizhenkun/codebase/RXNRECer/results/simi/fold_1_unirep_results.h5


In [11]:
with pd.HDFStore('/hpcfs/fhome/shizhenkun/codebase/RXNRECer/results/simi/fold_1_unirep_results.h5', 'r') as h5:
    res_unirep = h5['data']
res_unirep

Unnamed: 0,uniprot_id,reaction_id,ec_number,euclidean,cosine
0,Q9UYB6,-,-,"[(O58185, 1.067479), (Q8U3J1, 1.1344), (O57947...","[(O58185, 0.974287), (Q8U3J1, 0.970858), (O579..."
1,C1AQW9,RHEA:19669,3.6.5.-,"[(P65270, 0.0), (A1KL96, 0.0), (A5U598, 0.0), ...","[(P65270, 1.0), (A1KL96, 1.0), (A5U598, 1.0), ..."
2,P64647,-,-,"[(P64648, 0.0), (P64646, 0.0), (P64453, 1.3582...","[(P64648, 1.0), (P64646, 1.0), (P64453, 0.9711..."
3,Q9MTM3,RHEA:21248,2.7.7.6,"[(B0Z545, 0.018057), (B0Z5C9, 0.04094), (B0Z4W...","[(B0Z545, 0.999991), (B0Z5C9, 0.999955), (B0Z4..."
4,P45894,RHEA:17989;RHEA:46608,2.7.11.1,"[(Q09137, 0.777959), (P54646, 0.786766), (Q289...","[(Q09137, 0.983195), (P54646, 0.982973), (Q289..."
5,A9MPW3,RHEA:27473;RHEA:27465,2.7.1.167;2.7.7.70,"[(B5F696, 0.170626), (Q57JQ9, 0.170626), (B4TI...","[(B5F696, 0.999433), (Q57JQ9, 0.999433), (B4TI..."
6,P53567,-,-,"[(Q3T0B9, 0.415932), (P26801, 0.49377), (F1QW7...","[(Q3T0B9, 0.996505), (P26801, 0.995124), (F1QW..."
7,P53866,-,-,"[(A6ZRL6, 0.042175), (Q6CQ59, 0.768815), (Q875...","[(A6ZRL6, 0.999967), (Q6CQ59, 0.988905), (O132..."
8,Q8YAC5,RHEA:16373,2.7.1.33,"[(C1KYF4, 0.074809), (Q724J2, 0.074809), (B8DG...","[(C1KYF4, 0.999941), (Q724J2, 0.999941), (B8DG..."
9,Q8Z495,-,-,"[(E1WAC5, 0.188964), (P0CL00, 0.188964), (O666...","[(E1WAC5, 0.999263), (P0CL00, 0.999263), (O666..."


In [7]:
FOLD_NUM = 1
EMBD_METHD = 't5'
EMBD_METHDS = ['esm', 'unirep', 't5']

run_fold(fold_num=FOLD_NUM, embd_methd=EMBD_METHD)

Running fold 1 with embedding method t5...
Loading data from /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/datasets/task240524/validation/fold1/train.feather
Loading data from /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/datasets/task240524/validation/fold1/valid.feather
Data loaded, Fold: 1. Train size: 457729, Test size: 50858
Loading t5 features from /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/featurebank/t5seq.feather
t5 features loaded, shape: (522102, 3)
adding features to trian and test data
Caculating similarity between test set and bank set...


  0%|          | 0/10 [00:00<?, ?it/s]

Similarity calculation for fold 1 and embedding method t5 is done, file saved to /hpcfs/fhome/shizhenkun/codebase/RXNRECer/results/simi/fold_1_t5_results.h5


In [10]:
with pd.HDFStore('/hpcfs/fhome/shizhenkun/codebase/RXNRECer/results/simi/fold_1_t5_results.h5', 'r') as h5:
    res_t5 = h5['data']
res_t5

Unnamed: 0,uniprot_id,reaction_id,ec_number,euclidean,cosine
0,Q9UYB6,-,-,"[(O58185, 0.128763), (Q8U3J1, 0.159559), (Q8U4...","[(O58185, 0.997383), (Q8U3J1, 0.996018), (Q8U4..."
1,C1AQW9,RHEA:19669,3.6.5.-,"[(P65270, 0.0), (A1KL96, 0.0), (A5U598, 0.0), ...","[(P65270, 1.0), (A1KL96, 1.0), (A5U598, 1.0), ..."
2,P64647,-,-,"[(P64648, 0.0), (P64646, 0.0), (P64453, 0.5625...","[(P64648, 1.0), (P64646, 1.0), (P64453, 0.9758..."
3,Q9MTM3,RHEA:21248,2.7.7.6,"[(B0Z545, 0.004401), (B0Z5C9, 0.015392), (B0Z4...","[(B0Z545, 0.999998), (B0Z5C9, 0.999979), (B0Z4..."
4,P45894,RHEA:17989;RHEA:46608,2.7.11.1,"[(O94168, 0.362471), (Q00372, 0.375524), (P524...","[(O94168, 0.983767), (Q95ZQ4, 0.982562), (Q003..."
5,A9MPW3,RHEA:27473;RHEA:27465,2.7.1.167;2.7.7.70,"[(Q5PC86, 0.032469), (B5BG11, 0.032469), (B5F6...","[(Q5PC86, 0.999847), (B5BG11, 0.999847), (B5F6..."
6,P53567,-,-,"[(P26801, 0.2172), (Q3T0B9, 0.223511), (Q8W191...","[(P26801, 0.996495), (Q3T0B9, 0.99639), (Q9SM5..."
7,P53866,-,-,"[(A6ZRL6, 0.022946), (A7TQN2, 0.302361), (Q6CQ...","[(A6ZRL6, 0.999958), (A7TQN2, 0.99254), (Q6CQ5..."
8,Q8YAC5,RHEA:16373,2.7.1.33,"[(C1KYF4, 0.023721), (Q724J2, 0.023721), (B8DG...","[(C1KYF4, 0.999931), (Q724J2, 0.999931), (B8DG..."
9,Q8Z495,-,-,"[(E1WAC5, 0.067969), (P0CL00, 0.067969), (P0A1...","[(E1WAC5, 0.999502), (P0CL00, 0.999502), (P0A1..."


In [18]:
33

0

In [None]:
train_with_features = pd.read_csv('/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/train_with_features.csv')

In [None]:
print(f'Loading esm features from {cfg.FILE_EMBD_PROTEIN_ESM2_L33_650M}')
f_esm = pd.read_feather(cfg.FILE_EMBD_PROTEIN_ESM2_L33_650M)[['uniprot_id', 'rep33']].rename(columns={'rep33': 'esm'})
print('Loading unirep features from {cfg.FILE_EMBD_PROTEIN_UNIREP}')
f_unirep = pd.read_feather(cfg.FILE_EMBD_PROTEIN_UNIREP)[['uniprot_id','unirep']]
print('Loading t5 features from {cfg.FILE_EMBD_PROTEIN_T5_SEQ}')
f_t5 = pd.read_feather(cfg.FILE_EMBD_PROTEIN_T5_SEQ)[['uniprot_id', 't5_per_protein']].rename(columns={'t5_per_protein': 't5'})

In [27]:
print(f'Loading esm features from {cfg.FILE_EMBD_PROTEIN_ESM2_L33_650M}')
f_esm = pd.read_feather(cfg.FILE_EMBD_PROTEIN_ESM2_L33_650M)[['uniprot_id', 'rep33']].rename(columns={'rep33': 'esm'})
print('Loading unirep features from {cfg.FILE_EMBD_PROTEIN_UNIREP}')
f_unirep = pd.read_feather(cfg.FILE_EMBD_PROTEIN_UNIREP)[['uniprot_id','unirep']]
print('Loading t5 features from {cfg.FILE_EMBD_PROTEIN_T5_SEQ}')
f_t5 = pd.read_feather(cfg.FILE_EMBD_PROTEIN_T5_SEQ)[['uniprot_id', 't5_per_protein']].rename(columns={'t5_per_protein': 't5'})

Loading esm features from /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/featurebank/esm2_l33_650m.feather
Loading unirep features from {cfg.FILE_EMBD_PROTEIN_UNIREP}


In [40]:
train_esm = data_train.merge(f_esm, on='uniprot_id', how='left')
test_esm = data_test.merge(f_esm, on='uniprot_id', how='left')
test_esm.head(3)

Unnamed: 0,uniprot_id,reaction_id,ec_number,esm
0,Q9UYB6,-,-,"[0.13069916, -0.06665996, -0.0012160758, -0.01..."
1,C1AQW9,RHEA:19669,3.6.5.-,"[0.032065388, -0.040186, -0.005980768, -0.0144..."
2,P64647,-,-,"[0.06691212, -0.03209091, 0.007255429, 0.07624..."


In [42]:
test_esm = test_esm.head(10)

In [43]:
with pd.option_context('mode.chained_assignment', None):
    test_esm[['euclidean', 'cosine']] = test_esm.progress_apply(lambda x : get_top_protein_simi(x_feature= x.esm[np.newaxis, :], y_feature=f_esm_tr, y_uniprot_id = train_esm.uniprot_id.to_list(), topk=100),axis=1, result_type="expand") # type: ignore

  0%|          | 0/10 [00:00<?, ?it/s]

In [44]:
test_esm

Unnamed: 0,uniprot_id,reaction_id,ec_number,esm,euclidean,cosine
0,Q9UYB6,-,-,"[0.13069916, -0.06665996, -0.0012160758, -0.01...","[(O58185, 0.473194), (Q8U3J1, 0.865424), (O282...","[(O58185, 0.997938), (Q8U3J1, 0.993183), (O282..."
1,C1AQW9,RHEA:19669,3.6.5.-,"[0.032065388, -0.040186, -0.005980768, -0.0144...","[(A1KL96, 0.0), (A5U598, 0.0), (P9WK96, 0.0), ...","[(P65270, 1.0), (A1KL96, 1.0), (A5U598, 1.0), ..."
2,P64647,-,-,"[0.06691212, -0.03209091, 0.007255429, 0.07624...","[(P64648, 0.0), (P64646, 4e-06), (P64453, 1.04...","[(P64648, 1.0), (P64646, 1.0), (P64453, 0.9928..."
3,Q9MTM3,RHEA:21248,2.7.7.6,"[0.02079815, -0.018187005, -0.05434854, -0.017...","[(B0Z545, 0.038679), (B0Z5C9, 0.053632), (B0Z4...","[(B0Z545, 0.99999), (B0Z5C9, 0.999982), (B0Z4W..."
4,P45894,RHEA:17989;RHEA:46608,2.7.11.1,"[0.01358087, -0.042656515, 0.041366298, 0.0079...","[(Q95ZQ4, 0.71042), (U4PR86, 0.835448), (O7453...","[(Q95ZQ4, 0.995369), (Q09137, 0.993601), (P546..."
5,A9MPW3,RHEA:27473;RHEA:27465,2.7.1.167;2.7.7.70,"[0.04580853, -0.09927941, 0.03287052, -0.01530...","[(Q5PC86, 0.166025), (B5BG11, 0.166025), (B5QZ...","[(Q5PC86, 0.999656), (B5BG11, 0.999656), (B5QZ..."
6,P53567,-,-,"[0.0827024, -0.17414416, -0.025071751, 0.08520...","[(P26801, 0.292173), (Q3T0B9, 0.353782), (Q8R0...","[(P26801, 0.999124), (Q3T0B9, 0.998702), (Q8R0..."
7,P53866,-,-,"[0.047924682, -0.093538456, 0.036287274, 0.057...","[(A6ZRL6, 0.035527), (Q6CQ59, 0.419905), (Q6FP...","[(A6ZRL6, 0.999992), (Q6CQ59, 0.998936), (Q6FP..."
8,Q8YAC5,RHEA:16373,2.7.1.33,"[0.029553216, -0.01902868, 0.07429806, 0.02683...","[(C1KYF4, 0.108294), (Q724J2, 0.108294), (B8DG...","[(C1KYF4, 0.999862), (Q724J2, 0.999862), (B8DG..."
9,Q8Z495,-,-,"[0.022993626, -0.02239413, 0.00034852725, 0.09...","[(P0CL00, 0.139058), (E1WAC5, 0.139059), (P0A1...","[(E1WAC5, 0.999834), (P0CL00, 0.999834), (P0A1..."


In [37]:
f_unirep.head(3)

Unnamed: 0,uniprot_id,unirep
0,Q928A4,"[-0.0015948936343193054, 0.003045504679903388,..."
1,Q6G9U3,"[0.007866262458264828, 0.020977269858121872, 0..."
2,Q7VP02,"[0.0046709743328392506, 0.06482962518930435, 0..."


In [38]:
f_t5.head(3)

Unnamed: 0,uniprot_id,t5
0,Q928A4,"[-0.0237579345703125, -0.0286865234375, -0.011..."
1,Q6G9U3,"[-0.0121917724609375, -0.050048828125, 0.06677..."
2,Q7VP02,"[0.0684814453125, -0.040740966796875, -0.02244..."


#### 2.2 Load Feature ESM

In [5]:
#esm
f_esm = pd.read_feather(cfg.FILE_EMBD_PROTEIN_ESM2_L33_650M)[['uniprot_id', 'rep33']].rename(columns={'rep33': 'esm'})


# train_esm = ds_train.merge(f_esm, on='uniprot_id', how='left')
# test_esm = ds_test.merge(f_esm, on='uniprot_id', how='left')
# test_esm.head(3)

In [6]:
train_esm = datasets[0][0].merge(f_esm, on='uniprot_id', how='left')
test_esm = datasets[0][1].merge(f_esm, on='uniprot_id', how='left')
test_esm.head(3)

Unnamed: 0,uniprot_id,reaction_id,ec_number,functionCounts,ec_specific_level,isenzyme,label,esm
0,Q9UYB6,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.13069916, -0.06665996, -0.0012160758, -0.01..."
1,C1AQW9,RHEA:19669,3.6.5.-,1,4,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.032065388, -0.040186, -0.005980768, -0.0144..."
2,P64647,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.06691212, -0.03209091, 0.007255429, 0.07624..."


Unnamed: 0,uniprot_id,esm
0,Q928A4,"[0.00022248982, 0.03423467, -0.016741816, 0.02..."
1,Q6G9U3,"[0.08237157, -0.058032326, 0.006943757, 0.0778..."
2,Q7VP02,"[0.02026703, -0.04411123, -0.036267266, 0.0834..."
3,Q46WG6,"[0.055388793, -0.04616225, -0.059219934, 0.128..."
4,Q8BHJ7,"[0.00061394484, -0.24837753, -0.10457196, 0.09..."
...,...,...
565178,Q731I2,"[-0.008347662, -0.06857681, -0.076824434, 0.04..."
565179,P0C798,"[0.060479004, -0.08771032, 0.035592824, -0.017..."
565180,P16609,"[0.040429622, -0.035487723, -0.008318746, -0.0..."
565181,Q49V37,"[0.00639956, -0.04034964, 0.032943677, 0.00732..."


#### 2.3 Load Unirep

In [12]:
f_unirep = pd.read_feather(cfg.FILE_EMBD_PROTEIN_UNIREP)[['uniprot_id','unirep']]
train_unirep = ds_train.merge(f_unirep, on='uniprot_id', how='left')
test_unirep = ds_test.merge(f_unirep, on='uniprot_id', how='left')
test_unirep.head(3)

NameError: name 'ds_train' is not defined

#### 2.4 Load T5

In [7]:
f_t5 = pd.read_feather(cfg.FILE_EMBD_PROTEIN_T5_SEQ)[['uniprot_id', 't5_per_protein']]
train_t5 = ds_train.merge(f_t5, on='uniprot_id', how='left')
test_t5 = ds_test.merge(f_t5, on='uniprot_id', how='left')
test_t5.head(3)

Unnamed: 0,uniprot_id,reaction_groundtruth,isenzyme_groundtruth,t5_per_protein
0,A9JLI2,-,nonenzyme,"[-0.11700439453125, 0.1304931640625, -0.007064..."
1,A9JLI3,-,nonenzyme,"[-0.0577392578125, 0.07659912109375, 0.0052337..."
2,A9JLI5,-,nonenzyme,"[-0.07373046875, 0.11846923828125, -0.00873565..."


### 3. Simi Caculation

In [17]:
# # 获取最相似的topk个蛋白   
# def get_top_protein_simi(x_feature, y_feature, y_uniprot_id, topk):
#     res_euclidean = np.round(simitool.get_euclidean_distances(fx=x_feature, fy=y_feature)[0], 6)
    
#     res_euclidean_dict = dict(zip(y_uniprot_id, res_euclidean))
#     final_res_euclidean = sorted(res_euclidean_dict.items(), key=lambda item: item[1])[:topk]

#     res_cosine = np.round(simitool.get_cosine_similarity(fx=x_feature, fy=y_feature)[0], 6)
#     res_cosine_dict = dict(zip(y_uniprot_id, res_cosine))
#     final_res_cosine = sorted(res_cosine_dict.items(), key=lambda item: item[1],  reverse=True)[:topk]

#     return final_res_euclidean,final_res_cosine

# Function to get the top k most similar proteins
def get_top_protein_simi(x_feature, y_feature, y_uniprot_id, topk):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_euclidean = executor.submit(simitool.get_euclidean_distances, x_feature, y_feature)
        future_cosine = executor.submit(simitool.get_cosine_similarity, x_feature, y_feature)
        
        res_euclidean = np.round(future_euclidean.result()[0], 6)
        res_cosine = np.round(future_cosine.result()[0], 6)
    
    res_euclidean_dict = dict(zip(y_uniprot_id, res_euclidean))
    final_res_euclidean = sorted(res_euclidean_dict.items(), key=lambda item: item[1])[:topk]

    res_cosine_dict = dict(zip(y_uniprot_id, res_cosine))
    final_res_cosine = sorted(res_cosine_dict.items(), key=lambda item: item[1], reverse=True)[:topk]

    return final_res_euclidean, final_res_cosine

#### 3.1 ESM

In [8]:
dim_f_esm = len(train_esm.esm.values[0])
f_esm_tr = np.concatenate(train_esm.esm.values).reshape(-1,dim_f_esm)


In [None]:
with pd.option_context('mode.chained_assignment', None):
    test_esm[['euclidean', 'cosine']] = test_esm.progress_apply(lambda x : get_top_protein_simi(x_feature= x.esm[np.newaxis, :], y_feature=f_esm_tr, y_uniprot_id = train_esm.uniprot_id.to_list(), topk=100),axis=1, result_type="expand") # type: ignore



# try:
#     with pd.HDFStore(cfg.FILE_RESULTS_SIMI_ESM,'w') as h5:
#         h5['data'] = test_esm
#         h5.close()
# except Exception as e:
#     print(f"Error writing HDF5 file: {e}")
    
# test_esm.head(3)

In [19]:
aa = test_esm.head(10)

In [20]:
with pd.option_context('mode.chained_assignment', None):
    aa[['euclidean', 'cosine']] = aa.progress_apply(lambda x : get_top_protein_simi(x_feature= x.esm[np.newaxis, :], y_feature=f_esm_tr, y_uniprot_id = train_esm.uniprot_id.to_list(), topk=100),axis=1, result_type="expand") # type: ignore

  0%|          | 0/10 [00:00<?, ?it/s]

In [21]:
aa

Unnamed: 0,uniprot_id,reaction_id,ec_number,functionCounts,ec_specific_level,isenzyme,label,esm,euclidean,cosine
0,Q9UYB6,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.13069916, -0.06665996, -0.0012160758, -0.01...","[(O58185, 0.473194), (Q8U3J1, 0.865424), (O282...","[(O58185, 0.997938), (Q8U3J1, 0.993183), (O282..."
1,C1AQW9,RHEA:19669,3.6.5.-,1,4,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.032065388, -0.040186, -0.005980768, -0.0144...","[(A1KL96, 0.0), (A5U598, 0.0), (P9WK96, 0.0), ...","[(P65270, 1.0), (A1KL96, 1.0), (A5U598, 1.0), ..."
2,P64647,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.06691212, -0.03209091, 0.007255429, 0.07624...","[(P64648, 0.0), (P64646, 4e-06), (P64453, 1.04...","[(P64648, 1.0), (P64646, 1.0), (P64453, 0.9928..."
3,Q9MTM3,RHEA:21248,2.7.7.6,1,4,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.02079815, -0.018187005, -0.05434854, -0.017...","[(B0Z545, 0.038679), (B0Z5C9, 0.053632), (B0Z4...","[(B0Z545, 0.99999), (B0Z5C9, 0.999982), (B0Z4W..."
4,P45894,RHEA:17989;RHEA:46608,2.7.11.1,2,4,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.01358087, -0.042656515, 0.041366298, 0.0079...","[(Q95ZQ4, 0.71042), (U4PR86, 0.835448), (O7453...","[(Q95ZQ4, 0.995369), (Q09137, 0.993601), (P546..."
5,A9MPW3,RHEA:27473;RHEA:27465,2.7.1.167;2.7.7.70,2,4,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.04580853, -0.09927941, 0.03287052, -0.01530...","[(Q5PC86, 0.166025), (B5BG11, 0.166025), (B5QZ...","[(Q5PC86, 0.999656), (B5BG11, 0.999656), (B5QZ..."
6,P53567,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0827024, -0.17414416, -0.025071751, 0.08520...","[(P26801, 0.292173), (Q3T0B9, 0.353782), (Q8R0...","[(P26801, 0.999124), (Q3T0B9, 0.998702), (Q8R0..."
7,P53866,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.047924682, -0.093538456, 0.036287274, 0.057...","[(A6ZRL6, 0.035527), (Q6CQ59, 0.419905), (Q6FP...","[(A6ZRL6, 0.999992), (Q6CQ59, 0.998936), (Q6FP..."
8,Q8YAC5,RHEA:16373,2.7.1.33,1,4,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.029553216, -0.01902868, 0.07429806, 0.02683...","[(C1KYF4, 0.108294), (Q724J2, 0.108294), (B8DG...","[(C1KYF4, 0.999862), (Q724J2, 0.999862), (B8DG..."
9,Q8Z495,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.022993626, -0.02239413, 0.00034852725, 0.09...","[(P0CL00, 0.139058), (E1WAC5, 0.139059), (P0A1...","[(E1WAC5, 0.999834), (P0CL00, 0.999834), (P0A1..."


In [20]:
test_esm.head(3)

Unnamed: 0,uniprot_id,seq,reaction_id,ec_number,functionCounts,ec_specific_level,isenzyme,label,esm
0,Q9UYB6,MLPDRVLEILNEMKAERIRGATWLARKGAEAFLALAEELDEALLED...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.13069916, -0.06665996, -0.0012160758, -0.01..."
1,C1AQW9,MRTPCSQHRRDRPSAIGSQLPDADTLDTRQPPLQEIPISSFADKTF...,RHEA:19669,3.6.5.-,1,4,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.032065388, -0.040186, -0.005980768, -0.0144..."
2,P64647,MALFSKILIFYVIGVNISFVIIWFISHEKTHIRLLSAFLVGITWPM...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.06691212, -0.03209091, 0.007255429, 0.07624..."


#### 3.2 Unirep    

In [None]:
dim_f_unirep = len(train_unirep.unirep.values[0])
f_unirep_tr = np.concatenate(train_unirep.unirep.values).reshape(-1,dim_f_unirep)

with pd.option_context('mode.chained_assignment', None):
    test_unirep[['simi_euclidean', 'simi_cosine']] = test_unirep.parallel_apply(lambda x : get_top_protein_simi(x_feature= x.unirep[np.newaxis, :], y_feature=f_unirep_tr, y_uniprot_id = train_unirep.uniprot_id.to_list(), topk=100),axis=1, result_type="expand") # type: ignore

try:
    with pd.HDFStore(cfg.FILE_RESULTS_SIMI_UNIREP,'w') as h5:
        h5['data'] = test_unirep
        h5.close()
except Exception as e:
    print(f"Error writing HDF5 file: {e}")
    
test_unirep.head(3)

#### 3.3 T5

In [39]:
dim_f_t5 = len(train_t5.t5_per_protein.values[0])
f_t5_tr = np.concatenate(train_t5.t5_per_protein.values).reshape(-1,dim_f_t5)

with pd.option_context('mode.chained_assignment', None):
    test_t5[['simi_euclidean', 'simi_cosine']] = test_t5.parallel_apply(lambda x : get_top_protein_simi(x_feature= x.t5_per_protein[np.newaxis, :], y_feature=f_t5_tr, y_uniprot_id = train_t5.uniprot_id.to_list(), topk=100),axis=1, result_type="expand") # type: ignore


try:
    with pd.HDFStore(cfg.FILE_RESULTS_SIMI_T5,'w') as h5:
        h5['data'] = test_t5
        h5.close()
except Exception as e:
    print(f"Error writing HDF5 file: {e}")
    
    

### 4. Transform to reaction

In [14]:
# 获取最相似蛋白对应的反应
def get_simi_ref(euclidean, cosine, ref_df):

    uid_euclidean = euclidean[0][0]
    uid_cosine = cosine[0][0]

    ref_record_e = ref_df[ref_df.uniprot_id==uid_euclidean]
    ref_record_c = ref_df[ref_df.uniprot_id==uid_cosine]

    euclidean_reaction = ref_record_e.reaction_groundtruth.values[0]
    cosine_reaction = ref_record_c.reaction_groundtruth.values[0]

    if euclidean_reaction =='-':
        euclidean_isenzyme = 'nonenzyme'
    else:
        euclidean_isenzyme = 'enzyme'

    if cosine_reaction =='-':
        cosine_isenzyme = 'nonenzyme'
    else:
        cosine_isenzyme = 'enzyme'
    

    return euclidean_reaction, euclidean_isenzyme,cosine_reaction, cosine_isenzyme

#### 4.1 ESM

In [15]:
with pd.HDFStore(cfg.FILE_RESULTS_SIMI_ESM, 'r') as h5:
    res_esm = h5['data']


res_esm[['euclidean_reaction_rep0', 
          'euclidean_isenzyme_rep0', 
          'cosine_reaction_rep0', 
          'cosine_isenzyme_rep0']] =res_esm.parallel_apply(lambda x: get_simi_ref(euclidean=x.euclidean_rep0, cosine=x.cosine_rep0, ref_df=ds_train) ,axis=1, result_type="expand") # type: ignore

res_esm[['euclidean_reaction_rep32', 
          'euclidean_isenzyme_rep32', 
          'cosine_reaction_rep32', 
          'cosine_isenzyme_rep32']] =res_esm.parallel_apply(lambda x: get_simi_ref(euclidean=x.euclidean_rep32, cosine=x.cosine_rep32, ref_df=ds_train) ,axis=1, result_type="expand") # type: ignore

res_esm[['euclidean_reaction_rep33', 
          'euclidean_isenzyme_rep33', 
          'cosine_reaction_rep33', 
          'cosine_isenzyme_rep33']] =res_esm.parallel_apply(lambda x: get_simi_ref(euclidean=x.euclidean_rep33, cosine=x.cosine_rep33, ref_df=ds_train) ,axis=1, result_type="expand") # type: ignore


res_esm = res_esm[['uniprot_id', 'reaction_groundtruth', 'isenzyme_groundtruth',
         'euclidean_reaction_rep0', 
          'euclidean_isenzyme_rep0', 
          'cosine_reaction_rep0', 
          'cosine_isenzyme_rep0',
          'euclidean_reaction_rep32', 
          'euclidean_isenzyme_rep32', 
          'cosine_reaction_rep32', 
          'cosine_isenzyme_rep32',
          'euclidean_reaction_rep33', 
          'euclidean_isenzyme_rep33', 
          'cosine_reaction_rep33', 
          'cosine_isenzyme_rep33'
         ]]

try:
    with pd.HDFStore(cfg.FILE_RESULTS_SIMI_ESM_REACTION,'w') as h5:
        h5['data'] = res_esm
        h5.close()
        print(f'Write file to: {cfg.FILE_RESULTS_SIMI_ESM_REACTION}')
except Exception as e:
    print(f"Error writing HDF5 file: {e}")
    
res_esm.head(3)

Write file to: /hpcfs/fhome/shizhenkun/codebase/preaction/results240614/simi/exp_test_esm_reaction.h5


Unnamed: 0,uniprot_id,reaction_groundtruth,isenzyme_groundtruth,euclidean_reaction_rep0,euclidean_isenzyme_rep0,cosine_reaction_rep0,cosine_isenzyme_rep0,euclidean_reaction_rep32,euclidean_isenzyme_rep32,cosine_reaction_rep32,cosine_isenzyme_rep32,euclidean_reaction_rep33,euclidean_isenzyme_rep33,cosine_reaction_rep33,cosine_isenzyme_rep33
0,A9JLI2,-,nonenzyme,-,nonenzyme,-,nonenzyme,-,nonenzyme,-,nonenzyme,-,nonenzyme,-,nonenzyme
1,A9JLI3,-,nonenzyme,-,nonenzyme,-,nonenzyme,-,nonenzyme,-,nonenzyme,-,nonenzyme,-,nonenzyme
2,A9JLI5,-,nonenzyme,-,nonenzyme,-,nonenzyme,-,nonenzyme,-,nonenzyme,-,nonenzyme,-,nonenzyme


## 4.2 Unirep

In [12]:
with pd.HDFStore(cfg.FILE_RESULTS_SIMI_UNIREP, 'r') as h5:
    res_unirep = h5['data']
    
res_unirep[['euclidean_reaction', 
          'euclidean_isenzyme', 
          'cosine_reaction', 
          'cosine_isenzyme']] =res_unirep.parallel_apply(lambda x: get_simi_ref(euclidean=x.simi_euclidean, cosine=x.simi_cosine, ref_df=ds_train) ,axis=1, result_type="expand") # type: ignore

res_unirep = res_unirep[['uniprot_id', 'reaction_groundtruth', 'isenzyme_groundtruth', 'euclidean_reaction', 'euclidean_isenzyme', 'cosine_reaction', 'cosine_isenzyme']]

try:
    with pd.HDFStore(cfg.FILE_RESULTS_SIMI_UNIREP_REACTION,'w') as h5:
        h5['data'] = res_unirep
        h5.close()
        print(f'Write file to: {cfg.FILE_RESULTS_SIMI_UNIREP_REACTION}')
except Exception as e:
    print(f"Error writing HDF5 file: {e}")
        
res_unirep.head(3)

Write file to: /hpcfs/fhome/shizhenkun/codebase/preaction/results240614/simi/exp_test_unirep_reaction.h5


Unnamed: 0,uniprot_id,reaction_groundtruth,isenzyme_groundtruth,euclidean_reaction,euclidean_isenzyme,cosine_reaction,cosine_isenzyme
0,A9JLI2,-,nonenzyme,-,nonenzyme,-,nonenzyme
1,A9JLI3,-,nonenzyme,-,nonenzyme,-,nonenzyme
2,A9JLI5,-,nonenzyme,-,nonenzyme,-,nonenzyme


### 4.3 T5

In [15]:
with pd.HDFStore(cfg.FILE_RESULTS_SIMI_T5, 'r') as h5:
    res_t5 = h5['data']
        
res_t5[['euclidean_reaction', 
          'euclidean_isenzyme', 
          'cosine_reaction', 
          'cosine_isenzyme']] =res_t5.parallel_apply(lambda x: get_simi_ref(euclidean=x.simi_euclidean, cosine=x.simi_cosine, ref_df=ds_train) ,axis=1, result_type="expand") # type: ignore

res_t5 = res_t5[['uniprot_id', 'reaction_groundtruth', 'isenzyme_groundtruth', 'euclidean_reaction', 'euclidean_isenzyme', 'cosine_reaction', 'cosine_isenzyme']]
try:
    with pd.HDFStore(cfg.FILE_RESULTS_SIMI_T5_REACTION,'w') as h5:
        h5['data'] = res_t5
        h5.close()
        print(f'Write file to: {cfg.FILE_RESULTS_SIMI_T5_REACTION}')
except Exception as e:
    print(f"Error writing HDF5 file: {e}")
        



res_t5.head(3)

Write file to: /hpcfs/fhome/shizhenkun/codebase/preaction/results240614/simi/exp_test_t5_reaction.h5


Unnamed: 0,uniprot_id,reaction_groundtruth,isenzyme_groundtruth,euclidean_reaction,euclidean_isenzyme,cosine_reaction,cosine_isenzyme
0,A9JLI2,-,nonenzyme,-,nonenzyme,-,nonenzyme
1,A9JLI3,-,nonenzyme,-,nonenzyme,-,nonenzyme
2,A9JLI5,-,nonenzyme,-,nonenzyme,-,nonenzyme


In [14]:
res_t5.head(3)

Unnamed: 0,uniprot_id,reaction_groundtruth,isenzyme_groundtruth,euclidean_reaction,euclidean_isenzyme,cosine_reaction,cosine_isenzyme
0,A9JLI2,-,nonenzyme,-,nonenzyme,-,nonenzyme
1,A9JLI3,-,nonenzyme,-,nonenzyme,-,nonenzyme
2,A9JLI5,-,nonenzyme,-,nonenzyme,-,nonenzyme
