## Methos: SimiProtein
> 2024-11-05

### 1. 导入必要的包

In [1]:
import numpy as np
import pandas as pd
import sys,os
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../')
from config import conf as cfg
from tkinter import _flatten 
from tqdm.notebook import tqdm

import modules.simi_caculator as simitool
from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize(progress_bar=False)
# pandarallel.initialize(progress_bar=True)

os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 128 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### 2. Load data and features
#### 2.1 Load datasets

In [2]:
# load datasets
ds_train = pd.read_feather(cfg.FILE_DS_TRAIN)[['uniprot_id', 'reaction_id']].rename(columns={'reaction_id':'reaction_groundtruth'}) # type: ignore
ds_test = pd.read_feather(cfg.FILE_DS_TEST)[['uniprot_id', 'reaction_id']].rename(columns={'reaction_id':'reaction_groundtruth'}) # type: ignore

# 添加是否是酶的groundtruth
ds_train['isenzyme_groundtruth']=ds_train.reaction_groundtruth.parallel_apply(lambda x: 'enzyme' if ';'.join(x)!='-' else 'nonenzyme')
ds_test['isenzyme_groundtruth']=ds_test.reaction_groundtruth.parallel_apply(lambda x: 'enzyme' if ';'.join(x)!='-' else 'nonenzyme')

ds_train.head(2)

Unnamed: 0,uniprot_id,reaction_groundtruth,isenzyme_groundtruth
0,Q6GZX4,-,nonenzyme
1,Q6GZX3,-,nonenzyme


#### 2.2 Load Feature ESM

In [3]:
#esm
f_esm = pd.read_feather(cfg.FILE_EMBD_PROTEIN_ESM2_L33_650M)[['uniprot_id', 'rep0', 'rep32', 'rep33']]
train_esm = ds_train.merge(f_esm, on='uniprot_id', how='left')
test_esm = ds_test.merge(f_esm, on='uniprot_id', how='left')
test_esm.head(3)

Unnamed: 0,uniprot_id,reaction_groundtruth,isenzyme_groundtruth,rep0,rep32,rep33
0,A9JLI2,-,nonenzyme,"[-0.016734194, 0.0030938622, -0.032814544, -0....","[0.2910796, 1.1923026, -2.5471961, -0.88039047...","[0.027029077, 0.05122046, -0.039942063, -0.040..."
1,A9JLI3,-,nonenzyme,"[-0.0035194962, -0.003754213, -0.024650995, -0...","[4.1588683, -0.9986028, 3.379115, 1.4899616, 0...","[0.10094367, 0.013692737, 0.03674977, 0.025248..."
2,A9JLI5,-,nonenzyme,"[-0.010903073, 0.0012608002, -0.027889619, -0....","[4.230414, -0.97044975, 1.8296818, 2.16777, 0....","[0.05490144, 0.007199925, 0.017326621, 0.00402..."


In [8]:
test_esm.rep0.values[0].shape

(1280,)

#### 2.3 Load Unirep

In [9]:
f_unirep = pd.read_feather(cfg.FILE_EMBD_PROTEIN_UNIREP)[['uniprot_id','unirep']]
train_unirep = ds_train.merge(f_unirep, on='uniprot_id', how='left')
test_unirep = ds_test.merge(f_unirep, on='uniprot_id', how='left')
test_unirep.head(3)

Unnamed: 0,uniprot_id,reaction_groundtruth,isenzyme_groundtruth,unirep
0,A9JLI2,-,nonenzyme,"[0.00449784379452467, -0.14125452935695648, 0...."
1,A9JLI3,-,nonenzyme,"[0.008313218131661415, -0.21734574437141418, 0..."
2,A9JLI5,-,nonenzyme,"[0.006142920348793268, -0.12105941027402878, 0..."


In [12]:
f_unirep.unirep[0].shape

(1900,)

#### 2.4 Load T5

In [13]:
f_t5 = pd.read_feather(cfg.FILE_EMBD_PROTEIN_T5_SEQ)[['uniprot_id', 't5_per_protein']]
train_t5 = ds_train.merge(f_t5, on='uniprot_id', how='left')
test_t5 = ds_test.merge(f_t5, on='uniprot_id', how='left')
test_t5.head(3)

Unnamed: 0,uniprot_id,reaction_groundtruth,isenzyme_groundtruth,t5_per_protein
0,A9JLI2,-,nonenzyme,"[-0.11700439453125, 0.1304931640625, -0.007064..."
1,A9JLI3,-,nonenzyme,"[-0.0577392578125, 0.07659912109375, 0.0052337..."
2,A9JLI5,-,nonenzyme,"[-0.07373046875, 0.11846923828125, -0.00873565..."


In [17]:
f_t5.t5_per_protein[0].shape

(1024,)

### 3. Simi Caculation

In [5]:
# 获取最相似的topk个蛋白   
def get_top_protein_simi(x_feature, y_feature, y_uniprot_id, topk):
    res_euclidean = np.round(simitool.get_euclidean_distances(fx=x_feature, fy=y_feature)[0], 6)
    
    res_euclidean_dict = dict(zip(y_uniprot_id, res_euclidean))
    final_res_euclidean = sorted(res_euclidean_dict.items(), key=lambda item: item[1])[:topk]

    res_cosine = np.round(simitool.get_cosine_similarity(fx=x_feature, fy=y_feature)[0], 6)
    res_cosine_dict = dict(zip(y_uniprot_id, res_cosine))
    final_res_cosine = sorted(res_cosine_dict.items(), key=lambda item: item[1],  reverse=True)[:topk]

    return final_res_euclidean,final_res_cosine

#### 3.1 ESM

In [7]:
dim_f_esm = len(train_esm.rep0.values[0])
f_esm_tr_0 = np.concatenate(train_esm.rep0.values).reshape(-1,dim_f_esm)

In [11]:
dim_f_esm = len(train_esm.rep0.values[0])
f_esm_tr_0 = np.concatenate(train_esm.rep0.values).reshape(-1,dim_f_esm)
f_esm_tr_32 = np.concatenate(train_esm.rep32.values).reshape(-1,dim_f_esm)
f_esm_tr_33 = np.concatenate(train_esm.rep33.values).reshape(-1,dim_f_esm)


with pd.option_context('mode.chained_assignment', None):
    test_esm[['euclidean_rep0', 'cosine_rep0']] = test_esm.parallel_apply(lambda x : get_top_protein_simi(x_feature= x.rep0[np.newaxis, :], y_feature=f_esm_tr_0, y_uniprot_id = train_esm.uniprot_id.to_list(), topk=100),axis=1, result_type="expand") # type: ignore
    test_esm[['euclidean_rep32', 'cosine_rep32']] = test_esm.parallel_apply(lambda x : get_top_protein_simi(x_feature= x.rep32[np.newaxis, :], y_feature=f_esm_tr_32, y_uniprot_id = train_esm.uniprot_id.to_list(), topk=100),axis=1, result_type="expand") # type: ignore
    test_esm[['euclidean_rep33', 'cosine_rep33']] = test_esm.parallel_apply(lambda x : get_top_protein_simi(x_feature= x.rep33[np.newaxis, :], y_feature=f_esm_tr_33, y_uniprot_id = train_esm.uniprot_id.to_list(), topk=100),axis=1, result_type="expand") # type: ignore


try:
    with pd.HDFStore(cfg.FILE_RESULTS_SIMI_ESM,'w') as h5:
        h5['data'] = test_esm
        h5.close()
except Exception as e:
    print(f"Error writing HDF5 file: {e}")
    
test_esm.head(3)

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['uniprot_id', 'reaction_groundtruth', 'isenzyme_groundtruth', 'rep0',
       'rep32', 'rep33', 'euclidean_rep0', 'cosine_rep0', 'euclidean_rep32',
       'cosine_rep32', 'euclidean_rep33', 'cosine_rep33'],
      dtype='object')]

  h5['data'] = test_esm


Unnamed: 0,uniprot_id,reaction_groundtruth,isenzyme_groundtruth,rep0,rep32,rep33,euclidean_rep0,cosine_rep0,euclidean_rep32,cosine_rep32,euclidean_rep33,cosine_rep33
0,A9JLI2,-,nonenzyme,"[-0.016734194, 0.0030938622, -0.032814544, -0....","[0.2910796, 1.1923026, -2.5471961, -0.88039047...","[0.027029077, 0.05122046, -0.039942063, -0.040...","[(P18560, 0.031595), (P0C9G3, 0.091084), (P0C9...","[(P18560, 0.999272), (P0C9G3, 0.993932), (P0C9...","[(P18560, 8.096243), (P0C9G3, 24.56245), (P0C9...","[(P18560, 0.99988), (P0C9G3, 0.998867), (P0C9G...","[(P18560, 0.133943), (P0C9G3, 0.440141), (P0C9...","[(P18560, 0.999891), (P0C9G3, 0.998819), (P0C9..."
1,A9JLI3,-,nonenzyme,"[-0.0035194962, -0.003754213, -0.024650995, -0...","[4.1588683, -0.9986028, 3.379115, 1.4899616, 0...","[0.10094367, 0.013692737, 0.03674977, 0.025248...","[(P18559, 0.049364), (P0C9G5, 0.122323), (Q9I8...","[(P18559, 0.998138), (P0C9G5, 0.988737), (Q9I8...","[(P18559, 16.431988), (P26708, 66.79439), (P0C...","[(P18559, 0.999564), (P26708, 0.992849), (P0C9...","[(P18559, 0.247578), (P26708, 0.982841), (P0C9...","[(P18559, 0.999649), (P26708, 0.994392), (P0C9..."
2,A9JLI5,-,nonenzyme,"[-0.010903073, 0.0012608002, -0.027889619, -0....","[4.230414, -0.97044975, 1.8296818, 2.16777, 0....","[0.05490144, 0.007199925, 0.017326621, 0.00402...","[(P26707, 0.0), (P18558, 0.068385), (P0C9H1, 0...","[(P26707, 1.0), (P18558, 0.996487), (P0C9H1, 0...","[(P26707, 0.000198), (P18558, 17.919054), (P0C...","[(P26707, 1.0), (P18558, 0.999485), (P0C9H3, 0...","[(P26707, 3e-06), (P18558, 0.272072), (P0C9H3,...","[(P26707, 0.999999), (P18558, 0.999584), (P0C9..."


#### 3.2 Unirep    

In [None]:
dim_f_unirep = len(train_unirep.unirep.values[0])
f_unirep_tr = np.concatenate(train_unirep.unirep.values).reshape(-1,dim_f_unirep)

with pd.option_context('mode.chained_assignment', None):
    test_unirep[['simi_euclidean', 'simi_cosine']] = test_unirep.parallel_apply(lambda x : get_top_protein_simi(x_feature= x.unirep[np.newaxis, :], y_feature=f_unirep_tr, y_uniprot_id = train_unirep.uniprot_id.to_list(), topk=100),axis=1, result_type="expand") # type: ignore

try:
    with pd.HDFStore(cfg.FILE_RESULTS_SIMI_UNIREP,'w') as h5:
        h5['data'] = test_unirep
        h5.close()
except Exception as e:
    print(f"Error writing HDF5 file: {e}")
    
test_unirep.head(3)

#### 3.3 T5

In [39]:
dim_f_t5 = len(train_t5.t5_per_protein.values[0])
f_t5_tr = np.concatenate(train_t5.t5_per_protein.values).reshape(-1,dim_f_t5)

with pd.option_context('mode.chained_assignment', None):
    test_t5[['simi_euclidean', 'simi_cosine']] = test_t5.parallel_apply(lambda x : get_top_protein_simi(x_feature= x.t5_per_protein[np.newaxis, :], y_feature=f_t5_tr, y_uniprot_id = train_t5.uniprot_id.to_list(), topk=100),axis=1, result_type="expand") # type: ignore


try:
    with pd.HDFStore(cfg.FILE_RESULTS_SIMI_T5,'w') as h5:
        h5['data'] = test_t5
        h5.close()
except Exception as e:
    print(f"Error writing HDF5 file: {e}")
    
    

### 4. Transform to reaction

In [14]:
# 获取最相似蛋白对应的反应
def get_simi_ref(euclidean, cosine, ref_df):

    uid_euclidean = euclidean[0][0]
    uid_cosine = cosine[0][0]

    ref_record_e = ref_df[ref_df.uniprot_id==uid_euclidean]
    ref_record_c = ref_df[ref_df.uniprot_id==uid_cosine]

    euclidean_reaction = ref_record_e.reaction_groundtruth.values[0]
    cosine_reaction = ref_record_c.reaction_groundtruth.values[0]

    if euclidean_reaction =='-':
        euclidean_isenzyme = 'nonenzyme'
    else:
        euclidean_isenzyme = 'enzyme'

    if cosine_reaction =='-':
        cosine_isenzyme = 'nonenzyme'
    else:
        cosine_isenzyme = 'enzyme'
    

    return euclidean_reaction, euclidean_isenzyme,cosine_reaction, cosine_isenzyme

#### 4.1 ESM

In [15]:
with pd.HDFStore(cfg.FILE_RESULTS_SIMI_ESM, 'r') as h5:
    res_esm = h5['data']


res_esm[['euclidean_reaction_rep0', 
          'euclidean_isenzyme_rep0', 
          'cosine_reaction_rep0', 
          'cosine_isenzyme_rep0']] =res_esm.parallel_apply(lambda x: get_simi_ref(euclidean=x.euclidean_rep0, cosine=x.cosine_rep0, ref_df=ds_train) ,axis=1, result_type="expand") # type: ignore

res_esm[['euclidean_reaction_rep32', 
          'euclidean_isenzyme_rep32', 
          'cosine_reaction_rep32', 
          'cosine_isenzyme_rep32']] =res_esm.parallel_apply(lambda x: get_simi_ref(euclidean=x.euclidean_rep32, cosine=x.cosine_rep32, ref_df=ds_train) ,axis=1, result_type="expand") # type: ignore

res_esm[['euclidean_reaction_rep33', 
          'euclidean_isenzyme_rep33', 
          'cosine_reaction_rep33', 
          'cosine_isenzyme_rep33']] =res_esm.parallel_apply(lambda x: get_simi_ref(euclidean=x.euclidean_rep33, cosine=x.cosine_rep33, ref_df=ds_train) ,axis=1, result_type="expand") # type: ignore


res_esm = res_esm[['uniprot_id', 'reaction_groundtruth', 'isenzyme_groundtruth',
         'euclidean_reaction_rep0', 
          'euclidean_isenzyme_rep0', 
          'cosine_reaction_rep0', 
          'cosine_isenzyme_rep0',
          'euclidean_reaction_rep32', 
          'euclidean_isenzyme_rep32', 
          'cosine_reaction_rep32', 
          'cosine_isenzyme_rep32',
          'euclidean_reaction_rep33', 
          'euclidean_isenzyme_rep33', 
          'cosine_reaction_rep33', 
          'cosine_isenzyme_rep33'
         ]]

try:
    with pd.HDFStore(cfg.FILE_RESULTS_SIMI_ESM_REACTION,'w') as h5:
        h5['data'] = res_esm
        h5.close()
        print(f'Write file to: {cfg.FILE_RESULTS_SIMI_ESM_REACTION}')
except Exception as e:
    print(f"Error writing HDF5 file: {e}")
    
res_esm.head(3)

Write file to: /hpcfs/fhome/shizhenkun/codebase/preaction/results240614/simi/exp_test_esm_reaction.h5


Unnamed: 0,uniprot_id,reaction_groundtruth,isenzyme_groundtruth,euclidean_reaction_rep0,euclidean_isenzyme_rep0,cosine_reaction_rep0,cosine_isenzyme_rep0,euclidean_reaction_rep32,euclidean_isenzyme_rep32,cosine_reaction_rep32,cosine_isenzyme_rep32,euclidean_reaction_rep33,euclidean_isenzyme_rep33,cosine_reaction_rep33,cosine_isenzyme_rep33
0,A9JLI2,-,nonenzyme,-,nonenzyme,-,nonenzyme,-,nonenzyme,-,nonenzyme,-,nonenzyme,-,nonenzyme
1,A9JLI3,-,nonenzyme,-,nonenzyme,-,nonenzyme,-,nonenzyme,-,nonenzyme,-,nonenzyme,-,nonenzyme
2,A9JLI5,-,nonenzyme,-,nonenzyme,-,nonenzyme,-,nonenzyme,-,nonenzyme,-,nonenzyme,-,nonenzyme


## 4.2 Unirep

In [12]:
with pd.HDFStore(cfg.FILE_RESULTS_SIMI_UNIREP, 'r') as h5:
    res_unirep = h5['data']
    
res_unirep[['euclidean_reaction', 
          'euclidean_isenzyme', 
          'cosine_reaction', 
          'cosine_isenzyme']] =res_unirep.parallel_apply(lambda x: get_simi_ref(euclidean=x.simi_euclidean, cosine=x.simi_cosine, ref_df=ds_train) ,axis=1, result_type="expand") # type: ignore

res_unirep = res_unirep[['uniprot_id', 'reaction_groundtruth', 'isenzyme_groundtruth', 'euclidean_reaction', 'euclidean_isenzyme', 'cosine_reaction', 'cosine_isenzyme']]

try:
    with pd.HDFStore(cfg.FILE_RESULTS_SIMI_UNIREP_REACTION,'w') as h5:
        h5['data'] = res_unirep
        h5.close()
        print(f'Write file to: {cfg.FILE_RESULTS_SIMI_UNIREP_REACTION}')
except Exception as e:
    print(f"Error writing HDF5 file: {e}")
        
res_unirep.head(3)

Write file to: /hpcfs/fhome/shizhenkun/codebase/preaction/results240614/simi/exp_test_unirep_reaction.h5


Unnamed: 0,uniprot_id,reaction_groundtruth,isenzyme_groundtruth,euclidean_reaction,euclidean_isenzyme,cosine_reaction,cosine_isenzyme
0,A9JLI2,-,nonenzyme,-,nonenzyme,-,nonenzyme
1,A9JLI3,-,nonenzyme,-,nonenzyme,-,nonenzyme
2,A9JLI5,-,nonenzyme,-,nonenzyme,-,nonenzyme


### 4.3 T5

In [15]:
with pd.HDFStore(cfg.FILE_RESULTS_SIMI_T5, 'r') as h5:
    res_t5 = h5['data']
        
res_t5[['euclidean_reaction', 
          'euclidean_isenzyme', 
          'cosine_reaction', 
          'cosine_isenzyme']] =res_t5.parallel_apply(lambda x: get_simi_ref(euclidean=x.simi_euclidean, cosine=x.simi_cosine, ref_df=ds_train) ,axis=1, result_type="expand") # type: ignore

res_t5 = res_t5[['uniprot_id', 'reaction_groundtruth', 'isenzyme_groundtruth', 'euclidean_reaction', 'euclidean_isenzyme', 'cosine_reaction', 'cosine_isenzyme']]
try:
    with pd.HDFStore(cfg.FILE_RESULTS_SIMI_T5_REACTION,'w') as h5:
        h5['data'] = res_t5
        h5.close()
        print(f'Write file to: {cfg.FILE_RESULTS_SIMI_T5_REACTION}')
except Exception as e:
    print(f"Error writing HDF5 file: {e}")
        



res_t5.head(3)

Write file to: /hpcfs/fhome/shizhenkun/codebase/preaction/results240614/simi/exp_test_t5_reaction.h5


Unnamed: 0,uniprot_id,reaction_groundtruth,isenzyme_groundtruth,euclidean_reaction,euclidean_isenzyme,cosine_reaction,cosine_isenzyme
0,A9JLI2,-,nonenzyme,-,nonenzyme,-,nonenzyme
1,A9JLI3,-,nonenzyme,-,nonenzyme,-,nonenzyme
2,A9JLI5,-,nonenzyme,-,nonenzyme,-,nonenzyme


In [14]:
res_t5.head(3)

Unnamed: 0,uniprot_id,reaction_groundtruth,isenzyme_groundtruth,euclidean_reaction,euclidean_isenzyme,cosine_reaction,cosine_isenzyme
0,A9JLI2,-,nonenzyme,-,nonenzyme,-,nonenzyme
1,A9JLI3,-,nonenzyme,-,nonenzyme,-,nonenzyme
2,A9JLI5,-,nonenzyme,-,nonenzyme,-,nonenzyme
