## Methos: SimiProtein
> 2024-11-05

### 1. 导入必要的包

In [2]:
import pandas as pd
import sys,os
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../')
from config import conf as cfg
from tqdm import tqdm
from tools import btools
import rxnrecer as production
from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize(progress_bar=False)


%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 128 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### 2. Excucte simi script


In [2]:
# for slurm job submission
# ! bash s1_simi_protein_10_fold.sh

# for python run
# ! python simi_protein_10_fold.py

### 3. Load test data

In [2]:
def load_10folds_test_data():
    test_path = [f'{cfg.DIR_DATASET}validation/fold{fold_num+1}/valid.feather' for fold_num in range(10)]
    res = [pd.read_feather(path)[['uniprot_id','seq','reaction_id']].rename(columns={'reaction_id': 'rxn_groundtruth'}) for path in test_path]
    return res


print('Loading uniprot_rxn_dict ...' )
d1 = pd.read_feather(cfg.FILE_DS_TRAIN)
d2 = pd.read_feather(cfg.FILE_DS_TEST)
uniprot_rxn_dict = pd.concat([d1,d2], axis=0).reset_index(drop=True)[['uniprot_id', 'reaction_id']].set_index('uniprot_id')['reaction_id'].to_dict()

print('Loading 10-folds test data ...' )
data_test = load_10folds_test_data()
data_test[0].head(3)

Loading uniprot_rxn_dict ...
Loading 10-folds test data ...


Unnamed: 0,uniprot_id,seq,rxn_groundtruth
105768,Q9UYB6,MLPDRVLEILNEMKAERIRGATWLARKGAEAFLALAEELDEALLED...,-
195319,C1AQW9,MRTPCSQHRRDRPSAIGSQLPDADTLDTRQPPLQEIPISSFADKTF...,RHEA:19669
135884,P64647,MALFSKILIFYVIGVNISFVIIWFISHEKTHIRLLSAFLVGITWPM...,-


### 3. Load results
#### 3.1 Blast

In [4]:
def merge_groundtruth(ds_test, ds_pred):
    res = []
    for i in tqdm(range(10)):
        # ds_test[i].merge(ds_pred[i], on='uniprot_id', how='left')
        res = res + [ds_test[i].merge(ds_pred[i], on='uniprot_id', how='left').fillna('NO-PREDICTION').reset_index(drop=True)]
    return res

In [5]:
vali_res_file_path = [f'{cfg.DIR_RES_BASELINE}results/direct_methods/blast/fold{item}.tsv'  for item in range(1, 11)]
data_blast = [pd.read_csv(file_path, sep='\t') for file_path in vali_res_file_path]
res_blast = merge_groundtruth(ds_test=data_test, ds_pred=data_blast)

#保存加groundtruth的结果
save_vali_res_file_path = [f'{cfg.DIR_PROJECT_ROOT}/results/intermediate/direct/blast_fold{item}.tsv' for item in range(1, 11)]
for i in tqdm(range(10)):
    res_blast[i].to_csv(save_vali_res_file_path[i], sep='\t', index=False)
    
res_blast[i].head(3)

100%|██████████| 10/10 [00:00<00:00, 49.60it/s]
100%|██████████| 10/10 [00:00<00:00, 24.92it/s]


Unnamed: 0,uniprot_id,rxn_groundtruth,rxn_blast
0,Q66DF3,-,-
1,Q57L31,RHEA:36899,RHEA:36899
2,Q6CUV7,-,-


#### 3.2 Unirep

In [None]:
embd_methd = 'unirep'
file_res_unirep = [f'{cfg.RESULTS_DIR}simi/fold_{fold_num}_{embd_methd}_results.h5' for fold_num in range(1,11)]
res_unirep = [btools.read_h5_file(item)for item in tqdm(file_res_unirep)]

In [27]:
# 获取反应ID
print('Get reaction ID')
for i in tqdm(range(10)):
    res_unirep[i]['rxn_euclidean'] = res_unirep[i].euclidean.apply(lambda x : btools.get_simi_Pred(pred_list=x, uniprot_rxn_dict=uniprot_rxn_dict))
    res_unirep[i]['rxn_cosine'] = res_unirep[i].cosine.apply(lambda x : btools.get_simi_Pred(pred_list=x, uniprot_rxn_dict=uniprot_rxn_dict))
    
print('Save file')
save_vali_res_file_path=[f'{cfg.DIR_PROJECT_ROOT}/results/intermediate/direct/{embd_methd}_fold{item}.tsv' for item in range(1, 11)]
for i in tqdm(range(10)):
    res_unirep[i][['uniprot_id','reaction_id', 'rxn_euclidean', 'rxn_cosine']].rename(columns={'reaction_id': 'rxn_groundtruth', 
                                                                                               'rxn_euclidean':'rxn_unirep_euclidean',
                                                                                               'rxn_cosine':'rxn_unirep_cosine'}).to_csv(save_vali_res_file_path[i], index=False, sep='\t')

Get reaction ID


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:13<00:00,  1.30s/it]


Save file


100%|██████████| 10/10 [00:00<00:00, 17.51it/s]


#### 3.3 ESM

In [32]:
embd_methd = 'esm'
file_res_esm = [f'{cfg.RESULTS_DIR}simi/fold_{fold_num}_{embd_methd}_results.h5' for fold_num in range(1,11)]
res_esm = [btools.read_h5_file(item)for item in tqdm(file_res_esm)]



100%|██████████| 10/10 [10:37<00:00, 63.77s/it]


In [35]:
# 获取反应ID
print('Get reaction ID')
for i in tqdm(range(10)):
    res_esm[i]['rxn_euclidean'] = res_esm[i].euclidean.apply(lambda x : btools.get_simi_Pred(pred_list=x, uniprot_rxn_dict=uniprot_rxn_dict))
    res_esm[i]['rxn_cosine'] = res_esm[i].cosine.apply(lambda x : btools.get_simi_Pred(pred_list=x, uniprot_rxn_dict=uniprot_rxn_dict))
    
print('Save file')
save_vali_res_file_path=[f'{cfg.DIR_PROJECT_ROOT}/results/intermediate/direct/{embd_methd}_fold{item}.tsv' for item in range(1, 11)]
for i in tqdm(range(10)):
    res_esm[i][['uniprot_id','reaction_id', 'rxn_euclidean', 'rxn_cosine']].rename(columns={'reaction_id': 'rxn_groundtruth', 
                                                                                               'rxn_euclidean':'rxn_esm_euclidean',
                                                                                               'rxn_cosine':'rxn_esm_cosine'}).to_csv(save_vali_res_file_path[i], index=False, sep='\t')
    
pd.read_csv(save_vali_res_file_path[0], sep='\t').head(3)


Get reaction ID


100%|██████████| 10/10 [00:13<00:00,  1.36s/it]


Save file


100%|██████████| 10/10 [00:00<00:00, 18.57it/s]


Unnamed: 0,uniprot_id,rxn_groundtruth,rxn_esm_euclidean,rxn_esm_cosine
0,Q9UYB6,-,RHEA:32243;-,RHEA:32243;-
1,C1AQW9,RHEA:19669,RHEA:19669,RHEA:19669
2,P64647,-,-,-


#### 3.4 T5

In [36]:
embd_methd = 't5'
file_res_t5 = [f'{cfg.RESULTS_DIR}simi/fold_{fold_num}_{embd_methd}_results.h5' for fold_num in range(1,11)]
res_t5 = [btools.read_h5_file(item)for item in tqdm(file_res_t5)]

# 获取反应ID
print('Get reaction ID')
for i in tqdm(range(10)):
    res_t5[i]['rxn_euclidean'] = res_t5[i].euclidean.apply(lambda x : btools.get_simi_Pred(pred_list=x, uniprot_rxn_dict=uniprot_rxn_dict))
    res_t5[i]['rxn_cosine'] = res_t5[i].cosine.apply(lambda x : btools.get_simi_Pred(pred_list=x, uniprot_rxn_dict=uniprot_rxn_dict))
    
print('Save file')
save_vali_res_file_path=[f'{cfg.DIR_PROJECT_ROOT}/results/intermediate/direct/{embd_methd}_fold{item}.tsv' for item in range(1, 11)]
for i in tqdm(range(10)):
    res_t5[i][['uniprot_id','reaction_id', 'rxn_euclidean', 'rxn_cosine']].rename(columns={'reaction_id': 'rxn_groundtruth', 
                                                                                               'rxn_euclidean':'rxn_t5_euclidean',
                                                                                               'rxn_cosine':'rxn_t5_cosine'}).to_csv(save_vali_res_file_path[i], index=False, sep='\t')
    
pd.read_csv(save_vali_res_file_path[0], sep='\t').head(3)    

100%|██████████| 10/10 [10:32<00:00, 63.29s/it]


Get reaction ID


100%|██████████| 10/10 [00:13<00:00,  1.34s/it]


Save file


100%|██████████| 10/10 [00:00<00:00, 18.65it/s]


Unnamed: 0,uniprot_id,rxn_groundtruth,rxn_t5_euclidean,rxn_t5_cosine
0,Q9UYB6,-,RHEA:32243;-,RHEA:32243;-
1,C1AQW9,RHEA:19669,RHEA:19669,RHEA:19669
2,P64647,-,-,-


### 3.5 RXNRECer

In [17]:
embd_methd = 'RXNRECer'
file_res_alfp = [f'{cfg.RESULTS_DIR}simi/fold{fold_num}_{embd_methd}_results.feather' for fold_num in range(1,11)]
save_vali_res_file_path=[f'{cfg.DIR_PROJECT_ROOT}/results/intermediate/direct/{embd_methd}_fold{item}.tsv' for item in range(1, 11)]

if not os.path.exists(save_vali_res_file_path[0]):
    # 模型预测结果
    for i in tqdm(range(10)):
        res =  production.step_by_step_prediction_with_protein_df(input_protein_df=data_test[i][['uniprot_id', 'seq']].reset_index(drop=True), 
                                            dict_rxn2id=cfg.FILE_DS_DICT_RXN2ID
                                            )
        res.to_feather(file_res_alfp[i])

    # 拼合groundtruth    
    alfp_preds = [pd.read_feather(path) for path in file_res_alfp]
    
    for i in range(10):
        alfp_preds[i] =alfp_preds[i].rename(columns={'input_id': 'uniprot_id', 'RXNRECer': 'rxn_RXNRECer'}).merge(data_test[i][['uniprot_id', 'rxn_groundtruth']], on='uniprot_id', how='left')
        alfp_preds[i].to_csv(save_vali_res_file_path[i], sep='\t', index=False)
else:
    alfp_preds = [pd.read_csv(path, sep='\t') for path in save_vali_res_file_path]
    
alfp_preds[0].head(3)

Unnamed: 0,uniprot_id,rxn_RXNRECer,rxn_groundtruth
0,Q9UYB6,-,-
1,C1AQW9,RHEA:19669,RHEA:19669
2,P64647,-,-
