# 此文件用于筛选fpocket的输出结果

In [4]:
import os
import copy
from utils import statistic_pocket_interaction
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
FPOCKET_OUT_PATH = "E:\Research\SM_miRNA\Data\complex_structure\complex-pdb-structures"

# 函数定义

In [13]:
def is_dibind_or_only_rna(file: str, or_only_rna=True) -> bool:
    """用于判断pocket文件中是否同时有蛋白质残基和RNA残基,pocket文件中包含一个口袋中表面的所有原子
    file: pocket文件路径
    return bool"""
    with open(file, 'r') as f:
        cont = 1
        for line in f:
            # print(line)
            if line[:4] != 'ATOM': continue
            if cont == 1: # 确定第一个残基的长度，RNA为1， 蛋白质为3
                residue_name_length = len(f.readline()[17:20].strip()) # 残基的位置在第17-20位
            residue_name = line[17:20].strip() # 获取当前行残基
            cont += 1
            # 如果残基的长度与第一个残基的长度相同，则跳过，不同则表明同时包含R和P的原子，返回True
            if len(residue_name) == residue_name_length:
                continue
            else:
                return True
    # 如果包含只有RNA的pocket，返回True
    if or_only_rna:
        if residue_name_length == 1:
            logging.info(f'只有RNA的pocket: {file}')
            return True
        else:
            return False
    # 如果循环结束，说明没有同时包含R和P的原子，返回False
    return False

def read_druggability(pockets_info_file: str):
    """根据fpocket输出的pocket信息文件，读取每个pocket的druggability分数
    pockets_info_file: fpocket输出的pocket信息文件"""
    
    score_dict = dict()
    with open(pockets_info_file, 'r') as f:
        pockets = f.read().strip().split('\n\n') # 每个pocket之间用两个换行符隔开
        for i, p in enumerate(pockets): # 遍历每个pocket，i为pocket的编号，p为pocket的信息
            i += 1
            score_dict[i] = float(p.split('\n')[2].split('\t')[2]) # pocket的druggability分数在第三行第三列
    return score_dict


def fpocket_filter(fpocket_out_folder: str, druggability_low_line: float=0.5) -> list:
    """此函数用来过滤fpocket输出的pocket，返回符合条件的pocket编号，利用di_bind函数和druggability分数
    fpocket_out_folder: fpocket针对一个pdb文件输出的文件夹
    druggability_low_line: druggability分数的下限
    return list of pocket number starting from 1
    """
    PDB_ID = os.path.split(fpocket_out_folder)[-1].split('_')[0]
    logging.info(f'正在处理{PDB_ID}')
    pocket_folder = os.path.join(fpocket_out_folder, 'pockets') # 输出文件夹中pockets文件夹的路径
    pockets_info_file = os.path.join(fpocket_out_folder, PDB_ID+'_info.txt')
    # pockets_file = [os.path.join(pocket_folder, file) for file in os.listdir(pocket_folder) if file[-3:]=='pdb']
    pockets_score = read_druggability(pockets_info_file) # 读取每个pocket的druggability分数
    
    # 结果返回符合条件的pocket编号列表
    results = list()
    for i in pockets_score.keys(): # i为pocket的编号
        pocket_file_path = os.path.join(pocket_folder,f'pocket{i}_atm.pdb') # 相应编号的pocket文件路径
        # 如果druggability分数大于下限并且同时包含蛋白质和RNA残基，则添加到结果列表中
        if pockets_score[i] > druggability_low_line and is_dibind_or_only_rna(pocket_file_path):
            results.append(i)
    return results


def rewrite_out_file(pdb_out_file:str, pockets:list):
    # 此函数用于将不在pockets的pocket删除，并将剩余的pocket重新编号
    with open(pdb_out_file, 'r') as f:
        line_list = f.read().strip().split('\n')
        # out_line = copy.deepcopy(line_list)
        for i, line in enumerate(line_list):
            # print(i)
            line_list[i] = line + '\n'
            line += '\n'
            if line[:6] == 'HETATM' and line[17:20] == 'STP':
                if int(line[22:26]) not in pockets:
                    line_list[i] = ''
                else:
                    # 更换pocket编号
                    line_list[i] = line[:22] + (' '*(4-len(str(pockets.index(int(line[22:26]))+1))) + str(pockets.index(int(line[22:26]))+1)) + line[26:]
    # 写入新的pdb文件
    new_pdb_out_file = pdb_out_file.replace('.pdb', '_new.pdb')
    with open(new_pdb_out_file, 'w') as f:
        f.writelines(line_list)
    # 写入新的pml文件
    with open(pdb_out_file.replace('_out.pdb', '.pml'), 'r') as f:
        lines = f.readlines()
    with open(pdb_out_file.replace('_out.pdb', '_new.pml'), 'w') as f:
        # 更换load的pdb文件为新的pdb文件
        lines[1] = f'load {new_pdb_out_file}\n'
        f.writelines(lines)
        

# main函数

In [17]:
if __name__ == "__main__":
    # fpocket输出文件夹，例："E:\tmp\fpocket\6v5b_out"
    # fpocket_out_folder = r'E:/Research/SM_miRNA/Data/Dock/complex/pdb/1ibr_out'
    path = FPOCKET_OUT_PATH
    all_pockets = dict()
    for fpocket_out_folder in [dir_tmp for dir_tmp in os.listdir(path) if not os.path.isfile(os.path.join(path, dir_tmp))]:
        logging.info(f'正在处理{fpocket_out_folder}')
        fpocket_out_folder = os.path.join(path, fpocket_out_folder)
        logging.info(f'fpocket_out_folder: {fpocket_out_folder}')
        pdb_id = os.path.split(fpocket_out_folder)[-1].split('_')[0]
        pdb_out_file = os.path.join(fpocket_out_folder, pdb_id+'_out.pdb')
        # pml_out_file = os.path.join(fpocket_out_folder, fpocket_out_folder.split('/')[-1].replace('_out.pdb', '.pml'))
        pockets = fpocket_filter(fpocket_out_folder, druggability_low_line=0)
        all_pockets[pdb_id] = pockets
        logging.info(f'pdb_id have pockets: {pockets}')
        rewrite_out_file(pdb_out_file, pockets)
        logging.info(f'处理完成{pdb_id}')

2023-03-07 20:41:20,755 - INFO - 正在处理2LI8_out
2023-03-07 20:41:20,755 - INFO - fpocket_out_folder: E:\Research\SM_miRNA\Data\complex_structure\complex-pdb-structures\2LI8_out
2023-03-07 20:41:20,756 - INFO - 正在处理2LI8
2023-03-07 20:41:20,757 - INFO - pdb_id have pockets: [4]
2023-03-07 20:41:20,760 - INFO - 处理完成2LI8
2023-03-07 20:41:20,760 - INFO - 正在处理2N82_out
2023-03-07 20:41:20,760 - INFO - fpocket_out_folder: E:\Research\SM_miRNA\Data\complex_structure\complex-pdb-structures\2N82_out
2023-03-07 20:41:20,761 - INFO - 正在处理2N82
2023-03-07 20:41:20,765 - INFO - pdb_id have pockets: [2, 5, 8, 9, 10]
2023-03-07 20:41:20,770 - INFO - 处理完成2N82
2023-03-07 20:41:20,770 - INFO - 正在处理3A6P_out
2023-03-07 20:41:20,771 - INFO - fpocket_out_folder: E:\Research\SM_miRNA\Data\complex_structure\complex-pdb-structures\3A6P_out
2023-03-07 20:41:20,771 - INFO - 正在处理3A6P
2023-03-07 20:41:20,810 - INFO - pdb_id have pockets: [5, 14, 32, 34, 36, 40, 41, 50, 61, 73, 98, 115, 117, 119, 122, 154, 160, 161, 165

In [6]:
# read_druggability(fpocket_out_folder+'/6v5c_info.txt')
pdb_id = os.path.split(fpocket_out_folder)[-1].split('_')[0]

# 输出文件

In [15]:
total_dict = dict()
for file in os.listdir('fpocket'):
    f = open(os.path.join('fpocket', file), 'r')
    pocket_atom = f.read()
    total_dict[file] = statistic_pocket_interaction(pocket_atom)


In [16]:
import pandas as pd
pd.DataFrame(total_dict)

Unnamed: 0,3a6p_pocket5_atm.pdb,4z4c_pocket1_atm.pdb,4z4d_pocket7_atm.pdb,5zal_pocket7_atm.pdb,5zam_pocket5_atm.pdb,6cbd_pocket44_atm.pdb,6dcl_pocket2_atm.pdb,6lxd_pocket90_atm.pdb,6v5b_pocket19_atm.pdb
3,85,140,57,64,86,36,60,88,48
1,16,63,48,5,4,29,2,2,4
