# 分批对接结果的综合

## 导入模块

In [1]:
import pickle
import os
import gzip
from utils import ZincPdbqt
import pandas as pd
os.listdir()

['batch_dock_speed_test.sh',
 'receptor',
 'log',
 'batch_dock.sh',
 'outputs',
 '.ipynb_checkpoints',
 'ligand',
 '__pycache__',
 'batch_dock_job_.py',
 'utils.py',
 'autodock_vina_batch_dock.py',
 'batch_dock_2_jobs.sh',
 'slurm-5975874.out']

## 数据目录

In [2]:
data_dir = 'outputs/3a6p_1m/'

## 列出数据文件

In [3]:
data_files = os.listdir(data_dir)
pkl_files = [f for f in data_files if f.endswith('.pkl')]
pdbqt_gz_files = [f for f in data_files if f.endswith('.pdbqt.gz')]
len(pkl_files), len(pdbqt_gz_files), pkl_files[:5], pdbqt_gz_files[:5]

(44,
 50,
 ['3a6p_dOthers_apH_dock_energy_38862_58293_32_20230414141038.pkl',
  '3a6p_dOthers_apH_dock_energy_349758_369189_32_20230414141536.pkl',
  '3a6p_dOthers_apH_dock_energy_388620_408051_32_20230414141534.pkl',
  '3a6p_dOthers_apH_dock_energy_369189_388620_32_20230414141536.pkl',
  '3a6p_dOthers_apH_dock_energy_427482_446913_32_20230414141534.pkl'],
 ['3a6p_dOthers_apH_dock_results_641223_660654_32_20230414141605.pdbqt.gz',
  '3a6p_dOthers_apH_dock_results_563499_582930_32_20230414141606.pdbqt.gz',
  '3a6p_dOthers_apH_dock_results_893826_913257_32_20230414141605.pdbqt.gz',
  '3a6p_dOthers_apH_dock_results_932688_952125_32_20230414141607.pdbqt.gz',
  '3a6p_dOthers_apH_dock_results_505206_524637_32_20230414141605.pdbqt.gz'])

## 拼接数据

In [4]:
total_data = dict()
for f in pkl_files:
    with open(os.path.join(data_dir, f), 'rb') as f:
        data = pickle.load(f)
        total_data.update(data)
len(total_data)

835276

In [11]:
with open(os.path.join(data_dir, 'total_data_dock_energy.pkl'), 'wb') as f:
    pickle.dump(total_data, f)

## 构造hdf5数据集格式，包括两个表，一个坐标表，一个统计信息表。

In [5]:
def analyze_zinc_pdbqt_gz(pdbqt_gz_path: str):
    """此函数用于分析ZINC的pdbqt.gz文件，将其转换成3维坐标数据和原子在3维坐标数据中的起始和终止位置
    input: pdbqt_gz_path: str, pdbqt.gz文件的路径
    output: coor: pd.DataFrame, 3维坐标数据
            index: pd.DataFrame, 每个分子中的原子在coor中的起始和终止位置
    """
    coor = []
    index = []
    # 读取pdbqt.gz文件
    with gzip.open(pdbqt_gz_path, 'rb') as f:
        t_start = 0 # 记录当前分子的原子起始位置
        t_end = 0 # 记录当前分子的原子终止位置
        for line in f:
            if line.startswith(b'ATOM'):
                coor.append([str(line[12:14].strip(), 'utf-8'), float(line[30:38]), float(line[38:46]), float(line[46:54])])
                t_end += 1 # 记录已存入原子的个数
            if line.startswith(b'REMARK  Name = '): # 一个分子的起始位置
                if t_end == 0:
                    # 记录第一个分子的id
                    zinc_id = str(line[15:].strip(), 'utf-8')
                    continue
                index.append([zinc_id, t_start, t_end])  # 存储上一个分子的信息
                zinc_id = str(line[15:].strip(), 'utf-8')  # 记录当前分子的id
                t_start = t_end # 记录当前分子的原子起始位置
        index.append([zinc_id, t_start, t_end])
    return pd.DataFrame(coor, columns=['atom', 'x', 'y', 'z']), pd.DataFrame(index, columns=['zinc_id', 'start', 'end']).set_index('zinc_id', drop=True)

In [6]:
# 生成表
coor_df, index_df = analyze_zinc_pdbqt_gz('ligand/zinc20_druglike_random_sample_molecule_1f600.pdbqt.gz')

In [7]:
# 根据表，添加对接能量
# 提取最佳对接能量
total_data_best = {k: v[0] for k, v in total_data.items()}


In [12]:
for zinc_id, energy in total_data_best.items():
    # print(energy.shape)
    assert energy.shape[0] == 5

In [21]:
# 生成表
total_data_best_df = pd.DataFrame.from_dict(total_data_best, columns=['total', 'inter', 'intra', 'torsions', 'intra best pose'], orient='index')

In [22]:
# 合并表
total_data_best_df.index.name = 'zinc_id'
total_data_best_df = total_data_best_df.join(index_df)

In [None]:
# 保存为hdf5文件
store = pd.HDFStore('outputs/3a6p_1m/total_data_best_df.h5')
store['label'] = total_data_best_df
store['pos'] = coor_df
store.close()

In [36]:
total_data_best_df.shape[0], len(total_data)

(835610, 835276)

In [46]:
for i in total_data_best_df.iterrows():
    print(i)
    break

('ZINC000000023541', total                   -6.764
inter                   -7.947
intra                   -0.264
torsions                 1.186
intra best pose         -0.260
start              4441801.000
end                4441822.000
Name: ZINC000000023541, dtype: float64)


In [58]:
elements_dict = dict(C=0, N=1, O=2, H=3, F=4, S=5, CL=6, BR=7, I=8, SI=9, P=10)
ele_df = pd.DataFrame.from_dict(elements_dict, orient='index', columns=['element_id'], dtype='int8')


In [31]:
# coor_df['id'] = coor_df['atom'].map(ele_df['element_id'])
# 查看含有none的行
total_data_best_df[total_data_best_df.isnull().any(axis=1)]
total_data_best_df.head()

Unnamed: 0_level_0,total,inter,intra,torsions,intra best pose,start,end
zinc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ZINC000000023541,-6.764,-7.947,-0.264,1.186,-0.26,4441801,4441822
ZINC000000027943,-6.242,-7.414,-0.065,1.095,-0.142,5758277,5758294
ZINC000000029829,-6.333,-7.251,-0.638,1.111,-0.445,2564893,2564911
ZINC000000030076,-6.643,-7.823,-0.261,1.165,-0.276,3789268,3789290
ZINC000000030450,-5.653,-7.676,-0.402,1.983,-0.441,3788253,3788272
