# 分批对接结果的综合

设置项目路径

In [None]:
# 设置工作目录
import os

os.chdir("/mnt/e/wsl_project_data/SMTarRNA")
# 设置python工作目录
import sys

sys.path.append("/home/huabei/projects/SMTarRNA")

## 导入模块

In [None]:
import gzip
import pickle

import pandas as pd

from src.dock_utils import ZincPdbqt

os.listdir()

## 数据目录

In [None]:
data_dir = "outputs/4z4c_1m/"

## 列出数据文件

In [None]:
data_files = os.listdir(data_dir)
pkl_files = [f for f in data_files if f.endswith(".pkl")]
pdbqt_gz_files = [f for f in data_files if f.endswith(".pdbqt.gz")]
len(pkl_files), len(pdbqt_gz_files), pkl_files[:5], pdbqt_gz_files[:5]

## 拼接数据

In [None]:
total_data = dict()
for f in pkl_files:
    with open(os.path.join(data_dir, f), "rb") as f:
        data = pickle.load(f)
        total_data.update(data)
print(f"total data: {len(total_data)}")

In [None]:
with open(os.path.join(data_dir, "1m_total_data_dock_energy.pkl"), "wb") as f:
    pickle.dump(total_data, f)

## 构造hdf5数据集格式，包括两个表，一个坐标表，一个统计信息表。

In [None]:
def analyze_zinc_pdbqt_gz(pdbqt_gz_path: str):
    """此函数用于分析ZINC的pdbqt.gz文件，将其转换成3维坐标数据和原子在3维坐标数据中的起始和终止位置
    input: pdbqt_gz_path: str, pdbqt.gz文件的路径
    output: coor: pd.DataFrame, 3维坐标数据
            index: pd.DataFrame, 每个分子中的原子在coor中的起始和终止位置
    """
    coor = []
    index = []
    # 读取pdbqt.gz文件
    with gzip.open(pdbqt_gz_path, "rb") as f:
        t_start = 0  # 记录当前分子的原子起始位置
        t_end = 0  # 记录当前分子的原子终止位置
        for line in f:
            if line.startswith(b"ATOM"):
                coor.append(
                    [
                        str(line[12:14].strip(), "utf-8"),
                        float(line[30:38]),
                        float(line[38:46]),
                        float(line[46:54]),
                    ]
                )
                t_end += 1  # 记录已存入原子的个数
            if line.startswith(b"REMARK  Name = "):  # 一个分子的起始位置
                if t_end == 0:
                    # 记录第一个分子的id
                    zinc_id = str(line[15:].strip(), "utf-8")
                    continue
                index.append([zinc_id, t_start, t_end])  # 存储上一个分子的信息
                zinc_id = str(line[15:].strip(), "utf-8")  # 记录当前分子的id
                t_start = t_end  # 记录当前分子的原子起始位置
        index.append([zinc_id, t_start, t_end])
    return pd.DataFrame(coor, columns=["atom", "x", "y", "z"]), pd.DataFrame(
        index, columns=["zinc_id", "start", "end"]
    ).set_index("zinc_id", drop=True)

In [None]:
# 获取训练样本的数据
coor_df, index_df = analyze_zinc_pdbqt_gz(
    "ligand/zinc20_druglike_random_sample_molecule_1f600.pdbqt.gz"
)

In [None]:
# 根据表，添加对接能量
# 提取最佳对接能量
total_data_best = {k: v[0] for k, v in total_data.items()}
len(total_data_best)

In [None]:
for zinc_id, energy in total_data_best.items():
    # print(energy.shape)
    assert energy.shape[0] == 5

In [None]:
# 生成最佳能量表
total_data_best_df = pd.DataFrame.from_dict(
    total_data_best,
    columns=["total", "inter", "intra", "torsions", "intra best pose"],
    orient="index",
)

In [None]:
# 合并表
total_data_best_df.index.name = "zinc_id"
total_data_best_df = index_df.join(total_data_best_df, how="left")
total_data_best_df.head()

In [None]:
len(total_data_best_df)

In [None]:
# 保存为hdf5文件
store = pd.HDFStore("outputs/4z4d-1m_total_data_best_df.h5")
store["label"] = total_data_best_df
store["pos"] = coor_df
store.close()

In [None]:
total_data_best_df.shape[0], len(total_data)  # 有重复分子

In [None]:
for i in total_data_best_df.iterrows():
    print(i)
    break

In [None]:
elements_dict = dict(C=0, N=1, O=2, H=3, F=4, S=5, CL=6, BR=7, I=8, SI=9, P=10)
ele_df = pd.DataFrame.from_dict(
    elements_dict, orient="index", columns=["element_id"], dtype="int8"
)

In [None]:
# coor_df['id'] = coor_df['atom'].map(ele_df['element_id'])
# 查看重复index的行
total_data_best_df.loc[total_data_best_df[total_data_best_df.index.duplicated()].index].head()

# 生成测试数据集