# 定位一个zinc id在一个数据集中存储的位置
因为数据集比较大，而且是分块存储的，所以需要定位一个zinc id在哪个块中

流程：
- 读取需要定位的zinc id
- 读取数据集的索引文件
- 求他们的交集
- 输出结果

## workspace

In [None]:
import os

os.chdir("/mnt/f/SMTarRNA_total_results/total_dataset_results/total_dataset_results")
os.listdir()

## module

In [None]:
import logging

import numpy as np
import pandas as pd
from tqdm import tqdm

## 1. 读取需要定位的zinc id

In [None]:
# import torch
complex_str = ["3a6p", "4z4c", "4z4d", "6cbd"]
# # 将.pt文件转换为csv文件
# for c in complex_str:
#     data = torch.load(c + '_top_data.np.pt')
#     np.savetxt(c + '_top_data.csv', data, delimiter = ',')
#     print(c + '_top_data.csv')
complex_top_zinc_id = {}
for c in complex_str:
    complex_top_zinc_id[c] = set(
        np.loadtxt(c + "_top_data.csv", delimiter=",")[:, 0].astype(int).tolist()
    )
    logging.info(c + "_top_data.csv")

## 2. 读取数据集的索引文件

In [None]:
index_folder = "/mnt/e/wsl_project_data/SMTarRNA/ligand/index"
index_file_list = os.listdir(index_folder)


def create_zinc_id(id: int):
    return "ZINC" + str(int(id + 1e12))[1:]

In [None]:
from collections import defaultdict

results = defaultdict(dict)
for file in index_file_list:
    file_path = os.path.join(index_folder, file)
    index_hf = pd.HDFStore(file_path)
    for k in tqdm(index_hf.keys()):
        index_set = set([int(i[4:]) for i in index_hf[k].index.to_list()])
        for c in complex_str:
            # 求交集
            zinc_id_set = index_set & complex_top_zinc_id[c]
            if zinc_id_set:
                results[c][k] = zinc_id_set
    index_hf.close()

## 明文保存结果

In [None]:
# 将结果保存为文本
for c in complex_str:
    with open(c + "_zinc_id_index.txt", "w") as f:
        for k in results[c].keys():
            f.write(k + "\n")
            for i in results[c][k]:
                f.write(create_zinc_id(i) + "\n")
            f.write("\n")
    logging.info(c + "_zinc_id.txt")

### 计算不同复合体的并集

In [None]:
# 计算四个复合体的并集
key_union_set = set()
for c in complex_str:
    key_union_set = key_union_set | set(results[c].keys())

# 获取每个键下的并集
results_union = defaultdict(set)
for k in key_union_set:
    for c in complex_str:
        if k in results[c].keys():
            results_union[k] = results_union[k] | results[c][k]

## 保存四个复合体并集结果

In [None]:
with open("four_complex_zinc_id_index.txt", "w") as f:
    for k in results_union.keys():
        f.write(k + "\n")
        for i in results_union[k]:
            f.write(create_zinc_id(i) + "\n")
        f.write("\n")

In [None]:
# 读取数据
data = {}
with open("four_complex_zinc_id_index.txt", "r") as f:
    its = f.read().strip().split("\n\n")
    for i in its:
        t = i.split("\n")
        data[t[0]] = t[1:]

In [None]:
list(data.keys())[:10]
# data['/ED/ADRN/EDADRN_xazi']