# 定位一个zinc id在一个数据集中存储的位置
因为数据集比较大，而且是分块存储的，所以需要定位一个zinc id在哪个块中

流程：
- 读取需要定位的zinc id
- 读取数据集的索引文件
- 求他们的交集
- 输出结果

## workspace

In [28]:
import os
os.chdir("/mnt/f/SMTarRNA_total_results/total_dataset_results/total_dataset_results")
os.listdir()

['3a6p',
 '3a6p_top_data.csv',
 '3a6p_top_data.np.pt',
 '4z4c',
 '4z4c_top_data.csv',
 '4z4c_top_data.np.pt',
 '4z4d',
 '4z4d_top_data.csv',
 '4z4d_top_data.np.pt',
 '6cbd',
 '6cbd_top_data.csv',
 '6cbd_top_data.np.pt',
 'chemfp_clustering',
 'data_3a6p_zinc_id_smiles_frame.csv',
 'data_4z4c_zinc_id_smiles_frame.csv',
 'data_4z4d_zinc_id_smiles_frame.csv',
 'data_6cbd_zinc_id_smiles_frame.csv',
 'fps',
 'smiles',
 'smiles.tar.gz',
 'total_zinc_id_set.pt',
 'zinc_id_smiles_frame.csv']

## module

In [29]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import logging

## 1. 读取需要定位的zinc id

In [30]:
# import torch
complex_str = ['3a6p', '4z4c', '4z4d', '6cbd']
# # 将.pt文件转换为csv文件
# for c in complex_str:
#     data = torch.load(c + '_top_data.np.pt')
#     np.savetxt(c + '_top_data.csv', data, delimiter = ',')
#     print(c + '_top_data.csv')
complex_top_zinc_id = {}
for c in complex_str:
    complex_top_zinc_id[c] = set(np.loadtxt(c + '_top_data.csv', delimiter = ',')[:, 0].astype(int).tolist())
    logging.info(c + '_top_data.csv')


## 2. 读取数据集的索引文件

In [31]:
index_folder = '/mnt/e/wsl_project_data/SMTarRNA/ligand/index'
index_file_list = os.listdir(index_folder)

def create_zinc_id(id: int):
    return 'ZINC'+str(int(id + 1e12))[1:]


In [32]:
from collections import defaultdict
results = defaultdict(dict)
for file in index_file_list:
    file_path = os.path.join(index_folder, file)
    index_hf = pd.HDFStore(file_path)
    for k in tqdm(index_hf.keys()):
        index_set = set([int(i[4:]) for i in index_hf[k].index.to_list()])
        for c in complex_str:
            # 求交集
            zinc_id_set = index_set & complex_top_zinc_id[c]
            if zinc_id_set:
                results[c][k] = zinc_id_set
    index_hf.close()


100%|██████████| 94/94 [00:00<00:00, 214.40it/s]
100%|██████████| 111/111 [00:00<00:00, 122.55it/s]
100%|██████████| 147/147 [00:02<00:00, 59.69it/s]
100%|██████████| 210/210 [00:05<00:00, 38.02it/s] 
100%|██████████| 158/158 [00:02<00:00, 53.47it/s]
100%|██████████| 140/140 [00:02<00:00, 60.14it/s]
100%|██████████| 102/102 [00:01<00:00, 88.07it/s]
100%|██████████| 76/76 [00:00<00:00, 136.20it/s]
100%|██████████| 58/58 [00:00<00:00, 232.75it/s]
100%|██████████| 40/40 [00:00<00:00, 277.01it/s]
100%|██████████| 101/101 [00:00<00:00, 144.14it/s]
100%|██████████| 171/171 [00:04<00:00, 38.26it/s]
100%|██████████| 329/329 [00:12<00:00, 27.16it/s] 
100%|██████████| 401/401 [00:19<00:00, 20.17it/s]
100%|██████████| 295/295 [00:11<00:00, 24.72it/s]
100%|██████████| 280/280 [00:11<00:00, 25.28it/s]
100%|██████████| 271/271 [00:10<00:00, 25.73it/s] 
100%|██████████| 179/179 [00:04<00:00, 37.46it/s]
100%|██████████| 96/96 [00:00<00:00, 133.42it/s]
100%|██████████| 77/77 [00:00<00:00, 224.34it/s]
1

## 明文保存结果

In [33]:
# 将结果保存为文本
for c in complex_str:
    with open(c + '_zinc_id_index.txt', 'w') as f:
        for k in results[c].keys():
            f.write(k + '\n')
            for i in results[c][k]:
                f.write(create_zinc_id(i) + '\n')
            f.write('\n')
    logging.info(c + '_zinc_id.txt')

### 计算不同复合体的并集

In [34]:
# 计算四个复合体的并集
key_union_set = set()
for c in complex_str:
    key_union_set = key_union_set | set(results[c].keys())

# 获取每个键下的并集
results_union = defaultdict(set)
for k in key_union_set:
    for c in complex_str:
        if k in results[c].keys():
            results_union[k] = results_union[k] | results[c][k]



## 保存四个复合体并集结果

In [35]:
with open('four_complex_zinc_id_index.txt', 'w') as f:
    for k in results_union.keys():
        f.write(k + '\n')
        for i in results_union[k]:
            f.write(create_zinc_id(i) + '\n')
        f.write('\n')

In [36]:
# 读取数据
data = {}
with open('four_complex_zinc_id_index.txt', 'r') as f:
    its = f.read().strip().split('\n\n')
    for i in its:
        t = i.split('\n')
        data[t[0]] = t[1:]
    

In [40]:
list(data.keys())[:10]
# data['/ED/ADRN/EDADRN_xazi']

['/ED/ADRN/EDADRN_xazi',
 '/DD/EDML/DDEDML_xaa',
 '/GJ/EBRM/GJEBRM_xaa',
 '/DF/ADRN/DFADRN_xvy',
 '/DD/AARO/DDAARO_xaa',
 '/EC/ADRN/ECADRN_xez',
 '/DH/EDRN/DHEDRN_xac',
 '/EB/ADRN/EBADRN_xbi',
 '/ED/ADRN/EDADRN_xavi',
 '/GH/ADRN/GHADRN_xdc']