In [24]:

import os
from Bio.PDB import PDBParser
import pandas as pd
from tqdm import tqdm

min_length = 13
# 初始化氨基酸计数列表（假设有20种标准氨基酸）
gt_condition_acid_list = [0] * 20
amino_acid_map = {
    'ALA': 1, 'ARG': 2, 'ASN': 3, 'ASP': 4, 'CYS': 5,
    'GLN': 6, 'GLU': 7, 'GLY': 8, 'HIS': 9, 'ILE': 10,
    'LEU': 11, 'LYS': 12, 'MET': 13, 'PHE': 14, 'PRO': 15,
    'SER': 16, 'THR': 17, 'TRP': 18, 'TYR': 19, 'VAL': 20
}

file_path = '/data/private/jdp/PepGLAD/datasets/train_valid/all.txt'
df = pd.read_csv(file_path, sep='\t', header=None, names=['index', 'protein_id', 'peptide_id', 'label'])
df.set_index('index', inplace=True)

# 筛选 label 为 0 的行
df = df[df['label'] == 0]
for idx in tqdm(df.index):
    peptide_path = os.path.join('/data/private/jdp/PepGLAD/datasets/train_valid/pdbs', idx + '.pdb')
    if not os.path.exists(peptide_path):
        print(f"File not found: {peptide_path}")
        continue
    parser = PDBParser(QUIET=True)
    try:
        structure = parser.get_structure('peptide', peptide_path)
    except Exception as e:
        print(f"Error parsing {peptide_path}: {e}")
        continue
    peptide_id = df.loc[idx, 'peptide_id']
    try:
        chain = structure[0][peptide_id]
    except KeyError:
        print(f"Chain {peptide_id} not found in {peptide_path}")
        continue

    residues = list(chain.get_residues())

    if len(residues) < min_length:
        continue

    for res in residues:
        resname = res.get_resname() 
        if resname in amino_acid_map:
            gt_condition_acid_list[amino_acid_map[resname] - 1] += 1
        else:
            print(f"Unknown residue {resname} in {peptide_path}")

file_path = '/data/private/jdp/PepGLAD/datasets/LNR/test.txt'
df = pd.read_csv(file_path, sep='\t', header=None, names=['index', 'protein_id', 'peptide_id', 'label'])
df.set_index('index', inplace=True)

# 筛选 label 为 0 的行
df = df[df['label'] == 0]
for idx in tqdm(df.index):
    peptide_path = os.path.join('/data/private/jdp/PepGLAD/datasets/LNR/pdbs', idx + '.pdb')
    if not os.path.exists(peptide_path):
        print(f"File not found: {peptide_path}")
        continue
    parser = PDBParser(QUIET=True)
    try:
        structure = parser.get_structure('peptide', peptide_path)
    except Exception as e:
        print(f"Error parsing {peptide_path}: {e}")
        continue
    peptide_id = df.loc[idx, 'peptide_id']
    try:
        chain = structure[0][peptide_id]
    except KeyError:
        print(f"Chain {peptide_id} not found in {peptide_path}")
        continue

    residues = list(chain.get_residues())

    if len(residues) < min_length:
        continue

    for res in residues:
        resname = res.get_resname() 
        if resname in amino_acid_map:
            gt_condition_acid_list[amino_acid_map[resname] - 1] += 1
        else:
            print(f"Unknown residue {resname} in {peptide_path}")
print("Final amino acid counts:", gt_condition_acid_list)

    

    

  0%|          | 0/4612 [00:00<?, ?it/s]

100%|██████████| 4612/4612 [02:51<00:00, 26.92it/s]
  5%|▌         | 5/93 [00:00<00:05, 16.93it/s]

Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/1d4t.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/1d4t.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/1d4t.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/1d4t.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/1d4t.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/1d4t.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/1d4t.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/1d4t.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/1d4t.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/1d4t.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/1d4t.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/1d4t.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/1d4t.pdb
Unknown resi

 15%|█▌        | 14/93 [00:00<00:03, 21.88it/s]

Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/1t3l.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/1t3l.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/1t3l.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/1t3l.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/1t3l.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/1t3l.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/1t3l.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/1t3l.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/1t4f.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/1t4f.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/1t4f.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/1t4f.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/1t4f.pdb
Unknown resi

 23%|██▎       | 21/93 [00:01<00:04, 16.78it/s]

Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/2hwl.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/2hwl.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/2hwl.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/2qa9.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/2qa9.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/2qa9.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/2qa9.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/2qa9.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/2qa9.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/2qa9.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/2qa9.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/2qa9.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/2qos.pdb
Unknown resi

 26%|██▌       | 24/93 [00:01<00:05, 13.23it/s]

Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/2v8x.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/2v8x.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/2v8x.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/2v8x.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/2v8x.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/2v8x.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/2v8x.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/2v8x.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/2v8x.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/2v8x.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/2v8x.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/2v8x.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/2v8x.pdb
Unknown resi

 34%|███▍      | 32/93 [00:02<00:04, 14.74it/s]

Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3ayu.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3ayu.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3ayu.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3ayu.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3ayu.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3ayu.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3ayu.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3ayu.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3ayu.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3ayu.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3ayu.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3ayu.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3ayu.pdb
Unknown resi

 41%|████      | 38/93 [00:02<00:03, 15.83it/s]

Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3pkn.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3pkn.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3pkn.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3pkn.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3pkn.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3pkn.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3pkn.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3r42.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3r42.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3r42.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3r42.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3r42.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3r42.pdb
Unknown resi

 48%|████▊     | 45/93 [00:03<00:02, 16.68it/s]

Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3ro3.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3ro3.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3ro3.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3ro3.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3ro3.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3ro3.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3ro3.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3ro3.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3ro3.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3ro3.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3ro3.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3ro3.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/3ro3.pdb
Unknown resi

 54%|█████▍    | 50/93 [00:03<00:03, 12.34it/s]

Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4cu4.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4cy3.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4cy3.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4cy3.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4cy3.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4cy3.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4cy3.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4cy3.pdb


 59%|█████▉    | 55/93 [00:04<00:03, 11.89it/s]

Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4k0u.pdb


 61%|██████▏   | 57/93 [00:04<00:03, 11.89it/s]

Unknown residue SO4 in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4mn8.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4piq.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4piq.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4piq.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4piq.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4piq.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4piq.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4piq.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4rs9.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4rs9.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4rs9.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4rs9.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4rs9.pdb
Unknown resi

 66%|██████▌   | 61/93 [00:04<00:03, 10.36it/s]

Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4tzm.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4w50.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4w50.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4w50.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4w50.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4w50.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4x3h.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4x3h.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4x3h.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4x3h.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4x3h.pdb


 69%|██████▉   | 64/93 [00:05<00:03,  8.41it/s]

Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4xob.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4xob.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4z2o.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4z2o.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4z2o.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4z2o.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4z2o.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4z2o.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4z2o.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/4z2o.pdb
Unknown residue SRT in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/5a29.pdb
Unknown residue SRT in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/5a29.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/5a29.pdb
Unknown resi

 75%|███████▌  | 70/93 [00:05<00:02, 10.18it/s]

Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/5l9b.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/5l9b.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/5l9b.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/5l9b.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/5l9b.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/5l9b.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/5l9b.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/5l9b.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/5l9b.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/5l9b.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/5l9b.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/5l9b.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/5l9b.pdb
Unknown resi

 86%|████████▌ | 80/93 [00:06<00:01, 11.14it/s]

Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/5yc2.pdb


 88%|████████▊ | 82/93 [00:06<00:00, 11.31it/s]

Unknown residue ACE in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6efk.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6efk.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6efk.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6efk.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6efk.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6efk.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6efk.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6efk.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6efk.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6efk.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6efk.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6efk.pdb


 91%|█████████▏| 85/93 [00:07<00:00,  9.85it/s]

Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6g86.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6g86.pdb


 98%|█████████▊| 91/93 [00:08<00:00,  9.30it/s]

Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6j0x.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6j0x.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6j0x.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6j0x.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6j0x.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6n3e.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6n3e.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6n3e.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6n3e.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6n3e.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6n3e.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6n3e.pdb
Unknown residue HOH in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6qg8.pdb
Unknown resi

100%|██████████| 93/93 [00:08<00:00, 11.05it/s]

Unknown residue ACE in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6seo.pdb
Unknown residue NH2 in /data/private/jdp/PepGLAD/datasets/LNR/pdbs/6seo.pdb
Final amino acid counts: [1977, 1767, 940, 1384, 441, 1047, 1672, 1493, 480, 1284, 2302, 1698, 541, 875, 1536, 1718, 1163, 395, 781, 1207]





In [32]:
#Calculate the KL acid type
min_length = 13
import numpy as np
amino_acid_map = {
    'A': 1,  # Alanine
    'R': 2,  # Arginine
    'N': 3,  # Asparagine
    'D': 4,  # Aspartic acid
    'C': 5,  # Cysteine
    'E': 6,  # Glutamic acid
    'Q': 7,  # Glutamine
    'G': 8,  # Glycine
    'H': 9,  # Histidine
    'I': 10, # Isoleucine
    'L': 11, # Leucine
    'K': 12, # Lysine
    'M': 13, # Methionine
    'F': 14, # Phenylalanine
    'P': 15, # Proline
    'S': 16, # Serine
    'T': 17, # Threonine
    'W': 18, # Tryptophan
    'Y': 19, # Tyrosine
    'V': 20  # Valine
}
def kl_divergence(P, Q):
    # 归一化
    P = np.array(P) / np.sum(P)
    Q = np.array(Q) / np.sum(Q)
    
    # 确保Q中没有零值
    epsilon = 1e-10
    P = P + epsilon
    Q = Q + epsilon
    
    # 计算KL散度
    return np.sum(P * np.log(P / Q))

condition_acid_list = [0]*20
import json
import numpy as np
import os
from Bio.PDB import PDBParser
import mdtraj as md
condition_directory = '/home/jiangdapeng/PepGLAD/results/condition4_w10_5samples/results.jsonl'
with open(condition_directory, 'r', encoding='utf-8') as f:
    for line in f:
        # 跳过空行
        if line.strip():
            # 将每行解析为 JSON 对象
            json_object = json.loads(line)
            id = json_object['id']
            peptide_path = json_object['gen_pdb']
            peptide_path = os.path.join('..',peptide_path)
            peptide = json_object['gen_seq']
            parser = PDBParser(QUIET=True)
            structure = parser.get_structure('peptide', peptide_path)
            chain = structure[0][json_object['lig_chain']]  # 假设只取第一个模型和指定的链
            residues = list(chain.get_residues())
            if len(peptide)<min_length:
                continue
            for acid in peptide:
                condition_acid_list[amino_acid_map[acid]-1]+=1

gt1_condition_acid_list = [x for i, x in enumerate(gt_condition_acid_list) if i not in {4}]
condition1_acid_list = [x for i, x in enumerate(condition_acid_list) if i not in {4}]
kl_div = kl_divergence(gt1_condition_acid_list, condition1_acid_list)

print(kl_div)

1.2677048442060763
