In [13]:
# Condition11

import json
import numpy as np
import os
from Bio.PDB import PDBParser

# 文件路径
directory = '/home/jiangdapeng/PepGLAD/results/condition11_w5_5samples/results.jsonl'
directory_good = '/home/jiangdapeng/PepGLAD/results/condition11_w5_5samples/good_results_multiple.jsonl'

# 初始化变量
all_peptide_list = []
meet_peptide_dic1 = {}
meet_peptide_dic_path1 = {}
total_counter = 0
counter2 = 0  # 满足至少 1 个结构的多肽数
counter3 = 0  # 满足超过 2 个结构的多肽数

def check_kde_structures(peptide, residues, distance_threshold=(4, 6.5), min_structures=3):
    """
    检查多肽是否存在超过 2 个不交叉的 K-D/E 结构。
    - peptide: 多肽序列
    - residues: PDB 残基列表
    - distance_threshold: 距离阈值 (lower, upper)
    - min_structures: 至少需要的结构数量
    返回:
    - True/False: 是否满足条件
    - valid_positions: 满足条件的结构起始位置列表
    """
    valid_positions = []  # 存储满足条件的结构起始位置
    lower, upper = distance_threshold
    for i in range(len(residues) - 4):
        atom1 = residues[i]['C']
        atom2 = residues[i + 3]['C']
        atom3 = residues[i + 4]['C']
        distance1 = np.linalg.norm(atom1.coord - atom2.coord)
        distance2 = np.linalg.norm(atom1.coord - atom3.coord)

        # 检查是否满足 K-D/E 条件和距离要求
        if peptide[i] == 'K':
            if ((peptide[i + 3] in 'DE' and lower < distance1 < upper) or
                (peptide[i + 4] in 'DE' and lower < distance2 < upper)):
                # 检查是否与已有结构重叠
                if not any(i in [start, start + 4] for start in valid_positions):
                    valid_positions.append(i)
                    # 如果满足的结构数量达到阈值，提前返回
                    if len(valid_positions) >= min_structures:
                        return True, valid_positions
    return False, valid_positions

# 主逻辑
with open(directory, 'r', encoding='utf-8') as f, open(directory_good, 'w', encoding='utf-8') as good_file:
    for line in f:
        if line.strip():  # 跳过空行
            json_object = json.loads(line)
            python_object = json.loads(line.strip())
            id = json_object['id']
            peptide_path = json_object['gen_pdb']
            peptide_path = os.path.join('..', peptide_path)
            peptide = json_object['gen_seq']
            parser = PDBParser(QUIET=True)
            structure = parser.get_structure('peptide', peptide_path)
            chain = structure[0][json_object['lig_chain']]
            residues = list(chain.get_residues())

            if len(peptide) < 10:  # 跳过长度小于 5 的多肽
                continue

            if id not in all_peptide_list:
                all_peptide_list.append(id)

            total_counter += 1

            # 检查是否有超过 2 个不交叉的 K-D/E 结构
            satisfies, positions = check_kde_structures(peptide, residues,min_structures=2)
            if satisfies:
                counter3 += 1  # 计数满足超过 2 个结构的多肽
                if id not in meet_peptide_dic1.keys():
                    meet_peptide_dic1[id] = 1
                    meet_peptide_dic_path1[id] = [peptide_path]
                    good_file.write(json.dumps(python_object) + '\n')
                else:
                    meet_peptide_dic1[id] += 1
                    if peptide_path not in meet_peptide_dic_path1[id]:
                        meet_peptide_dic_path1[id].append(peptide_path)
                        good_file.write(json.dumps(python_object) + '\n')

            # 记录满足单个结构条件的多肽数
            if len(positions) > 0:
                counter2 += 1

# 输出结果统计
print(f'total samples: {total_counter}')
print(f'meet the requirement (at least 1 K-D/E structure): {counter2}')
print(f'meet the requirement (more than 1 K-D/E structures): {counter3}')
print(len(all_peptide_list))
print(len(meet_peptide_dic1.keys()))

total samples: 215
meet the requirement (at least 1 K-D/E structure): 29
meet the requirement (more than 2 K-D/E structures): 4
43
4


In [None]:
# Condition 2
all_peptide_list = []
meet_peptide_dic2 = {}
meet_peptide_dic_path2 = {}
import json
import numpy as np
import os
from Bio.PDB import PDBParser
import mdtraj as md
directory = '/home/jiangdapeng/PepGLAD/results/condition2_w1_50samples/results.jsonl'
directory_good = '/home/jiangdapeng/PepGLAD/results/condition2_w1_50samples/good_results.jsonl'
counter = 0
counter2 = 0
counter3 = 0
total_counter = 0
distance_list = []
with open(directory, 'r', encoding='utf-8') as f, open(directory_good, 'w', encoding='utf-8') as good_file:
    for line in f:
        # 跳过空行
        if line.strip():
            # 将每行解析为 JSON 对象
            json_object = json.loads(line)
            python_object = json.loads(line.strip())
            id = json_object['id']
            if id not in all_peptide_list:
                all_peptide_list.append(id)
            peptide_path = json_object['gen_pdb']
            peptide_path = os.path.join('..',peptide_path)
            peptide = json_object['gen_seq']
            parser = PDBParser(QUIET=True)
            structure = parser.get_structure('peptide', peptide_path)
            chain = structure[0][json_object['lig_chain']]  # 假设只取第一个模型和指定的链
            residues = list(chain.get_residues())
            total_counter=total_counter+1
            first_atom = residues[0]['C']  # N端的氮原子
            last_atom = residues[-1]['C']   # C端的羧基碳原子
            # 计算欧几里得距离
            distance = np.linalg.norm(first_atom.coord - last_atom.coord)
            distance_list.append(distance)
            if distance<5:
                if id not in meet_peptide_dic2.keys():
                    meet_peptide_dic2[id] = 1
                    meet_peptide_dic_path2[id] = [peptide_path]
                    good_file.write(json.dumps(python_object) + '\n')
                else:
                    meet_peptide_dic2[id] += 1
                    if peptide_path not in meet_peptide_dic_path2[id]:
                        meet_peptide_dic_path2[id].append(peptide_path)
                        good_file.write(json.dumps(python_object) + '\n')
                counter+=1

print(f'mean distance is {np.mean(distance_list)}')
print(f'meet the requirement Distance: {counter}')
print(f'total samples:{total_counter}')
print(len(meet_peptide_dic2.keys()))
print(len(all_peptide_list))

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f02aca75bb0>>
Traceback (most recent call last):
  File "/data/private/jdp/envs/PepGLAD/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


In [10]:
import json
import numpy as np
import os
from Bio.PDB import PDBParser

# Condition32
# 文件路径
directory = '/home/jiangdapeng/PepGLAD/results/condition32_w2_5samples/results.jsonl'
directory_good = '/home/jiangdapeng/PepGLAD/results/condition32_w2_5samples/good_results_double.jsonl'

# 初始化变量
all_peptide_list = []
meet_peptide_dic3 = {}
meet_peptide_dic_path3 = {}
total_counter = 0
counter = 0  # 满足距离条件的结构数
counter2 = 0  # 满足至少一个 C-C 条件的多肽数
counter3 = 0  # 满足至少两个非相邻结构的多肽数
distance_list = []

def check_cc_structures(peptide, residues, distance_threshold=6, min_structures=1):
    """
    检查多肽是否存在至少两个不交叉且非相邻的 C-C 结构，且距离小于 distance_threshold。
    - peptide: 多肽序列
    - residues: PDB 残基列表
    - distance_threshold: 距离阈值（默认 6 埃）
    - min_structures: 至少需要的结构数量
    返回:
    - True/False: 是否满足条件
    - valid_pairs: 满足条件的残基对列表
    """
    valid_pairs = []  # 存储满足条件的残基对 (i, j)
    for i in range(len(residues)):
        for j in range(i + 1, len(residues)):
            # 检查是否是非相邻残基
            if j - i <= 2:
                continue
            # 检查残基是否为 C（半胱氨酸）
            if peptide[i] == 'C' and peptide[j] == 'C':
                first_atom = residues[i]['C']
                last_atom = residues[j]['C']
                distance = np.linalg.norm(first_atom.coord - last_atom.coord)
                # 检查距离条件
                if distance < distance_threshold:
                    # 检查是否与已有对重叠
                    # if not any((start <= i <= end or start <= j <= end) for start, end in valid_pairs):
                    if not any((i in pair) or (j in pair) for pair in valid_pairs):
                        valid_pairs.append((i, j))
                        # 如果满足的结构数量达到阈值，提前返回
                        if len(valid_pairs) >= min_structures:
                            return True, valid_pairs
    return False, valid_pairs

# 主逻辑
with open(directory, 'r', encoding='utf-8') as f, open(directory_good, 'w', encoding='utf-8') as good_file:
    for line in f:
        if line.strip():  # 跳过空行
            json_object = json.loads(line)
            python_object = json.loads(line.strip())
            peptide_path = json_object['gen_pdb']
            peptide_path = os.path.join('..', peptide_path)
            peptide = json_object['gen_seq']
            id = json_object['id']
            parser = PDBParser(QUIET=True)
            structure = parser.get_structure('peptide', peptide_path)
            chain = structure[0][json_object['lig_chain']]
            residues = list(chain.get_residues())

            first_atom = residues[0]['C']  # N端的氮原子
            last_atom = residues[-1]['C']   # C端的羧基碳原子
            # 计算欧几里得距离
            distance = np.linalg.norm(first_atom.coord - last_atom.coord)


            if len(peptide) < 8:  # 跳过长度小于 8 的多肽
                continue

            if id not in all_peptide_list:
                all_peptide_list.append(id)

            total_counter += 1

            # 检查是否有至少两个不交叉且非相邻的 C-C 结构
            satisfies, valid_pairs = check_cc_structures(peptide, residues,min_structures=1)
            if satisfies and distance<6:
                counter3 += 1  # 计数满足至少两个结构的多肽
                if id not in meet_peptide_dic3.keys():
                    meet_peptide_dic3[id] = 1
                    meet_peptide_dic_path3[id] = [peptide_path]
                    good_file.write(json.dumps(python_object) + '\n')
                else:
                    meet_peptide_dic3[id] += 1
                    if peptide_path not in meet_peptide_dic_path3[id]:
                        meet_peptide_dic_path3[id].append(peptide_path)
                        good_file.write(json.dumps(python_object) + '\n')

            # 记录满足单个结构条件的数量
            counter += len(valid_pairs)
            if len(valid_pairs) > 0:
                counter2 += 1

# 输出结果统计
print(f'mean distance is {np.mean(distance_list) if distance_list else 0}')
print(f'meet the requirement Distance: {counter}')
print(f'total samples: {total_counter}')
print(f'meet the requirement Cys: {counter2}')
print(len(all_peptide_list))
print(len(meet_peptide_dic3.keys()))

mean distance is 0
meet the requirement Distance: 5
total samples: 54
meet the requirement Cys: 5
30
5


In [1]:
import json
import numpy as np
import os
from Bio.PDB import PDBParser

# Condition333
# 文件路径
directory = '/home/jiangdapeng/PepGLAD/results/condition333_w2_5samples/results.jsonl'
directory_good = '/home/jiangdapeng/PepGLAD/results/condition333_w2_5samples/good_results_double.jsonl'

# 初始化变量
all_peptide_list = []
meet_peptide_dic3 = {}
meet_peptide_dic_path3 = {}
total_counter = 0
counter = 0  # 满足距离条件的结构数
counter2 = 0  # 满足至少一个 C-C 条件的多肽数
counter3 = 0  # 满足至少两个非相邻结构的多肽数
distance_list = []

def check_cc_structures(peptide, residues, distance_threshold=6, min_structures=2):
    """
    检查多肽是否存在至少两个不交叉且非相邻的 C-C 结构，且距离小于 distance_threshold。
    - peptide: 多肽序列
    - residues: PDB 残基列表
    - distance_threshold: 距离阈值（默认 6 埃）
    - min_structures: 至少需要的结构数量
    返回:
    - True/False: 是否满足条件
    - valid_pairs: 满足条件的残基对列表
    """
    valid_pairs = []  # 存储满足条件的残基对 (i, j)
    for i in range(len(residues)):
        for j in range(i + 1, len(residues)):
            # 检查是否是非相邻残基
            if j - i <= 2:
                continue
            # 检查残基是否为 C（半胱氨酸）
            if peptide[i] == 'C' and peptide[j] == 'C':
                first_atom = residues[i]['C']
                last_atom = residues[j]['C']
                distance = np.linalg.norm(first_atom.coord - last_atom.coord)
                # 检查距离条件
                if distance < distance_threshold:
                    # 检查是否与已有对重叠
                    # if not any((start <= i <= end or start <= j <= end) for start, end in valid_pairs):
                    if not any((i in pair) or (j in pair) for pair in valid_pairs):
                        valid_pairs.append((i, j))
                        # 如果满足的结构数量达到阈值，提前返回
                        if len(valid_pairs) >= min_structures:
                            return True, valid_pairs
    return False, valid_pairs

# 主逻辑
with open(directory, 'r', encoding='utf-8') as f, open(directory_good, 'w', encoding='utf-8') as good_file:
    for line in f:
        if line.strip():  # 跳过空行
            json_object = json.loads(line)
            python_object = json.loads(line.strip())
            peptide_path = json_object['gen_pdb']
            peptide_path = os.path.join('..', peptide_path)
            peptide = json_object['gen_seq']
            id = json_object['id']
            parser = PDBParser(QUIET=True)
            structure = parser.get_structure('peptide', peptide_path)
            chain = structure[0][json_object['lig_chain']]
            residues = list(chain.get_residues())

            if len(peptide) < 8:  # 跳过长度小于 8 的多肽
                continue

            if id not in all_peptide_list:
                all_peptide_list.append(id)

            total_counter += 1

            # 检查是否有至少两个不交叉且非相邻的 C-C 结构
            satisfies, valid_pairs = check_cc_structures(peptide, residues,min_structures=3)
            if satisfies:
                counter3 += 1  # 计数满足至少两个结构的多肽
                if id not in meet_peptide_dic3.keys():
                    meet_peptide_dic3[id] = 1
                    meet_peptide_dic_path3[id] = [peptide_path]
                    good_file.write(json.dumps(python_object) + '\n')
                else:
                    meet_peptide_dic3[id] += 1
                    if peptide_path not in meet_peptide_dic_path3[id]:
                        meet_peptide_dic_path3[id].append(peptide_path)
                        good_file.write(json.dumps(python_object) + '\n')

            # 记录满足单个结构条件的数量
            counter += len(valid_pairs)
            if len(valid_pairs) > 0:
                counter2 += 1

# 输出结果统计
print(f'mean distance is {np.mean(distance_list) if distance_list else 0}')
print(f'meet the requirement Distance: {counter}')
print(f'total samples: {total_counter}')
print(f'meet the requirement Cys: {counter2}')
print(len(all_peptide_list))
print(len(meet_peptide_dic3.keys()))



mean distance is 0
meet the requirement Distance: 35
total samples: 76
meet the requirement Cys: 24
50
2


In [7]:
meet_peptide_dic_path3

{'1d4t': ['.././results/condition33_w2_5samples/candidates/1d4t/1d4t_gen_0.pdb',
  '.././results/condition33_w2_5samples/candidates/1d4t/1d4t_gen_4.pdb'],
 '2x72': ['.././results/condition33_w2_5samples/candidates/2x72/2x72_gen_0.pdb',
  '.././results/condition33_w2_5samples/candidates/2x72/2x72_gen_1.pdb'],
 '2xyi': ['.././results/condition33_w2_5samples/candidates/2xyi/2xyi_gen_0.pdb'],
 '3pkn': ['.././results/condition33_w2_5samples/candidates/3pkn/3pkn_gen_0.pdb',
  '.././results/condition33_w2_5samples/candidates/3pkn/3pkn_gen_2.pdb',
  '.././results/condition33_w2_5samples/candidates/3pkn/3pkn_gen_4.pdb'],
 '4dcb': ['.././results/condition33_w2_5samples/candidates/4dcb/4dcb_gen_0.pdb',
  '.././results/condition33_w2_5samples/candidates/4dcb/4dcb_gen_3.pdb'],
 '5vao': ['.././results/condition33_w2_5samples/candidates/5vao/5vao_gen_0.pdb'],
 '6g86': ['.././results/condition33_w2_5samples/candidates/6g86/6g86_gen_0.pdb',
  '.././results/condition33_w2_5samples/candidates/6g86/6g86_g

In [32]:
#Condition 4
import json
import numpy as np
import os
from Bio.PDB import PDBParser
import mdtraj as md
directory = '/home/jiangdapeng/PepGLAD/results/condition4_w5_5samples/results.jsonl'
directory_good = '/home/jiangdapeng/PepGLAD/results/condition4_w5_5samples/good_results.jsonl'
counter = 0
counter2 = 0
counter3 = 0
total_counter = 0
all_peptide_list = []
meet_peptide_dic4 = {}
meet_peptide_dic_path4 = {}
with open(directory, 'r', encoding='utf-8') as f, open(directory_good, 'w', encoding='utf-8') as good_file:
    for line in f:
        # 跳过空行
        if line.strip():
            # 将每行解析为 JSON 对象
            json_object = json.loads(line)
            python_object = json.loads(line.strip())
            peptide_path = json_object['gen_pdb']
            peptide_path = os.path.join('..',peptide_path)
            peptide = json_object['gen_seq']
            id = json_object['id']
            parser = PDBParser(QUIET=True)
            structure = parser.get_structure('peptide', peptide_path)
            chain = structure[0][json_object['lig_chain']]  # 假设只取第一个模型和指定的链
            residues = list(chain.get_residues())
            if len(peptide)<13:
                continue
            if id not in all_peptide_list:
                all_peptide_list.append(id)
            total_counter=total_counter+1
            
            flag=0
            flag_distance = 0
            counter2_flag = 0
            for i in range(0,len(residues)):
                for j in range(i+1,len(residues)):
                    for k in range(j+1,len(residues)):
                        flag1=False
                        flag2 = False
                        atom1 = residues[i]['C'] 
                        atom2 = residues[j]['C'] 
                        atom3 = residues[k]['C']
                        # 计算欧几里得距离
                        distance1 = np.linalg.norm(atom1.coord - atom2.coord)
                        distance2 = np.linalg.norm(atom2.coord - atom3.coord)
                        distance3 = np.linalg.norm(atom3.coord - atom1.coord)
                        if distance1<10 and distance2<10 and distance3<10:
                            flag1 = True
                        if peptide[i] == 'C' and peptide[j] == 'C' and peptide[k] == 'C': 
                            counter2_flag = 1
                            flag2 = True
                        if flag1&flag2:
                            flag=1
                            if id not in meet_peptide_dic4.keys():
                                meet_peptide_dic4[id] = 1
                                meet_peptide_dic_path4[id] = [peptide_path]
                                good_file.write(json.dumps(python_object) + '\n')
                            else:
                                meet_peptide_dic4[id] += 1
                                if peptide_path not in meet_peptide_dic_path4[id]:
                                    meet_peptide_dic_path4[id].append(peptide_path)
                                    good_file.write(json.dumps(python_object) + '\n')
            counter2+=counter2_flag
            counter3+=flag
            counter+=flag_distance

print(f'total samples:{total_counter}')
print(f'meet the requirement Cys: {counter2}')
print(f'meet the requirement all: {counter3}')
print(len(all_peptide_list))
print(len(meet_peptide_dic4.keys()))

total samples:130
meet the requirement Cys: 58587
meet the requirement all: 52047
26
22
