In [1]:
from collections import Counter

def load_scored_molecules(score_file):
    """加载已打分分子，保留原始ID（带编号）和基础ID"""
    with open(score_file, 'r') as f:
        lines = f.readlines()[1:]  # 跳过标题行
    
    scored_full_ids = []  # 保留完整ID（带编号）
    scored_base_ids = []  # 只保留基础ID
    
    for line in lines:
        full_id = line.split('\t')[0].strip()
        base_id = full_id.split('-')[0]
        scored_full_ids.append(full_id)
        scored_base_ids.append(base_id)
    
    return scored_full_ids, scored_base_ids

def load_sdf_molecules(sdf_file):
    """加载SDF文件中的所有分子ID"""
    with open(sdf_file, 'r') as f:
        content = f.readlines()
    
    sdf_molecules = []
    i = 0
    while i < len(content):
        line = content[i].strip()
        if line and not line.startswith(('RDKit', '#')):
            sdf_molecules.append(line)
            i += 1  # 跳过RDKit行
            while i < len(content) and not content[i].startswith('$$$$'):
                i += 1
        i += 1
    return sdf_molecules

In [10]:
score_file = "outputs/4BVC_EC/EC_combined_fixed_score.dat"
sdf_file = "screening/EC_combined_fixed.sdf"

scored_full_ids, scored_base_ids = load_scored_molecules(score_file)
sdf_molecules = load_sdf_molecules(sdf_file)

# 创建查找集合（使用基础ID）
scored_base_set = set(scored_base_ids)

# 找出未打分的分子
unscored_molecules = [mol for mol in sdf_molecules if mol not in scored_base_set]

# 统计结果
print(f"已打分分子记录数（含编号重复）: {len(scored_full_ids)}")
print(f"已打分分子唯一数（去编号后）: {len(set(scored_base_ids))}")
print(f"SDF文件中分子总数: {len(sdf_molecules)}")
print(f"SDF文件中唯一分子数: {len(set(sdf_molecules))}")
print(f"未被打分分子数: {len(unscored_molecules)}")

# 详细检查
if len(sdf_molecules) - len(set(scored_base_ids)) != len(unscored_molecules):
    print("\n警告：数量不一致，可能原因：")
    print("1. SDF中有重复分子但分数文件中只出现一次")
    print("2. ID匹配方式有问题（如大小写不一致）")
    
    # 找出可能的匹配问题
    sdf_set = set(sdf_molecules)
    scored_set = set(scored_base_ids)
    
    # 找出在SDF中但不在分数文件中的分子
    true_missing = sdf_set - scored_set
    print(f"\n真正未打分的唯一分子数: {len(true_missing)}")
    print("这些分子是:")
    for mol in sorted(true_missing):
        print(mol)
    
    # 检查是否有大小写问题
    case_issues = set()
    for sdf_mol in sdf_set:
        if sdf_mol.lower() in [x.lower() for x in scored_set] and sdf_mol not in scored_set:
            case_issues.add(sdf_mol)
    
    if case_issues:
        print(f"\n发现 {len(case_issues)} 个可能的大小写不一致分子:")
        for mol in sorted(case_issues):
            print(mol)

已打分分子记录数（含编号重复）: 115180
已打分分子唯一数（去编号后）: 89941
SDF文件中分子总数: 115193
SDF文件中唯一分子数: 89952
未被打分分子数: 11

警告：数量不一致，可能原因：
1. SDF中有重复分子但分数文件中只出现一次
2. ID匹配方式有问题（如大小写不一致）

真正未打分的唯一分子数: 11
这些分子是:
ZINC000000331927
ZINC000014920475
ZINC000014920478
ZINC000042888388
ZINC000045070506
ZINC000045070509
ZINC000230114930
ZINC000248003605
ZINC000257440659
ZINC000261493160
ZINC000261493163


In [3]:
def get_sdf_ids(sdf_file):
    """从SDF文件中提取所有分子ID"""
    with open(sdf_file) as f:
        return [line.strip() for line in f if line.strip() and not line.startswith((' ', 'RDKit', '#'))]

def extract_molecules_by_ids(sdf_file, ids_to_extract, output_file):
    """根据ID列表提取完整的SDF分子记录"""
    collecting = False
    current_mol = []
    extracted_count = 0
    
    with open(sdf_file) as fin, open(output_file, 'w') as fout:
        for line in fin:
            if line.strip() and not line.startswith((' ', 'RDKit', '#')):  # ID行
                mol_id = line.strip()
                collecting = mol_id in ids_to_extract
                current_mol = [line] if collecting else []
            elif collecting:
                current_mol.append(line)
                if line.startswith('$$$$'):  # 分子结束
                    fout.writelines(current_mol)
                    extracted_count += 1
                    current_mol = []
    
    return extracted_count

score_file = "outputs/4BVC_EC/EC_combined_fixed_score.dat"
sdf_file = "screening/EC_combined_fixed.sdf"

# 使用示例
scored_full_ids, scored_base_ids = load_scored_molecules(score_file)
sdf_ids = get_sdf_ids(sdf_file)

# 找出未打分的分子ID
missing_ids = set(sdf_ids) - set(scored_base_ids)

# 提取并保存这些分子
count = extract_molecules_by_ids(
    "screening/EC_combined_fixed.sdf",
    missing_ids,
    "missing_molecules.sdf"
)

print(f"已提取 {count} 个未打分分子到 missing_molecules.sdf")

已提取 0 个未打分分子到 missing_molecules.sdf


In [8]:
import re

def debug_sdf_processing(score_file, sdf_file):
    """终极调试函数，带详细诊断信息"""
    # 1. 打印文件基本信息
    print("=== 文件基本信息 ===")
    print(f"分数文件行数: {sum(1 for _ in open(score_file))}")
    print(f"SDF文件行数: {sum(1 for _ in open(sdf_file))}\n")
    
    # 2. 加载已打分分子ID（显示前5个）
    with open(score_file) as f:
        scored_ids = {line.split('\t')[0].split('-')[0] for line in f if not line.startswith('#')}
    print(f"=== 已加载 {len(scored_ids)} 个已打分分子 ===")
    print("示例（前5个）:", list(scored_ids)[:5], "\n")
    
    # 3. 扫描SDF文件并诊断
    print("=== SDF文件扫描诊断 ===")
    mol_count = 0
    current_id = None
    id_pattern = re.compile(r'^ZINC\d{12}$')  # 匹配ZINC+12位数字
    
    with open(sdf_file) as f:
        for line in f:
            line = line.strip()
            # 检测分子ID行
            if line and not any(line.startswith(x) for x in [' ', 'RDKit', '#', 'M']):
                if id_pattern.match(line):
                    current_id = line
                    mol_count += 1
                    status = "未打分" if current_id not in scored_ids else "已打分"
                    if mol_count <= 5:  # 打印前5个分子的诊断信息
                        print(f"分子 {mol_count}: ID='{current_id}' ({status})")
                        print("所在行内容:", repr(line))
                        print("是否在已打分集合:", current_id in scored_ids)
                        print("---")
                else:
                    print(f"警告：发现非标准ID: {line}")
    
    # 4. 实际提取未打分分子
    print("\n=== 开始提取未打分分子 ===")
    missing_count = 0
    with open(sdf_file) as fin, open("MISSING.sdf", 'w') as fout:
        saving = False
        for line in fin:
            line_stripped = line.strip()
            if line_stripped and not any(line.startswith(x) for x in [' ', 'RDKit', '#', 'M']):
                if id_pattern.match(line_stripped):
                    saving = line_stripped not in scored_ids
                    if saving: 
                        missing_count += 1
                        print(f"发现未打分分子 #{missing_count}: {line_stripped}")
            if saving:
                fout.write(line)
    
    print(f"\n=== 最终结果 ===")
    print(f"总分子数: {mol_count}")
    print(f"已打分分子: {len(scored_ids)}")
    print(f"未打分分子: {missing_count}")
    print(f"结果已保存到: MISSING.sdf")

# 使用示例
debug_sdf_processing(
    "outputs/4BVC_EC/EC_combined_fixed_score.dat",
    "screening/EC_combined_fixed.sdf"
)

=== 文件基本信息 ===
分数文件行数: 115181


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



发现未打分分子 #1: ZINC000230114930
发现未打分分子 #2: ZINC000045070506
发现未打分分子 #3: ZINC000248003605
发现未打分分子 #4: ZINC000261493160
发现未打分分子 #5: ZINC000261493163
发现未打分分子 #6: ZINC000014920478
发现未打分分子 #7: ZINC000045070509
发现未打分分子 #8: ZINC000014920475
发现未打分分子 #9: ZINC000042888388
发现未打分分子 #10: ZINC000257440659
发现未打分分子 #11: ZINC000000331927

=== 最终结果 ===
总分子数: 115193
已打分分子: 89941
未打分分子: 11
结果已保存到: MISSING.sdf


In [6]:
with open("screening/EC_combined_fixed.sdf") as f:
    print("SDF文件前5行:")
    for i in range(5):
        print(f"{i+1}: {repr(f.readline())}")

SDF文件前5行:
1: 'ZINC000019330849\n'
2: '     RDKit          3D\n'
3: '\n'
4: ' 49 52  0  0  1  0  0  0  0  0999 V2000\n'
5: '    0.4542    1.6241    0.1472 C   0  0  1  0  0  0  0  0  0  0  0  0\n'


In [34]:
mol_id

'$$$$'

In [35]:
current_mol

['$$$$\n']