In [4]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdFMCS

# Load the SDF file
sdf_file = "molecule_set_largest_cluster.sdf"
supplier = Chem.SDMolSupplier(sdf_file)

# Filter valid molecules
molecules = [mol for mol in supplier if mol is not None]

# Compute the Maximum Common Substructure (MCS)
mcs_result = rdFMCS.FindMCS(molecules)
mcs_smarts = mcs_result.smartsString
mcs_mol = Chem.MolFromSmarts(mcs_smarts)

# Convert MCS to SMILES for reference
mcs_smiles = Chem.MolToSmiles(mcs_mol)
print("MCS SMILES:", mcs_smiles)

# Perform optimizations on each molecule by adding fragments (e.g., halogens, methyl groups)
optimized_smiles = []
for mol in molecules:
    # Create editable molecule
    editable_mol = Chem.EditableMol(mol)
    
    # Add an example optimization, such as adding a chlorine atom (Cl) and a methyl group (-CH3)
    editable_mol.AddAtom(Chem.Atom(17))  # Add a chlorine atom
    editable_mol.AddAtom(Chem.Atom(6))   # Add a carbon atom (CH3)
    
    # Get the new molecule and convert to SMILES
    optimized_mol = editable_mol.GetMol()
    optimized_smiles.append(Chem.MolToSmiles(optimized_mol))

# Select the first 50 optimized molecules
optimized_smiles_50 = optimized_smiles[:50]

# Output optimized SMILES strings
for i, smiles in enumerate(optimized_smiles_50):
    print(f"Optimized molecule {i+1}: {smiles}")


MCS SMILES: CCOC1:C:C2:N:C:N:C(NC3:C:C:C:C:C:3):C:2:C:C:1NC(C)=O
Optimized molecule 1: C.C=CC(=O)NCCOc1cc2ncnc(Nc3ccc(Br)cc3F)c2cc1NC(=O)C=C.Cl
Optimized molecule 2: C.C=CC(=O)Nc1cc2c(Nc3ccc(F)c(Br)c3)ncnc2cc1OCCCN1CCOCC1.Cl
Optimized molecule 3: C.C=CC(=O)Nc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCCCNC(=O)CN(C)C.Cl
Optimized molecule 4: C.C=CC(=O)Nc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCCNC(=O)NCC.Cl
Optimized molecule 5: C.C=CC(=O)Nc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCCNC(=O)CN(C)C.Cl
Optimized molecule 6: C.C=CC(=O)Nc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCCNC(C)=O.Cl
Optimized molecule 7: C.C=CC(=O)Nc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCCCNC.Cl
Optimized molecule 8: C.COCCOc1cc2ncnc(Nc3ccc(Br)cc3F)c2cc1NC(=O)/C=C/CN(C)C.Cl
Optimized molecule 9: C.C=CC(=O)Nc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCCNCC(C)=O.Cl
Optimized molecule 10: C.C=CC(=O)Nc1cc2c(Nc3cccc(Br)c3)ncnc2cc1OCCCN1CCOCC1.Cl
Optimized molecule 11: C.C=CC(=O)Nc1cc2c(Nc3cc(Cl)c(Cl)cc3F)ncnc2cc1OCCNCC(C)=O.Cl
Optimized molecule 12: C.C=CC(=O)Nc1cc2c(Nc3ccc(Br)cc3F)ncnc2

In [9]:
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors, QED
import pandas as pd

# 加载SDF文件
sdf_file = "molecule_set_largest_cluster.sdf"
supplier = Chem.SDMolSupplier(sdf_file)

# 过滤有效的分子
molecules = [mol for mol in supplier if mol is not None]

# 计算最大公共子结构（MCS）
mcs_result = rdFMCS.FindMCS(molecules)
mcs_smarts = mcs_result.smartsString
mcs_mol = Chem.MolFromSmarts(mcs_smarts)

# 将MCS转换为SMILES格式以供参考
mcs_smiles = Chem.MolToSmiles(mcs_mol)
print("MCS SMILES:", mcs_smiles)

# 评估分子是否符合Lipinski's Rule of Five
def lipinski_rule_of_five(mol):
    mol_weight = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    hbd = Descriptors.NumHDonors(mol)
    hba = Descriptors.NumHAcceptors(mol)
    
    # Lipinski's Rule of Five
    return mol_weight <= 500 and logp <= 5 and hbd <= 5 and hba <= 10

# 评估分子是否符合CNS药物设计规则
def cns_drug_likeness(mol):
    mol_weight = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    tpsa = Descriptors.TPSA(mol)
    
    # CNS药物规则：分子量小于400，logP在2到3之间，TPSA小于90
    return mol_weight <= 400 and 2 <= logp <= 3 and tpsa <= 90

# 初始化结果列表
results = []

# 对每个分子进行优化并计算属性
for i, mol in enumerate(molecules):
    # 创建可编辑的分子副本以进行修改
    mol_copy = Chem.Mol(mol)
    editable_mol = Chem.EditableMol(mol_copy)
    
    # 添加一个氯原子（Cl）和一个甲基（CH3）作为优化示例
    chlorine_atom = Chem.Atom(17)  # 氯原子
    carbon_atom = Chem.Atom(6)     # 碳原子（用于甲基）

    editable_mol.AddAtom(chlorine_atom)
    editable_mol.AddAtom(carbon_atom)
    
    # 获取优化后的分子并进行氢原子的补充计算
    optimized_mol = editable_mol.GetMol()
    AllChem.Compute2DCoords(optimized_mol)  # 重新计算坐标
    AllChem.AddHs(optimized_mol)  # 添加隐式氢原子

    # 计算QED分数
    qed_score = QED.qed(optimized_mol)
    
    # 检查是否符合Lipinski's Rule of Five和CNS药物设计规则
    lipinski_pass = lipinski_rule_of_five(optimized_mol)
    cns_pass = cns_drug_likeness(optimized_mol)
    
    # 将结果保存到列表中
    results.append({
        'id': i,
        'smiles': Chem.MolToSmiles(optimized_mol),
        'QED': qed_score,
        'Lipinski': lipinski_pass,
        'CNS': cns_pass
    })

# 将结果保存为CSV文件
df = pd.DataFrame(results)
output_file = "filtered_drug_like_molecules.csv"
df.to_csv(output_file, index=False)

print(f"已将筛选后的分子保存为CSV文件: {output_file}")


MCS SMILES: CCOC1:C:C2:N:C:N:C(NC3:C:C:C:C:C:3):C:2:C:C:1NC(C)=O
已将筛选后的分子保存为CSV文件: filtered_drug_like_molecules.csv


In [10]:
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors, QED
import pandas as pd

# 加载SDF文件
sdf_file = "molecule_set_largest_cluster.sdf"
supplier = Chem.SDMolSupplier(sdf_file)

# 过滤有效的分子
molecules = [mol for mol in supplier if mol is not None]

# 计算最大公共子结构（MCS）
mcs_result = rdFMCS.FindMCS(molecules)
mcs_smarts = mcs_result.smartsString
mcs_mol = Chem.MolFromSmarts(mcs_smarts)

# 将MCS转换为SMILES格式以供参考
mcs_smiles = Chem.MolToSmiles(mcs_mol)
print("MCS SMILES:", mcs_smiles)

# 评估分子是否符合Lipinski's Rule of Five
def lipinski_rule_of_five(mol):
    mol_weight = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    hbd = Descriptors.NumHDonors(mol)
    hba = Descriptors.NumHAcceptors(mol)
    
    # Lipinski's Rule of Five
    return mol_weight <= 500 and logp <= 5 and hbd <= 5 and hba <= 10

# 评估分子是否符合CNS药物设计规则
def cns_drug_likeness(mol):
    mol_weight = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    tpsa = Descriptors.TPSA(mol)
    
    # CNS药物规则：分子量小于400，logP在2到3之间，TPSA小于90
    return mol_weight <= 400 and 2 <= logp <= 3 and tpsa <= 90

# 计算分子的额外属性
def calculate_additional_properties(mol):
    # 脂水分布系数
    logp = Descriptors.MolLogP(mol)

    # 氢键供体和受体
    hbd = Descriptors.NumHDonors(mol)
    hba = Descriptors.NumHAcceptors(mol)

    # 芳香环和非芳香环数目
    num_aromatic_rings = Chem.rdMolDescriptors.CalcNumAromaticRings(mol)
    num_aliphatic_rings = Chem.rdMolDescriptors.CalcNumAliphaticRings(mol)

    # 计算非碳原子数占非氢原子数的比率
    num_atoms = mol.GetNumAtoms()
    num_non_h_atoms = sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() > 1)  # 非氢原子
    num_non_c_atoms = sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() != 6)  # 非碳原子
    non_c_ratio = num_non_c_atoms / num_non_h_atoms if num_non_h_atoms > 0 else 0

    return logp, hbd, hba, num_aromatic_rings, num_aliphatic_rings, non_c_ratio

# 初始化结果列表
results = []

# 对每个分子进行优化并计算属性
for i, mol in enumerate(molecules):
    # 创建可编辑的分子副本以进行修改
    mol_copy = Chem.Mol(mol)
    editable_mol = Chem.EditableMol(mol_copy)
    
    # 添加一个氯原子（Cl）和一个甲基（CH3）作为优化示例
    chlorine_atom = Chem.Atom(17)  # 氯原子
    carbon_atom = Chem.Atom(6)     # 碳原子（用于甲基）

    editable_mol.AddAtom(chlorine_atom)
    editable_mol.AddAtom(carbon_atom)
    
    # 获取优化后的分子并进行氢原子的补充计算
    optimized_mol = editable_mol.GetMol()
    AllChem.Compute2DCoords(optimized_mol)  # 重新计算坐标
    AllChem.AddHs(optimized_mol)  # 添加隐式氢原子

    # 计算QED分数
    qed_score = QED.qed(optimized_mol)
    
    # 检查是否符合Lipinski's Rule of Five和CNS药物设计规则
    lipinski_pass = lipinski_rule_of_five(optimized_mol)
    cns_pass = cns_drug_likeness(optimized_mol)

    # 计算额外的分子属性
    logp, hbd, hba, num_aromatic_rings, num_aliphatic_rings, non_c_ratio = calculate_additional_properties(optimized_mol)
    
    # 将结果保存到列表中
    results.append({
        'id': i,
        'smiles': Chem.MolToSmiles(optimized_mol),
        'QED': qed_score,
        'Lipinski': lipinski_pass,
        'CNS': cns_pass,
        'LogP': logp,
        'HBD': hbd,
        'HBA': hba,
        'Aromatic Rings': num_aromatic_rings,
        'Aliphatic Rings': num_aliphatic_rings,
        'Non-C Ratio': non_c_ratio
    })

# 将结果保存为CSV文件
df = pd.DataFrame(results)
output_file = "filtered_drug_like_molecules_with_properties.csv"
df.to_csv(output_file, index=False)

print(f"已将筛选后的分子保存为CSV文件: {output_file}")


MCS SMILES: CCOC1:C:C2:N:C:N:C(NC3:C:C:C:C:C:3):C:2:C:C:1NC(C)=O
已将筛选后的分子保存为CSV文件: filtered_drug_like_molecules_with_properties.csv


In [13]:
# Updated code with logging for debugging molecule processing

from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors, QED
import pandas as pd

# Load SDF file
sdf_file = "molecule_set_largest_cluster.sdf"
supplier = Chem.SDMolSupplier(sdf_file)

# Filter valid molecules
molecules = [mol for mol in supplier if mol is not None]

# Calculate MCS
mcs_result = rdFMCS.FindMCS(molecules)
mcs_smarts = mcs_result.smartsString
mcs_mol = Chem.MolFromSmarts(mcs_smarts)

# Logging MCS result
mcs_smiles = Chem.MolToSmiles(mcs_mol)
print(f"MCS SMILES: {mcs_smiles}")

# Function to check Lipinski's Rule of Five
def lipinski_rule_of_five(mol):
    mol_weight = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    hbd = Descriptors.NumHDonors(mol)
    hba = Descriptors.NumHAcceptors(mol)
    return mol_weight <= 500 and logp <= 5 and hbd <= 5 and hba <= 10

# Function to check CNS drug likeness
def cns_drug_likeness(mol):
    mol_weight = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    tpsa = Descriptors.TPSA(mol)
    return mol_weight <= 400 and 2 <= logp <= 3 and tpsa <= 90

# Additional property calculations
def calculate_additional_properties(mol):
    logp = Descriptors.MolLogP(mol)
    hbd = Descriptors.NumHDonors(mol)
    hba = Descriptors.NumHAcceptors(mol)
    num_aromatic_rings = Chem.rdMolDescriptors.CalcNumAromaticRings(mol)
    num_aliphatic_rings = Chem.rdMolDescriptors.CalcNumAliphaticRings(mol)
    num_non_h_atoms = sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() > 1)
    num_non_c_atoms = sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() != 6)
    non_c_ratio = num_non_c_atoms / num_non_h_atoms if num_non_h_atoms > 0 else 0
    return logp, hbd, hba, num_aromatic_rings, num_aliphatic_rings, non_c_ratio

# Initialize results and counters
results = []
count = 0
max_count = 100000

# Process molecules
for mol in molecules:
    # Make molecule editable and add atoms
    mol_copy = Chem.Mol(mol)
    editable_mol = Chem.EditableMol(mol_copy)
    editable_mol.AddAtom(Chem.Atom(17))  # Chlorine
    editable_mol.AddAtom(Chem.Atom(6))   # Carbon (for CH3)

    # Finalize molecule and compute properties
    optimized_mol = editable_mol.GetMol()
    AllChem.Compute2DCoords(optimized_mol)
    AllChem.AddHs(optimized_mol)

    # Compute QED score
    qed_score = QED.qed(optimized_mol)
    
    # Check filtering conditions
    lipinski_pass = lipinski_rule_of_five(optimized_mol)
    cns_pass = cns_drug_likeness(optimized_mol)

    # Logging each step
    print(f"Molecule {count}: Lipinski {lipinski_pass}, CNS {cns_pass}, QED {qed_score}")

    # Save only molecules that pass all criteria
    if lipinski_pass and cns_pass and qed_score > 0.7:
        logp, hbd, hba, num_aromatic_rings, num_aliphatic_rings, non_c_ratio = calculate_additional_properties(optimized_mol)
        results.append({
            'id': count,
            'smiles': Chem.MolToSmiles(optimized_mol),
            'QED': qed_score,
            'Lipinski': lipinski_pass,
            'CNS': cns_pass,
            'LogP': logp,
            'HBD': hbd,
            'HBA': hba,
            'Aromatic Rings': num_aromatic_rings,
            'Aliphatic Rings': num_aliphatic_rings,
            'Non-C Ratio': non_c_ratio
        })
        count += 1
        if count >= max_count:
            break

# Save results to CSV
df = pd.DataFrame(results)
output_file = "filtered_100k_drug_like_molecules_updated.csv"
df.to_csv(output_file, index=False)

print(f"Filtered molecules saved to: {output_file}")



MCS SMILES: CCOC1:C:C2:N:C:N:C(NC3:C:C:C:C:C:3):C:2:C:C:1NC(C)=O
Molecule 0: Lipinski False, CNS False, QED 0.2508287348406419
Molecule 0: Lipinski False, CNS False, QED 0.25231273714008856
Molecule 0: Lipinski False, CNS False, QED 0.2130544083536871
Molecule 0: Lipinski False, CNS False, QED 0.20903044148285244
Molecule 0: Lipinski False, CNS False, QED 0.2380016337858809
Molecule 0: Lipinski False, CNS False, QED 0.2751429575501083
Molecule 0: Lipinski False, CNS False, QED 0.2539599782879669
Molecule 0: Lipinski False, CNS False, QED 0.2487330311459943
Molecule 0: Lipinski False, CNS False, QED 0.2388335395886044
Molecule 0: Lipinski False, CNS False, QED 0.2649989670685144
Molecule 0: Lipinski False, CNS False, QED 0.17431621766701716
Molecule 0: Lipinski False, CNS False, QED 0.24285483892209014
Molecule 0: Lipinski True, CNS False, QED 0.4046798228156765
Molecule 0: Lipinski False, CNS False, QED 0.24519514402023274
Molecule 0: Lipinski False, CNS False, QED 0.20127064082158047


In [17]:
import random
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors, QED
import pandas as pd

# 常见的元素和基团库
element_choices = [
    (Chem.Atom(17), "Cl"),  # 氯
    (Chem.Atom(9), "F"),    # 氟
    (Chem.Atom(35), "Br"),  # 溴
    (Chem.Atom(53), "I"),   # 碘
    (Chem.Atom(6), "CH3"),  # 甲基（CH3）
    (Chem.Atom(7), "NH2"),  # 氨基
    (Chem.Atom(8), "OH"),   # 羟基
    (Chem.Atom(16), "SH"),  # 硫氢基（SH）
    (Chem.Atom(15), "PH"),  # 磷氢基（PH）
]

# 加载SDF文件
sdf_file = "molecule_set_largest_cluster.sdf"
supplier = Chem.SDMolSupplier(sdf_file)

# 过滤有效的分子
molecules = [mol for mol in supplier if mol is not None]

# 函数：添加随机元素或基团
def add_random_element(editable_mol):
    atom, label = random.choice(element_choices)
    editable_mol.AddAtom(atom)  # 添加随机选择的原子
    print(f"Added element: {label}")

# 初始化结果和计数器
results = []
count = 0
max_count = 100000

# 对每个分子进行处理
for mol in molecules:
    # 创建分子副本进行编辑
    mol_copy = Chem.Mol(mol)
    editable_mol = Chem.EditableMol(mol_copy)

    # 添加两个随机元素或基团
    add_random_element(editable_mol)
    add_random_element(editable_mol)
    
    # 完成分子并计算性质
    optimized_mol = editable_mol.GetMol()
    AllChem.Compute2DCoords(optimized_mol)
    AllChem.AddHs(optimized_mol)

    # 计算QED分数
    qed_score = QED.qed(optimized_mol)
    
    # 检查是否符合Lipinski和CNS规则
    lipinski_pass = lipinski_rule_of_five(optimized_mol)
    cns_pass = cns_drug_likeness(optimized_mol)

    # 打印每个步骤日志
    print(f"Molecule {count}: Lipinski {lipinski_pass}, CNS {cns_pass}, QED {qed_score}")

    # 保存符合条件的分子
    if lipinski_pass and cns_pass and qed_score > 0.7:
        logp, hbd, hba, num_aromatic_rings, num_aliphatic_rings, non_c_ratio = calculate_additional_properties(optimized_mol)
        results.append({
            'id': count,
            'smiles': Chem.MolToSmiles(optimized_mol),
            'QED': qed_score,
            'Lipinski': lipinski_pass,
            'CNS': cns_pass,
            'LogP': logp,
            'HBD': hbd,
            'HBA': hba,
            'Aromatic Rings': num_aromatic_rings,
            'Aliphatic Rings': num_aliphatic_rings,
            'Non-C Ratio': non_c_ratio
        })
        count += 1
        if count >= max_count:
            break

# 保存结果为CSV文件
df = pd.DataFrame(results)
output_file = "filtered_100k_drug_like_molecules_with_random_elements.csv"
df.to_csv(output_file, index=False)

print(f"Filtered molecules saved to: {output_file}")


Added element: I
Added element: SH
Molecule 0: Lipinski False, CNS False, QED 0.1727812908506852
Added element: F
Added element: CH3
Molecule 0: Lipinski False, CNS False, QED 0.2689686171720111
Added element: OH
Added element: NH2
Molecule 0: Lipinski False, CNS False, QED 0.19756566947289975
Added element: F
Added element: CH3
Molecule 0: Lipinski False, CNS False, QED 0.2216729918379981
Added element: OH
Added element: SH
Molecule 0: Lipinski False, CNS False, QED 0.2464146252753658
Added element: NH2
Added element: I
Molecule 0: Lipinski False, CNS False, QED 0.16036876066066946
Added element: OH
Added element: CH3
Molecule 0: Lipinski False, CNS False, QED 0.29453678728702787
Added element: OH
Added element: PH
Molecule 0: Lipinski False, CNS False, QED 0.217289307787076
Added element: SH
Added element: Cl
Molecule 0: Lipinski False, CNS False, QED 0.2428204973211695
Added element: I
Added element: Cl
Molecule 0: Lipinski False, CNS False, QED 0.17839348422037538
Added element: I


In [30]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors, QED
import torch
import torch.nn as nn
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split

# 读取EGFR compounds数据
egfr_csv_file = "EGFR_compounds_lipinski_noPAINS_noBrenk.csv"
egfr_df = pd.read_csv(egfr_csv_file)

# 根据pIC50值划分数据集（假设pIC50列名为"pIC50"）
high_activity_df = egfr_df[egfr_df["pIC50"] >= 8]  # 活性较好的数据（pIC50 >= 8）
low_activity_df = egfr_df[egfr_df["pIC50"] < 8]    # 活性较低的数据

# 加载SDF文件
sdf_file = "molecule_set_largest_cluster.sdf"
supplier = Chem.SDMolSupplier(sdf_file)

# 过滤有效分子
molecules = [mol for mol in supplier if mol is not None]

# 构建GCN模型
class GCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = nn.Linear(input_dim, hidden_dim)
        self.conv2 = nn.Linear(hidden_dim, output_dim)
        self.fc = nn.Linear(output_dim, 1)

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.mean(x, dim=0)
        return self.fc(x)

# 将SMILES转换为分子图
def smiles_to_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    atoms = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
    bonds = []
    for bond in mol.GetBonds():
        bonds.append((bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()))
    data = Data(x=torch.tensor(atoms, dtype=torch.float).view(-1, 1),
                edge_index=torch.tensor(bonds, dtype=torch.long).t().contiguous())
    return data

# 将EGFR数据集转换为图表示
high_activity_smiles = high_activity_df["smiles"].values
low_activity_smiles = low_activity_df["smiles"].values
high_activity_data_list = [smiles_to_graph(smiles) for smiles in high_activity_smiles]
low_activity_data_list = [smiles_to_graph(smiles) for smiles in low_activity_smiles]

# 按pIC50分为训练集和测试集
train_data, test_data = train_test_split(high_activity_data_list, test_size=0.2, random_state=42)

# 使用GCN模型预测并优化分子
def optimize_molecule(mol, model):
    mol_data = smiles_to_graph(Chem.MolToSmiles(mol))
    return model(mol_data.x)

# 创建GCN模型实例
input_dim = 1
hidden_dim = 32
output_dim = 16
model = GCN(input_dim, hidden_dim, output_dim)

# 训练模型
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.MSELoss()

for epoch in range(100):
    model.train()
    total_loss = 0
    for data in train_data:
        optimizer.zero_grad()
        output = model(data.x)
        target = torch.tensor([Descriptors.MolWt(Chem.MolFromSmiles('CCO'))], dtype=torch.float)
        loss = loss_fn(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch}, Loss: {total_loss}')

# 优化并生成新分子
optimized_results = []
for mol in molecules:
    optimized_score = optimize_molecule(mol, model)
    smiles = Chem.MolToSmiles(mol)
    qed_score = QED.qed(mol)
    if qed_score > 0.5:  # 只保留QED大于0.7的分子
        optimized_results.append({
            "smiles": smiles,
            "optimized_score": optimized_score.item(),
            "QED": qed_score
        })

# 保存优化后的分子为CSV文件
df = pd.DataFrame(optimized_results)
output_file = "optimized_drug_like_molecules.csv"
df.to_csv(output_file, index=False)

print(f"Optimized molecules saved to: {output_file}")


Epoch 0, Loss: 36755.35322060525
Epoch 1, Loss: 2784.254277016502
Epoch 2, Loss: 2670.54922376413
Epoch 3, Loss: 2515.8356804414652
Epoch 4, Loss: 2346.6379591668374
Epoch 5, Loss: 2167.320482035051
Epoch 6, Loss: 1978.3645905772137
Epoch 7, Loss: 1778.7025899024447
Epoch 8, Loss: 1566.6522970463848
Epoch 9, Loss: 1347.5483860050008
Epoch 10, Loss: 1122.337716943468
Epoch 11, Loss: 917.2711724587134
Epoch 12, Loss: 685.5865127347206
Epoch 13, Loss: 493.48083153396146
Epoch 14, Loss: 332.7167346635979
Epoch 15, Loss: 214.93820264813257
Epoch 16, Loss: 124.28177051326202
Epoch 17, Loss: 61.55685647108476
Epoch 18, Loss: 23.917921364627546
Epoch 19, Loss: 7.071251437577303
Epoch 20, Loss: 1.6458271469309693
Epoch 21, Loss: 0.2495762168400688
Epoch 22, Loss: 0.0261760383436922
Epoch 23, Loss: 0.0016904692893149331
Epoch 24, Loss: 5.1553884986788034e-05
Epoch 25, Loss: 1.7226120689883828e-06
Epoch 26, Loss: 4.480534698814154e-08
Epoch 27, Loss: 9.880750440061092e-09
Epoch 28, Loss: 9.022187

In [31]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors, QED
import torch
import torch.nn as nn
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split

# 读取EGFR compounds数据
egfr_csv_file = "EGFR_compounds_lipinski_noPAINS_noBrenk.csv"
egfr_df = pd.read_csv(egfr_csv_file)

# 根据pIC50值划分数据集（假设pIC50列名为"pIC50"）
high_activity_df = egfr_df[egfr_df["pIC50"] >= 8]  # 活性较好的数据（pIC50 >= 8）
low_activity_df = egfr_df[egfr_df["pIC50"] < 8]    # 活性较低的数据

# 加载SDF文件
sdf_file = "molecule_set_largest_cluster.sdf"
supplier = Chem.SDMolSupplier(sdf_file)

# 过滤有效分子
molecules = [mol for mol in supplier if mol is not None]

# 构建GCN模型
class GCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = nn.Linear(input_dim, hidden_dim)
        self.conv2 = nn.Linear(hidden_dim, output_dim)
        self.fc = nn.Linear(output_dim, 1)

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.mean(x, dim=0)
        return self.fc(x)

# 将SMILES转换为分子图
def smiles_to_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    atoms = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
    bonds = []
    for bond in mol.GetBonds():
        bonds.append((bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()))
    data = Data(x=torch.tensor(atoms, dtype=torch.float).view(-1, 1),
                edge_index=torch.tensor(bonds, dtype=torch.long).t().contiguous())
    return data

# 将数据集转换为图表示
high_activity_smiles = high_activity_df["smiles"].values
high_activity_data_list = [smiles_to_graph(smiles) for smiles in high_activity_smiles]

# 训练集和测试集划分
train_data, test_data = train_test_split(high_activity_data_list, test_size=0.2, random_state=2)

# 使用GCN模型预测并优化分子
def optimize_molecule(mol, model):
    mol_data = smiles_to_graph(Chem.MolToSmiles(mol))
    return model(mol_data.x)

# 创建GCN模型实例
input_dim = 1
hidden_dim = 32
output_dim = 16
model = GCN(input_dim, hidden_dim, output_dim)

# 训练模型
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.MSELoss()

for epoch in range(100):
    model.train()
    total_loss = 0
    for data in train_data:
        optimizer.zero_grad()
        output = model(data.x)
        # 这里的目标值可以根据EGFR活性数据进行修改
        target = torch.tensor([1.0], dtype=torch.float)  # 示例目标
        loss = loss_fn(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch}, Loss: {total_loss}')

# 评估分子是否符合Lipinski's Rule of Five
def lipinski_rule_of_five(mol):
    mol_weight = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    hbd = Descriptors.NumHDonors(mol)
    hba = Descriptors.NumHAcceptors(mol)
    
    # Lipinski's Rule of Five
    return mol_weight <= 500 and logp <= 5 and hbd <= 5 and hba <= 10

# 评估分子是否符合CNS药物设计规则
def cns_drug_likeness(mol):
    mol_weight = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    tpsa = Descriptors.TPSA(mol)
    
    # CNS药物规则：分子量小于400，logP在2到3之间，TPSA小于90
    return mol_weight <= 400 and 2 <= logp <= 3 and tpsa <= 90

# 计算分子的额外属性
def calculate_additional_properties(mol):
    logp = Descriptors.MolLogP(mol)
    hbd = Descriptors.NumHDonors(mol)
    hba = Descriptors.NumHAcceptors(mol)
    num_aromatic_rings = Chem.rdMolDescriptors.CalcNumAromaticRings(mol)
    num_aliphatic_rings = Chem.rdMolDescriptors.CalcNumAliphaticRings(mol)
    num_non_h_atoms = sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() > 1)
    num_non_c_atoms = sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() != 6)
    non_c_ratio = num_non_c_atoms / num_non_h_atoms if num_non_h_atoms > 0 else 0
    return logp, hbd, hba, num_aromatic_rings, num_aliphatic_rings, non_c_ratio

# 优化并生成新分子
optimized_results = []
for mol in molecules:
    optimized_score = optimize_molecule(mol, model)
    smiles = Chem.MolToSmiles(mol)
    qed_score = QED.qed(mol)
    
    # 评估分子是否符合药物设计规则
    lipinski_pass = lipinski_rule_of_five(mol)
    cns_pass = cns_drug_likeness(mol)
    logp, hbd, hba, num_aromatic_rings, num_aliphatic_rings, non_c_ratio = calculate_additional_properties(mol)

    # 只保留符合所有规则且QED大于0.7的分子
    if qed_score > 0.7 and lipinski_pass and cns_pass:
        optimized_results.append({
            "smiles": smiles,
            "optimized_score": optimized_score.item(),
            "QED": qed_score,
            "Lipinski": lipinski_pass,
            "CNS": cns_pass,
            "LogP": logp,
            "HBD": hbd,
            "HBA": hba,
            "Aromatic Rings": num_aromatic_rings,
            "Aliphatic Rings": num_aliphatic_rings,
            "Non-C Ratio": non_c_ratio
        })

# 保存优化后的分子为CSV文件
df = pd.DataFrame(optimized_results)
output_file = "optimized_drug_like_molecules_with_properties.csv"
df.to_csv(output_file, index=False)

print(f"Optimized molecules saved to: {output_file}")


Epoch 0, Loss: 3.820669558181308
Epoch 1, Loss: 0.15938795288310814
Epoch 2, Loss: 0.001077499355258027
Epoch 3, Loss: 6.264048799309307e-05
Epoch 4, Loss: 6.065685270328913e-05
Epoch 5, Loss: 1.9606471770572398e-05
Epoch 6, Loss: 1.1328380328023968e-05
Epoch 7, Loss: 1.4471897582524207e-05
Epoch 8, Loss: 8.171737594153683e-06
Epoch 9, Loss: 4.823461222969172e-05
Epoch 10, Loss: 0.026289895486616643
Epoch 11, Loss: 0.12788284045204534
Epoch 12, Loss: 0.04200148868119413
Epoch 13, Loss: 0.004117860125507633
Epoch 14, Loss: 0.001073681488556133
Epoch 15, Loss: 0.0024058764258612086
Epoch 16, Loss: 0.00041380684676184387
Epoch 17, Loss: 0.0003224942307760159
Epoch 18, Loss: 0.005351376177056721
Epoch 19, Loss: 0.0171978619811739
Epoch 20, Loss: 0.050753731544489256
Epoch 21, Loss: 0.014226661852173805
Epoch 22, Loss: 0.12406663464784629
Epoch 23, Loss: 0.011260406475834372
Epoch 24, Loss: 0.0033463208166395475
Epoch 25, Loss: 0.0016930565857649071
Epoch 26, Loss: 0.001023851328515235
Epoc