In [None]:
# 导入所需的库
from rdkit import rdBase, Chem, DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import rdMolDraw2D, IPythonConsole
import rdkit
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster import hierarchy
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# 打印RDKit的版本号
print(rdBase.rdkitVersion)

# 定义SDF文件所在的文件夹路径
folder_path = 'D:/pubchem/cluster/sdf'  # 替换为您的文件夹路径

# 遍历文件夹中的所有SDF文件并加载分子对象
sdf_files = []
sdf_file_names = []  # 存储SDF文件名
for file in os.listdir(folder_path):
    if file.endswith('.sdf'):
        sdf_files.append(os.path.join(folder_path, file))
        sdf_file_names.append(file)  # 记录SDF文件名

mols_free = []

# 遍历SDF文件列表，并加载分子对象
for sdf_file in sdf_files:
    suppl = Chem.SDMolSupplier(sdf_file)
    mols_free.extend([x for x in suppl if x is not None])

# 对分子计算Morgan指纹
morgan_fp = [AllChem.GetMorganFingerprintAsBitVect(x, 2, 2048) for x in mols_free]

# 计算相似性矩阵
dis_matrix = [DataStructs.BulkTanimotoSimilarity(morgan_fp[i], morgan_fp[:len(mols_free)], returnDistance=True) for i in range(len(mols_free))]
dis_array = np.array(dis_matrix)

# 使用层次聚类算法进行聚类（使用Ward Linkage）
ward = AgglomerativeClustering(n_clusters=12, affinity='euclidean', linkage='ward')
ward.fit(dis_array)

# 统计每个聚类的分子数量
pd.value_counts(ward.labels_)

# 创建每个聚类的分子库，并打印聚类后的SDF文件名
ward_library = {i: [] for i in range(12)}
for n, j in enumerate(ward.labels_):
    ward_library[j].append(sdf_file_names[n])  # 使用SDF文件名记录聚类结果

# 导入所需的库（继续）
from rdkit.Chem import Draw

# 创建保存分子图的文件夹
output_folder = 'D:/pubchem/cluster/cluster_molecule_images_1'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# 遍历每个聚类，将该聚类的分子图保存到对应文件夹
for cluster_label, cluster_molecules in ward_library.items():
    cluster_output_folder = os.path.join(output_folder, f'cluster_{cluster_label}')
    if not os.path.exists(cluster_output_folder):
        os.makedirs(cluster_output_folder)
    
    for sdf_file in cluster_molecules:
        mol_index = sdf_file_names.index(sdf_file)
        mol = mols_free[mol_index]
        img = Draw.MolToImage(mol, size=(300, 300))
        img_path = os.path.join(cluster_output_folder, f'{sdf_file.strip(".sdf")}.png')
        img.save(img_path)
