# 药物靶点识别预测模型——构建一个基于SMILES和pIC50数据的药物靶点识别预测模型：

1. **数据预处理**：读取`chembl.csv`文件，提取SMILES和pIC50数据。
2. **标签生成**：根据pIC50值，将大于8的标记为1，其他为0。
3. **特征提取**：将SMILES转换为模型可接受的数值特征。
4. **模型训练和预测**：使用RF、SVM、ANN、CNN、GNN、GCN、GAN等模型进行训练和预测。(新增：添加Naive Bayes（NB）、**K-Nearest Neighbors（KNN）和Extreme Gradient Boosting（XGBoost）**这三个模型的实现。)
5. **结果评估**：评估模型的性能，输出预测结果。


## 1. 导入必要的库

In [1]:
# 数据处理
import pandas as pd
import numpy as np

# 化学信息学
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors

# 特征提取
from rdkit.Chem import MACCSkeys

# 机器学习模型
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# 神经网络模型
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten

# 图神经网络
import torch
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv

# 忽略警告
import warnings
warnings.filterwarnings('ignore')


2024-10-03 14:20:43.653345: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 2. 数据预处理

In [2]:
from pathlib import Path
import os

# 获取当前工作目录
HERE = Path(os.getcwd())
DATA = HERE / 'data'
if not DATA.exists():
    DATA.mkdir(parents=True, exist_ok=True)
print(DATA)


/Users/wangyang/Desktop/AIDD/08_Drug_target_recognition_prediction_model_based_on_SMILES_and_pIC50_data/data


In [14]:
data = pd.read_csv(
     DATA / "./chembl.csv",
)
data.head()

Unnamed: 0,molecule_chembl_id,IC50,smiles,pIC50,molecular_weight,n_hba,n_hbd,logp,ro5_fulfilled
0,CHEMBL5189340,0.023,CN(C)c1ccc(/C=N/NC(=O)Cn2nc(Cc3ccc(Cl)cc3)c3cc...,10.638272,473.161853,6,1,3.857,True
1,CHEMBL429743,0.03,COc1cc2nccc(Oc3ccc4c(C(=O)Nc5ccc(Cl)cc5)cccc4c...,10.522879,484.118985,5,1,7.1032,True
2,CHEMBL5186748,0.12,CNC(=O)c1cc(Oc2ccc(NC(=O)c3nn(-c4ccc(Cl)cc4)cc...,9.920819,489.120382,7,2,3.99352,True
3,CHEMBL3586072,0.14,CNC(=O)c1ccc(-c2ccc(NC(=O)Nc3cc(Br)cc(C(F)(F)F...,9.853872,492.040873,3,3,5.5335,True
4,CHEMBL3586071,0.15,CNC(=O)c1cc(-c2ccc(NC(=O)Nc3ccc(Cl)c(C(F)(F)F)...,9.823909,448.091388,3,3,5.4244,True


In [15]:

# 提取SMILES和pIC50
smiles_list = data['smiles']
pIC50_list = data['pIC50']

# 生成标签：pIC50 > 8 的标记为1，其他为0
labels = [1 if x > 8 else 0 for x in pIC50_list]

# 创建DataFrame
df = pd.DataFrame({'smiles': smiles_list, 'pIC50': pIC50_list, 'label': labels})




## 3. 特征提取
- 我们将使用不同的方法来提取特征，以适应不同的模型。

### 3.1 分子指纹（适用于RF和SVM）

In [16]:
def smiles_to_fingerprints(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        # 使用Morgan指纹
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)
        arr = np.zeros((1,))
        Chem.DataStructs.ConvertToNumpyArray(fp, arr)
        return arr
    else:
        return np.zeros((1024,))

# 应用到数据集
fingerprints = np.array([smiles_to_fingerprints(s) for s in df['smiles']])


### 3.2 分子描述符（适用于ANN）

In [17]:
def smiles_to_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        descriptors = [Descriptors.MolWt(mol), Descriptors.MolLogP(mol), Descriptors.NumHDonors(mol), Descriptors.NumHAcceptors(mol)]
        return descriptors
    else:
        return [0, 0, 0, 0]

# 应用到数据集
descriptors = np.array([smiles_to_descriptors(s) for s in df['smiles']])


### 3.3 图数据（适用于GNN和GCN）

In [18]:
# 定义函数将SMILES转换为PyTorch Geometric的数据格式
from torch_geometric.utils import from_networkx
import networkx as nx

def smiles_to_graph(smiles, label):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    G = nx.Graph()
    for atom in mol.GetAtoms():
        G.add_node(atom.GetIdx(), x=torch.tensor([atom.GetAtomicNum()], dtype=torch.float))
    for bond in mol.GetBonds():
        G.add_edge(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx())
    data = from_networkx(G)
    data.y = torch.tensor([label], dtype=torch.long)
    return data

# 应用到数据集
graph_data_list = [smiles_to_graph(s, l) for s, l in zip(df['smiles'], df['label'])]
# 移除None值
graph_data_list = [d for d in graph_data_list if d is not None]


## 4. 数据集拆分
### 4.1 传统机器学习模型的数据集

In [19]:
# 分割数据集（指纹）
X_train_fp, X_test_fp, y_train_fp, y_test_fp = train_test_split(fingerprints, df['label'], test_size=0.2, random_state=42)

# 分割数据集（描述符）
X_train_desc, X_test_desc, y_train_desc, y_test_desc = train_test_split(descriptors, df['label'], test_size=0.2, random_state=42)


### 4.2 图神经网络的数据集

In [20]:
# 分割数据集
train_size = int(0.8 * len(graph_data_list))
train_graphs = graph_data_list[:train_size]
test_graphs = graph_data_list[train_size:]

# 创建数据加载器
train_loader = DataLoader(train_graphs, batch_size=32, shuffle=True)
test_loader = DataLoader(test_graphs, batch_size=32, shuffle=False)


## 5. 模型训练和预测
### 5.1 随机森林（RF）

In [21]:
# 定义模型
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# 训练模型
rf_model.fit(X_train_fp, y_train_fp)

# 预测
rf_predictions = rf_model.predict(X_test_fp)


### 5.2 支持向量机（SVM）

In [22]:
# 定义模型
svm_model = SVC(kernel='rbf', probability=True)

# 训练模型
svm_model.fit(X_train_fp, y_train_fp)

# 预测
svm_predictions = svm_model.predict(X_test_fp)


### 5.3 人工神经网络（ANN）

In [23]:
# 定义模型
ann_model = Sequential()
ann_model.add(Dense(64, activation='relu', input_shape=(descriptors.shape[1],)))
ann_model.add(Dense(32, activation='relu'))
ann_model.add(Dense(1, activation='sigmoid'))

# 编译模型
ann_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 训练模型
ann_model.fit(X_train_desc, y_train_desc, epochs=50, batch_size=32, verbose=0)

# 预测
ann_predictions = (ann_model.predict(X_test_desc) > 0.5).astype(int).flatten()




### 5.4 卷积神经网络（CNN）
由于SMILES序列的特殊性，我们需要将其转换为适合CNN输入的格式。这里，我们简单地将SMILES字符映射为整数索引，然后进行One-Hot编码。

In [24]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 准备SMILES数据
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(df['smiles'])
sequences = tokenizer.texts_to_sequences(df['smiles'])
max_len = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

# One-Hot编码
X_cnn = tf.keras.utils.to_categorical(padded_sequences)

# 分割数据集
X_train_cnn, X_test_cnn, y_train_cnn, y_test_cnn = train_test_split(X_cnn, df['label'], test_size=0.2, random_state=42)

# 定义模型
cnn_model = Sequential()
cnn_model.add(Conv1D(64, 3, activation='relu', input_shape=(max_len, X_cnn.shape[2])))
cnn_model.add(Flatten())
cnn_model.add(Dense(1, activation='sigmoid'))

# 编译模型
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 训练模型
cnn_model.fit(X_train_cnn, y_train_cnn, epochs=10, batch_size=32, verbose=0)

# 预测
cnn_predictions = (cnn_model.predict(X_test_cnn) > 0.5).astype(int).flatten()




### 5.5 图卷积神经网络（GCN）

In [None]:
# 定义GCN模型
class GCN(torch.nn.Module):
    def __init__(self):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(1, 64)
        self.conv2 = GCNConv(64, 32)
        self.fc = torch.nn.Linear(32, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        x = torch.relu(x)
        x = torch_geometric.nn.global_mean_pool(x, data.batch)
        x = self.fc(x)
        return x

# 初始化模型和优化器
gcn_model = GCN()
optimizer = torch.optim.Adam(gcn_model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

# 训练模型
for epoch in range(10):
    gcn_model.train()
    for data in train_loader:
        optimizer.zero_grad()
        out = gcn_model(data)
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()

# 测试模型
gcn_model.eval()
correct = 0
for data in test_loader:
    out = gcn_model(data)
    pred = out.argmax(dim=1)
    correct += (pred == data.y).sum().item()

accuracy = correct / len(test_graphs)
print(f'GCN Test Accuracy: {accuracy:.4f}')


### 5.6 生成对抗网络（GAN）
GAN主要用于生成数据，这里我们可以尝试生成新的分子结构。但是由于复杂性，这里提供一个简单的框架。

In [None]:
# 由于时间和复杂度原因，这里仅提供GAN的简单结构
from tensorflow.keras.layers import Reshape

# 定义生成器
def build_generator():
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=100))
    model.add(Dense(max_len * X_cnn.shape[2], activation='sigmoid'))
    model.add(Reshape((max_len, X_cnn.shape[2])))
    return model

# 定义判别器
def build_discriminator():
    model = Sequential()
    model.add(Conv1D(64, 3, activation='relu', input_shape=(max_len, X_cnn.shape[2])))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    return model

generator = build_generator()
discriminator = build_discriminator()

# 编译模型
discriminator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
discriminator.trainable = False

gan_input = tf.keras.Input(shape=(100,))
generated_smiles = generator(gan_input)
gan_output = discriminator(generated_smiles)
gan = tf.keras.Model(gan_input, gan_output)
gan.compile(optimizer='adam', loss='binary_crossentropy')

# 由于训练GAN需要大量时间，这里不具体展开


## 6. 模型评估

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# RF评估
print("Random Forest Accuracy:", accuracy_score(y_test_fp, rf_predictions))
print(classification_report(y_test_fp, rf_predictions))

# SVM评估
print("SVM Accuracy:", accuracy_score(y_test_fp, svm_predictions))
print(classification_report(y_test_fp, svm_predictions))

# ANN评估
print("ANN Accuracy:", accuracy_score(y_test_desc, ann_predictions))
print(classification_report(y_test_desc, ann_predictions))

# CNN评估
print("CNN Accuracy:", accuracy_score(y_test_cnn, cnn_predictions))
print(classification_report(y_test_cnn, cnn_predictions))
