<a href="https://colab.research.google.com/github/hanxi898/Polymer-kaggle/blob/main/models_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ==============================
# 1. 环境准备
# ==============================
!pip install rdkit-pypi scikit-learn xgboost joblib deepchem

import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import deepchem as dc
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Collecting deepchem
  Downloading deepchem-2.8.0-py3-none-any.whl.metadata (2.0 kB)
Collecting rdkit (from deepchem)
  Downloading rdkit-2025.3.5-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading deepchem-2.8.0-py3-none-any.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rdkit-2025.3.5-cp311-cp311-manylinux_2_28_x86_64.whl (36.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.3/36.3 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit-pypi, rdkit, deepchem
Succ

Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead


In [2]:
# ==============================
# 2. 数据读取
# 数据格式要求：csv，至少包含 2 列： SMILES, Tg
# ==============================
df = pd.read_csv("/content/dataset3.csv")  # 例： "SMILES,Tg"
print(df.head())

                                              SMILES          Tg
0  *=Nc1ccc(N=C(C)Nc2ccc(-c3ccc(NC(=*)C)c(C(=O)O)...   89.380459
1   *C(=O)OC(=O)COc1ccc(OCC(=O)OC(=O)c2ccc(*)nc2)cc1  155.970957
2  *C(=O)c1ccc(C(=O)c2ccc(C=C3CCC(=Cc4ccc(*)cc4)C...  192.209684
3  *C=C(*)c1ccc(OCCCCCC(=O)Oc2c(F)c(F)c(F)c(F)c2F...   73.831985
4                     *C=CC1C=CC(*)c2ccc(CCCCCC)cc21    9.704073


In [3]:
# --- GraphConv 特征 ---
def featurize_graphconv(smiles_list):
    X = []
    try:
        import deepchem as dc
        featurizer = dc.feat.ConvMolFeaturizer()
        for smi in smiles_list:
            try:
                feat = featurizer.featurize([smi])[0]
                arr = feat.get_atom_features().mean(axis=0)
            except:
                arr = np.zeros(75)
            X.append(arr)
    except:
        print("请安装 deepchem 才能使用 GraphConv 特征")
        X = [np.zeros(75)]*len(smiles_list)
    return np.array(X)

# --- Morgan 指纹 ---
def featurize_morgan(smiles_list, radius=2, nBits=2048):
    X = []
    from rdkit import DataStructs
    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            arr = np.zeros(nBits)
        else:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
            arr = np.zeros(nBits, dtype=int)
            DataStructs.ConvertToNumpyArray(fp, arr)
        X.append(arr)
    return np.array(X)

# --- 分子描述符 ---
def featurize_desc(smiles_list):
    X = []
    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            arr = np.zeros(len(Descriptors._descList))
        else:
            arr = np.array([d[1](mol) for d in Descriptors._descList], dtype=float)
        X.append(arr)
    return np.array(X)

# --- 混合特征 ---
def featurize_mixed(smiles_list):
    X_morgan = featurize_morgan(smiles_list)
    X_desc = featurize_desc(smiles_list)
    X_graph = featurize_graphconv(smiles_list)
    X_all = np.hstack([X_morgan, X_desc, X_graph])
    return X_all

# ================= 数据准备 =================
X = featurize_mixed(df["SMILES"].tolist())
y = df["Tg"].values

# 替换 NaN/Inf，并归一化
X = np.nan_to_num(X, nan=0.0, posinf=1e10, neginf=-1e10)
scaler = StandardScaler()
X = scaler.fit_transform(X).astype(np.float32)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




接下来，尝试不同的深度学习模型（CNN、LSTM/RNN、Transformer)，观察他们的表现

用pytorch，首先需要将数据转为tensor格式

树模型 (RF/XGB) → 接受 [n_samples, n_features]。

深度学习模型 → 通常需要 [n_samples, seq_len, embedding_dim]

所以我们要 reshape 一下特征矩阵

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# 转换为 PyTorch Tensor
X_train_t = torch.tensor(X_train, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test_t = torch.tensor(X_test, dtype=torch.float32)
y_test_t = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

train_ds = TensorDataset(X_train_t, y_train_t)
test_ds = TensorDataset(X_test_t, y_test_t)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=64, shuffle=False)

In [5]:
# ============== 模型定义 ==============
# 1. CNN 模型
class CNNRegressor(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 32, kernel_size=5, padding=2)
        self.conv2 = nn.Conv1d(32, 64, kernel_size=5, padding=2)
        self.fc1 = nn.Linear(64 * input_dim, 128)
        self.fc2 = nn.Linear(128, 1)

    def forward(self, x):
        x = x.unsqueeze(1)  # [batch, 1, features]
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = x.view(x.size(0), -1)  # flatten
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

In [6]:
# 2. LSTM 模型
class LSTMRegressor(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_layers=2):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = x.unsqueeze(1)  # [batch, seq_len=1, features]
        _, (h, _) = self.lstm(x)
        return self.fc(h[-1])


In [7]:
# 3. Transformer 模型
class TransformerRegressor(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_heads=4, num_layers=2):
        super().__init__()
        self.embedding = nn.Linear(input_dim, hidden_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_dim,
                                                   nhead=num_heads,
                                                   batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = x.unsqueeze(1)  # [batch, seq_len=1, features]
        x = self.embedding(x)
        x = self.transformer(x)
        return self.fc(x[:, 0, :])  # 取第一个 token


In [8]:
# ============== 训练函数 ==============
def train_model(model, train_loader, test_loader, epochs=20, lr=1e-3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * xb.size(0)
        train_loss = total_loss / len(train_loader.dataset)

        # 验证
        model.eval()
        with torch.no_grad():
            preds, trues = [], []
            for xb, yb in test_loader:
                xb, yb = xb.to(device), yb.to(device)
                out = model(xb)
                preds.append(out.cpu())
                trues.append(yb.cpu())
            preds = torch.cat(preds).squeeze()
            trues = torch.cat(trues).squeeze()
            val_loss = criterion(preds, trues).item()

        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

    return model

In [9]:
# ============== 选择并训练模型 ==============
input_dim = X_train.shape[1]

print("\n=== CNN ===")
cnn_model = train_model(CNNRegressor(input_dim), train_loader, test_loader)

print("\n=== LSTM ===")
lstm_model = train_model(LSTMRegressor(input_dim), train_loader, test_loader)

print("\n=== Transformer ===")
trans_model = train_model(TransformerRegressor(input_dim), train_loader, test_loader)


=== CNN ===
Epoch 1/20 | Train Loss: 28208.6484 | Val Loss: 23566.0059
Epoch 2/20 | Train Loss: 19178.4141 | Val Loss: 17106.1738
Epoch 3/20 | Train Loss: 11663.1533 | Val Loss: 15902.7715
Epoch 4/20 | Train Loss: 9224.5537 | Val Loss: 20594.5156
Epoch 5/20 | Train Loss: 13058.3691 | Val Loss: 19681.8242
Epoch 6/20 | Train Loss: 12099.4072 | Val Loss: 16663.0820
Epoch 7/20 | Train Loss: 9308.0508 | Val Loss: 14927.0000
Epoch 8/20 | Train Loss: 7845.7793 | Val Loss: 14761.1875
Epoch 9/20 | Train Loss: 7868.1519 | Val Loss: 15160.3232
Epoch 10/20 | Train Loss: 8305.5332 | Val Loss: 15420.0684
Epoch 11/20 | Train Loss: 8443.0586 | Val Loss: 15281.0752
Epoch 12/20 | Train Loss: 8051.6875 | Val Loss: 14773.9043
Epoch 13/20 | Train Loss: 7198.8901 | Val Loss: 14077.7471
Epoch 14/20 | Train Loss: 6108.3994 | Val Loss: 13452.8379
Epoch 15/20 | Train Loss: 5094.7637 | Val Loss: 13165.8535
Epoch 16/20 | Train Loss: 4476.9722 | Val Loss: 13218.4297
Epoch 17/20 | Train Loss: 4303.6162 | Val Loss:

In [None]:
总体对比：树模型 vs 深度学习模型

In [10]:
# ============== 树模型部分 ==============
def evaluate_sklearn_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    results = {
        "Train_MSE": mean_squared_error(y_train, y_pred_train),
        "Test_MSE": mean_squared_error(y_test, y_pred_test),
        "Test_MAE": mean_absolute_error(y_test, y_pred_test),
        "Test_R2": r2_score(y_test, y_pred_test),
    }
    return results

# RF
rf_model = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)
rf_results = evaluate_sklearn_model(rf_model, X_train, y_train, X_test, y_test)
print("\n=== Random Forest ===")
print(rf_results)

# XGB
xgb_model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
xgb_results = evaluate_sklearn_model(xgb_model, X_train, y_train, X_test, y_test)
print("\n=== XGBoost ===")
print(xgb_results)


=== Random Forest ===
{'Train_MSE': 523.2371494907771, 'Test_MSE': 4059.7692541384217, 'Test_MAE': 55.23278569759348, 'Test_R2': 0.7324682141117189}

=== XGBoost ===
{'Train_MSE': 7.271185353415391e-06, 'Test_MSE': 3813.0952711721607, 'Test_MAE': 50.431835565761716, 'Test_R2': 0.7487236037814318}


In [11]:
# ============== 深度学习模型部分 (前面写好的 train_model 可复用) ==============
def evaluate_dl_model(model, train_loader, test_loader, epochs=20):
    model = train_model(model, train_loader, test_loader, epochs=epochs)

    # 最终评估
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    with torch.no_grad():
        preds, trues = [], []
        for xb, yb in test_loader:
            xb, yb = xb.to(device), yb.to(device)
            out = model(xb)
            preds.append(out.cpu())
            trues.append(yb.cpu())
        preds = torch.cat(preds).squeeze().numpy()
        trues = torch.cat(trues).squeeze().numpy()

    results = {
        "Test_MSE": mean_squared_error(trues, preds),
        "Test_MAE": mean_absolute_error(trues, preds),
        "Test_R2": r2_score(trues, preds),
    }
    return results

input_dim = X_train.shape[1]

print("\n=== CNN ===")
cnn_results = evaluate_dl_model(CNNRegressor(input_dim), train_loader, test_loader, epochs=20)
print(cnn_results)

print("\n=== LSTM ===")
lstm_results = evaluate_dl_model(LSTMRegressor(input_dim), train_loader, test_loader, epochs=20)
print(lstm_results)

print("\n=== Transformer ===")
trans_results = evaluate_dl_model(TransformerRegressor(input_dim), train_loader, test_loader, epochs=20)
print(trans_results)


=== CNN ===
Epoch 1/20 | Train Loss: 28213.3359 | Val Loss: 25434.6328
Epoch 2/20 | Train Loss: 21175.2910 | Val Loss: 19605.1289
Epoch 3/20 | Train Loss: 14462.0488 | Val Loss: 15652.6357
Epoch 4/20 | Train Loss: 9501.7422 | Val Loss: 17317.9570
Epoch 5/20 | Train Loss: 10207.4258 | Val Loss: 20031.8105
Epoch 6/20 | Train Loss: 12521.0771 | Val Loss: 18669.3418
Epoch 7/20 | Train Loss: 11092.2656 | Val Loss: 16296.4326
Epoch 8/20 | Train Loss: 8766.8438 | Val Loss: 14908.9639
Epoch 9/20 | Train Loss: 7463.6147 | Val Loss: 14646.3262
Epoch 10/20 | Train Loss: 7252.2344 | Val Loss: 14909.2441
Epoch 11/20 | Train Loss: 7478.8994 | Val Loss: 15162.1045
Epoch 12/20 | Train Loss: 7588.3540 | Val Loss: 15147.0332
Epoch 13/20 | Train Loss: 7330.9824 | Val Loss: 14819.3154
Epoch 14/20 | Train Loss: 6686.2480 | Val Loss: 14223.9355
Epoch 15/20 | Train Loss: 5733.6992 | Val Loss: 13563.6357
Epoch 16/20 | Train Loss: 4726.7554 | Val Loss: 13057.1768
Epoch 17/20 | Train Loss: 3947.6904 | Val Loss

In [12]:
# ============== 最终结果汇总 ==============
print("\n📊 模型对比结果")
all_results = {
    "RandomForest": rf_results,
    "XGBoost": xgb_results,
    "CNN": cnn_results,
    "LSTM": lstm_results,
    "Transformer": trans_results
}
for model, res in all_results.items():
    print(f"{model:12s} | Test_MSE: {res['Test_MSE']:.4f} | Test_MAE: {res['Test_MAE']:.4f} | Test_R2: {res['Test_R2']:.4f}")


📊 模型对比结果
RandomForest | Test_MSE: 4059.7693 | Test_MAE: 55.2328 | Test_R2: 0.7325
XGBoost      | Test_MSE: 3813.0953 | Test_MAE: 50.4318 | Test_R2: 0.7487
CNN          | Test_MSE: 11724.2910 | Test_MAE: 99.3856 | Test_R2: 0.2274
LSTM         | Test_MSE: 31632.9492 | Test_MAE: 128.4100 | Test_R2: -1.0846
Transformer  | Test_MSE: 29533.4121 | Test_MAE: 120.2857 | Test_R2: -0.9462


树模型（RF / XGB）明显更优

R² ≈ 0.73–0.75，说明能解释 ~73–75% 的方差。

MAE 在 50 左右，相当不错。

XGBoost 略优于 RF（更低的 MSE/MAE、更高的 R²）。

深度学习模型表现差

CNN 稍微有点学习能力（R²≈0.23），但远不如树模型。

LSTM / Transformer 严重欠拟合（R²<0，表示比“预测均值”还差）。

为什么深度学习效果差？

**数据量问题**：
深度学习需要大量样本（成千上万分子）才能超过树模型；如果你的数据集只有几百/几千个分子，树模型会更稳。

**特征维度问题**：
你现在的输入是 拼接后的手工特征 (Morgan+desc+GraphConv)。这些特征本身就是高层次、稀疏、非序列化的 → 树模型更擅长处理稀疏、离散的输入。
CNN/LSTM/Transformer其实并不适合直接吃这种“全拼接特征矩阵”。

模型设计问题：
目前的 CNN/LSTM/Transformer 都是 简单版本，相当于是“硬套”，没有针对分子数据的特殊结构做优化（不像 Graph Neural Network, ChemBERTa 这种专门为分子设计的模型）。

✅ 建议

**如果数据量不大（< 1w 分子） → 继续用 XGBoost / RF，**并做 超参数调优 (GridSearch/Optuna)。

如果想用深度学习 → 不要用 CNN/LSTM/Transformer 硬套拼接特征，可以尝试：

Graph Neural Networks (**GNN**)：GCN、GIN、MPNN 等，直接吃分子图。

Transformer for SMILES：比如 ChemBERTa, SMILES-BERT。

这样深度模型会比树模型更有优势。

混合方法：

**先用 GNN / Transformer 提取 embedding，再接一个 XGB 做回归，往往比纯 DL 或纯树模型更强。**