<a href="https://colab.research.google.com/gist/lisasimakova/d41d3630a6b5a1313431a91dae94c3d8/mlp-cnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Обоснование выбора признаков, модели и параметров

В качестве признаков использованы дескрипторы Mordred.

Для предсказания pIC50 выбрана нейронная сеть (MLP). Эта архитектура подходит для работы с числовыми признаками фиксированной размерности и может выявлять нелинейные зависимости между дескрипторами и целевой переменной. Используется два скрытых слоя с ReLU-активацией и Dropout для борьбы с переобучением.

Модель обучается с ранней остановкой по валидационной ошибке, что позволяет предотвратить переобучение без необходимости вручную подбирать число эпох. Используется Adam-оптимизатор и MSELoss как функция ошибки, стандартные для регрессии.

Для оценки обобщающей способности применяется 5-кратная кросс-валидация, после чего модель дообучается на всём train+val и тестируется на отложенной выборке.

In [1]:
import os, random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

df = pd.read_csv('/content/Mordred_nonagg_pca_processed.csv')
X = df.drop(columns=['pIC50']).values.astype(np.float32)
y = df['pIC50'].values.astype(np.float32)

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.model(x)

def train_with_early_stopping(model, train_loader, val_loader=None, patience=10, max_epochs=100):
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    best_loss = float('inf')
    best_model_wts = None
    patience_counter = 0

    for epoch in range(max_epochs):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = criterion(pred, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if val_loader is not None:
            model.eval()
            val_losses = []
            with torch.no_grad():
                for xb, yb in val_loader:
                    xb, yb = xb.to(device), yb.to(device)
                    pred = model(xb)
                    loss = criterion(pred, yb)
                    val_losses.append(loss.item())
            val_loss = np.mean(val_losses)
            print(f"Epoch {epoch+1}: val_loss={val_loss:.4f}")

            if val_loss < best_loss:
                best_loss = val_loss
                best_model_wts = model.state_dict()
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("Early stopping triggered.")
                    break
        else:
            # если нет валидации, просто сохраняем последнюю модель
            best_model_wts = model.state_dict()

    if best_model_wts:
        model.load_state_dict(best_model_wts)
    return model

def train_model(X_trainval, y_trainval):
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    mae_scores, rmse_scores, r2_scores = [], [], []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_trainval), 1):
        print(f"\n--- Fold {fold} ---")
        X_train, X_val = X_trainval[train_idx], X_trainval[val_idx]
        y_train, y_val = y_trainval[train_idx], y_trainval[val_idx]

        X_train_t = torch.from_numpy(X_train).float()
        y_train_t = torch.from_numpy(y_train).float().unsqueeze(1)
        X_val_t = torch.from_numpy(X_val).float()
        y_val_t = torch.from_numpy(y_val).float().unsqueeze(1)

        train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=32, shuffle=True)
        val_loader   = DataLoader(TensorDataset(X_val_t, y_val_t), batch_size=32, shuffle=False)

        model = MLP(input_dim=X_train.shape[1]).to(device)
        model = train_with_early_stopping(model, train_loader, val_loader)

        model.eval()
        with torch.no_grad():
            preds = model(X_val_t.to(device)).squeeze().cpu().numpy()

        mae = mean_absolute_error(y_val, preds)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        r2 = r2_score(y_val, preds)
        print(f"MAE : {mae:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}")

        mae_scores.append(mae)
        rmse_scores.append(rmse)
        r2_scores.append(r2)

    return mae_scores, rmse_scores, r2_scores

mae_scores, rmse_scores, r2_scores = train_model(X_trainval, y_trainval)

print("\n=== Метрики на кросс-валидации ===")
print(f"MAE : {np.mean(mae_scores):.4f} ± {np.std(mae_scores):.4f}")
print(f"RMSE: {np.mean(rmse_scores):.4f} ± {np.std(rmse_scores):.4f}")
print(f"R²  : {np.mean(r2_scores):.4f} ± {np.std(r2_scores):.4f}")

print("\n=== Финальное обучение на всём train+val ===")
X_trainval_t = torch.from_numpy(X_trainval).float()
y_trainval_t = torch.from_numpy(y_trainval).float().unsqueeze(1)
trainval_loader = DataLoader(TensorDataset(X_trainval_t, y_trainval_t), batch_size=32, shuffle=True)

final_model = MLP(input_dim=X_trainval.shape[1]).to(device)
final_model = train_with_early_stopping(final_model, trainval_loader, val_loader=None)

# сохраняем финальную модель
model_path = 'final_mlp_model.pth'
torch.save(final_model.state_dict(), model_path)
print(f"Final model saved to {model_path}")

# загрузка модели из файла и тест
print("\n=== Загрузка модели из файла и тестирование ===")
loaded_model = MLP(input_dim=X_trainval.shape[1]).to(device)
loaded_model.load_state_dict(torch.load(model_path))
loaded_model.eval()

X_test_t = torch.from_numpy(X_test).float().to(device)
with torch.no_grad():
    y_test_pred = loaded_model(X_test_t).squeeze().cpu().numpy()

print(f"Test MAE : {mean_absolute_error(y_test, y_test_pred):.4f}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred)):.4f}")
print(f"Test R²  : {r2_score(y_test, y_test_pred):.4f}")

Using device: cuda

--- Fold 1 ---
Epoch 1: val_loss=2.7134
Epoch 2: val_loss=1.7722
Epoch 3: val_loss=1.5445
Epoch 4: val_loss=1.5410
Epoch 5: val_loss=1.3890
Epoch 6: val_loss=1.3647
Epoch 7: val_loss=1.3131
Epoch 8: val_loss=1.2601
Epoch 9: val_loss=1.2245
Epoch 10: val_loss=1.1919
Epoch 11: val_loss=1.1661
Epoch 12: val_loss=1.1504
Epoch 13: val_loss=1.2299
Epoch 14: val_loss=1.1238
Epoch 15: val_loss=1.0973
Epoch 16: val_loss=1.1152
Epoch 17: val_loss=1.1330
Epoch 18: val_loss=1.0765
Epoch 19: val_loss=1.0426
Epoch 20: val_loss=1.0907
Epoch 21: val_loss=1.0655
Epoch 22: val_loss=1.1237
Epoch 23: val_loss=1.0530
Epoch 24: val_loss=1.1237
Epoch 25: val_loss=1.0872
Epoch 26: val_loss=1.0750
Epoch 27: val_loss=1.0832
Epoch 28: val_loss=1.0563
Epoch 29: val_loss=1.0315
Epoch 30: val_loss=1.1221
Epoch 31: val_loss=1.1340
Epoch 32: val_loss=1.1890
Epoch 33: val_loss=1.0785
Epoch 34: val_loss=1.0374
Epoch 35: val_loss=1.1646
Epoch 36: val_loss=1.0505
Epoch 37: val_loss=1.0790
Epoch 38: va

Обоснование выбора признаков, модели и параметров

В качестве признаков использованы дескрипторы Mordred.

Для предсказания pIC50 выбрана модель глубокой нейросети (MLP) с тремя скрытыми слоями размерностью 256, 128 и 64 нейрона. Используются LeakyReLU-активации, которые помогают избежать затухающего градиента, и Dropout (0.3) для регуляризации и борьбы с переобучением.

Обучение проводится с ранней остановкой (patience = 15, max_epochs = 200) и оптимизатором Adam с пониженным шагом обучения (lr=0.0005) — это позволяет модели сходиться стабильнее и не перепрыгивать минимум.

Для оценки используется 5-кратная кросс-валидация. После подбора и обучения модель переобучается на всех обучающих данных и тестируется на отложенной выборке.

In [2]:
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.LeakyReLU(0.1),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.model(x)

def train_with_early_stopping(model, train_loader, val_loader=None, patience=15, max_epochs=200):
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
    best_loss = float('inf')
    best_model_wts = None
    patience_counter = 0

    for epoch in range(max_epochs):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = criterion(pred, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if val_loader is not None:
            model.eval()
            val_losses = []
            with torch.no_grad():
                for xb, yb in val_loader:
                    xb, yb = xb.to(device), yb.to(device)
                    pred = model(xb)
                    loss = criterion(pred, yb)
                    val_losses.append(loss.item())
            val_loss = np.mean(val_losses)
            print(f"Epoch {epoch+1}: val_loss={val_loss:.4f}")

            if val_loss < best_loss:
                best_loss = val_loss
                best_model_wts = model.state_dict()
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("Early stopping triggered.")
                    break
        else:
            best_model_wts = model.state_dict()

    if best_model_wts:
        model.load_state_dict(best_model_wts)
    return model

def train_model(X_trainval, y_trainval):
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    mae_scores, rmse_scores, r2_scores = [], [], []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_trainval), 1):
        print(f"\n--- Fold {fold} ---")
        X_train, X_val = X_trainval[train_idx], X_trainval[val_idx]
        y_train, y_val = y_trainval[train_idx], y_trainval[val_idx]

        X_train_t = torch.from_numpy(X_train).float()
        y_train_t = torch.from_numpy(y_train).float().unsqueeze(1)
        X_val_t = torch.from_numpy(X_val).float()
        y_val_t = torch.from_numpy(y_val).float().unsqueeze(1)

        train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=64, shuffle=True)
        val_loader   = DataLoader(TensorDataset(X_val_t, y_val_t), batch_size=64, shuffle=False)

        model = MLP(input_dim=X_train.shape[1]).to(device)
        model = train_with_early_stopping(model, train_loader, val_loader)

        model.eval()
        with torch.no_grad():
            preds = model(X_val_t.to(device)).squeeze().cpu().numpy()

        mae = mean_absolute_error(y_val, preds)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        r2 = r2_score(y_val, preds)
        print(f"MAE : {mae:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}")

        mae_scores.append(mae)
        rmse_scores.append(rmse)
        r2_scores.append(r2)

    return mae_scores, rmse_scores, r2_scores
mae_scores, rmse_scores, r2_scores = train_model(X_trainval, y_trainval)

print("\n=== Метрики на кросс-валидации ===")
print(f"MAE : {np.mean(mae_scores):.4f} ± {np.std(mae_scores):.4f}")
print(f"RMSE: {np.mean(rmse_scores):.4f} ± {np.std(rmse_scores):.4f}")
print(f"R²  : {np.mean(r2_scores):.4f} ± {np.std(r2_scores):.4f}")

print("\n=== Финальное обучение на всём train+val ===")
X_trainval_t = torch.from_numpy(X_trainval).float()
y_trainval_t = torch.from_numpy(y_trainval).float().unsqueeze(1)
trainval_loader = DataLoader(TensorDataset(X_trainval_t, y_trainval_t), batch_size=32, shuffle=True)

final_model = MLP(input_dim=X_trainval.shape[1]).to(device)
final_model = train_with_early_stopping(final_model, trainval_loader, val_loader=None)

# сохраняем финальную модель
model_path = 'final_mlp_model.pth'
torch.save(final_model.state_dict(), model_path)
print(f"Final model saved to {model_path}")

# загрузка модели из файла и тест
print("\n=== Загрузка модели из файла и тестирование ===")
loaded_model = MLP(input_dim=X_trainval.shape[1]).to(device)
loaded_model.load_state_dict(torch.load(model_path))
loaded_model.eval()

X_test_t = torch.from_numpy(X_test).float().to(device)
with torch.no_grad():
    y_test_pred = loaded_model(X_test_t).squeeze().cpu().numpy()

print(f"Test MAE : {mean_absolute_error(y_test, y_test_pred):.4f}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred)):.4f}")
print(f"Test R²  : {r2_score(y_test, y_test_pred):.4f}")



--- Fold 1 ---
Epoch 1: val_loss=6.2429
Epoch 2: val_loss=4.0710
Epoch 3: val_loss=3.0160
Epoch 4: val_loss=2.7243
Epoch 5: val_loss=2.5666
Epoch 6: val_loss=2.4386
Epoch 7: val_loss=2.3777
Epoch 8: val_loss=2.3001
Epoch 9: val_loss=2.3259
Epoch 10: val_loss=2.2793
Epoch 11: val_loss=2.2531
Epoch 12: val_loss=2.2145
Epoch 13: val_loss=2.1238
Epoch 14: val_loss=2.1562
Epoch 15: val_loss=2.1207
Epoch 16: val_loss=2.0943
Epoch 17: val_loss=2.0447
Epoch 18: val_loss=2.0842
Epoch 19: val_loss=2.0257
Epoch 20: val_loss=1.9722
Epoch 21: val_loss=2.0247
Epoch 22: val_loss=1.9841
Epoch 23: val_loss=1.8994
Epoch 24: val_loss=1.9525
Epoch 25: val_loss=1.9193
Epoch 26: val_loss=1.9439
Epoch 27: val_loss=1.8228
Epoch 28: val_loss=1.7876
Epoch 29: val_loss=1.8049
Epoch 30: val_loss=1.8979
Epoch 31: val_loss=1.8015
Epoch 32: val_loss=1.7978
Epoch 33: val_loss=1.8398
Epoch 34: val_loss=1.6870
Epoch 35: val_loss=1.6704
Epoch 36: val_loss=1.6651
Epoch 37: val_loss=1.7259
Epoch 38: val_loss=1.5961
Epoch

Та же модель и параметры но на дескрипторах RDKit

На другом датасете rdkit. Ниже два кода с двумя комбинациями параметров.

In [3]:
import os, random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

# reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

df = pd.read_csv('/content/RDKit_nonagg_pca_processed.csv')
X = df.drop(columns=['pIC50']).values.astype(np.float32)
y = df['pIC50'].values.astype(np.float32)

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.model(x)

def train_with_early_stopping(model, train_loader, val_loader=None, patience=10, max_epochs=100):
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    best_loss = float('inf')
    best_model_wts = None
    patience_counter = 0

    for epoch in range(max_epochs):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = criterion(pred, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if val_loader is not None:
            model.eval()
            val_losses = []
            with torch.no_grad():
                for xb, yb in val_loader:
                    xb, yb = xb.to(device), yb.to(device)
                    pred = model(xb)
                    loss = criterion(pred, yb)
                    val_losses.append(loss.item())
            val_loss = np.mean(val_losses)
            print(f"Epoch {epoch+1}: val_loss={val_loss:.4f}")

            if val_loss < best_loss:
                best_loss = val_loss
                best_model_wts = model.state_dict()
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("Early stopping triggered.")
                    break
        else:
            # если нет валидации, просто сохраняем последнюю модель
            best_model_wts = model.state_dict()

    if best_model_wts:
        model.load_state_dict(best_model_wts)
    return model

def train_model(X_trainval, y_trainval):
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    mae_scores, rmse_scores, r2_scores = [], [], []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_trainval), 1):
        print(f"\n--- Fold {fold} ---")
        X_train, X_val = X_trainval[train_idx], X_trainval[val_idx]
        y_train, y_val = y_trainval[train_idx], y_trainval[val_idx]

        X_train_t = torch.from_numpy(X_train).float()
        y_train_t = torch.from_numpy(y_train).float().unsqueeze(1)
        X_val_t = torch.from_numpy(X_val).float()
        y_val_t = torch.from_numpy(y_val).float().unsqueeze(1)

        train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=32, shuffle=True)
        val_loader   = DataLoader(TensorDataset(X_val_t, y_val_t), batch_size=32, shuffle=False)

        model = MLP(input_dim=X_train.shape[1]).to(device)
        model = train_with_early_stopping(model, train_loader, val_loader)

        model.eval()
        with torch.no_grad():
            preds = model(X_val_t.to(device)).squeeze().cpu().numpy()

        mae = mean_absolute_error(y_val, preds)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        r2 = r2_score(y_val, preds)
        print(f"MAE : {mae:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}")

        mae_scores.append(mae)
        rmse_scores.append(rmse)
        r2_scores.append(r2)

    return mae_scores, rmse_scores, r2_scores

mae_scores, rmse_scores, r2_scores = train_model(X_trainval, y_trainval)

print("\n=== Метрики на кросс-валидации ===")
print(f"MAE : {np.mean(mae_scores):.4f} ± {np.std(mae_scores):.4f}")
print(f"RMSE: {np.mean(rmse_scores):.4f} ± {np.std(rmse_scores):.4f}")
print(f"R²  : {np.mean(r2_scores):.4f} ± {np.std(r2_scores):.4f}")

print("\n=== Финальное обучение на всём train+val ===")
X_trainval_t = torch.from_numpy(X_trainval).float()
y_trainval_t = torch.from_numpy(y_trainval).float().unsqueeze(1)
trainval_loader = DataLoader(TensorDataset(X_trainval_t, y_trainval_t), batch_size=32, shuffle=True)

final_model = MLP(input_dim=X_trainval.shape[1]).to(device)
final_model = train_with_early_stopping(final_model, trainval_loader, val_loader=None)

# сохраняем финальную модель
model_path = 'final_mlp_model.pth'
torch.save(final_model.state_dict(), model_path)
print(f"Final model saved to {model_path}")

# загрузка модели из файла и тест
print("\n=== Загрузка модели из файла и тестирование ===")
loaded_model = MLP(input_dim=X_trainval.shape[1]).to(device)
loaded_model.load_state_dict(torch.load(model_path))
loaded_model.eval()

X_test_t = torch.from_numpy(X_test).float().to(device)
with torch.no_grad():
    y_test_pred = loaded_model(X_test_t).squeeze().cpu().numpy()

print(f"Test MAE : {mean_absolute_error(y_test, y_test_pred):.4f}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred)):.4f}")
print(f"Test R²  : {r2_score(y_test, y_test_pred):.4f}")

Using device: cuda

--- Fold 1 ---
Epoch 1: val_loss=3.5534
Epoch 2: val_loss=2.9390
Epoch 3: val_loss=2.6685
Epoch 4: val_loss=2.5458
Epoch 5: val_loss=2.5130
Epoch 6: val_loss=2.4303
Epoch 7: val_loss=2.3991
Epoch 8: val_loss=2.4348
Epoch 9: val_loss=2.4415
Epoch 10: val_loss=2.3084
Epoch 11: val_loss=2.2556
Epoch 12: val_loss=2.1886
Epoch 13: val_loss=2.2903
Epoch 14: val_loss=2.3667
Epoch 15: val_loss=2.1256
Epoch 16: val_loss=2.1702
Epoch 17: val_loss=2.2289
Epoch 18: val_loss=2.1823
Epoch 19: val_loss=2.1711
Epoch 20: val_loss=2.2563
Epoch 21: val_loss=2.1966
Epoch 22: val_loss=2.1325
Epoch 23: val_loss=2.2420
Epoch 24: val_loss=2.1634
Epoch 25: val_loss=2.1350
Early stopping triggered.
MAE : 0.8320, RMSE: 1.4617, R²: -0.3883

--- Fold 2 ---
Epoch 1: val_loss=2.6962
Epoch 2: val_loss=1.8456
Epoch 3: val_loss=1.6169
Epoch 4: val_loss=1.5147
Epoch 5: val_loss=1.4401
Epoch 6: val_loss=1.3969
Epoch 7: val_loss=1.3483
Epoch 8: val_loss=1.3082
Epoch 9: val_loss=1.3237
Epoch 10: val_los

In [4]:
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.LeakyReLU(0.1),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.model(x)

def train_with_early_stopping(model, train_loader, val_loader=None, patience=15, max_epochs=200):
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
    best_loss = float('inf')
    best_model_wts = None
    patience_counter = 0

    for epoch in range(max_epochs):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = criterion(pred, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if val_loader is not None:
            model.eval()
            val_losses = []
            with torch.no_grad():
                for xb, yb in val_loader:
                    xb, yb = xb.to(device), yb.to(device)
                    pred = model(xb)
                    loss = criterion(pred, yb)
                    val_losses.append(loss.item())
            val_loss = np.mean(val_losses)
            print(f"Epoch {epoch+1}: val_loss={val_loss:.4f}")

            if val_loss < best_loss:
                best_loss = val_loss
                best_model_wts = model.state_dict()
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("Early stopping triggered.")
                    break
        else:
            best_model_wts = model.state_dict()

    if best_model_wts:
        model.load_state_dict(best_model_wts)
    return model

def train_model(X_trainval, y_trainval):
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    mae_scores, rmse_scores, r2_scores = [], [], []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_trainval), 1):
        print(f"\n--- Fold {fold} ---")
        X_train, X_val = X_trainval[train_idx], X_trainval[val_idx]
        y_train, y_val = y_trainval[train_idx], y_trainval[val_idx]

        X_train_t = torch.from_numpy(X_train).float()
        y_train_t = torch.from_numpy(y_train).float().unsqueeze(1)
        X_val_t = torch.from_numpy(X_val).float()
        y_val_t = torch.from_numpy(y_val).float().unsqueeze(1)

        train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=64, shuffle=True)
        val_loader   = DataLoader(TensorDataset(X_val_t, y_val_t), batch_size=64, shuffle=False)

        model = MLP(input_dim=X_train.shape[1]).to(device)
        model = train_with_early_stopping(model, train_loader, val_loader)

        model.eval()
        with torch.no_grad():
            preds = model(X_val_t.to(device)).squeeze().cpu().numpy()

        mae = mean_absolute_error(y_val, preds)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        r2 = r2_score(y_val, preds)
        print(f"MAE : {mae:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}")

        mae_scores.append(mae)
        rmse_scores.append(rmse)
        r2_scores.append(r2)

    return mae_scores, rmse_scores, r2_scores
mae_scores, rmse_scores, r2_scores = train_model(X_trainval, y_trainval)

print("\n=== Метрики на кросс-валидации ===")
print(f"MAE : {np.mean(mae_scores):.4f} ± {np.std(mae_scores):.4f}")
print(f"RMSE: {np.mean(rmse_scores):.4f} ± {np.std(rmse_scores):.4f}")
print(f"R²  : {np.mean(r2_scores):.4f} ± {np.std(r2_scores):.4f}")

print("\n=== Финальное обучение на всём train+val ===")
X_trainval_t = torch.from_numpy(X_trainval).float()
y_trainval_t = torch.from_numpy(y_trainval).float().unsqueeze(1)
trainval_loader = DataLoader(TensorDataset(X_trainval_t, y_trainval_t), batch_size=32, shuffle=True)

final_model = MLP(input_dim=X_trainval.shape[1]).to(device)
final_model = train_with_early_stopping(final_model, trainval_loader, val_loader=None)

# сохраняем финальную модель
model_path = 'final_mlp_model.pth'
torch.save(final_model.state_dict(), model_path)
print(f"Final model saved to {model_path}")

# загрузка модели из файла и тест
print("\n=== Загрузка модели из файла и тестирование ===")
loaded_model = MLP(input_dim=X_trainval.shape[1]).to(device)
loaded_model.load_state_dict(torch.load(model_path))
loaded_model.eval()

X_test_t = torch.from_numpy(X_test).float().to(device)
with torch.no_grad():
    y_test_pred = loaded_model(X_test_t).squeeze().cpu().numpy()

print(f"Test MAE : {mean_absolute_error(y_test, y_test_pred):.4f}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred)):.4f}")
print(f"Test R²  : {r2_score(y_test, y_test_pred):.4f}")



--- Fold 1 ---
Epoch 1: val_loss=7.0923
Epoch 2: val_loss=3.5996
Epoch 3: val_loss=2.6609
Epoch 4: val_loss=2.4363
Epoch 5: val_loss=2.2923
Epoch 6: val_loss=2.2291
Epoch 7: val_loss=2.2986
Epoch 8: val_loss=2.1039
Epoch 9: val_loss=2.0284
Epoch 10: val_loss=1.9729
Epoch 11: val_loss=1.9097
Epoch 12: val_loss=1.8637
Epoch 13: val_loss=1.8075
Epoch 14: val_loss=1.8024
Epoch 15: val_loss=1.7879
Epoch 16: val_loss=1.7695
Epoch 17: val_loss=1.7094
Epoch 18: val_loss=1.7118
Epoch 19: val_loss=1.6742
Epoch 20: val_loss=1.6710
Epoch 21: val_loss=1.6703
Epoch 22: val_loss=1.6578
Epoch 23: val_loss=1.7170
Epoch 24: val_loss=1.6326
Epoch 25: val_loss=1.7184
Epoch 26: val_loss=1.6157
Epoch 27: val_loss=1.5404
Epoch 28: val_loss=1.5462
Epoch 29: val_loss=1.5406
Epoch 30: val_loss=1.5270
Epoch 31: val_loss=1.5441
Epoch 32: val_loss=1.4796
Epoch 33: val_loss=1.4780
Epoch 34: val_loss=1.4998
Epoch 35: val_loss=1.5139
Epoch 36: val_loss=1.4890
Epoch 37: val_loss=1.5076
Epoch 38: val_loss=1.4949
Epoch

Обоснование выбора признаков, модели и параметров


В качестве входных признаков используются дескрипторы Mordred.

В качестве модели выбрана одномерная сверточная нейросеть (1D-CNN), способная выявлять локальные паттерны в последовательности признаков. Используется Conv1d с ReLU, MaxPool1d, AdaptiveMaxPool1d и Dropout (0.3–0.4).

Модель обучается с помощью оптимизатора Adam (lr=0.001) и ранней остановки (patience=10), что позволяет избежать переобучения. Используется 5-кратная кросс-валидация для устойчивой оценки качества. Финальная модель переобучается на всём train+val и сохраняется для тестирования.


In [5]:
import os, random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

# reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# === Загрузка данных ===
df = pd.read_csv('/content/Mordred_nonagg_pca_processed.csv')
X = df.drop(columns=['pIC50']).values.astype(np.float32)
y = df['pIC50'].values.astype(np.float32)

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

# === CNN-модель ===
class CNN1D(nn.Module):
    def __init__(self, input_len, out_channels=16, kernel_size=3, dropout=0.3):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv1d(1, out_channels, kernel_size=kernel_size),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(out_channels, out_channels * 2, kernel_size=kernel_size),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(1),
            nn.Flatten(),
            nn.Dropout(dropout),
            nn.Linear(out_channels * 2, 1)
        )

    def forward(self, x):
        x = x.unsqueeze(1)  # -> (batch_size, 1, input_len)
        return self.model(x)

# === Обучение с ранней остановкой ===
def train_with_early_stopping(model, train_loader, val_loader=None, patience=10, max_epochs=100):
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    best_loss = float('inf')
    best_model_wts = None
    patience_counter = 0

    for epoch in range(max_epochs):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = criterion(pred, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if val_loader is not None:
            model.eval()
            val_losses = []
            with torch.no_grad():
                for xb, yb in val_loader:
                    xb, yb = xb.to(device), yb.to(device)
                    pred = model(xb)
                    loss = criterion(pred, yb)
                    val_losses.append(loss.item())
            val_loss = np.mean(val_losses)
            print(f"Epoch {epoch+1}: val_loss={val_loss:.4f}")

            if val_loss < best_loss:
                best_loss = val_loss
                best_model_wts = model.state_dict()
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("Early stopping triggered.")
                    break
        else:
            best_model_wts = model.state_dict()

    if best_model_wts:
        model.load_state_dict(best_model_wts)
    return model

# === Кросс-валидация ===
def train_model(X_trainval, y_trainval):
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    mae_scores, rmse_scores, r2_scores = [], [], []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_trainval), 1):
        print(f"\n--- Fold {fold} ---")
        X_train, X_val = X_trainval[train_idx], X_trainval[val_idx]
        y_train, y_val = y_trainval[train_idx], y_trainval[val_idx]

        X_train_t = torch.from_numpy(X_train).float()
        y_train_t = torch.from_numpy(y_train).float().unsqueeze(1)
        X_val_t = torch.from_numpy(X_val).float()
        y_val_t = torch.from_numpy(y_val).float().unsqueeze(1)

        train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=32, shuffle=True)
        val_loader   = DataLoader(TensorDataset(X_val_t, y_val_t), batch_size=32, shuffle=False)

        model = CNN1D(input_len=X_train.shape[1], out_channels=64, kernel_size=7, dropout=0.4).to(device)

        model = train_with_early_stopping(model, train_loader, val_loader)

        model.eval()
        with torch.no_grad():
            preds = model(X_val_t.to(device)).squeeze().cpu().numpy()

        mae = mean_absolute_error(y_val, preds)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        r2 = r2_score(y_val, preds)
        print(f"MAE : {mae:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}")

        mae_scores.append(mae)
        rmse_scores.append(rmse)
        r2_scores.append(r2)

    return mae_scores, rmse_scores, r2_scores

mae_scores, rmse_scores, r2_scores = train_model(X_trainval, y_trainval)

print("\n=== Метрики на кросс-валидации ===")
print(f"MAE : {np.mean(mae_scores):.4f} ± {np.std(mae_scores):.4f}")
print(f"RMSE: {np.mean(rmse_scores):.4f} ± {np.std(rmse_scores):.4f}")
print(f"R²  : {np.mean(r2_scores):.4f} ± {np.std(r2_scores):.4f}")

# === Финальное обучение ===
print("\n=== Финальное обучение на train+val ===")
X_trainval_t = torch.from_numpy(X_trainval).float()
y_trainval_t = torch.from_numpy(y_trainval).float().unsqueeze(1)
trainval_loader = DataLoader(TensorDataset(X_trainval_t, y_trainval_t), batch_size=32, shuffle=True)

final_model = CNN1D(input_len=X_trainval.shape[1], out_channels=32, kernel_size=5, dropout=0.2).to(device)
final_model = train_with_early_stopping(final_model, trainval_loader, val_loader=None)

# === Сохранение модели ===
model_path = 'final_cnn_model.pth'
torch.save(final_model.state_dict(), model_path)
print(f"Final model saved to {model_path}")

# === Тестирование ===
print("\n=== Загрузка модели и тестирование ===")
loaded_model = CNN1D(input_len=X_trainval.shape[1], out_channels=32, kernel_size=5, dropout=0.2).to(device)
loaded_model.load_state_dict(torch.load(model_path))
loaded_model.eval()

X_test_t = torch.from_numpy(X_test).float().to(device)
with torch.no_grad():
    y_test_pred = loaded_model(X_test_t).squeeze().cpu().numpy()

print(f"Test MAE : {mean_absolute_error(y_test, y_test_pred):.4f}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred)):.4f}")
print(f"Test R²  : {r2_score(y_test, y_test_pred):.4f}")


Using device: cuda

--- Fold 1 ---
Epoch 1: val_loss=8.1758
Epoch 2: val_loss=4.1332
Epoch 3: val_loss=2.5221
Epoch 4: val_loss=1.7465
Epoch 5: val_loss=1.5759
Epoch 6: val_loss=1.5190
Epoch 7: val_loss=1.5173
Epoch 8: val_loss=1.4837
Epoch 9: val_loss=1.4778
Epoch 10: val_loss=1.4790
Epoch 11: val_loss=1.4492
Epoch 12: val_loss=1.4450
Epoch 13: val_loss=1.4543
Epoch 14: val_loss=1.4242
Epoch 15: val_loss=1.4700
Epoch 16: val_loss=1.4104
Epoch 17: val_loss=1.4156
Epoch 18: val_loss=1.3945
Epoch 19: val_loss=1.4013
Epoch 20: val_loss=1.3996
Epoch 21: val_loss=1.3957
Epoch 22: val_loss=1.3776
Epoch 23: val_loss=1.4829
Epoch 24: val_loss=1.3648
Epoch 25: val_loss=1.3598
Epoch 26: val_loss=1.3512
Epoch 27: val_loss=1.3662
Epoch 28: val_loss=1.3420
Epoch 29: val_loss=1.3479
Epoch 30: val_loss=1.3865
Epoch 31: val_loss=1.3335
Epoch 32: val_loss=1.3680
Epoch 33: val_loss=1.3231
Epoch 34: val_loss=1.3104
Epoch 35: val_loss=1.3064
Epoch 36: val_loss=1.3062
Epoch 37: val_loss=1.3254
Epoch 38: va

Обоснование выбора признаков, модели и параметров

В качестве входных признаков используются дескрипторы Mordred, прошедшие стандартизацию через StandardScaler.

В качестве модели выбрана одномерная сверточная нейросеть (1D-CNN), способная выявлять локальные зависимости между признаками. Архитектура включает два слоя Conv1d с ReLU, MaxPool1d и AdaptiveMaxPool1d, завершающихся полносвязным выходом. Используется Dropout (0.3–0.4) для регуляризации.

Обучение производится с помощью оптимизатора Adam (lr=0.001) и ранней остановки (patience=10) для предотвращения переобучения. Качество модели оценивается по 5-кратной кросс-валидации. После этого модель дообучается на всех обучающих данных и сохраняется для тестирования.

In [6]:
import os, random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

# reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# === Загрузка данных ===
df = pd.read_csv('/content/Mordred_nonagg_pca_processed.csv')
X = df.drop(columns=['pIC50']).values.astype(np.float32)
y = df['pIC50'].values.astype(np.float32)

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

# === CNN-модель ===
class CNN1D(nn.Module):
    def __init__(self, input_len, out_channels=16, kernel_size=3, dropout=0.3):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv1d(1, out_channels, kernel_size=kernel_size),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(out_channels, out_channels * 2, kernel_size=kernel_size),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(1),
            nn.Flatten(),
            nn.Dropout(dropout),
            nn.Linear(out_channels * 2, 1)
        )

    def forward(self, x):
        x = x.unsqueeze(1)  # -> (batch_size, 1, input_len)
        return self.model(x)

# === Обучение с ранней остановкой ===
def train_with_early_stopping(model, train_loader, val_loader=None, patience=10, max_epochs=100):
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    best_loss = float('inf')
    best_model_wts = None
    patience_counter = 0

    for epoch in range(max_epochs):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = criterion(pred, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if val_loader is not None:
            model.eval()
            val_losses = []
            with torch.no_grad():
                for xb, yb in val_loader:
                    xb, yb = xb.to(device), yb.to(device)
                    pred = model(xb)
                    loss = criterion(pred, yb)
                    val_losses.append(loss.item())
            val_loss = np.mean(val_losses)
            print(f"Epoch {epoch+1}: val_loss={val_loss:.4f}")

            if val_loss < best_loss:
                best_loss = val_loss
                best_model_wts = model.state_dict()
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("Early stopping triggered.")
                    break
        else:
            best_model_wts = model.state_dict()

    if best_model_wts:
        model.load_state_dict(best_model_wts)
    return model

# === Кросс-валидация ===
def train_model(X_trainval, y_trainval):
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    mae_scores, rmse_scores, r2_scores = [], [], []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_trainval), 1):
        print(f"\n--- Fold {fold} ---")
        X_train, X_val = X_trainval[train_idx], X_trainval[val_idx]
        y_train, y_val = y_trainval[train_idx], y_trainval[val_idx]

        X_train_t = torch.from_numpy(X_train).float()
        y_train_t = torch.from_numpy(y_train).float().unsqueeze(1)
        X_val_t = torch.from_numpy(X_val).float()
        y_val_t = torch.from_numpy(y_val).float().unsqueeze(1)

        train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=32, shuffle=True)
        val_loader   = DataLoader(TensorDataset(X_val_t, y_val_t), batch_size=32, shuffle=False)

        model = CNN1D(input_len=X_train.shape[1], out_channels=64, kernel_size=7, dropout=0.4).to(device)


        model = train_with_early_stopping(model, train_loader, val_loader)

        model.eval()
        with torch.no_grad():
            preds = model(X_val_t.to(device)).squeeze().cpu().numpy()

        mae = mean_absolute_error(y_val, preds)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        r2 = r2_score(y_val, preds)
        print(f"MAE : {mae:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}")

        mae_scores.append(mae)
        rmse_scores.append(rmse)
        r2_scores.append(r2)

    return mae_scores, rmse_scores, r2_scores

mae_scores, rmse_scores, r2_scores = train_model(X_trainval, y_trainval)

print("\n=== Метрики на кросс-валидации ===")
print(f"MAE : {np.mean(mae_scores):.4f} ± {np.std(mae_scores):.4f}")
print(f"RMSE: {np.mean(rmse_scores):.4f} ± {np.std(rmse_scores):.4f}")
print(f"R²  : {np.mean(r2_scores):.4f} ± {np.std(r2_scores):.4f}")

# === Финальное обучение ===
print("\n=== Финальное обучение на train+val ===")
X_trainval_t = torch.from_numpy(X_trainval).float()
y_trainval_t = torch.from_numpy(y_trainval).float().unsqueeze(1)
trainval_loader = DataLoader(TensorDataset(X_trainval_t, y_trainval_t), batch_size=32, shuffle=True)

final_model = CNN1D(input_len=X_trainval.shape[1], out_channels=64, kernel_size=7, dropout=0.4).to(device)

final_model = train_with_early_stopping(final_model, trainval_loader, val_loader=None)

# === Сохранение модели ===
model_path = 'final_cnn_model.pth'
torch.save(final_model.state_dict(), model_path)
print(f"Final model saved to {model_path}")

# === Тестирование ===
print("\n=== Загрузка модели и тестирование ===")
loaded_model = CNN1D(input_len=X_trainval.shape[1], out_channels=64, kernel_size=7, dropout=0.4).to(device)

loaded_model.load_state_dict(torch.load(model_path))
loaded_model.eval()

X_test_t = torch.from_numpy(X_test).float().to(device)
with torch.no_grad():
    y_test_pred = loaded_model(X_test_t).squeeze().cpu().numpy()

print(f"Test MAE : {mean_absolute_error(y_test, y_test_pred):.4f}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred)):.4f}")
print(f"Test R²  : {r2_score(y_test, y_test_pred):.4f}")


Using device: cuda

--- Fold 1 ---
Epoch 1: val_loss=4.4922
Epoch 2: val_loss=2.1715
Epoch 3: val_loss=1.5804
Epoch 4: val_loss=1.5158
Epoch 5: val_loss=1.4812
Epoch 6: val_loss=1.4928
Epoch 7: val_loss=1.4476
Epoch 8: val_loss=1.4119
Epoch 9: val_loss=1.4255
Epoch 10: val_loss=1.3927
Epoch 11: val_loss=1.3717
Epoch 12: val_loss=1.3639
Epoch 13: val_loss=1.3368
Epoch 14: val_loss=1.3515
Epoch 15: val_loss=1.3143
Epoch 16: val_loss=1.3038
Epoch 17: val_loss=1.2869
Epoch 18: val_loss=1.2957
Epoch 19: val_loss=1.3016
Epoch 20: val_loss=1.2761
Epoch 21: val_loss=1.3795
Epoch 22: val_loss=1.2543
Epoch 23: val_loss=1.2510
Epoch 24: val_loss=1.2326
Epoch 25: val_loss=1.3476
Epoch 26: val_loss=1.2281
Epoch 27: val_loss=1.2322
Epoch 28: val_loss=1.2087
Epoch 29: val_loss=1.2043
Epoch 30: val_loss=1.1986
Epoch 31: val_loss=1.2045
Epoch 32: val_loss=1.3002
Epoch 33: val_loss=1.1827
Epoch 34: val_loss=1.1579
Epoch 35: val_loss=1.1551
Epoch 36: val_loss=1.4994
Epoch 37: val_loss=1.1612
Epoch 38: va

Далее датасет RDKit с теми же параметрами

In [7]:
import os, random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

# reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# === Загрузка данных ===
df = pd.read_csv('/content/RDKit_nonagg_pca_processed.csv')
X = df.drop(columns=['pIC50']).values.astype(np.float32)
y = df['pIC50'].values.astype(np.float32)

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_trainval, X_test, y_trainval, y_test = train_test_split(X,
                                                          y, test_size=0.2, random_state=SEED)

# === CNN-модель ===
class CNN1D(nn.Module):
    def __init__(self, input_len, out_channels=16, kernel_size=3, dropout=0.3):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv1d(1, out_channels, kernel_size=kernel_size),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(out_channels, out_channels * 2, kernel_size=kernel_size),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(1),
            nn.Flatten(),
            nn.Dropout(dropout),
            nn.Linear(out_channels * 2, 1)
        )

    def forward(self, x):
        x = x.unsqueeze(1)  # -> (batch_size, 1, input_len)
        return self.model(x)

# === Обучение с ранней остановкой ===
def train_with_early_stopping(model, train_loader, val_loader=None, patience=10, max_epochs=100):
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    best_loss = float('inf')
    best_model_wts = None
    patience_counter = 0

    for epoch in range(max_epochs):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = criterion(pred, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if val_loader is not None:
            model.eval()
            val_losses = []
            with torch.no_grad():
                for xb, yb in val_loader:
                    xb, yb = xb.to(device), yb.to(device)
                    pred = model(xb)
                    loss = criterion(pred, yb)
                    val_losses.append(loss.item())
            val_loss = np.mean(val_losses)
            print(f"Epoch {epoch+1}: val_loss={val_loss:.4f}")

            if val_loss < best_loss:
                best_loss = val_loss
                best_model_wts = model.state_dict()
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("Early stopping triggered.")
                    break
        else:
            best_model_wts = model.state_dict()

    if best_model_wts:
        model.load_state_dict(best_model_wts)
    return model

# === Кросс-валидация ===
def train_model(X_trainval, y_trainval):
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    mae_scores, rmse_scores, r2_scores = [], [], []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_trainval), 1):
        print(f"\n--- Fold {fold} ---")
        X_train, X_val = X_trainval[train_idx], X_trainval[val_idx]
        y_train, y_val = y_trainval[train_idx], y_trainval[val_idx]

        X_train_t = torch.from_numpy(X_train).float()
        y_train_t = torch.from_numpy(y_train).float().unsqueeze(1)
        X_val_t = torch.from_numpy(X_val).float()
        y_val_t = torch.from_numpy(y_val).float().unsqueeze(1)

        train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=32, shuffle=True)
        val_loader   = DataLoader(TensorDataset(X_val_t, y_val_t), batch_size=32, shuffle=False)

        model = CNN1D(input_len=X_train.shape[1], out_channels=64, kernel_size=7, dropout=0.4).to(device)

        model = train_with_early_stopping(model, train_loader, val_loader)

        model.eval()
        with torch.no_grad():
            preds = model(X_val_t.to(device)).squeeze().cpu().numpy()

        mae = mean_absolute_error(y_val, preds)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        r2 = r2_score(y_val, preds)
        print(f"MAE : {mae:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}")

        mae_scores.append(mae)
        rmse_scores.append(rmse)
        r2_scores.append(r2)

    return mae_scores, rmse_scores, r2_scores

mae_scores, rmse_scores, r2_scores = train_model(X_trainval, y_trainval)

print("\n=== Метрики на кросс-валидации ===")
print(f"MAE : {np.mean(mae_scores):.4f} ± {np.std(mae_scores):.4f}")
print(f"RMSE: {np.mean(rmse_scores):.4f} ± {np.std(rmse_scores):.4f}")
print(f"R²  : {np.mean(r2_scores):.4f} ± {np.std(r2_scores):.4f}")

# === Финальное обучение ===
print("\n=== Финальное обучение на train+val ===")
X_trainval_t = torch.from_numpy(X_trainval).float()
y_trainval_t = torch.from_numpy(y_trainval).float().unsqueeze(1)
trainval_loader = DataLoader(TensorDataset(X_trainval_t, y_trainval_t), batch_size=32, shuffle=True)

final_model = CNN1D(input_len=X_trainval.shape[1], out_channels=32, kernel_size=5, dropout=0.2).to(device)
final_model = train_with_early_stopping(final_model, trainval_loader, val_loader=None)

# === Сохранение модели ===
model_path = 'final_cnn_model.pth'
torch.save(final_model.state_dict(), model_path)
print(f"Final model saved to {model_path}")

# === Тестирование ===
print("\n=== Загрузка модели и тестирование ===")
loaded_model = CNN1D(input_len=X_trainval.shape[1], out_channels=32, kernel_size=5, dropout=0.2).to(device)
loaded_model.load_state_dict(torch.load(model_path))
loaded_model.eval()

X_test_t = torch.from_numpy(X_test).float().to(device)
with torch.no_grad():
    y_test_pred = loaded_model(X_test_t).squeeze().cpu().numpy()

print(f"Test MAE : {mean_absolute_error(y_test, y_test_pred):.4f}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred)):.4f}")
print(f"Test R²  : {r2_score(y_test, y_test_pred):.4f}")


Using device: cuda

--- Fold 1 ---
Epoch 1: val_loss=3.2672
Epoch 2: val_loss=1.5712
Epoch 3: val_loss=1.5016
Epoch 4: val_loss=1.4701
Epoch 5: val_loss=1.4478
Epoch 6: val_loss=1.4318
Epoch 7: val_loss=1.4745
Epoch 8: val_loss=1.4323
Epoch 9: val_loss=1.3720
Epoch 10: val_loss=1.5644
Epoch 11: val_loss=1.3916
Epoch 12: val_loss=1.4103
Epoch 13: val_loss=1.3140
Epoch 14: val_loss=1.4867
Epoch 15: val_loss=1.3193
Epoch 16: val_loss=1.3992
Epoch 17: val_loss=1.2943
Epoch 18: val_loss=1.3112
Epoch 19: val_loss=1.3624
Epoch 20: val_loss=1.5252
Epoch 21: val_loss=1.2585
Epoch 22: val_loss=1.3295
Epoch 23: val_loss=1.2585
Epoch 24: val_loss=1.4233
Epoch 25: val_loss=1.3383
Epoch 26: val_loss=1.2473
Epoch 27: val_loss=1.2923
Epoch 28: val_loss=1.2715
Epoch 29: val_loss=1.2662
Epoch 30: val_loss=1.3426
Epoch 31: val_loss=1.2039
Epoch 32: val_loss=1.3016
Epoch 33: val_loss=1.3323
Epoch 34: val_loss=1.3212
Epoch 35: val_loss=1.2599
Epoch 36: val_loss=1.3192
Epoch 37: val_loss=1.3246
Epoch 38: va

In [8]:
import os, random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

# reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# === Загрузка данных ===
df = pd.read_csv('/content/RDKit_nonagg_pca_processed.csv')
X = df.drop(columns=['pIC50']).values.astype(np.float32)
y = df['pIC50'].values.astype(np.float32)

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

# === CNN-модель ===
class CNN1D(nn.Module):
    def __init__(self, input_len, out_channels=16, kernel_size=3, dropout=0.3):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv1d(1, out_channels, kernel_size=kernel_size),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(out_channels, out_channels * 2, kernel_size=kernel_size),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(1),
            nn.Flatten(),
            nn.Dropout(dropout),
            nn.Linear(out_channels * 2, 1)
        )

    def forward(self, x):
        x = x.unsqueeze(1)  # -> (batch_size, 1, input_len)
        return self.model(x)

# === Обучение с ранней остановкой ===
def train_with_early_stopping(model, train_loader, val_loader=None, patience=10, max_epochs=100):
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    best_loss = float('inf')
    best_model_wts = None
    patience_counter = 0

    for epoch in range(max_epochs):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = criterion(pred, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if val_loader is not None:
            model.eval()
            val_losses = []
            with torch.no_grad():
                for xb, yb in val_loader:
                    xb, yb = xb.to(device), yb.to(device)
                    pred = model(xb)
                    loss = criterion(pred, yb)
                    val_losses.append(loss.item())
            val_loss = np.mean(val_losses)
            print(f"Epoch {epoch+1}: val_loss={val_loss:.4f}")

            if val_loss < best_loss:
                best_loss = val_loss
                best_model_wts = model.state_dict()
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("Early stopping triggered.")
                    break
        else:
            best_model_wts = model.state_dict()

    if best_model_wts:
        model.load_state_dict(best_model_wts)
    return model

# === Кросс-валидация ===
def train_model(X_trainval, y_trainval):
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    mae_scores, rmse_scores, r2_scores = [], [], []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_trainval), 1):
        print(f"\n--- Fold {fold} ---")
        X_train, X_val = X_trainval[train_idx], X_trainval[val_idx]
        y_train, y_val = y_trainval[train_idx], y_trainval[val_idx]

        X_train_t = torch.from_numpy(X_train).float()
        y_train_t = torch.from_numpy(y_train).float().unsqueeze(1)
        X_val_t = torch.from_numpy(X_val).float()
        y_val_t = torch.from_numpy(y_val).float().unsqueeze(1)

        train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=32, shuffle=True)
        val_loader   = DataLoader(TensorDataset(X_val_t, y_val_t), batch_size=32, shuffle=False)

        model = CNN1D(input_len=X_train.shape[1], out_channels=64, kernel_size=7, dropout=0.4).to(device)


        model = train_with_early_stopping(model, train_loader, val_loader)

        model.eval()
        with torch.no_grad():
            preds = model(X_val_t.to(device)).squeeze().cpu().numpy()

        mae = mean_absolute_error(y_val, preds)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        r2 = r2_score(y_val, preds)
        print(f"MAE : {mae:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}")

        mae_scores.append(mae)
        rmse_scores.append(rmse)
        r2_scores.append(r2)

    return mae_scores, rmse_scores, r2_scores

mae_scores, rmse_scores, r2_scores = train_model(X_trainval, y_trainval)

print("\n=== Метрики на кросс-валидации ===")
print(f"MAE : {np.mean(mae_scores):.4f} ± {np.std(mae_scores):.4f}")
print(f"RMSE: {np.mean(rmse_scores):.4f} ± {np.std(rmse_scores):.4f}")
print(f"R²  : {np.mean(r2_scores):.4f} ± {np.std(r2_scores):.4f}")

# === Финальное обучение ===
print("\n=== Финальное обучение на train+val ===")
X_trainval_t = torch.from_numpy(X_trainval).float()
y_trainval_t = torch.from_numpy(y_trainval).float().unsqueeze(1)
trainval_loader = DataLoader(TensorDataset(X_trainval_t, y_trainval_t), batch_size=32, shuffle=True)

final_model = CNN1D(input_len=X_trainval.shape[1], out_channels=64, kernel_size=7, dropout=0.4).to(device)

final_model = train_with_early_stopping(final_model, trainval_loader, val_loader=None)

# === Сохранение модели ===
model_path = 'final_cnn_model.pth'
torch.save(final_model.state_dict(), model_path)
print(f"Final model saved to {model_path}")

# === Тестирование ===
print("\n=== Загрузка модели и тестирование ===")
loaded_model = CNN1D(input_len=X_trainval.shape[1], out_channels=64, kernel_size=7, dropout=0.4).to(device)

loaded_model.load_state_dict(torch.load(model_path))
loaded_model.eval()

X_test_t = torch.from_numpy(X_test).float().to(device)
with torch.no_grad():
    y_test_pred = loaded_model(X_test_t).squeeze().cpu().numpy()

print(f"Test MAE : {mean_absolute_error(y_test, y_test_pred):.4f}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred)):.4f}")
print(f"Test R²  : {r2_score(y_test, y_test_pred):.4f}")


Using device: cuda

--- Fold 1 ---
Epoch 1: val_loss=3.2672
Epoch 2: val_loss=1.5712
Epoch 3: val_loss=1.5016
Epoch 4: val_loss=1.4702
Epoch 5: val_loss=1.4462
Epoch 6: val_loss=1.4273
Epoch 7: val_loss=1.4771
Epoch 8: val_loss=1.4316
Epoch 9: val_loss=1.3728
Epoch 10: val_loss=1.5801
Epoch 11: val_loss=1.3940
Epoch 12: val_loss=1.4072
Epoch 13: val_loss=1.3207
Epoch 14: val_loss=1.4998
Epoch 15: val_loss=1.3189
Epoch 16: val_loss=1.3919
Epoch 17: val_loss=1.3000
Epoch 18: val_loss=1.3078
Epoch 19: val_loss=1.4292
Epoch 20: val_loss=1.4716
Epoch 21: val_loss=1.2580
Epoch 22: val_loss=1.3122
Epoch 23: val_loss=1.2307
Epoch 24: val_loss=1.4294
Epoch 25: val_loss=1.3153
Epoch 26: val_loss=1.2327
Epoch 27: val_loss=1.3025
Epoch 28: val_loss=1.2406
Epoch 29: val_loss=1.2403
Epoch 30: val_loss=1.2687
Epoch 31: val_loss=1.1770
Epoch 32: val_loss=1.2590
Epoch 33: val_loss=1.2950
Epoch 34: val_loss=1.3366
Epoch 35: val_loss=1.1841
Epoch 36: val_loss=1.2436
Epoch 37: val_loss=1.2788
Epoch 38: va