In [6]:
#import sys
import torch
import torch.nn as nn
from sympy.stats.sampling.sample_numpy import numpy
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
#from core.model import MultiOutputRegressor
from sklearn.multioutput import MultiOutputRegressor
#from core.visualize import export_and_visualize_model
import joblib

In [7]:
if __name__ == "__main__":
    DF = pd.read_csv("new_merged_features_IC50_g12c.csv", index_col=False)
    DF = DF.dropna()
    DF = DF.loc[:, ~DF.columns.str.contains('^Unnamed')]
    DF['IC50 (nM)'] = pd.to_numeric(DF['IC50 (nM)'], errors='coerce')
    # Calculate Q1 (25th percentile) and Q3 (75th percentile)
    Q1 = DF['IC50 (nM)'].quantile(0.25)
    Q3 = DF['IC50 (nM)'].quantile(0.75)

    # Calculate IQR
    IQR = Q3 - Q1
    # Relax the conditions by increasing the multiplier
    multiplier = 0.6  # You can change this to a value that suits your needs

    # Remove outliers using the relaxed IQR method
    DF = DF[(DF['IC50 (nM)'] >= (Q1 - multiplier * IQR)) & (DF['IC50 (nM)'] <= (Q3 + multiplier * IQR))]
    DF = DF.reset_index(drop=True)

    # Filter and sample data
    DF = DF[DF['FC'] == 0]

    DF = DF.sample(frac=10, replace=True, random_state=42)  # Shuffle data

    y = DF[["IC50 (nM)"]] #  "IC50 (nM)"
    X = DF.drop(columns=["FC", "IC50 (nM)", "Smiles", "ChEMBL ID"])

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Scale the data
    scaler_X = StandardScaler()
    X_train_scaled = scaler_X.fit_transform(X_train)
    X_test_scaled = scaler_X.transform(X_test)

    scaler_y = StandardScaler()
    y_train_scaled = scaler_y.fit_transform(y_train)
    y_test_scaled = scaler_y.transform(y_test)


    #joblib.dump(scaler_X, f"saved_models/scaler_X_{chembel_id}_SV.pkl")
    #joblib.dump(scaler_y, f"saved_models/scaler_y_{chembel_id}_SV.pkl")

    # Convert to tensors
    X_train_tensor = torch.from_numpy(X_train_scaled).float()
    X_test_tensor = torch.from_numpy(X_test_scaled).float()
    y_train_tensor = torch.from_numpy(y_train_scaled).float()
    y_test_tensor = torch.from_numpy(y_test_scaled).float()


    # Model, loss, optimizer, and scheduler
    input_dim = X_train_tensor.shape[1]
    hidden_dim = 104  # Increased hidden dimension
    num_hidden_layers = 4
    output_dim = y_train_tensor.shape[1]
    dropout_rate = 0.0


    class MultiOutputNN(nn.Module):
        def __init__(self, input_dim, hidden_dim, output_dim, num_hidden_layers, dropout_rate):
            super(MultiOutputNN, self).__init__()
            layers = []
            layers.append(nn.Linear(input_dim, hidden_dim))
            layers.append(nn.ReLU())
            for _ in range(num_hidden_layers - 1):
                layers.append(nn.Linear(hidden_dim, hidden_dim))
                layers.append(nn.ReLU())
                layers.append(nn.Dropout(dropout_rate))
            layers.append(nn.Linear(hidden_dim, output_dim))
            self.model = nn.Sequential(*layers)

        def forward(self, x):
            return self.model(x)


    # Now create the model
    model = MultiOutputNN(input_dim, hidden_dim, output_dim, num_hidden_layers, dropout_rate)

    criterion = nn.SmoothL1Loss()  # Huber loss
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0041, weight_decay=1e-6)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)

    # Prepare DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
    train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

    # Training loop
    num_epochs = 100
    train_losses, test_losses, r2_scores = [], [], []

    for epoch in range(num_epochs):
        # Training
        model.train()
        epoch_train_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            epoch_train_loss += loss.item()
        train_losses.append(epoch_train_loss / len(train_loader))

        # Validation
        model.eval()
        epoch_test_loss = 0
        y_pred_test_all, y_test_all = [], []
        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                y_pred = model(X_batch)
                loss = criterion(y_pred, y_batch)
                epoch_test_loss += loss.item()
                y_pred_test_all.append(y_pred)
                y_test_all.append(y_batch)
        test_losses.append(epoch_test_loss / len(test_loader))

        # Calculate R2 score
        y_pred_test_all = torch.cat(y_pred_test_all, dim=0).cpu().numpy()
        y_test_all = torch.cat(y_test_all, dim=0).cpu().numpy()
        r2 = r2_score(y_test_all, y_pred_test_all, multioutput='variance_weighted')
        r2_scores.append(r2)

        scheduler.step()

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_losses[-1]:.4f}, Test Loss: {test_losses[-1]:.4f}, R2: {r2:.4f}")

Epoch 1/100, Train Loss: 0.3031, Test Loss: 0.2185, R2: 0.4490
Epoch 2/100, Train Loss: 0.2105, Test Loss: 0.1836, R2: 0.5766
Epoch 3/100, Train Loss: 0.1612, Test Loss: 0.1488, R2: 0.6747
Epoch 4/100, Train Loss: 0.1321, Test Loss: 0.1182, R2: 0.7312
Epoch 5/100, Train Loss: 0.1173, Test Loss: 0.1027, R2: 0.7636
Epoch 6/100, Train Loss: 0.1042, Test Loss: 0.1028, R2: 0.7844
Epoch 7/100, Train Loss: 0.0958, Test Loss: 0.0915, R2: 0.8086
Epoch 8/100, Train Loss: 0.0879, Test Loss: 0.0842, R2: 0.8218
Epoch 9/100, Train Loss: 0.0829, Test Loss: 0.0824, R2: 0.8220
Epoch 10/100, Train Loss: 0.0797, Test Loss: 0.0842, R2: 0.8317
Epoch 11/100, Train Loss: 0.0787, Test Loss: 0.0987, R2: 0.8200
Epoch 12/100, Train Loss: 0.0765, Test Loss: 0.0700, R2: 0.8316
Epoch 13/100, Train Loss: 0.0766, Test Loss: 0.0681, R2: 0.8359
Epoch 14/100, Train Loss: 0.0720, Test Loss: 0.0776, R2: 0.8327
Epoch 15/100, Train Loss: 0.0698, Test Loss: 0.0670, R2: 0.8472
Epoch 16/100, Train Loss: 0.0662, Test Loss: 0.05

In [8]:
# Evaluate final R2
final_r2 = r2_scores[-1]
MAE = mean_squared_error(y_test_all, y_pred_test_all)
print(f"Final R2 Score: {final_r2:.4f}")
print(f"MSE: {MAE:.4f}")

Final R2 Score: 0.9660
MSE: 0.0342
