<a href="https://colab.research.google.com/github/jwasswa2023/Physpropnet/blob/main/GNN_GATMODEL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install CPU-only PyTorch 2.0.1 and matching torchdata
!pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 torchdata==0.6.1

# Install DeepChem (Torch-compatible)
!pip install "numpy==2.3.1" "deepchem[torch]"

# Install PyTorch Geometric (CPU-only)
!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv -f https://data.pyg.org/whl/torch-2.0.1+cpu.html
!pip install torch-geometric

# Install DGL (CPU version)
!pip install dgl==1.1.2

In [None]:
import torch
import torchdata
import torch_geometric
import dgl
import deepchem as dc
import numpy as np

print("Torch:", torch.__version__)
print("TorchData:", torchdata.__version__)
print("TorchGeometric:", torch_geometric.__version__)
print("DGL:", dgl.__version__)
print("DeepChem:", dc.__version__)
print("NumPy:", np.__version__)


In [None]:
import pandas as pd
import numpy as np
from deepchem.feat import PagtnMolGraphFeaturizer
from deepchem.data import NumpyDataset
from deepchem.models.torch_models import GATModel
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load dataset
df = pd.read_csv("/content/desalted_WS.csv")  # Adjust path as needed
smiles_list = df["SMILES"].tolist()
labels = df["LogWS"].values

# Featurize molecules
featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)
features = featurizer.featurize(smiles_list)

# Filter out failed featurizations
valid_data = [(f, l, s) for f, l, s in zip(features, labels, smiles_list) if f is not None]
X, y, ids = zip(*valid_data)

# Convert to numpy arrays
X = np.array(X)
y = np.array(y)
ids = np.array(ids)

# Perform the initial train/validation/test split on the data arrays
# Split into 80% train_val and 20% test
X_train_val, X_test, y_train_val, y_test, ids_train_val, ids_test = train_test_split(
    X, y, ids, test_size=0.2, random_state=42, shuffle=True
)

# Create DeepChem datasets for train_val and test
train_val_dataset = NumpyDataset(X=X_train_val, y=y_train_val, ids=ids_train_val)
test_dataset = NumpyDataset(X=X_test, y=y_test, ids=ids_test)


# Cross-validation setup on training+validation set
num_folds = 5
# KFold splits indices based on the number of samples in the dataset
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Collect metrics
r2_scores = []
rmse_scores = []
mae_scores = []

# Cross-validation loop
# Use kf.split on the indices of the train_val_dataset
for fold, (train_idx, valid_idx) in enumerate(kf.split(train_val_dataset.ids)):
    print(f"\nFold {fold+1}/{num_folds}")

    # Select data for the current fold's train and validation sets using .select()
    train_dataset = train_val_dataset.select(train_idx)
    valid_dataset = train_val_dataset.select(valid_idx)

    # Re-instantiate model for each fold
    model = GATModel(1, mode="regression", batch_normalize=False)

    # Train model
    model.fit(train_dataset, nb_epoch=30)

    # Predict and evaluate
    preds = model.predict(valid_dataset)
    true = valid_dataset.y

    r2 = r2_score(true, preds)
    rmse = np.sqrt(mean_squared_error(true, preds))
    mae = mean_absolute_error(true, preds)

    r2_scores.append(r2)
    rmse_scores.append(rmse)
    mae_scores.append(mae)

# Report mean ± std for each metric
print("\n=== Cross-Validation Summary ===")
print(f"R²     : {np.mean(r2_scores):.4f} ± {np.std(r2_scores):.4f}")
print(f"RMSE   : {np.mean(rmse_scores):.4f} ± {np.std(rmse_scores):.4f}")
print(f"MAE    : {np.mean(mae_scores):.4f} ± {np.std(mae_scores):.4f}")


Fold 1/5

Fold 2/5

Fold 3/5

Fold 4/5

Fold 5/5

=== Cross-Validation Summary ===
R²     : 0.8678 ± 0.0146
RMSE   : 0.7402 ± 0.0145
MAE    : 0.5393 ± 0.0127


In [None]:
import deepchem as dc
import numpy as np
import random
import time
from deepchem.models import GATModel
from deepchem.metrics import Metric, pearson_r2_score

# Start timer
start_time = time.time()

# Define hyperparameter grid for GATModel
param_grid = {
    'n_tasks': [1],
    'batch_size': [128],
    'mode': ['regression'],
    'number_atom_features': [30],
    'graph_attention_layers': [[8, 8], [16, 16], [64, 64], [256, 256]],
    'n_attention_heads': [2, 4, 6, 8, 10, 12],
    'agg_modes': [None, ['flatten', 'mean'], ['mean', 'mean'], ['flatten', 'flatten'], ['mean', 'flatten']],
    'residual': [True, False],
    'dropout': [0.0, 0.2, 0.4],
    'alpha': [0.1, 0.2, 0.4],
    'predictor_hidden_feats': [64, 128, 256, 512],
    'predictor_dropout': [0.0, 0.2, 0.4],
    'self_loop': [True],
}

# Evaluation metric
metric = Metric(pearson_r2_score, mode='regression')

# Random search setup
n_iter = 60
results = []
best_score = -np.inf
best_model = None
best_params = None

for i in range(n_iter):
    # Randomly sample parameters
    params = {k: random.choice(v) for k, v in param_grid.items()}

    print(f"\n🔁 Iteration {i+1}/{n_iter} — Trying params: {params}")

    # Instantiate and train model
    model = GATModel(**params)
    model.fit(train_val_dataset, nb_epoch=30)

    # Evaluate
    scores = model.evaluate(valid_dataset, [metric])
    score = scores[metric.name]
    print(f"📊 Score: {score:.4f}")

    # Save best model
    results.append((score, params))
    if score > best_score:
        best_score = score
        best_model = model
        best_params = params

# Report best
print("\n✅ Best Hyperparameters Found:")
for k, v in best_params.items():
    print(f"{k}: {v}")
print(f"\n🎯 Best Pearson R²: {best_score:.4f}")

# End timer and report elapsed time
end_time = time.time()
elapsed_time = (end_time - start_time) / 60  # minutes
print(f"\n⏱️ Total execution time: {elapsed_time:.2f} minutes")



🔁 Iteration 1/60 — Trying params: {'n_tasks': 1, 'batch_size': 128, 'mode': 'regression', 'number_atom_features': 30, 'graph_attention_layers': [16, 16], 'n_attention_heads': 6, 'agg_modes': ['flatten', 'mean'], 'residual': False, 'dropout': 0.4, 'alpha': 0.1, 'predictor_hidden_feats': 512, 'predictor_dropout': 0.2, 'self_loop': True}
📊 Score: 0.8475

🔁 Iteration 2/60 — Trying params: {'n_tasks': 1, 'batch_size': 128, 'mode': 'regression', 'number_atom_features': 30, 'graph_attention_layers': [8, 8], 'n_attention_heads': 6, 'agg_modes': ['flatten', 'mean'], 'residual': False, 'dropout': 0.4, 'alpha': 0.2, 'predictor_hidden_feats': 256, 'predictor_dropout': 0.4, 'self_loop': True}
📊 Score: 0.8351

🔁 Iteration 3/60 — Trying params: {'n_tasks': 1, 'batch_size': 128, 'mode': 'regression', 'number_atom_features': 30, 'graph_attention_layers': [64, 64], 'n_attention_heads': 2, 'agg_modes': ['flatten', 'flatten'], 'residual': False, 'dropout': 0.0, 'alpha': 0.4, 'predictor_hidden_feats': 256

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Initialize lists to store MAE, RMSE, and R2 scores across 3 test runs
mae_scores, rmse_scores, r2_scores = [], [], []

# Perform testing three times
for _ in range(3):
    # Make predictions
    pred = best_model.predict(test_dataset)

    # Compute metrics
    y_true = test_dataset.y
    mae = mean_absolute_error(y_true=y_true, y_pred=pred)
    rmse = np.sqrt(mean_squared_error(y_true=y_true, y_pred=pred))
    r2 = r2_score(y_true=y_true, y_pred=pred)

    # Append scores
    mae_scores.append(mae)
    rmse_scores.append(rmse)
    r2_scores.append(r2)

# Compute mean and std for each metric
mean_mae, std_mae = np.mean(mae_scores), np.std(mae_scores)
mean_rmse, std_rmse = np.mean(rmse_scores), np.std(rmse_scores)
mean_r2, std_r2 = np.mean(r2_scores), np.std(r2_scores)

# Display results
print(f"📈 Mean MAE   : {mean_mae:.4f} ± {std_mae:.4f}")
print(f"📈 Mean RMSE  : {mean_rmse:.4f} ± {std_rmse:.4f}")
print(f"📈 Mean R²    : {mean_r2:.4f} ± {std_r2:.4f}")


📈 Mean MAE   : 0.3529 ± 0.0000
📈 Mean RMSE  : 0.4350 ± 0.0000
📈 Mean R²    : 0.6597 ± 0.0000
