<a href="https://colab.research.google.com/github/jwasswa2023/Physpropnet/blob/main/deep_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Deep Model Script - DMPNN

In [None]:
import deepchem as dc
import numpy as np
from deepchem.models import DMPNNModel
from deepchem.splits import RandomSplitter
from deepchem.hyper import RandomHyperparamOpt

# Load custom dataset from CSV
csv_file = "desalted_BioHC.csv"
tasks = ["LogHalfLife"]

# Initialize the loader for graph-based models
loader = dc.data.CSVLoader(tasks=tasks, feature_field="SMILES", featurizer=dc.feat.DMPNNFeaturizer())
dataset = loader.featurize(csv_file)

# Split the train+val and test dataset
splitter = RandomSplitter()
train_val_dataset, test_dataset = splitter.train_test_split(dataset, frac_train=0.9)

# Split the train and validation dataset
train_dataset, valid_dataset = splitter.train_test_split(train_val_dataset, frac_train=8/9)

# Define metric
pearson_r2_score = dc.metrics.Metric(dc.metrics.pearson_r2_score)
mae_error = dc.metrics.Metric(dc.metrics.mean_absolute_error)
rms_score = dc.metrics.Metric(dc.metrics.rms_score)



In [None]:
len(train_dataset), len(valid_dataset), len(test_dataset)

(120, 15, 15)

K-Fold Cross-Validation

In [None]:
num_folds = 5
# KFold splits indices based on the number of samples in the dataset
train_val_dataset_folds = splitter.k_fold_split(train_val_dataset, k=num_folds)

# Collect metrics
k_r2_scores = []
k_rmse_scores = []
k_mae_scores = []

# Cross-validation loop
for fold, (k_train_dataset, k_valid_dataset) in enumerate(train_val_dataset_folds):
    print(f"\nFold {fold+1}/{num_folds}")

    # Re-instantiate model for each fold
    model = DMPNNModel(n_tasks=len(tasks), mode='regression', learning_rate=1e-3, batch_size=64)

    # Train model
    model.fit(k_train_dataset, nb_epoch=100)

    # Evaluate
    scores = model.evaluate(k_valid_dataset, [pearson_r2_score, mae_error, rms_score])

    k_r2_scores.append(scores['pearson_r2_score'])
    k_rmse_scores.append(scores['rms_score'])
    k_mae_scores.append(scores['mean_absolute_error'])

# Report mean ± std for each metric
print("\n=== Cross-Validation Summary ===")
print(f"R²     : {np.mean(k_r2_scores):.4f} ± {np.std(k_r2_scores):.4f}")
print(f"RMSE   : {np.mean(k_rmse_scores):.4f} ± {np.std(k_rmse_scores):.4f}")
print(f"MAE    : {np.mean(k_mae_scores):.4f} ± {np.std(k_mae_scores):.4f}")


Fold 1/5

Fold 2/5

Fold 3/5

Fold 4/5

Fold 5/5

=== Cross-Validation Summary ===
R²     : 0.7116 ± 0.1149
RMSE   : 0.3936 ± 0.0850
MAE    : 0.2976 ± 0.0390


Hyper-Parameter Tuning

In [None]:
# Define the model builder function for DMPNN
def model_builder(model_dir, **model_params):
    return DMPNNModel(
        n_tasks=len(tasks),
        mode='regression',
        model_dir=model_dir,
        **model_params
    )

# Define hyperparameter space for DMPNN
param_dict = {
    'learning_rate': [1e-4, 1e-3, 1e-2],
    'batch_size': [32, 64, 128],
    'global_features_size': [0],
    'enc_hidden': [60, 150, 300],
    'depth': [2, 3, 4],
    'enc_dropout_p': [0.0, 0.2, 0.4],
    'aggregation': ['mean'],
    'aggregation_norm': [100],
    'ffn_hidden': [60, 150, 300],
    'ffn_layers': [2, 3],
    'ffn_dropout_p': [0.0, 0.2, 0.4],
    'ffn_dropout_at_input_no_act': [True, False],
}

# Set up and run random search
search = RandomHyperparamOpt(model_builder=model_builder, max_iter=10) # change max_iter to 60 for full random search
best_model, best_params, all_results = search.hyperparam_search(param_dict, train_dataset, valid_dataset, pearson_r2_score, nb_epoch=100, use_max=True, logdir="./random_search")

# Evaluate best model
print("Validation Set Metrics for best model")
val_scores = best_model.evaluate(valid_dataset, [pearson_r2_score, mae_error, rms_score])
print(val_scores)

Validation Set Metrics for best model
{'pearson_r2_score': 0.8593848433030704, 'mean_absolute_error': 0.41409178224563614, 'rms_score': 0.47125810192796763}


3 random seed models on best parameters

In [None]:
del best_params['model_dir']

In [None]:
# Collect metrics
r2_scores = []
rmse_scores = []
mae_scores = []

for i in range(3):
    model = DMPNNModel(
            n_tasks=len(tasks),
            mode='regression',
            **best_params
        )
    model.fit(train_dataset, nb_epoch=100)
    # Evaluate on test
    scores = model.evaluate(test_dataset, [pearson_r2_score, mae_error, rms_score])

    r2_scores.append(scores['pearson_r2_score'])
    rmse_scores.append(scores['rms_score'])
    mae_scores.append(scores['mean_absolute_error'])

# Report mean ± std for each metric
print("\n=== Test Results Summary ===")
print(f"R²     : {np.mean(r2_scores):.4f} ± {np.std(r2_scores):.4f}")
print(f"RMSE   : {np.mean(rmse_scores):.4f} ± {np.std(rmse_scores):.4f}")
print(f"MAE    : {np.mean(mae_scores):.4f} ± {np.std(mae_scores):.4f}")


=== Test Results Summary ===
R²     : 0.8259 ± 0.0154
RMSE   : 0.4737 ± 0.0545
MAE    : 0.3088 ± 0.0108
