<a href="https://colab.research.google.com/github/jwasswa2023/Physpropnet/blob/main/new_shallow_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Shallow Model Script - Random Forest Regressor

In [None]:
import deepchem as dc
import numpy as np
from deepchem.splits import RandomSplitter
from deepchem.models import SklearnModel
from sklearn.ensemble import RandomForestRegressor
from deepchem.hyper import RandomHyperparamOpt

# Load custom dataset from CSV
csv_file = "desalted_BioHC.csv"
tasks = ["LogHalfLife"]

# Initialize the loader for shallow model
rdkit_featurizer = dc.feat.RDKitDescriptors(is_normalized=True)
loader = dc.data.CSVLoader(tasks=tasks, feature_field="SMILES", featurizer=rdkit_featurizer)
dataset = loader.featurize(csv_file)

# Split the train+val and test dataset
splitter = RandomSplitter()
train_val_dataset, test_dataset = splitter.train_test_split(dataset, frac_train=0.9)

# Split the train and validation dataset
train_dataset, valid_dataset = splitter.train_test_split(train_val_dataset, frac_train=8/9)

# Define metric
pearson_r2_score = dc.metrics.Metric(dc.metrics.pearson_r2_score)
mae_error = dc.metrics.Metric(dc.metrics.mean_absolute_error)
rms_score = dc.metrics.Metric(dc.metrics.rms_score)



In [None]:
len(train_dataset), len(valid_dataset), len(test_dataset)

(120, 15, 15)

K-Fold Cross-Validation

In [None]:
num_folds = 5
# KFold splits indices based on the number of samples in the dataset
train_val_dataset_folds = splitter.k_fold_split(train_val_dataset, k=num_folds)

# Collect metrics
k_r2_scores = []
k_rmse_scores = []
k_mae_scores = []

# Cross-validation loop
for fold, (k_train_dataset, k_valid_dataset) in enumerate(train_val_dataset_folds):
    print(f"\nFold {fold+1}/{num_folds}")

    # Re-instantiate model for each fold
    model = SklearnModel(RandomForestRegressor())

    # Train model
    model.fit(k_train_dataset)

    # Evaluate
    scores = model.evaluate(k_valid_dataset, [pearson_r2_score, mae_error, rms_score])

    k_r2_scores.append(scores['pearson_r2_score'])
    k_rmse_scores.append(scores['rms_score'])
    k_mae_scores.append(scores['mean_absolute_error'])

# Report mean ± std for each metric
print("\n=== Cross-Validation Summary ===")
print(f"R²     : {np.mean(k_r2_scores):.4f} ± {np.std(k_r2_scores):.4f}")
print(f"RMSE   : {np.mean(k_rmse_scores):.4f} ± {np.std(k_rmse_scores):.4f}")
print(f"MAE    : {np.mean(k_mae_scores):.4f} ± {np.std(k_mae_scores):.4f}")


Fold 1/5

Fold 2/5

Fold 3/5

Fold 4/5

Fold 5/5

=== Cross-Validation Summary ===
R²     : 0.8056 ± 0.0515
RMSE   : 0.3177 ± 0.0482
MAE    : 0.2381 ± 0.0452


Hyper-Parameter Tuning

In [None]:
# Define the model builder function for sklearn models
def sklearn_model(model):
    def initialize_sklearn_model(model_dir: str = None, **kwargs):
        if model_dir is None:
            return SklearnModel(model(**kwargs))
        else:
            return SklearnModel(model(**kwargs), model_dir=model_dir)

    return initialize_sklearn_model


# Define hyperparameter space for RandomForestRegressor
param_dict = {
    "n_estimators": [50, 100, 150, 200, 300],
    "max_depth": [None, 5, 10, 20, 50],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False]
}

# Set up and run random search
search = RandomHyperparamOpt(model_builder=sklearn_model(RandomForestRegressor), max_iter=60)
best_model, best_params, all_results = search.hyperparam_search(param_dict, train_dataset, valid_dataset, pearson_r2_score, use_max=True, logdir="./random_search_shallow")

# Evaluate best model
print("Validation Set Metrics for best model")
val_scores = best_model.evaluate(valid_dataset, [pearson_r2_score, mae_error, rms_score])
print(val_scores)

Validation Set Metrics for best model
{'pearson_r2_score': 0.8451389335906981, 'mean_absolute_error': 0.24253312071292907, 'rms_score': 0.3208967668414655}


3 random seed models on best parameters

In [None]:
del best_params['model_dir']

In [None]:
# Collect metrics
r2_scores = []
rmse_scores = []
mae_scores = []

for i in range(3):
    model = SklearnModel(RandomForestRegressor(**best_params))
    model.fit(train_dataset)
    # Evaluate on test
    scores = model.evaluate(test_dataset, [pearson_r2_score, mae_error, rms_score])

    r2_scores.append(scores['pearson_r2_score'])
    rmse_scores.append(scores['rms_score'])
    mae_scores.append(scores['mean_absolute_error'])

# Report mean ± std for each metric
print("\n=== Test Results Summary ===")
print(f"R²     : {np.mean(r2_scores):.4f} ± {np.std(r2_scores):.4f}")
print(f"RMSE   : {np.mean(rmse_scores):.4f} ± {np.std(rmse_scores):.4f}")
print(f"MAE    : {np.mean(mae_scores):.4f} ± {np.std(mae_scores):.4f}")


=== Test Results Summary ===
R²     : 0.8614 ± 0.0029
RMSE   : 0.3242 ± 0.0038
MAE    : 0.2507 ± 0.0032
