# Installation instructions

- `conda create --name=attentivefp_env python=3.8`
- `conda activate attentivefp_env`
- `pip install deepchem`
- `pip install dgl dgllife`
- `pip install torch`
- `pip install numpy==1.21.1`
- `conda deactivate`
- `python -m ipykernel install --user --name=attentivefp_env`

In [9]:
import glob
import os

import deepchem as dc
import numpy as np
import pandas as pd
from deepchem.models import AttentiveFPModel
from hyperopt import Trials, fmin, hp, tpe
from sklearn.metrics import auc, precision_recall_curve, roc_auc_score

# Prepare dataset

In [2]:
featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)

tasks = ["hit"]
input_file = "../data/TRAIN_03_19_2022.csv"
loader = dc.data.CSVLoader(tasks=tasks, feature_field="SMILES", featurizer=featurizer)

dataset = loader.create_dataset(input_file)
dataset

[17:32:41] Unusual charge on atom 0 number of radical electrons set to zero
Failed to featurize datapoint 4247, Cl.[NaH]. Appending empty array
Exception message: tuple index out of range
  return np.asarray(features)
Failed to featurize datapoint 1543, Cl.[LiH]. Appending empty array
Exception message: tuple index out of range
  return np.asarray(features)
Failed to featurize datapoint 7461, I.[KH]. Appending empty array
Exception message: tuple index out of range
  return np.asarray(features)


<DiskDataset X.shape: (30964,), y.shape: (30964, 1), w.shape: (30964, 1), task_names: ['hit']>

# Train model - do hyperparameter search on 80% train set

In [3]:
# hyperparameters
NUM_FOLDS = 3
SAVE_INTERVAL = 100
NB_EPOCH = 10
OPT_TRIALS = 10
BATCH_SIZE = 50

# hyperparameter optimization
search_space = {
    "num_layers": hp.choice("num_layers", [1, 2, 3, 4]),
    "layer_sizes": hp.choice("layer_sizes", [200, 400, 600, 800, 1000]),  # graph_feat_size
    "dropout": hp.uniform("dropout", low=0.1, high=0.4),
    "learning_rate": hp.uniform("learning_rate", high=0.001, low=0.00001),
}

In [None]:
def train_attentive_fp_model(save_dir, num_layers, learning_rate=0.001, dropout=0.1, graph_feat_size=200):
    """
    Trains an AttentiveFP graph neural network model using scaffold-based data splitting.

    The model is trained across multiple random seeds (NUM_FOLDS) with early stopping
    on validation performance. For each fold, auPR and auROC are computed on the test set,
    results are saved to disk, and averages are reported.

    Parameters:
    save_dir (str): Directory where fold results and models will be saved.
    num_layers (int): Number of graph convolution layers in the AttentiveFP model.
    learning_rate (float, optional): Learning rate for model optimization. Default is 0.001.
    dropout (float, optional): Dropout probability for regularization. Default is 0.1.
    graph_feat_size (int, optional): Dimensionality of learned graph feature vectors. Default is 200.

    Returns:
    float: Average auPR across all folds.
    
    Notes:
    - Requires global variables: NUM_FOLDS, BATCH_SIZE, SAVE_INTERVAL, NB_EPOCH, and dataset.
    - Writes per-seed and average results to text files in save_dir.
    - Uses DeepChem's ScaffoldSplitter for train/valid/test partitioning.
    """
    metrics = [
        dc.metrics.Metric(dc.metrics.roc_auc_score),
        dc.metrics.Metric(dc.metrics.prc_auc_score),
    ]

    print("Saving results to " + save_dir)
    # Train the model with scaffold splitting
    auprs = []
    aurocs = []
    df = open(save_dir + "results.txt", "w")
    for seed in list(range(NUM_FOLDS)):
        print("Seed: " + str(seed))
        model_dir = save_dir + str(seed) + "/"
        os.mkdir(model_dir)

        splitter = dc.splits.ScaffoldSplitter()
        model = AttentiveFPModel(
            model_dir=model_dir,
            mode="classification",
            n_classes=2,
            n_tasks=1,
            batch_size=BATCH_SIZE,
            num_layers=num_layers,
            learning_rate=learning_rate,
            dropout=dropout,
            graph_feat_size=graph_feat_size,
        )
        train, valid, test = splitter.train_valid_test_split(
            dataset, frac_train=0.8, frac_valid=0.1, frac_test=0.1, seed=seed
        )

        # this automatically enables EarlyStopping based on validation metrics, and saves final model to file
        tr = open(model_dir + str(seed) + ".txt", "w")
        vc = dc.models.ValidationCallback(
            valid,
            interval=SAVE_INTERVAL,
            metrics=metrics,
            save_dir=model_dir,
            output_file=tr,
        )
        model.fit(train, nb_epoch=NB_EPOCH, checkpoint_interval=SAVE_INTERVAL, callbacks=[vc])

        # Evaluate the model - auPR on test set
        model.restore(model_dir=model_dir)  # restoring the best checkpoint
        met = model.evaluate(test, metrics)
        aupr = met["prc_auc_score"]
        auroc = met["roc_auc_score"]

        # save results
        df.write("auPR on test set for seed " + str(seed) + ": " + str(aupr) + "\n")
        df.write("auROC on test set for seed " + str(seed) + ": " + str(auroc) + "\n")
        auprs.append(aupr)
        aurocs.append(auroc)

    df.close()
    avg_aupr = np.mean(auprs)
    print("Average auPR: " + str(avg_aupr))
    print("Average auROC: " + str(np.mean(aurocs)))
    return avg_aupr

In [None]:
# Define the metrics


def fm(args):
    """
    Wrapper function for training an AttentiveFP model with hyperparameter inputs.

    Constructs a save directory based on the provided hyperparameters, trains an
    AttentiveFP model using `train_attentive_fp_model`, and returns the negative
    auPR score (suitable for minimization in hyperparameter optimization).

    Parameters:
    args (dict): Dictionary of hyperparameters with keys:
        - "num_layers" (int): Number of graph convolution layers.
        - "learning_rate" (float): Learning rate for model training.
        - "dropout" (float): Dropout probability.
        - "layer_sizes" (int): Size of graph feature vectors.

    Returns:
    float: Negative auPR value on the test set (so optimizers can minimize).
    
    Side Effects:
    - Creates a save directory for each hyperparameter combination.
    - Prints fold identifier and auPR to stdout.
    - Saves trained model and results in the created directory.
    """
    nl = args["num_layers"]
    lr = args["learning_rate"]
    dr = args["dropout"]
    ls = args["layer_sizes"]
    fold = str(nl) + "_" + str(lr) + "_" + str(dr) + "_" + str(ls) + "/"
    save_dir = "../models/attentiveFP/hyperopt/" + fold
    os.mkdir(save_dir)
    pr_auc = train_attentive_fp_model(save_dir, num_layers=nl, learning_rate=lr, dropout=dr, graph_feat_size=ls)
    print(fold, pr_auc)
    return -1 * pr_auc  # need to give something to MINIMIZE

In [None]:
trials = Trials()
best = fmin(fm, space=search_space, algo=tpe.suggest, max_evals=OPT_TRIALS, trials=trials)

# Evaluate best model on 20% test set

In [None]:
def split_param_name(params):
    """
    Parses a parameter string into individual hyperparameters for AttentiveFP models.

    Expects a string in the format: 
        "<num_layers>_<learning_rate>_<dropout>_<layer_size>/"

    Parameters:
    params (str): Parameter string encoding model hyperparameters.

    Returns:
    tuple: A tuple containing:
        - int: Number of layers (nl).
        - float: Learning rate (lr).
        - float: Dropout probability (dr).
        - int: Graph feature vector size (ls).
    """
    nl = int(params.split("_")[0])
    lr = float(params.split("_")[1].split("_")[0])
    dr = float(params.split("_")[2].split("_")[0])
    ls = int(params.split("_")[3].split("/")[0])
    return (nl, lr, dr, ls)


# get best model
path = "../models/attentiveFP/hyperopt/"
columns = ["name", "NL", "LR", "DR", "LS", "auPR", "auROC"]
ho_df = pd.DataFrame(columns=columns)
for name in glob.glob(path + "*/results.txt"):
    clean_name = name.split("/results.txt")[0].split("hyperopt/")[1]
    results = pd.read_csv(name, header=None)
    auprs = []
    aurocs = []
    for i, row in results.iterrows():
        r = str(row)
        val = float(r.split(":")[1].split("\n")[0])
        if "auPR" in r:
            auprs.append(val)
        else:
            aurocs.append(val)
    nl, lr, dr, ls = split_param_name(clean_name)
    ho_df.loc[len(ho_df.index)] = [
        clean_name,
        nl,
        lr,
        dr,
        ls,
        np.mean(auprs),
        np.mean(aurocs),
    ]
ho_df = ho_df.sort_values("auPR", ascending=False)
ho_df.to_csv(path + "all_ho_results.csv", index=False)
ho_df

Unnamed: 0,name,NL,LR,DR,LS,auPR,auROC
9,3_0.00016298505613644663_0.3254185003397073_1000,3,0.000163,0.325419,1000,0.299143,0.733583
5,4_0.00024127546611118732_0.1477493201861702_1000,4,0.000241,0.147749,1000,0.286854,0.73701
8,2_0.0006961750403834422_0.3813669623919579_200,2,0.000696,0.381367,200,0.279951,0.729365
2,4_0.0004397299825071773_0.35543274010899906_800,4,0.00044,0.355433,800,0.279225,0.724282
3,2_0.0006729163068408161_0.39667630722646385_200,2,0.000673,0.396676,200,0.276575,0.727486
6,2_0.00010317567681770247_0.19535478942063683_600,2,0.000103,0.195355,600,0.258253,0.717409
7,3_8.52739668146465e-05_0.13598456595554664_400,3,8.5e-05,0.135985,400,0.249204,0.732744
1,4_0.00012105200153024894_0.3450562161243306_400,4,0.000121,0.345056,400,0.241298,0.716436
0,4_0.0006288632466732576_0.2674342530096203_1000,4,0.000629,0.267434,1000,0.222929,0.65596
4,1_0.0008797286773333985_0.29080578548327574_1000,1,0.00088,0.290806,1000,0.180468,0.59618


In [37]:
# prep test set
featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)

tasks = ["hit"]
test_input_file = "../data/TEST_03_19_2022.csv"
loader = dc.data.CSVLoader(tasks=tasks, feature_field="SMILES", featurizer=featurizer)

held_out_test = loader.create_dataset(test_input_file)
held_out_test

<DiskDataset X.shape: (7713,), y.shape: (7713, 1), w.shape: (7713, 1), task_names: ['hit']>

In [41]:
params = "3_0.00016298505613644663_0.3254185003397073_1000/"
preds = pd.DataFrame()
metrics = [
    dc.metrics.Metric(dc.metrics.roc_auc_score),
    dc.metrics.Metric(dc.metrics.prc_auc_score),
]
for seed in list(range(NUM_FOLDS)):
    nl, lr, dr, ls = split_param_name(params)
    correct_dir = path + params + str(seed) + "/"
    model = AttentiveFPModel(
        model_dir=correct_dir,
        mode="classification",
        n_tasks=1,
        n_classes=2,
        num_layers=nl,
        learning_rate=lr,
        dropout=dr,
        graph_feat_size=ls,
    )
    model.restore(model_dir=correct_dir)
    met = model.evaluate(held_out_test, metrics)
    print(met)  # this is the predictions when using 1 fold for prediction

    # actually save the predictions so we can average them - use the ensemble for prediction
    y_pred = model.predict(held_out_test)
    y_pred = y_pred[:, 1]
    preds[str(seed)] = list(y_pred)

avg_preds = preds.mean(axis=1)
preds["mean"] = avg_preds
preds.to_csv(path + params + "predictions.csv", index=False)

y_true = held_out_test.y
y_true = y_true.flatten()
precision, recall, threshold = precision_recall_curve(y_true, avg_preds)
pr_auc = auc(recall, precision)
roc = roc_auc_score(y_true, avg_preds)

print("auPR on held-out 20% test set: " + str(pr_auc))
print("auROC on held-out 20% test set: " + str(roc))

{'roc_auc_score': 0.8536963639496812, 'prc_auc_score': 0.286347165353454}
{'roc_auc_score': 0.8611104462783192, 'prc_auc_score': 0.23837542752904364}
{'roc_auc_score': 0.8550949062107084, 'prc_auc_score': 0.2385725525123126}
auPR on held-out 20% test set: 0.2738796075580747
auROC on held-out 20% test set: 0.8635866824950058


# Train final model

In [42]:
featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)

tasks = ["hit"]
input_file = "../data/FULL_03_19_2022.csv"
loader = dc.data.CSVLoader(tasks=tasks, feature_field="SMILES", featurizer=featurizer)

dataset = loader.create_dataset(input_file)
dataset

Failed to featurize datapoint 4247, Cl.[NaH]. Appending empty array
Exception message: tuple index out of range
  return np.asarray(features)
Failed to featurize datapoint 1543, Cl.[LiH]. Appending empty array
Exception message: tuple index out of range
  return np.asarray(features)
Failed to featurize datapoint 7461, I.[KH]. Appending empty array
Exception message: tuple index out of range
  return np.asarray(features)


<DiskDataset X.shape: (38677,), y.shape: (38677, 1), w.shape: (38677, 1), task_names: ['hit']>

In [None]:
save_dir = "../models/attentiveFP/FINAL_random_split_3_0.00016298505613644663_0.3254185003397073_1000/"
os.mkdir(save_dir)
print("Saving results to " + save_dir)

NUM_FOLDS = 20
NUM_LAYERS = 3
LEARNING_RATE = 0.00016298505613644663
DROPOUT = 0.3254185003397073
GRAPH_FEAT_SIZE = 1000

# Train the model with best parameters
auprs = []
aurocs = []
metrics = [
    dc.metrics.Metric(dc.metrics.roc_auc_score),
    dc.metrics.Metric(dc.metrics.prc_auc_score),
]
for seed in list(range(NUM_FOLDS)):
    print("Seed: " + str(seed))
    model_dir = save_dir + str(seed) + "/"
    os.mkdir(model_dir)

    splitter = dc.splits.RandomSplitter()  # dc.splits.ScaffoldSplitter()
    model = AttentiveFPModel(
        model_dir=model_dir,
        mode="classification",
        n_classes=2,
        n_tasks=1,
        batch_size=BATCH_SIZE,
        num_layers=NUM_LAYERS,
        learning_rate=LEARNING_RATE,
        dropout=DROPOUT,
        graph_feat_size=GRAPH_FEAT_SIZE,
    )
    train, valid, test = splitter.train_valid_test_split(
        dataset, frac_train=0.8, frac_valid=0.1, frac_test=0.1, seed=seed
    )

    # this automatically enables EarlyStopping based on validation metrics, and saves final model to file
    tr = open(model_dir + str(seed) + ".txt", "w")
    vc = dc.models.ValidationCallback(
        valid,
        interval=SAVE_INTERVAL,
        metrics=metrics,
        save_dir=model_dir,
        output_file=tr,
    )
    model.fit(train, nb_epoch=NB_EPOCH, checkpoint_interval=SAVE_INTERVAL, callbacks=[vc])

    # Evaluate the model - actually need to save predictions so we can average them
    model.restore(model_dir=model_dir)  # restoring the best checkpoint
    met = model.evaluate(test, metrics)
    aupr = met["prc_auc_score"]
    auroc = met["roc_auc_score"]

    # save results
    print("auPR on test set for seed " + str(seed) + ": " + str(aupr))
    print("auROC on test set for seed " + str(seed) + ": " + str(auroc))
    auprs.append(aupr)
    aurocs.append(auroc)

avg_aupr = np.mean(auprs)
print("Average auPR: " + str(avg_aupr))
avg_auroc = np.mean(aurocs)
print("Average auROC: " + str(avg_aupr))

results_df = pd.DataFrame()
results_df["auPR"] = auprs
results_df["auROC"] = aurocs
results_df.to_csv(save_dir + "summary_results.csv", index=False)
results_df