# Introduction to Neural Force Field

This Jupyter Notebook contains an introduction to the `nff` package. Here, we will load the modules and functions from `nff` to import a dataset, create dataloaders, create a model, train it and check the test stats. We will do most of it manually to illustrate the usage of the API. However, scripts such as the one provided in the `scripts/` folder already automate most of this process.

After the `nff` package has been installed, we start by importing all dependencies for this tutorial.

In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import sys
sys.path.insert(0, "..")
sys.path.insert(0, "/home/saxelrod/Repo/projects/covid_nff/NeuralForceField")
# sys.path.remove('/home/saxelrod/Repo/projects/ax_autopology/NeuralForceField')

import json
import os
import shutil
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch.optim import Adam
from torch.utils.data import DataLoader

from nff.data import Dataset, split_train_validation_test, collate_dicts, to_tensor
from nff.train import Trainer, get_trainer, get_model, load_model, loss, hooks, metrics, evaluate

from nff.hyperparams.utils import add_morgan, trim_confs, make_class_model

from nff.data.features import add_features

It might also be useful setting the GPU you want to use:

In [2]:
# DEVICE = 1
# OUTDIR = './sandbox'
# model = load_model(OUTDIR)

In [3]:
DEVICE = 0
MAX_EPOCHS = 5000
OUTDIR = './sandbox'
FOLDER_NAME = 'backup'

if os.path.exists(OUTDIR):
    newpath = os.path.join(os.path.dirname(OUTDIR), FOLDER_NAME)
    if os.path.exists(newpath):
        shutil.rmtree(newpath)
        
    shutil.move(OUTDIR, newpath)

# Get best hyperparameters

In [4]:
CWD = "/home/saxelrod/data/sigopt/covid_schnet"
METRIC = "roc_auc"

def get_params(job_dir=CWD):
    dirs = os.listdir(job_dir)
    model_dirs = [dir for dir in dirs if dir.startswith("model_")]
    best_metric = -1
    for dir in model_dirs:
        path = os.path.join(job_dir, dir, "job_info.json")
        if not os.path.isfile(path):
            continue
        with open(path, "r") as f:
            info = json.load(f)
        metric = info[METRIC]
        if metric > best_metric:
            best_metric = metric
            best_path = path
    new_path = os.path.join(job_dir, "best_info.json")
    shutil.copy(best_path, new_path)
    
    with open(new_path, "r") as f:
        dic = json.load(f)
    params = dic["assignments"]
    params.update(dic["set_params"])
    return params
params = get_params()


##
params.update({"num_mol_layers": 2,
              "extra_features": ["morgan", "rdkit_2d_normalized"],
              "extra_feat_length": 2048 + 200})


# params.update({"num_mol_layers": 2,
#               "extra_features": ["morgan"],
#               "extra_feat_length": 2048})


# params.update({"num_mol_layers": 2,
#               "extra_features": ["rdkit_2d_normalized"],
#               "extra_feat_length": 200})

##

## Loading the relevant data

As we usually work with the database, we can pack their information in a class `Dataset`, which is a subclass of `torch.utils.data.Dataset`. It basically wraps information on the atomic numbers, energies, forces and SMILES strings for each one of the geometries. In this example, we already have a pre-compiled `Dataset` to be used. We start by loading this file and creating three slices of the original dataset

In [6]:
# dataset = Dataset.from_file('./data/covid_crest_50.pth.tar')

# dataset = Dataset.from_file('./data/covid_crest.pth.tar')

# dataset = Dataset.from_file('./data/covid_crest_1.pth.tar')




# dataset = Dataset.from_file('./data/covid_crest.pth.tar')
# dataset = Dataset.from_file('./data/covid_mmff94.pth.tar')
# dataset = Dataset.from_file('./data/covid_mmff94_1_geom.pth.tar')



In [7]:
import pdb

def separate_datasets(dataset, split_ratio):

    bind_indices = torch.LongTensor([i  for i, bind in enumerate(dataset.props['bind']) if bind])
    remaining_indices = [i for i in range(len(dataset)) if i not in bind_indices]

    fail_dataset = dataset.copy()
    for key, val in fail_dataset.props.items():
        fail_dataset.props[key] = [val[i] for i in remaining_indices]
    return dataset, fail_dataset, bind_indices

def get_split_bind_indices(bind_indices, split_ratio):
    num_bind = len(bind_indices)
    bind_per_split = (split_ratio * num_bind).astype('int')
    while True:
        for i in range(3):
            if sum(bind_per_split) == num_bind:
                break
            bind_per_split[i] += 1
        if sum(bind_per_split) == num_bind:
                break

    bind_per_split = bind_per_split.tolist()
    split_bind_indices = torch.split(bind_indices, bind_per_split)
    return split_bind_indices

def make_bind_datasets(split_bind_indices, dataset):
    
    datasets = []
    for indices in split_bind_indices:
        new_set = dataset.copy()
        for key, val in dataset.props.items():
            new_set.props[key] = to_tensor([val[i] for i in indices])
        datasets.append(new_set)
    return tuple(datasets)
    

def split_data(dataset, split_ratio):
    dataset, fail_dataset, bind_indices = separate_datasets(dataset, split_ratio)
    split_bind_indices =  get_split_bind_indices(bind_indices, split_ratio)
    bind_datasets = make_bind_datasets(split_bind_indices, dataset)
    
    train, val, test = split_train_validation_test(fail_dataset, val_size=split_ratio[1],
                                                   test_size=split_ratio[2])
    split_sets = [train, val, test]
    
    for i in range(3):
        split_set = split_sets[i]
        bind_set = bind_datasets[i]
        
        for key, value in bind_set.props.items():
            if type(value) is list:
                split_set.props[key] += value
            else:
                split_set.props[key] = torch.cat((split_set.props[key], value))
    
    return train, val, test

        

Code for making the initial split

In [8]:
# # # # split_ratio = np.array([0.6, 0.2, 0.2])

# # # for crest based on its size -- not fully right yet
# split_ratio = np.array([0.6, 0.2, 0.2])
# datasets = split_data(dataset, split_ratio)
# train, val, test = datasets


Loading the same splits used before for this run

In [9]:
# train_name = "train_spec_crest.json"
# val_name = "val_spec_crest.json"
# test_name = "test_spec_crest.json"

# # train_name = "train_spec.json"
# # val_name = "val_spec.json"
# # test_name = "test_spec.json"

# file_names = [train_name, val_name, test_name]
# spec_list = []
# for name in file_names:
    
#     with open(name, 'r') as f:
#         spec_ids = json.load(f)
#     spec_list.append(spec_ids)

# datasets = []
# for spec_ids in spec_list:
#     idx = [i for i, spec_id in enumerate(dataset.props['spec_id']
#             ) if spec_id in spec_ids]
#     print("Found idx")
#     new_dataset = dataset.copy()
#     for key, val in dataset.props.items():
#         new_dataset.props[key] = [val[i] for i in range(len(val)
#             ) if i in idx]
#     datasets.append(new_dataset)
    
# train, val, test = datasets



# # datasets = (,)
# # for name in file_names:
# #     if os.path.isfile(name):
# #         datasets.append(Dataset.from_file(name))
        
# # if len(datasets) != 3:
# #     split_ratio = np.array([0.6, 0.2, 0.2])
# #     datasets = split_data(dataset, split_ratio)
# #     for dataset, name in zip(datasets, file_names):
# #         dataset.save(name)

        
# # train, val, test = datasets




In [None]:
main_path = "/home/saxelrod/data/sigopt/covid_schnet"
train = Dataset.from_file(os.path.join(main_path, "train.pth.tar"))
test = Dataset.from_file(os.path.join(main_path, "test.pth.tar"))
val = Dataset.from_file(os.path.join(main_path, "val.pth.tar"))


# add_morgan(train, params["morgan_length"])
# add_morgan(test, params["morgan_length"])
# add_morgan(val, params["morgan_length"])

names = ["train_new.pth.tar", "val_new.pth.tar", "test_new.pth.tar"]
full_paths = [os.path.join(main_path, name) for name in names]
if all([os.path.isfile(path) for path in full_paths]):
    train, val, test = [Dataset.from_file(path) for path in full_paths]
# else:
l
ll
feature_names = params["extra_features"]
feat_paths = ["{}.npz".format(name) for name in feature_names]
d_sets = [train, val, test]
for d_set, name in zip(d_sets, names):
    data_path = "/tmp/data_path"
    dataset_path = os.path.join(main_path, name)
    add_features(dataset=d_set,
                 data_path=data_path,
                 feature_names=feature_names,
                 dataset_path=dataset_path,
                 feat_paths=feat_paths)

trim_confs(train, params["num_confs"])
trim_confs(test, params["num_confs"])
trim_confs(val, params["num_confs"])

100%|██████████| 3224/3224 [00:00<00:00, 3372.88it/s]


> /home/saxelrod/Repo/projects/covid_nff/NeuralForceField/nff/data/features/rdkit_feat.py(37)add_features()
-> for feature_name, feat_path in zip(feature_names, feat_paths):
(Pdb) l
 32  	    make_csv(dataset=dataset, data_path=data_path)
 33  	    data = get_data(path=data_path)
 34  	
 35  	    pdb.set_trace()
 36  	
 37  ->	    for feature_name, feat_path in zip(feature_names, feat_paths):
 38  	
 39  	        if os.path.isfile(feat_path):
 40  	            features = load_features(feat_path)
 41  	            features = [item.tolist() for item in features]
 42  	        else:
(Pdb) data
<chemprop.data.data.MoleculeDataset object at 0x7fbfaf1d0cc0>
(Pdb) dir(data)
['__add__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '_

100%|██████████| 1075/1075 [00:00<00:00, 2665.94it/s]


> /home/saxelrod/Repo/projects/covid_nff/NeuralForceField/nff/data/features/rdkit_feat.py(37)add_features()
-> for feature_name, feat_path in zip(feature_names, feat_paths):
(Pdb) data.features()
(Pdb) l
 32  	    make_csv(dataset=dataset, data_path=data_path)
 33  	    data = get_data(path=data_path)
 34  	
 35  	    pdb.set_trace()
 36  	
 37  ->	    for feature_name, feat_path in zip(feature_names, feat_paths):
 38  	
 39  	        if os.path.isfile(feat_path):
 40  	            features = load_features(feat_path)
 41  	            features = [item.tolist() for item in features]
 42  	        else:
(Pdb) features
*** NameError: name 'features' is not defined
(Pdb) n
> /home/saxelrod/Repo/projects/covid_nff/NeuralForceField/nff/data/features/rdkit_feat.py(39)add_features()
-> if os.path.isfile(feat_path):
(Pdb) n
> /home/saxelrod/Repo/projects/covid_nff/NeuralForceField/nff/data/features/rdkit_feat.py(40)add_features()
-> features = load_features(feat_path)
(Pdb) n
> /home/saxelrod/R

(Pdb) dir(data)
['__add__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'args', 'compound_names', 'data', 'features', 'features_size', 'mols', 'normalize_features', 'num_tasks', 'scaler', 'set_targets', 'shuffle', 'smiles', 'sort', 'targets']
(Pdb) data. features_size
<bound method MoleculeDataset.features_size of <chemprop.data.data.MoleculeDataset object at 0x7fbf9eba4e80>>
(Pdb) data. features_size
<bound method MoleculeDataset.features_size of <chemprop.data.data.MoleculeDataset object at 0x7fbf9eba4e80>>
(Pdb) data.features_size
<bound method MoleculeDataset.features_size of <chemprop.data.data.MoleculeDataset object at 0x7fbf9eba4e80>>
(Pdb) data.features_size

In [None]:
print(len(train))
print(len(test))
print(len(val))

In [None]:
val.props["rdkit_2d_normalized"][0]

The `nff` code interfaces with the `graphbuilder` module through a git submodule in the repository. `graphbuilder` provides methods to create batches of graphs. In `nff`, we interface that through a custom dataloader called `
GraphLoader`. Here, we create one loader for each one of the slices.

In [None]:
train_loader = DataLoader(train, batch_size=1, collate_fn=collate_dicts)
val_loader = DataLoader(val, batch_size=1, collate_fn=collate_dicts)
test_loader = DataLoader(test, batch_size=1, collate_fn=collate_dicts)

Number of positive binders in train, validation, and test sets:

In [None]:
# print(np.sum(train.props['bind']))
# print(np.sum(val.props['bind']))
# print(np.sum(test.props['bind']))



## Creating a model

`nff` is based on SchNet. It parameterizes interatomic interactions in molecules and materials through a series of convolution layers with continuous filters. Here, we are going to create a simple model using the hyperparameters given on `params`:

In [None]:
model = make_class_model("WeightedConformers", params)

## Creating a trainer

To train our model with the data provided, we have to create a loss function. The easiest way to do that is through the `build_mse_loss` builder. Its argument `rho` is a parameter that will multiply the mean square error (MSE) of the force components before summing it with the MSE of the energy.

In [None]:
loss_fn = loss.build_cross_entropy_loss(loss_coef={'bind': 1.0})
# loss_fn = loss.build_mse_loss(loss_coef={'bind': 1.0})

We should also select an optimizer for our recently created model:

In [None]:
trainable_params = filter(lambda p: p.requires_grad, model.parameters())
optimizer = Adam(trainable_params, lr=1e-5)

### Metrics and hooks

Metrics and hooks allow the customization of the training process. Instead of tweaking directly the code or having to resort to countless flags, we can create submodules (or add-ons) to monitor the progress of the training or customize it.

If we want to monitor the progress of our training, say by looking at the mean absolute error (MAE) of energies and forces, we can simply create metrics to observe them:

In [None]:
train_metrics = [
    metrics.PrAuc('bind'),
    metrics.RocAuc('bind'),

]

Furthermore, if we want to customize how our training procedure is done, we can use hooks which can interrupt or change the train automatically.

In our case, we are adding hooks to:
* Stop the training procedure after 100 epochs;
* Log the training on a machine-readable CSV file under the directory `./sandbox`;
* Print the progress on the screen with custom formatting; and
* Setup a scheduler for the learning rate.

In [None]:
train_hooks = [
    hooks.MaxEpochHook(MAX_EPOCHS),
    hooks.CSVHook(
        OUTDIR,
        metrics=train_metrics,
    ),
    hooks.PrintingHook(
        OUTDIR,
        metrics=train_metrics,
        separator = ' | ',
        time_strf='%M:%S'
    ),
    hooks.ReduceLROnPlateauHook(
        optimizer=optimizer,
        patience=30,
        factor=0.5,
        min_lr=1e-7,
        window_length=1,
        stop_after_min=True
    )
]

### Trainer wrapper

A `Trainer` in the `nff` package is a wrapper to train a model. It automatically creates checkpoints, as well as trains and validates a given model. It also allow further training by loading checkpoints from existing paths, making the training procedure more flexible. Its functionalities can be extended by the hooks we created above. To create a trainer, we have to execute the following command:

In [None]:
T = Trainer(
    model_path=OUTDIR,
    model=model,
    loss_fn=loss_fn,
    optimizer=optimizer,
    train_loader=train_loader,
    validation_loader=val_loader,
    checkpoint_interval=1,
    hooks=train_hooks,
    mini_batches=3
)

Now we can finally train the model using the method `train` from the `Trainer`:

In [None]:
import pdb

T.train(device=DEVICE, n_epochs=MAX_EPOCHS)


Evaluate on the test set

In [None]:
from sklearn.metrics import auc

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve

results, targets, train_loss = evaluate(T.get_best_model(), test_loader, loss_fn, device=DEVICE)

bind_res = torch.cat(results['bind']).reshape(-1)
bind_targ = torch.cat(targets['bind']).reshape(-1)

print(roc_auc_score(y_true=bind_targ, y_score=bind_res))

roc = roc_curve(y_true=bind_targ, y_score=bind_res)

In [None]:
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y_true=bind_targ, probas_pred=bind_res)
pr_auc  = auc(recall, precision)

print(pr_auc)
# plt.plot(pcr[0], pcr[1])
# plt.show()

In [None]:
plt.plot(roc[0], roc[1])
plt.plot(roc[0], roc[0], '--')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.show()



In [None]:
# def true_positives(y, yp):
#     actual = y.detach().cpu().numpy().round().reshape(-1)
#     pred = yp.detach().cpu().numpy().round().reshape(-1)

#     all_positives = [i for i, item in enumerate(pred) if item == 1]
#     true_positives = [i for i in all_positives if pred[i] == actual[i]]

#     # number of predicted negatives
#     num_pred = len(all_positives)
#     num_pred_correct = len(true_positives)
    
#     ratio = num_pred_correct / num_pred

#     return ratio

# def true_negatives(y, yp):
#     actual = y.detach().cpu().numpy().round().reshape(-1)
#     pred = yp.detach().cpu().numpy().round().reshape(-1)

#     all_negatives = [i for i, item in enumerate(pred) if item == 0]
#     true_negatives = [i for i in all_negatives if pred[i] == actual[i]]

#     # number of predicted negatives
#     num_pred = len(all_negatives)
#     num_pred_correct = len(true_negatives)
    
#     ratio = num_pred_correct / num_pred

#     return ratio



In [None]:
# y = bind_targ
# yp = bind_res
# pos_rate = true_positives(y, yp)
# neg_rate = true_negatives(y, yp)

In [None]:
# pos_rate

In [None]:
# neg_rate

## Move model, save datasets

In [None]:
# NEW_FOLDER = "ten_confs_four_convolutions"
# NEW_FOLDER = "one_conf_four_convolutions"
# NEW_FOLDER = "ten_confs_four_convs_boltz_nn"

# NEW_FOLDER = "ten_confs_crest_four_convs_boltz_nn"
# NEW_FOLDER = "ten_confs_four_convs_crest"

# NEW_FOLDER = "ten_confs_four_convs_mmff_no_boltz_nn"

# NEW_FOLDER = "ten_confs_four_convs_mmff_with_boltz_nn"

NEW_FOLDER = "ten_confs_four_convs_crest_no_boltz_nn"

# NEW_FOLDER = "fifty_confs_four_convs_crest_no_boltz_nn"


# NEW_FOLDER = "one_conf_four_convs_crest_no_boltz_nn"




if not os.path.isdir(NEW_FOLDER):
    os.makedirs(NEW_FOLDER)

files = os.listdir(NEW_FOLDER)

if not files:
    train.save("{}/train.pth.tar".format(NEW_FOLDER))
    test.save("{}/test.pth.tar".format(NEW_FOLDER))
    val.save("{}/val.pth.tar".format(NEW_FOLDER))


    OUTDIR = './sandbox'
    files = os.listdir(OUTDIR)
    for file in files:
        old_path = os.path.join(OUTDIR, file)
        new_path = os.path.join(NEW_FOLDER, file)
        shutil.move(old_path, new_path)

    


In [None]:
# import json

# test_spec_ids = test.props['spec_id'].numpy().tolist()
# train_spec_ids = train.props['spec_id'].numpy().tolist()
# val_spec_ids = val.props['spec_id'].numpy().tolist()

# with open("test_spec_crest.json", "w") as f:
#     json.dump(test_spec_ids, f)
    
# with open("train_spec_crest.json", "w") as f:
#     json.dump(train_spec_ids, f)
    
# with open("val_spec_crest.json", "w") as f:
#     json.dump(val_spec_ids, f)