# Creating training and benchmarking datasets
This notebook will guide you through the process of creating the quantum chemical (QC) training and benchmarking datasets used in the development of the transferable double exponential force field (DEFF).

## Quantum chemical

To create the valence fitting dataset we follow the process used to create the original Sage fitting dataset but add in an extra filtering stage to remove molecules with vdW types, not refit by the physical property training set.

### Warning QCArchive datasets are not static!
Scripts filtering QCArchive datasets are not always reproducible as calculations may not be finished or the dataset could be later expanded so we include OpenFF-QCSubmit datasets which contain the record IDs and serve as a static reproducible dataset. This notebook just shows how the dataset was first built. 

In [None]:
import copy
import json
import logging
from collections import defaultdict
from multiprocessing import Pool
from tempfile import NamedTemporaryFile
from pathlib import Path
from openff.qcsubmit.results import (
    OptimizationResultCollection, TorsionDriveResultCollection,
)
from openff.qcsubmit.results.filters import (
    ConformerRMSDFilter, ConnectivityFilter,
    ElementFilter,
    HydrogenBondFilter,
    RecordStatusFilter,
    ResultRecordFilter,
    CMILESResultFilter,
    SMILESFilter
)
import qcelemental
from openeye import oechem
from openff.toolkit.topology import Molecule
from openff.toolkit.topology.molecule import SmilesParsingError
from openff.toolkit.typing.engines.smirnoff import ForceField
from openff.toolkit.utils import UndefinedStereochemistryError
from qcportal import FractalClient
from qcportal.models import TorsionDriveRecord
from qcportal.models.records import RecordStatusEnum
from tqdm import tqdm

In [None]:
class UndefinedStereoFilter(ResultRecordFilter):
    """
    OpenFF-QCSubmit style dataset filtering class to remove molecules with missing stereochemistry information.
    """
    def _filter_function(self, result, record, molecule) -> bool:

        has_stereochemistry = True

        molecule = copy.deepcopy(molecule)
        molecule._conformers = [molecule.conformers[0]]

        try:

            with NamedTemporaryFile(suffix=".sdf") as file:
                molecule.to_file(file.name, "SDF")
                molecule.from_file(file.name)

        except UndefinedStereochemistryError:
            has_stereochemistry = False

        return has_stereochemistry

In [None]:
def label_ids(record_and_molecule, force_field, parameter_types):
    """
    Label molecules with the given forcefield and record which parameter of the given parameter_type (bonds, angles torsions ...) are exercised.
    """
    record, molecule = record_and_molecule

    full_labels = force_field.label_molecules(molecule.to_topology())[0]

    parameter_ids = set()

    for parameter_type in parameter_types:

        parameter_labels = full_labels[parameter_type]

        for indices, parameter in parameter_labels.items():

            if isinstance(record, TorsionDriveRecord) and {*indices[1:3]} != {
                *record.keywords.dihedrals[0][1:3]
            }:
                continue

            parameter_ids.add(parameter.id)

    return [*parameter_ids]

In [None]:
def select_parameters(training_set, parameter_types, output_path, force_field):
    """Select which parameters should be optimised for the training set, the parameters must be exercised by 5 unique entries to be optimisable."""

    # Print out coverage information.
    coverage = defaultdict(int)


    for record_and_molecule in tqdm(training_set.to_records(), total=training_set.n_results):
        parameter_ids = label_ids(record_and_molecule=record_and_molecule, force_field=force_field, parameter_types=parameter_types)
        for parameter_id in parameter_ids:
            coverage[parameter_id] += 1

    # Save out the SMIRKS which should be trained against this set.
    with open(output_path, "w") as file:

        selected_parameters = defaultdict(list)

        for parameter_type in parameter_types:

            for parameter_id, count in coverage.items():

                found_parameters = force_field.get_parameter_handler(
                    parameter_type
                ).get_parameter({"id": parameter_id})

                if count < 5 or len(found_parameters) == 0:
                    continue

                selected_parameters[parameter_type].append(found_parameters[0].smirks)

        json.dump(selected_parameters, file)

Set up a set of common filters which will be applied to both torsiondrive and optimisation valence training sets. Here we use Sage to work out which SMIRKS types to optimise as they are the same in the DEFF.

In [None]:
logging.basicConfig(level=logging.INFO)
Path("./quantum-chemical-data-sets").mkdir(parents=True, exist_ok=True)
# use sage as it has the same SMIRKS types as DEFF
initial_forcefield = ForceField('openff_unconstrained-2.0.0.offxml', load_plugins=True, allow_cosmetic_attributes=True)
# set up the common filters
default_filters = [
    # only keep complete records
    RecordStatusFilter(status=RecordStatusEnum.complete),
    # keep records with no conectivity changes (tautomers of the input molecule)
    ConnectivityFilter(tolerance=1.2),
    # remove molecules with missing stereochemistry
    UndefinedStereoFilter(),
    ElementFilter(
        # The elements supported by SMIRNOFF
        allowed_elements=["H", "C", "N", "O", "Cl", "Br"]
    ),
]

We use OpenFF-QCSubmit to build a single torsiondrive collection from many torsion datasets stored on the public QCArchive instance. 

In [None]:
# Pull down the main torsion drive and optimization sets and filter out any records
# which have not completed or which inadvertently contain intra-molecular h-bonds.
client = FractalClient()

torsion_set = TorsionDriveResultCollection.from_server(
    client=client,
    datasets=[
        "OpenFF Gen 2 Torsion Set 1 Roche 2",
        "OpenFF Gen 2 Torsion Set 2 Coverage 2",
        "OpenFF Gen 2 Torsion Set 3 Pfizer Discrepancy 2",
        "OpenFF Gen 2 Torsion Set 4 eMolecules Discrepancy 2",
        "OpenFF Gen 2 Torsion Set 5 Bayer 2",
        "OpenFF Gen 2 Torsion Set 6 supplemental 2",
        "OpenFF Amide Torsion Set v1.0"
    ],
    spec_name="default",
)

In [None]:
# Drop record ids with inconsistent optimization histories or which cause failures
# in ForceBalance.
torsion_set.entries[client.address] = [
    entry
    for entry in torsion_set.entries[client.address]
    if entry.record_id
        not in [
            "6098580",
            "2703504",
            "2703505",
            "18045478",
        ]
]

With this single dataset, we can now apply some OpenFF-QCSubmit filters to curate the dataset, here we remove any torsiondrives with internal hydrogen bonds and apply the common filters set up earlier. 

In [None]:
# apply h-bond filter and all common filters
torsion_set = torsion_set.filter(
    HydrogenBondFilter(method="baker-hubbard"), *default_filters
)

We can then save the final torsion drive collection to JSON file, this is a static dataset which just contains the record Ids of the entries left after filtering, we can then easily load the collection from the file using the `TorsionDriveResultCollection.parse_file` method.

In [None]:
# save the filtered torsion drive dataset to file
with open("quantum-chemical-data-sets/reduced-set-td-set.json", "w") as file:
    file.write(torsion_set.json())

We can now analyse the final dataset to establish which proper torsion SMIRKS should be optimised.

In [None]:
# work out which torsion SMIRKS should be refit
select_parameters(
        torsion_set,
        parameter_types=["ProperTorsions"],
        output_path="quantum-chemical-data-sets/reduced-set-proper-torsions-params-smirks.json",
        force_field=initial_forcefield
    )

Repeat the process on the optimisation datasets.

In [None]:
# build the optimisation datasets
optimization_set = OptimizationResultCollection.from_server(
    client=FractalClient(),
    datasets=[
        "OpenFF Gen 2 Opt Set 1 Roche",
        "OpenFF Gen 2 Opt Set 2 Coverage",
        "OpenFF Gen 2 Opt Set 3 Pfizer Discrepancy",
        "OpenFF Gen 2 Opt Set 4 eMolecules Discrepancy",
        "OpenFF Gen 2 Opt Set 5 Bayer",
    ],
    spec_name="default",
)

In [None]:
# filter redundant similar conformers and use common filters
optimization_set = optimization_set.filter(
    ConformerRMSDFilter(max_conformers=10),
    *default_filters,

)

In [None]:
# save the filtered optimisation dataset to file
with open("quantum-chemical-data-sets/reduced-set-opt-set.json", "w") as file:
    file.write(optimization_set.json())

In [None]:
# work out which angle SMIRKS should be refit
select_parameters(
    optimization_set,
    parameter_types=["Angles"],
    output_path="quantum-chemical-data-sets/reduced-set-angles-params-smirks.json",
    force_field=initial_forcefield,
)

In [None]:
# work out which bond SMIRKS should be refit
select_parameters(
    optimization_set,
    parameter_types=["Bonds"],
    output_path="quantum-chemical-data-sets/reduced-set-bonds-params-smirks.json",
    force_field=initial_forcefield,
)

# Quantum Chemical Benchmarks

We will now construct the valence benchmark dataset using OpenFF-QCSubmit, this is a subset of the benchmark dataset used in the development of the SAGE force field containing only molecules with un-optimised vdW types removed.

In [None]:
import warnings
from openff.toolkit.utils import (
    GLOBAL_TOOLKIT_REGISTRY,
    OpenEyeToolkitWrapper,
    UndefinedStereochemistryError,RDKitToolkitWrapper
)

warnings.filterwarnings("ignore")
logging.getLogger("openff.toolkit").setLevel(logging.ERROR)

# Make sure we consistently only use OE in this script
for toolkit in GLOBAL_TOOLKIT_REGISTRY.registered_toolkits:
    if isinstance(toolkit, OpenEyeToolkitWrapper):
        continue
    GLOBAL_TOOLKIT_REGISTRY.deregister_toolkit(toolkit)

Load the dataset from our cached OpenFF-QCSubmit collection as downloading the dataset can take a long time.

In [None]:
result_collection = OptimizationResultCollection.parse_file(
    "quantum-chemical-data-sets/openff-industry-benchmark-v1.1-raw.json"
)

Manually remove some molecules which have implicit hydrogens in the smiles which stops us from building the molecule with the correct atom ordering.

In [None]:
ind_to_del = []
for i, item in enumerate(result_collection.entries['https://api.qcarchive.molssi.org:443/']):
    if 'NH+' in item.cmiles or 'nH+' in item.cmiles:
        ind_to_del.append(i)

for ind in sorted(ind_to_del, reverse=True):
    print('deleting implicit hydrogen entry: ', ind)
    del result_collection.entries['https://api.qcarchive.molssi.org:443/'][ind]

Apply filters and check the final stereochemistry of each optimised molecule, then try and buld an openMM system to make sure we can parameterise each entry including an AM1 charge calculation.

In [None]:
class InvalidCMILESFilter(CMILESResultFilter):
    "Add a custom invalid smiles filter to remove any molecules we missed"
    def _filter_function(self, result) -> bool:

        try:
            Molecule.from_mapped_smiles(result.cmiles, allow_undefined_stereo=True)
        except (ValueError, SmilesParsingError):
            return False

        return True

result_collection = result_collection.filter(
        ElementFilter(
            # The elements supported by SMIRNOFF
            allowed_elements=["H", "C", "N", "O", "Cl", "Br"]
        ),
        InvalidCMILESFilter(),
        RecordStatusFilter(status=RecordStatusEnum.complete),
        ConnectivityFilter(),
    )

In [None]:
# process the final molecules via openeye and check the stereochemistry
_, molecules = zip(*result_collection.to_records())

unique_molecules = set()

for molecule in molecules:

    # Re-perceive the stereochemistry from the final conformer.
    oe_molecule = molecule.to_openeye()
    oechem.OE3DToInternalStereo(oe_molecule)

    try:
        molecule = Molecule.from_openeye(oe_molecule)
    except UndefinedStereochemistryError:
        print(f"skipping {molecule.to_smiles()} - un-perceivable stereo")
        continue

    unique_molecules.add(molecule.to_smiles(isomeric=True, mapped=False))

In [None]:
def _can_parameterize(smiles: str):
    """For each molecule try and build and openMM system to ensure we can parameterize the molecule, this includes an AM1 charge calculation."""

    try:

        for toolkit in GLOBAL_TOOLKIT_REGISTRY.registered_toolkits:

            if isinstance(toolkit, OpenEyeToolkitWrapper):
                continue

            GLOBAL_TOOLKIT_REGISTRY.deregister_toolkit(toolkit)

        molecule = Molecule.from_smiles(smiles, allow_undefined_stereo=True)
        force_field = ForceField("openff-2.0.0.offxml")

        force_field.create_openmm_system(molecule.to_topology())

    except:
        return smiles, False

    return smiles, True

In [None]:
filtered_smiles = set()
for smiles in tqdm(unique_molecules, desc="Parameterising molecules", ncols=80):
    s, should_retain = _can_parameterize(smiles=smiles)
    if should_retain:
        filtered_smiles.add(s)
        

Only keep molecules for which systems could be made

In [None]:
result_collection = result_collection.filter(
    SMILESFilter(smiles_to_include=[*filtered_smiles]),
)

In [None]:
def _process_molecule(record_and_molecule) -> oechem.OEMol:
    """Convert a QC record and its associated molecule into an OE molecule which
    has been tagged with the associated SMILES, final energy and record id."""

    record, molecule = record_and_molecule

    oe_molecule = molecule.to_openeye()
    oechem.OE3DToInternalStereo(oe_molecule)

    final_energy = record.get_final_energy() * qcelemental.constants.hartree2kcalmol

    # add name and energy tag to the mol
    oechem.OESetSDData(oe_molecule, "SMILES QCArchive", molecule.to_smiles())
    oechem.OESetSDData(oe_molecule, "Energy QCArchive", str(final_energy))
    oechem.OESetSDData(oe_molecule, "Record QCArchive", str(record.id))

    return oe_molecule

Group the molecule conformations and write out SDF files to `quantum-chemical-data-sets/01-processed-qm-sdf` note a `tar` of these files has been included to save size. 
We also filter the OpenFF-QCSubmit dataset to contain the same records and save it as `quantum-chemical-data-sets/01-processed-qm.json`.

In [None]:
records_and_molecules = result_collection.to_records()

grouped_molecules = defaultdict(list)

for record, molecule in records_and_molecules:

    molecule = molecule.canonical_order_atoms()

    smiles = molecule.to_smiles(isomeric=False, explicit_hydrogens=True)
    grouped_molecules[smiles].append((record, molecule))

processed_oe_molecules = [
    _process_molecule(record_and_molecule)
    for record_and_molecule in records_and_molecules
]

In [None]:
output_steam = oechem.oemolostream("quantum-chemical-data-sets/01-processed-qm.sdf")

final_record_ids = set()

for i, oe_molecule in enumerate(processed_oe_molecules):

    final_record_ids.add(oechem.OEGetSDData(oe_molecule, "Record QCArchive"))

    oe_molecule.SetTitle(f"full_{i + 1}")
    oechem.OEWriteConstMolecule(output_steam, oe_molecule)

output_steam.close()

result_collection.entries[client.address] = [
    entry
    for entry in result_collection.entries[client.address]
    if entry.record_id in final_record_ids
]

with open("quantum-chemical-data-sets/01-processed-qm.json", "w") as file:
    file.write(result_collection.json())