# For more details on the data curation and fitting process please checkout these links
https://openforcefield.org/community/news/science-updates/ff-training-example-2021-07-01/

https://pubs.acs.org/doi/10.1021/acs.jctc.3c00039

Disclaimer: QCArchive is a living repository of data and some of the datasets used might have some unfinished calculations that got updated later and the exact retrieval of records may differ on re-runs. To circumvent this, all the record ids used in fitting were stored in a json file for reproducibility. Please use those file while generating the ForceBalance inputs, or re-running the fits in case of any mismatch in the downloaded records.


In [None]:
import copy
import functools
import json
import logging
import os.path

from collections import defaultdict
from multiprocessing import Pool
from tempfile import NamedTemporaryFile
from pathlib import Path

from openff.qcsubmit.results import (
    OptimizationResultCollection, TorsionDriveResultCollection,
)
from openff.qcsubmit.results.filters import (
    ConformerRMSDFilter, ConnectivityFilter,
    ElementFilter,
    HydrogenBondFilter,
    RecordStatusFilter,
    ResultRecordFilter,
    UnperceivableStereoFilter
)
from openff.toolkit.typing.engines.smirnoff import ForceField
from openff.toolkit.utils.exceptions import UnassignedMoleculeChargeException
from qcportal import FractalClient
from qcportal.models import TorsionDriveRecord
from qcportal.models.records import RecordStatusEnum
from tqdm import tqdm


from openff.bespokefit.optimizers.forcebalance import ForceBalanceInputFactory
from openff.bespokefit.schema.fitting import OptimizationSchema, OptimizationStageSchema
from openff.bespokefit.schema.optimizers import ForceBalanceSchema
from openff.bespokefit.schema.smirnoff import AngleHyperparameters, AngleSMIRKS, BondHyperparameters, \
    BondSMIRKS, ProperTorsionHyperparameters, ProperTorsionSMIRKS
from openff.bespokefit.schema.targets import (
    OptGeoTargetSchema,
    TorsionProfileTargetSchema, )

In [None]:
logging.basicConfig(level=logging.INFO)

# Some utility functions to check charge assignment failures, and selecting parameters for optimization

In [None]:
class ChargeCheckFilter(ResultRecordFilter):
    def _filter_function(self, result, record, molecule) -> bool:
        # Some of the molecules fail charging with am1bccelf10, either
        # because of lack of bccs, or due to failed conformer generation, sometimes it
        # cannot be captured with just the cmiles present in the record
        # metadata, so reading from file and checking it
        can_be_charged = True

        molecule = copy.deepcopy(molecule)
        molecule._conformers = [molecule.conformers[0]]

        try:
            with NamedTemporaryFile(suffix=".sdf") as file:
                molecule.to_file(file.name, "SDF")
                molecule.from_file(file.name)
                molecule.assign_partial_charges(partial_charge_method="am1bccelf10")

        except UnassignedMoleculeChargeException:
            can_be_charged = False

        return can_be_charged


def label_ids(record_and_molecule, force_field, parameter_types):
    record, molecule = record_and_molecule

    full_labels = force_field.label_molecules(molecule.to_topology())[0]

    parameter_ids = set()

    for parameter_type in parameter_types:

        parameter_labels = full_labels[parameter_type]

        for indices, parameter in parameter_labels.items():

            if isinstance(record, TorsionDriveRecord) and {*indices[1:3]} != {
                *record.keywords.dihedrals[0][1:3]
            }:
                continue

            parameter_ids.add(parameter.id)

    return [*parameter_ids]


def select_parameters(training_set, parameter_types, output_path, force_field):
    # Print out coverage information.
    coverage = defaultdict(int)

    with Pool(4) as pool:

        for parameter_ids in tqdm(
                pool.imap(
                    functools.partial(
                        label_ids, force_field=force_field, parameter_types=parameter_types
                    ),
                    training_set.to_records(),
                ),
                total=training_set.n_results,
        ):

            for parameter_id in parameter_ids:
                coverage[parameter_id] += 1

    # Save out the SMIRKS which should be trained against this set.
    with open(output_path, "w") as file:

        selected_parameters = defaultdict(list)

        for parameter_type in parameter_types:

            for parameter_id, count in coverage.items():

                found_parameters = force_field.get_parameter_handler(
                    parameter_type
                ).get_parameter({"id": parameter_id})

                if count < 5 or len(found_parameters) == 0:
                    continue

                selected_parameters[parameter_type].append(found_parameters[0].smirks)

        json.dump(selected_parameters, file)

# Create a directory to store the datasets information, and read in the forcefield to optimize

In [None]:
Path("./data-sets").mkdir(parents=True, exist_ok=True)

initial_forcefield = ForceField('force-field_7.offxml', load_plugins=True, allow_cosmetic_attributes=True)

# Common filters to apply on the downloaded QM data (for both torsion scans and optimized geometries)

In [None]:
default_filters = [
    RecordStatusFilter(status=RecordStatusEnum.complete),
    ConnectivityFilter(tolerance=1.2),
    UnperceivableStereoFilter(),
    ElementFilter(
        # The elements being optimized in DEXP Forcefields. Rest of the elements lack physical property data for vdW training
        allowed_elements=["H", "C", "N", "O", "Cl", "Br"]
    ),
    ChargeCheckFilter(),
]

# Download torsion scan data from QCArchive

In [None]:
# Pull down the main torsion drive and optimization sets and filter out any records
# which have not completed or which inadvertently contain intra-molecular h-bonds.
client = FractalClient()

torsion_set = TorsionDriveResultCollection.from_server(
    client=client,
    datasets=[
        "OpenFF Gen 2 Torsion Set 1 Roche 2",
        "OpenFF Gen 2 Torsion Set 2 Coverage 2",
        "OpenFF Gen 2 Torsion Set 3 Pfizer Discrepancy 2",
        "OpenFF Gen 2 Torsion Set 4 eMolecules Discrepancy 2",
        "OpenFF Gen 2 Torsion Set 5 Bayer 2",
        "OpenFF Gen 2 Torsion Set 6 supplemental 2",
        "OpenFF Amide Torsion Set v1.0",
    ],
    spec_name="default",
)

# Drop record ids with known inconsistent optimization histories or which cause failures
# in ForceBalance.
torsion_set.entries[client.address] = [
    entry
    for entry in torsion_set.entries[client.address]
    if entry.record_id
       not in [
           "6098580",
           "2703504",
           "2703505",
           "18045478",
       ]
]

In [None]:
with open("data-sets/full-td-set-before-filtering.json", "w") as file:
    file.write(torsion_set.json())

# Apply the filters on the downloaded data and store the remaining records in json format. Select the torsion parameters to optimize as well based on the molecules present in the training data.

In [None]:
torsion_set = torsion_set.filter(
    HydrogenBondFilter(method="baker-hubbard"), 
    *default_filters
)

with open("data-sets/reduced-set-td-set.json", "w") as file:
    file.write(torsion_set.json())

select_parameters(
    torsion_set,
    parameter_types=["ProperTorsions"],
    output_path="data-sets/reduced-set-proper-torsions-params-smirks.json",
    force_field=initial_forcefield
)

# Download optimized geometries data from QCArchive

In [None]:
optimization_set = OptimizationResultCollection.from_server(
    client=FractalClient(),
    datasets=[
        "OpenFF Gen 2 Opt Set 1 Roche",
        "OpenFF Gen 2 Opt Set 2 Coverage",
        "OpenFF Gen 2 Opt Set 3 Pfizer Discrepancy",
        "OpenFF Gen 2 Opt Set 4 eMolecules Discrepancy",
        "OpenFF Gen 2 Opt Set 5 Bayer",
    ],
    spec_name="default",
)

# Apply the filters on the downloaded optimized geometries data. Store the remaining record ids in a json file. And, select bond and angle parameters to optimize.

In [None]:
optimization_set = optimization_set.filter(
    ConformerRMSDFilter(max_conformers=10),
    *default_filters,

)

with open("data-sets/reduced-set-opt-set.json", "w") as file:
    file.write(optimization_set.json())

In [None]:
select_parameters(
    optimization_set,
    parameter_types=["Angles"],
    output_path="data-sets/reduced-set-angles-params-smirks.json",
    force_field=initial_forcefield,
)

select_parameters(
    optimization_set,
    parameter_types=["Bonds"],
    output_path="data-sets/reduced-set-bonds-params-smirks.json",
    force_field=initial_forcefield,
)

# Creating ForceBalance outputs

# Reading back the record ids from json files

In [None]:
Path("./schemas/optimizations/").mkdir(parents=True, exist_ok=True)

torsion_training_set = TorsionDriveResultCollection.parse_file(
    "data-sets/reduced-set-td-set.json"
)
optimization_training_set = OptimizationResultCollection.parse_file(
    "data-sets/reduced-set-opt-set.json"
)

# Passing initial forcefield and parameters to optimize

In [None]:
# to pick initial values and parameters to optimize
# enter
custom_force_field = 'force-field.offxml'
initial_force_field = ForceField('force-field_7.offxml', load_plugins=True, allow_cosmetic_attributes=True)
initial_force_field.to_file(custom_force_field)

# Define the parameters to train
with open("data-sets/reduced-set-angles-params-smirks.json") as file:
    angle_smirks = json.load(file)
with open("data-sets/reduced-set-bonds-params-smirks.json") as file:
    bond_smirks = json.load(file)
with open("data-sets/reduced-set-proper-torsions-params-smirks.json") as file:
    torsion_smirks = json.load(file)

target_parameters = [
    *[
        AngleSMIRKS(smirks=smirks, attributes={"k", "angle"})
        for smirks in angle_smirks["Angles"]
    ],
    *[
        BondSMIRKS(smirks=smirks, attributes={"k", "length"})
        for smirks in bond_smirks["Bonds"]
    ],
    *[
        ProperTorsionSMIRKS(
            smirks=smirks,
            attributes={
                f"k{i + 1}"
                for i in range(
                    len(
                        initial_force_field.get_parameter_handler("ProperTorsions")
                        .parameters[smirks]
                        .k
                    )
                )
            },
        )
        for smirks in torsion_smirks["ProperTorsions"]
    ],
]

# Define the full schema for the optimization, which includes the hyperparameters for optimization, weights of the targets, and target specific hyperparameters, calculation setup for remote workers, and generate the inputs required for a ForceBalance run.


In [None]:
optimization_schema = OptimizationSchema(
    id="reduced-set-targets",
    initial_force_field=os.path.abspath(custom_force_field),
    # Define the optimizer / ForceBalance specific settings.
    stages=[
        OptimizationStageSchema(
            optimizer=ForceBalanceSchema(
                max_iterations=50,
                step_convergence_threshold=0.01,
                objective_convergence_threshold=0.1,
                gradient_convergence_threshold=0.1,
                n_criteria=2,
                initial_trust_radius=-1.0,
                extras={"wq_port": "55145", "asynchronous": "True"},
            ),
            # Define the torsion profile targets to fit against.
            targets=[
                TorsionProfileTargetSchema(
                    reference_data=torsion_training_set,
                    energy_denominator=1.0,
                    energy_cutoff=5.0,
                    extras={"remote": "1"},
                ),
                OptGeoTargetSchema(
                    reference_data=optimization_training_set,
                    weight=0.1,
                    extras={"batch_size": 1, "remote": "1"},
                ),
            ],
            # Define the parameters to refit and the priors to place on them.
            parameters=target_parameters,
            parameter_hyperparameters=[
                AngleHyperparameters(priors={'k': 100, 'angle': 20}),
                BondHyperparameters(priors={'k': 100, 'length': 0.1}),
                ProperTorsionHyperparameters(priors={'k': 15})
            ],
)])

with open(
        os.path.join(
            "./schemas", "optimizations", f"{optimization_schema.id}.json"
        ),
        "w",
) as file:
    file.write(optimization_schema.json())

# Generate the ForceBalance inputs
ForceBalanceInputFactory.generate(
    os.path.join(
        optimization_schema.id
    ),
    optimization_schema.stages[0],
    ForceField(optimization_schema.initial_force_field, load_plugins=True, allow_cosmetic_attributes=True)
)

# Sample ForceBalance run with a single opt-geo target and a single torsion-profile target, and commenting out  `wq_port` so that remote workers were not needed, and running with `maxstep 0`

Actual run of ForceBalance is compute intensive and it is advised to run on a HPC cluster. Here is a short demonstration of the fitting run with two targets.

In [None]:
%cd toy-fb-run

In [None]:
!ForceBalance optimize.in