# Combined liquid property dataset

## Filenames

In [49]:
import os

BIBFILE = "data/00_physprop-benchmark-sources.bib"
RAW_JSON = "data/01_raw_thermoml_dataset_from_sources.json"
RAW_CSV = "data/01_raw_thermoml_dataset_from_sources.csv"
ENTRY_DIRNAME = "data/01_raw_data_entries"
ENTRY_NAME = os.path.join(ENTRY_DIRNAME, "entry_{:05d}.csv")

## Experimental Data

### Constants

In [2]:
SMILES_TEST_SET = [
    'CC#N',
    'c1ccc(cc1)Cl',
    'c1ccsc1',
    'C(=O)O',
    'C(Cl)(Cl)Cl',
    'c1cc(cc(c1)Cl)Cl',
    'c1cc(cc(c1)Br)Br',
    'CCI',
    'C1CCNCC1',
    'CCOC(OCC)OCC',
    'c1ccc(cc1)F',
    'C(Br)Br',
    'C(Cl)Br',
    'CCOC(=O)CC(=O)C',
    'CC(=O)CCC(=O)O',
    'C(C(C(F)(F)F)(F)F)O',
    'COc1cccc(c1)Br',
    'C=CC#N',
    'CNCCCN',
    'CCn1ccnc1',
    'CCCI',
    'CCCCCCCCCCCCS',
    'COc1ccccc1O',
    'c1cc(cc(c1)I)F',
    'c1cc(ccc1F)I',
    'c1ccc2c(c1)ncs2',
    'Cc1ccc2c(c1)OCO2',
    'c1ccc(cc1)SCN=[N+]=[N-]',
    'c1cscc1C#N',
    'c1cc(sc1)C#N'
]


In [3]:
from openff.evaluator.datasets.thermoml import ThermoMLDataSet
import pandas as pd

### Parsley benchmark set

The DOIs are in `data/physprop-benchmark-sources.bib`.

In [4]:
import re

def read_dois_from_bib(bibfile=BIBFILE):
    with open(bibfile, "r") as f:
        contents = f.read()
    dois = re.findall("doi[\s]*=[\s]*\{([a-zA-Z0-9./]+)\}", contents)
    return dois

I read the DOIs with [a modified version of evaluator that skipped un-parseable molecules](https://github.com/openforcefield/openff-evaluator/issues/364), so I will save and load to/from JSON to avoid doing this again.

In [10]:
# # uncomment this block to re-download the information
# dois = read_dois_from_bib()
# dataset = ThermoMLDataSet.from_doi(*dois)
# dataset.json(RAW_JSON)

dataset = ThermoMLDataSet.from_json(RAW_JSON)

In [45]:
df = dataset.to_pandas()
df.to_csv(RAW_CSV)

In [47]:
len(df)

16925

In [57]:
import os

for i in range(len(df)):
    filename = ENTRY_NAME.format(i+1)
    df.iloc[i:i+1].to_csv(filename)

### RESP2

> The program ForceBalance41 was used to optimize the LJ parameters for RESP1 charges and for RESP2 with values of δ from 0 to 1 in steps of 0.05. To simplify and speed the optimizations, we limited the number of different LJ types to five: C, N, and O, polar H, and apolar H. Polar hydrogens were defined by the following extended SMARTS pattern:[#1:1]-[#7,#8].Because each LJ type has two parameters, $r_{min-half}$ and $\epsilon$, the optimizations were done in a ten-dimensional parameter space. Starting parameters were drawn from SMIRNOFF v1.0.7 (Supplementary Notes 1). Training was based on measured HOV and pure liquid densities of 15 molecules (Fig. 8a) with a variety of functional groups. The ForceBalance procedure was terminated when the step size for the mathematical parameters fell <0.01 or the objective function changed <1.0 between two iterations; further details are provided below in this section. The resulting parameters were tested against measured HOVs and densities for a separate set of 53 molecules (Fig. 8b), as well as the measured dielectric constants and HFE of a subset of these compounds. All experimental values for HOV were taken from ThermoML76. Densities were taken from ThermoML when available, and otherwise from PubChem77. Dielectric constants were taken from multiple sources78. HFE were taken from the FreeSolv database79. All values are summarized in Supplementary Tables 2 and 3.

## Evaluator

### Set up schema

In [28]:
from openff.evaluator.client import RequestOptions
from openff.evaluator import unit
from openff.evaluator.properties import (Density,
                                         EnthalpyOfVaporization,
                                         ExcessMolarVolume,
                                         DielectricConstant,
                                         EnthalpyOfMixing)

In [34]:
ABSOLUTE_TOLERANCES = {
    Density: 0.45 * unit.kilogram * unit.meter ** -3,
    DielectricConstant: 1.5 * unit.dimensionless,
    EnthalpyOfVaporization: 0.65 * unit.kilojoule / unit.mole,
    EnthalpyOfMixing: 0.02 * unit.kilojoule / unit.mole,
    ExcessMolarVolume: 2e-8 * unit.meter ** 3 / unit.mole,
}

PROPERTIES = [Density, EnthalpyOfVaporization, ExcessMolarVolume, DielectricConstant, EnthalpyOfMixing,]

estimation_options = RequestOptions()
for prop in PROPERTIES:
    schema = prop.default_simulation_schema(absolute_tolerance=ABSOLUTE_TOLERANCES[prop])
    estimation_options.add_schema("SimulationLayer", prop, schema)

In [35]:
from openff.evaluator.workflow import Workflow

In [42]:
dir(dataset)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_from_file',
 '_from_url',
 '_properties',
 'add_properties',
 'from_doi',
 'from_file',
 'from_json',
 'from_pandas',
 'from_url',
 'from_xml',
 'json',
 'merge',
 'parse_json',
 'properties',
 'properties_by_substance',
 'properties_by_type',
 'property_types',
 'registered_properties',
 'sources',
 'substances',
 'to_pandas',
 'validate']

In [39]:
estimation_options.calculation_schemas

{'Density': {'SimulationLayer': <openff.evaluator.layers.simulation.SimulationSchema at 0x166cd6970>},
 'EnthalpyOfVaporization': {'SimulationLayer': <openff.evaluator.layers.simulation.SimulationSchema at 0x166910700>},
 'ExcessMolarVolume': {'SimulationLayer': <openff.evaluator.layers.simulation.SimulationSchema at 0x166ec9fa0>},
 'DielectricConstant': {'SimulationLayer': <openff.evaluator.layers.simulation.SimulationSchema at 0x166957f40>},
 'EnthalpyOfMixing': {'SimulationLayer': <openff.evaluator.layers.simulation.SimulationSchema at 0x166d90100>}}

In [21]:
estimation_options.calculation_layers

['ReweightingLayer', 'SimulationLayer']