# Augment training/validation data

In [1]:
import pandas as pd

## Convert stored data to comparable hashes

In [2]:
from eveq.storage.storage import LocalStoredEquilibrationData



In [3]:
# storage = LocalStoredEquilibrationData.from_localfilestorage(
#     lfs_root_directory="/Volumes/Nobbsy/combined_equilibration_data/stored_data",
#     new_root_directory="../data/stored_data"
# )

```u_1379389221493640985 u_1379389221493640985 NC1CCCCC1{solv}{x=1.000000} 2.830299353583582```

In [4]:
storage = LocalStoredEquilibrationData("../data/stored_data")
print(len(storage._cached_retrieved_objects))

4583


In [5]:
len(storage._cached_retrieved_objects)

4583

## Add new amine data

In [6]:
from openff.evaluator.datasets.datasets import PhysicalPropertyDataSet
from openff.evaluator.datasets.curation.components import filtering, selection, thermoml
from openff.evaluator.datasets.curation.components.selection import State, TargetState
from openff.evaluator.datasets.curation.workflow import (
    CurationWorkflow,
    CurationWorkflowSchema,
)

from openff.evaluator.utils.checkmol import ChemicalEnvironment, analyse_functional_groups
from eveq.storage.storage import PropertyBox

In [6]:
# load existing intermediate filtered set
df = pd.read_csv(
    "/Users/lily/pydev/old-ash-sage/01_download-data/physprop/intermediate/output/initial-filtered.csv",
    index_col=0
)
df["Id"] = df["Id"].astype(str)

In [8]:
# # get amines
# curation_schema = CurationWorkflowSchema(
#     component_schemas=[
        
#         filtering.FilterByEnvironmentsSchema(environments=[
#             ChemicalEnvironment.PrimaryAmine,
#             ChemicalEnvironment.SecondaryAmine,
#             ChemicalEnvironment.TertiaryAmine,
#         ]),
#     ]
# )

# amine_properties = CurationWorkflow.apply(df, curation_schema)

INFO:openff.evaluator.datasets.curation.workflow:Applying FilterByEnvironments
INFO:openff.evaluator.datasets.curation.components.components:9082 data points were removed after applying the FilterByEnvironments component.
INFO:openff.evaluator.datasets.curation.workflow:FilterByEnvironments applied


In [10]:
# the filter above is too strict -- all components have to match amines.
cols = [x for x in df.columns if x.startswith("Component")]
amine_property_rows = []
for _, row in df.iterrows():
    components = []
    for col in cols:
        if not pd.isna(row[col]):
            components.append(row[col])

    for comp in components:
        groups = [gp.value for gp in analyse_functional_groups(comp)]
        if any("Amine" in x for x in groups):
            amine_property_rows.append(dict(row))
            break

amine_properties = pd.DataFrame(amine_property_rows)

In [11]:
print(
    f"{len(amine_properties)} amine properties found"
)

1189 amine properties found


In [12]:
amine_dataset = PhysicalPropertyDataSet.from_pandas(amine_properties)

In [13]:
amine_dataset.properties[-1].substance

<Substance CCCCOCCO{solv}{x=0.214079}|CNCCO{solv}{x=0.785921}>

In [14]:
not_equilibrated = []
for physical_property in amine_dataset.properties:
    if not storage.contains_all_property_boxes(physical_property):
        not_equilibrated.append(physical_property)

print(
    f"{len(not_equilibrated)} properties not equilibrated"
)

373 properties not equilibrated


In [19]:
non_equilibrated_amines = PhysicalPropertyDataSet()
non_equilibrated_amines.add_properties(*not_equilibrated)
with open("amine-equilibration/dataset.json", "w") as f:
    f.write(non_equilibrated_amines.json())

non_equilibrated_amines_df = non_equilibrated_amines.to_pandas()
non_equilibrated_amines_df.to_csv("amine-equilibration/dataset.csv")

In [7]:
non_equilibrated_amines = PhysicalPropertyDataSet.from_json("amine-equilibration/dataset.json")

In [8]:
boxes = set(
    [
        box
         for prop in non_equilibrated_amines.properties
         for box in PropertyBox.from_physical_property(prop, n_molecules=1000)
    ]
)

In [9]:
box = list(boxes)[0]
box

PropertyBox(substance=CN(CCO)CCO{solv}{x=1.000000}, n_molecules=1000, thermodynamic_state=T=298.15 K P=101.0 kPa, phase=Liquid)