In [1]:
import qcportal as ptl
import tqdm
import pathlib
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
from offqcdata.download import download_optimization, get_client, download_torsiondrive

In [2]:
client = get_client("_cache")

In [3]:
IGNORE_IODINE = [
    "OpenFF Discrepancy Benchmark 1",
    "OpenFF Gen 2 Opt Set 2 Coverage",
    "OpenFF Gen 2 Opt Set 4 eMolecules Discrepancy",
    "SMIRNOFF Coverage Set 1",
    "OpenFF Ehrman Informative Optimization v0.2",
    "FDA optimization dataset 1",
    "Kinase Inhibitors: WBO Distributions",

    # ---
    "OpenFF Gen 2 Torsion Set 2 Coverage 2",
    "OpenFF Gen 2 Torsion Set 4 eMolecules Discrepancy 2",
]

In [4]:
OPTIMIZATION_WHITELISTS = [
    "OpenFF Optimization Set 1",
    "SMIRNOFF Coverage Set 1",
    "OpenFF VEHICLe Set 1",
    "OpenFF Discrepancy Benchmark 1",
    "OpenFF Ehrman Informative Optimization v0.2",
    "Pfizer discrepancy optimization dataset 1",
    "FDA optimization dataset 1",
    "Kinase Inhibitors: WBO Distributions",
    "OpenFF Gen 2 Opt Set 1 Roche",
    "OpenFF Gen 2 Opt Set 2 Coverage",
    "OpenFF Gen 2 Opt Set 3 Pfizer Discrepancy",
    "OpenFF Gen 2 Opt Set 4 eMolecules Discrepancy",
    "OpenFF Gen 2 Opt Set 5 Bayer",
    "OpenFF Sandbox CHO PhAlkEthOH v1.0",
    "OpenFF Industry Benchmark Season 1 v1.1",
    "OpenFF Gen2 Optimization Dataset Protomers v1.0",
    "OpenFF Protein Capped 1-mers 3-mers Optimization Dataset v1.0",
    "OpenFF Iodine Chemistry Optimization Dataset v1.0",
    # "OpenFF multi-Br ESP Fragment Conformers v1.0",
    "XtalPi Shared Fragments OptimizationDataset v1.0",
    "XtalPi 20-percent Fragments OptimizationDataset v1.0",
    "OpenFF Torsion Benchmark Supplement v1.0",
    "OpenFF Torsion Multiplicity Optimization Training Coverage Supplement v1.0",
    "OpenFF Torsion Multiplicity Optimization Benchmarking Coverage Supplement v1.0",
    "OpenFF Iodine Fragment Opt v1.0",
    "OpenFF Sulfur Optimization Training Coverage Supplement v1.0",
    "OpenFF Sulfur Optimization Benchmarking Coverage Supplement v1.0",
    "OpenFF Lipid Optimization Training Supplement v1.0",
    "OpenFF Lipid Optimization Benchmark Supplement v1.0",
    # "SPICE DES370k Monomers Lowest E Conformer Optimization Dataset v4.0",
    "OpenFF Cresset Additional Coverage Optimizations v4.0",
    "OpenFF Protein PDB 4-mers v4.0"
]

In [12]:
TORSIONDRIVE_WHITELISTS = [
    "OpenFF Group1 Torsions",
    "SMIRNOFF Coverage Torsion Set 1",
    "OpenFF Substituted Phenyl Set 1",
    "Pfizer discrepancy torsion dataset 1",
    "OpenFF Primary Benchmark 1 Torsion Set",
    "OpenFF Gen 2 Torsion Set 2 Coverage 2",
    "OpenFF Gen 2 Torsion Set 3 Pfizer Discrepancy 2",
    "OpenFF Gen 2 Torsion Set 4 eMolecules Discrepancy 2",
    "OpenFF Gen 2 Torsion Set 5 Bayer 2",
    "OpenFF Gen 2 Torsion Set 6 supplemental 2",
    "OpenFF Fragmenter Validation 1.0",
    "OpenFF DANCE 1 eMolecules t142 v1.0",
    "OpenFF Rowley Biaryl v1.0",
    "OpenFF-benchmark-ligand-fragments-v1.0",
    "OpenFF Protein Fragments TorsionDrives v1.0",
    "OpenFF WBO Conjugated Series v1.0",
    "OpenFF Amide Torsion Set v1.0",
    # "OpenFF Aniline Para Opt v1.0",
    "OpenFF Gen3 Torsion Set v1.0",
    "OpenFF Aniline 2D Impropers v1.0",
    "OpenFF-benchmark-ligand-fragments-v2.0",
    "OpenFF multiplicity correction torsion drive data v1.1",
    "OpenFF Protein Capped 3-mer Omega v1.0",
    "XtalPi Shared Fragments TorsiondriveDataset v1.0",
    "OpenFF Torsion Coverage Supplement v1.0",
    "OpenFF RNA Dinucleoside Monophosphate TorsionDrives v1.0",
    "XtalPi 20-percent Fragments TorsiondriveDataset v1.0",
    "OpenFF Torsion Drive Supplement v1.0",
    "OpenFF Torsion Multiplicity Torsion Drive Coverage Supplement v1.0",
    "OpenFF Phosphate Torsion Drives v1.0",
    "OpenFF Alkane Torsion Drives v1.0",
    "OpenFF Cresset Additional Coverage TorsionDrives v4.0"
]

In [9]:
for dsname in tqdm.tqdm(["SPICE DES370k Monomers Lowest E Conformer Optimization Dataset v4.0"]):
    table = download_optimization(client, dsname)
    print(table)
    if dsname in IGNORE_IODINE:
        df = table.to_pandas()
        n_df = len(df)
        mask = np.array(["I" in smi for smi in df.smiles.values])
        print(df[mask].smiles)
        df = pd.DataFrame(df[~mask])
        print(f"Found {len(mask)} in {dsname} -- filtering from {n_df} to {len(df)}")
        table = pa.Table.from_pandas(df)

    pq.write_table(table, f"offqcdata/data/tables/optimization/{dsname}.parquet")

    

100%|█████████████████████████████████████████████| 1/1 [00:02<00:00,  2.23s/it]

pyarrow.Table

----





In [8]:
for dsname in tqdm.tqdm(OPTIMIZATION_WHITELISTS):
    table = download_optimization(client, dsname)
    if dsname in IGNORE_IODINE:
        df = table.to_pandas()
        n_df = len(df)
        mask = np.array(["I" in smi for smi in df.smiles.values])
        print(df[mask].smiles)
        df = pd.DataFrame(df[~mask])
        print(f"Found {len(mask)} in {dsname} -- filtering from {n_df} to {len(df)}")
        table = pa.Table.from_pandas(df)

    pq.write_table(table, f"offqcdata/data/tables/optimization/{dsname}.parquet")

    

  6%|████████▎                                                                                                                            | 2/32 [00:09<02:29,  4.97s/it]

310        Ic1ccc(OC2CC2)cc1
311        Ic1ccc(OC2CC2)cc1
314       [O-]c1ccc([O-])n1I
315       [O-]c1ccc([O-])n1I
320          [O-]c1ccc(O)n1I
321          [O-]c1ccc(O)n1I
322             Oc1ccc(O)n1I
430           O=C1CCC(=O)N1I
431             Oc1ccc(O)n1I
494    Cc1ccc(S(=O)(=O)I)cc1
495    Cc1ccc(S(=O)(=O)I)cc1
512                     ClCI
742     COc1cc(F)c([O-])cc1I
743     COc1cc(F)c([O-])cc1I
744        COc1cc(F)c(O)cc1I
745        COc1cc(F)c(O)cc1I
Name: smiles, dtype: object
Found 1132 in SMIRNOFF Coverage Set 1 -- filtering from 1132 to 1116


 12%|████████████████▋                                                                                                                    | 4/32 [03:07<28:20, 60.72s/it]

1375     O=C(O)c1cc(/C=N/c2ccccc2I)nc2ccccc12
1376     O=C(O)c1cc(/C=N/c2ccccc2I)nc2ccccc12
1377     O=C(O)c1cc(/C=N/c2ccccc2I)nc2ccccc12
1378     O=C(O)c1cc(/C=N/c2ccccc2I)nc2ccccc12
1379     O=C(O)c1cc(/C=N/c2ccccc2I)nc2ccccc12
                         ...                 
19317                     COc1ccnc2ccc(I)cc12
19424                COC(=O)c1cc(C#N)cc(I)c1O
19425                COC(=O)c1cc(C#N)cc(I)c1O
19527                      COC(=O)c1nc(N)sc1I
19528                      COC(=O)c1nc(N)sc1I
Name: smiles, Length: 264, dtype: object
Found 19714 in OpenFF Discrepancy Benchmark 1 -- filtering from 19714 to 19450


 16%|████████████████████▊                                                                                                                | 5/32 [03:23<20:02, 44.53s/it]

684     O=C(Nc1ccc(Cl)cc1Cl)c1cc(I)cc(I)c1O
685     O=C(Nc1ccc(Cl)cc1Cl)c1cc(I)cc(I)c1O
686     O=C(Nc1ccc(Cl)cc1Cl)c1cc(I)cc(I)c1O
687     O=C(Nc1ccc(Cl)cc1Cl)c1cc(I)cc(I)c1O
688     O=C(Nc1ccc(Cl)cc1Cl)c1cc(I)cc(I)c1O
                       ...                 
3340                       COc1ccc(C=O)cc1I
3341                        COc1ccc(Cl)cc1I
3350              COc1cccc(I)c1[N+](=O)[O-]
3354                         COc1cccc(F)c1I
3362                            COc1ccccc1I
Name: smiles, Length: 74, dtype: object
Found 3485 in OpenFF Ehrman Informative Optimization v0.2 -- filtering from 3485 to 3411


 22%|█████████████████████████████                                                                                                        | 7/32 [03:54<12:20, 29.61s/it]

41                              [O-]c1c(I)cc(Cl)c2cccnc12
42                                 Oc1c(I)cc(Cl)c2cccnc12
790     [NH3+][C@@H](Cc1cc(I)c(Oc2cc(I)c([O-])c(I)c2)c...
791     [NH3+][C@@H](Cc1cc(I)c(Oc2cc(I)c([O-])c(I)c2)c...
792     [NH3+][C@@H](Cc1cc(I)c(Oc2cc(I)c([O-])c(I)c2)c...
                              ...                        
6545    COC(=O)[C@H]1[C@@H](c2ccc(I)cc2)C[C@@H]2CC[C@H...
6546    COC(=O)[C@H]1[C@@H](c2ccc(I)cc2)C[C@@H]2CC[C@H...
6547    COC(=O)[C@H]1[C@@H](c2ccc(I)cc2)C[C@@H]2CC[C@H...
6548    COC(=O)[C@H]1[C@@H](c2ccc(I)cc2)C[C@@H]2CC[C@H...
6549    COC(=O)[C@H]1[C@@H](c2ccc(I)cc2)C[C@@H]2CC[C@H...
Name: smiles, Length: 96, dtype: object
Found 6670 in FDA optimization dataset 1 -- filtering from 6670 to 6574


 25%|█████████████████████████████████▎                                                                                                   | 8/32 [04:20<11:26, 28.62s/it]

Series([], Name: smiles, dtype: object)
Found 958 in Kinase Inhibitors: WBO Distributions -- filtering from 958 to 958


 31%|█████████████████████████████████████████▎                                                                                          | 10/32 [04:25<05:27, 14.87s/it]

133                     ClCI
150             Oc1ccc(O)n1I
207    Cc1ccc(S(=O)(=O)I)cc1
245     COc1cc(F)c([O-])cc1I
Name: smiles, dtype: object
Found 373 in OpenFF Gen 2 Opt Set 2 Coverage -- filtering from 373 to 369


 38%|█████████████████████████████████████████████████▌                                                                                  | 12/32 [04:37<03:34, 10.73s/it]

274                   O=[N+]([O-])c1ccc(Br)cc1I
625                O=[N+]([O-])c1cc(I)c(F)cc1Br
639                          O=C(O)c1cccc(I)c1O
640                          O=C(O)c1cccc(I)c1O
641                          O=C(O)c1cccc(I)c1O
683                      Ic1cccc([C@@H]2CCN2)c1
684                      Ic1cccc([C@@H]2CCN2)c1
712        N#C/C(=N/Nc1ccc(I)cc1)c1nnc2n1CCCCC2
713        N#C/C(=N/Nc1ccc(I)cc1)c1nnc2n1CCCCC2
714        N#C/C(=N/Nc1ccc(I)cc1)c1nnc2n1CCCCC2
715        N#C/C(=N/Nc1ccc(I)cc1)c1nnc2n1CCCCC2
716        N#C/C(=N/Nc1ccc(I)cc1)c1nnc2n1CCCCC2
717        N#C/C(=N/Nc1ccc(I)cc1)c1nnc2n1CCCCC2
718        N#C/C(=N/Nc1ccc(I)cc1)c1nnc2n1CCCCC2
719        N#C/C(=N/Nc1ccc(I)cc1)c1nnc2n1CCCCC2
720        N#C/C(=N/Nc1ccc(I)cc1)c1nnc2n1CCCCC2
721        N#C/C(=N/Nc1ccc(I)cc1)c1nnc2n1CCCCC2
722        N#C/C(=N/Nc1ccc(I)cc1)c1nnc2n1CCCCC2
723        N#C/C(=N/Nc1ccc(I)cc1)c1nnc2n1CCCCC2
724        N#C/C(=N/Nc1ccc(I)cc1)c1nnc2n1CCCCC2
725        N#C/C(=N/Nc1ccc(I)cc1)c1nnc2n

 62%|█████████████████████████████████████████████████████████████████████████████████▉                                                 | 20/32 [45:31<24:35, 122.97s/it][08:59:13] Explicit valence for atom # 10 N, 5, is greater than permitted
[08:59:13] Explicit valence for atom # 10 N, 5, is greater than permitted
[08:59:13] Explicit valence for atom # 10 N, 5, is greater than permitted
[08:59:13] Explicit valence for atom # 10 N, 5, is greater than permitted


Invalid SMILES: [C:1]1([H:13])=[C:2]([H:14])[C:10](=[C:9]2[C:3]([H:15])=[C:6]([H:18])[N:11]([H:21])[C:7]([H:19])=[C:4]2[H:16])[C:8]([H:20])=[N+2:12]([H:22])=[C:5]1[H:17]
Invalid SMILES: [C:1]1([H:13])=[C:2]([H:14])[C:10](=[C:9]2[C:3]([H:15])=[C:6]([H:18])[N:11]([H:21])[C:7]([H:19])=[C:4]2[H:16])[C:8]([H:20])=[N+2:12]([H:22])=[C:5]1[H:17]
Invalid SMILES: [C:1]1([H:13])=[C:2]([H:14])[C:10](=[C:9]2[C:3]([H:15])=[C:6]([H:18])[N:11]([H:21])[C:7]([H:19])=[C:4]2[H:16])[C:8]([H:20])=[N+2:12]([H:22])=[C:5]1[H:17]
Invalid SMILES: [C:1]1([H:13])=[C:2]([H:14])[C:10](=[C:9]2[C:3]([H:15])=[C:6]([H:18])[N:11]([H:21])[C:7]([H:19])=[C:4]2[H:16])[C:8]([H:20])=[N+2:12]([H:22])=[C:5]1[H:17]


[09:02:52] Explicit valence for atom # 10 N, 5, is greater than permitted
[09:02:52] Explicit valence for atom # 10 N, 5, is greater than permitted
[09:02:52] Explicit valence for atom # 10 N, 5, is greater than permitted
[09:02:52] Explicit valence for atom # 10 N, 5, is greater than permitted
[09:02:52] Explicit valence for atom # 10 N, 5, is greater than permitted
[09:02:52] Explicit valence for atom # 10 N, 5, is greater than permitted
[09:02:52] Explicit valence for atom # 10 N, 5, is greater than permitted
[09:02:52] Explicit valence for atom # 10 N, 5, is greater than permitted
[09:02:52] Explicit valence for atom # 10 N, 5, is greater than permitted
[09:02:52] Explicit valence for atom # 10 N, 5, is greater than permitted
[09:02:52] Explicit valence for atom # 10 N, 5, is greater than permitted
[09:02:52] Explicit valence for atom # 10 N, 5, is greater than permitted
[09:02:52] Explicit valence for atom # 10 N, 5, is greater than permitted
[09:02:52] Explicit valence for atom #

Invalid SMILES: [c:1]1([H:16])[c:2]([H:17])[c:4]([H:19])[c:9]([C:6]([N:11]=[C:8]([N:12]=[C:10]2[C:7]([H:23])=[N+2:13]([H:25])=[N:14][O:15]2)[H:24])([H:21])[H:22])[c:5]([H:20])[c:3]1[H:18]
Invalid SMILES: [c:1]1([H:16])[c:2]([H:17])[c:4]([H:19])[c:9]([C:6]([N:11]=[C:8]([N:12]=[C:10]2[C:7]([H:23])=[N+2:13]([H:25])=[N:14][O:15]2)[H:24])([H:21])[H:22])[c:5]([H:20])[c:3]1[H:18]
Invalid SMILES: [c:1]1([H:16])[c:2]([H:17])[c:4]([H:19])[c:9]([C:6]([N:11]=[C:8]([N:12]=[C:10]2[C:7]([H:23])=[N+2:13]([H:25])=[N:14][O:15]2)[H:24])([H:21])[H:22])[c:5]([H:20])[c:3]1[H:18]
Invalid SMILES: [c:1]1([H:16])[c:2]([H:17])[c:4]([H:19])[c:9]([C:6]([N:11]=[C:8]([N:12]=[C:10]2[C:7]([H:23])=[N+2:13]([H:25])=[N:14][O:15]2)[H:24])([H:21])[H:22])[c:5]([H:20])[c:3]1[H:18]
Invalid SMILES: [c:1]1([H:16])[c:2]([H:17])[c:4]([H:19])[c:9]([C:6]([N:11]=[C:8]([N:12]=[C:10]2[C:7]([H:23])=[N+2:13]([H:25])=[N:14][O:15]2)[H:24])([H:21])[H:22])[c:5]([H:20])[c:3]1[H:18]
Invalid SMILES: [c:1]1([H:16])[c:2]([H:17])[c:4]([H:19])[c:9

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [1:03:19<00:00, 118.73s/it]


In [8]:
for dsname in tqdm.tqdm(OPTIMIZATION_WHITELISTS[16:]):
    table = download_optimization(client, dsname)
    if dsname in IGNORE_IODINE:
        df = table.to_pandas()
        n_df = len(df)
        mask = np.array(["I" in smi for smi in df.smiles.values])
        print(df[mask].smiles)
        df = pd.DataFrame(df[~mask])
        print(f"Found {len(mask)} in {dsname} -- filtering from {n_df} to {len(df)}")
        table = pa.Table.from_pandas(df)

    pq.write_table(table, f"offqcdata/data/tables/optimization/{dsname}.parquet")

    

 25%|█████████████████████████████████▎                                                                                                   | 4/16 [00:19<01:04,  5.36s/it][21:29:53] Explicit valence for atom # 10 N, 5, is greater than permitted
 25%|█████████████████████████████████▎                                                                                                   | 4/16 [01:41<05:04, 25.35s/it]


AttributeError: 'NoneType' object has no attribute 'GetAtoms'

In [11]:
for dsname in tqdm.tqdm(OPTIMIZATION_WHITELISTS[21:]):
    table = download_optimization(client, dsname)
    if dsname in IGNORE_IODINE:
        df = table.to_pandas()
        n_df = len(df)
        mask = np.array(["I" in smi for smi in df.smiles.values])
        print(df[mask].smiles)
        df = pd.DataFrame(df[~mask])
        print(f"Found {len(mask)} in {dsname} -- filtering from {n_df} to {len(df)}")
        table = pa.Table.from_pandas(df)

    pq.write_table(table, f"offqcdata/data/tables/optimization/{dsname}.parquet")

    

  0%|                                                                                                                                             | 0/11 [00:02<?, ?it/s]


PortalRequestError: Request failed: Could not find optimization dataset with name 'OpenFF Torsion Benchmark Supplement Optimization Dataset v1.0' (HTTP status 400)

In [13]:
for dsname in tqdm.tqdm(TORSIONDRIVE_WHITELISTS[23:]):
    print(dsname)
    table = download_torsiondrive(client, dsname)
    if dsname in IGNORE_IODINE:
        df = table.to_pandas()
        n_df = len(df)
        mask = np.array(["I" in smi for smi in df.smiles.values])
        print(df[mask].smiles)
        df = pd.DataFrame(df[~mask])
        print(f"Found {len(mask)} in {dsname} -- filtering from {n_df} to {len(df)}")
        table = pa.Table.from_pandas(df)

    pq.write_table(table, f"offqcdata/data/tables/torsiondrive/{dsname}.parquet")

    

  0%|                                                     | 0/8 [00:00<?, ?it/s]

OpenFF Torsion Coverage Supplement v1.0


 12%|█████▋                                       | 1/8 [00:02<00:15,  2.26s/it]

OpenFF RNA Dinucleoside Monophosphate TorsionDrives v1.0


 25%|███████████▎                                 | 2/8 [00:24<01:25, 14.25s/it]

XtalPi 20-percent Fragments TorsiondriveDataset v1.0


[14:58:44] Explicit valence for atom # 10 N, 5, is greater than permitted


Invalid SMILES: [C:1]1([H:13])=[C:2]([H:14])[C:10](=[C:9]2[C:3]([H:15])=[C:6]([H:18])[N:11]([H:21])[C:7]([H:19])=[C:4]2[H:16])[C:8]([H:20])=[N+2:12]([H:22])=[C:5]1[H:17]


[14:58:52] Explicit valence for atom # 10 N, 5, is greater than permitted


Invalid SMILES: [c:1]1([H:16])[c:2]([H:17])[c:4]([H:19])[c:9]([C:6]([N:11]=[C:8]([N:12]=[C:10]2[C:7]([H:23])=[N+2:13]([H:25])=[N:14][O:15]2)[H:24])([H:21])[H:22])[c:5]([H:20])[c:3]1[H:18]


 38%|████████████████▉                            | 3/8 [02:26<05:15, 63.16s/it]

OpenFF Torsion Drive Supplement v1.0


 50%|██████████████████████▌                      | 4/8 [02:28<02:36, 39.01s/it]

OpenFF Torsion Multiplicity Torsion Drive Coverage Supplement v1.0


 62%|████████████████████████████▏                | 5/8 [02:30<01:17, 25.89s/it]

OpenFF Phosphate Torsion Drives v1.0


 75%|█████████████████████████████████▊           | 6/8 [02:39<00:40, 20.12s/it]

OpenFF Alkane Torsion Drives v1.0


 88%|███████████████████████████████████████▍     | 7/8 [02:43<00:14, 14.75s/it]

OpenFF Cresset Additional Coverage TorsionDrives v4.0


100%|█████████████████████████████████████████████| 8/8 [02:46<00:00, 20.76s/it]


In [None]:
from yammbs.checkmol import analyse_functional_groups, ChemicalEnvironment

In [14]:
datasets = client.list_datasets()
datasets

[{'id': 35,
  'dataset_type': 'torsiondrive',
  'dataset_name': 'OpenFF Fragmenter Phenyl Benchmark',
  'record_count': 454},
 {'id': 36,
  'dataset_type': 'torsiondrive',
  'dataset_name': 'OpenFF Group1 Torsions',
  'record_count': 820},
 {'id': 41,
  'dataset_type': 'optimization',
  'dataset_name': 'OpenFF Optimization Set 1',
  'record_count': 937},
 {'id': 42,
  'dataset_type': 'torsiondrive',
  'dataset_name': 'Fragment Stability Benchmark',
  'record_count': 86},
 {'id': 43,
  'dataset_type': 'optimization',
  'dataset_name': 'SMIRNOFF Coverage Set 1',
  'record_count': 1132},
 {'id': 45,
  'dataset_type': 'optimization',
  'dataset_name': 'OpenFF VEHICLe Set 1',
  'record_count': 25500},
 {'id': 48,
  'dataset_type': 'torsiondrive',
  'dataset_name': 'SMIRNOFF Coverage Torsion Set 1',
  'record_count': 585},
 {'id': 49,
  'dataset_type': 'optimization',
  'dataset_name': 'OpenFF NCI250K Boron 1',
  'record_count': 189},
 {'id': 50,
  'dataset_type': 'optimization',
  'dataset_

In [15]:
opts = [ds for ds in datasets if ds["dataset_type"] == "optimization"]

In [16]:
opts

[{'id': 41,
  'dataset_type': 'optimization',
  'dataset_name': 'OpenFF Optimization Set 1',
  'record_count': 937},
 {'id': 43,
  'dataset_type': 'optimization',
  'dataset_name': 'SMIRNOFF Coverage Set 1',
  'record_count': 1132},
 {'id': 45,
  'dataset_type': 'optimization',
  'dataset_name': 'OpenFF VEHICLe Set 1',
  'record_count': 25500},
 {'id': 49,
  'dataset_type': 'optimization',
  'dataset_name': 'OpenFF NCI250K Boron 1',
  'record_count': 189},
 {'id': 50,
  'dataset_type': 'optimization',
  'dataset_name': 'OpenFF Discrepancy Benchmark 1',
  'record_count': 19714},
 {'id': 68,
  'dataset_type': 'optimization',
  'dataset_name': 'Pfizer Discrepancy Optimization Dataset 1',
  'record_count': 352},
 {'id': 69,
  'dataset_type': 'optimization',
  'dataset_name': 'FDA Optimization Dataset 1',
  'record_count': 6670},
 {'id': 196,
  'dataset_type': 'optimization',
  'dataset_name': 'OpenFF Primary Optimization Benchmark 1',
  'record_count': 1885},
 {'id': 197,
  'dataset_type':

In [17]:
dataset_names = [opt["dataset_name"] for opt in opts]
dataset_names

['OpenFF Optimization Set 1',
 'SMIRNOFF Coverage Set 1',
 'OpenFF VEHICLe Set 1',
 'OpenFF NCI250K Boron 1',
 'OpenFF Discrepancy Benchmark 1',
 'Pfizer Discrepancy Optimization Dataset 1',
 'FDA Optimization Dataset 1',
 'OpenFF Primary Optimization Benchmark 1',
 'OpenFF Full Optimization Benchmark 1',
 'Kinase Inhibitors: WBO Distributions',
 'OpenFF Gen 2 Opt Set 1 Roche',
 'OpenFF Gen 2 Opt Set 2 Coverage',
 'OpenFF Gen 2 Opt Set 4 eMolecules Discrepancy',
 'OpenFF Gen 2 Opt Set 3 Pfizer Discrepancy',
 'OpenFF Gen 2 Opt Set 5 Bayer',
 'OpenFF Ehrman Informative Optimization v0.1',
 'OpenFF Ehrman Informative Optimization v0.2',
 'OpenFF Protein Fragments v1.0',
 'OpenFF Protein Fragments v2.0',
 'OpenFF Sandbox CHO PhAlkEthOH v1.0',
 'OpenFF Protein Peptide Fragments constrained v1.0',
 'OpenFF Protein Peptide Fragments unconstrained v1.0',
 'OpenFF PEPCONF OptimizationDataset v1.0',
 'OpenFF Roche Opt Set With Protomers and Tautomers v1.0',
 'Genentech PDB Ligand Expo whole opti