In [1]:
import re
import pathlib
import tqdm
import numpy as np
import pandas as pd

In [20]:
def get_smiles(batch_name: str, target_name: str, indicate_log: str):
    from openff.toolkit.topology import Molecule
    
    indicate_log = pathlib.Path(indicate_log)
    target_directory = indicate_log.parent.parent.parent.parent / "targets"
    
    if batch_name.startswith("opt"):
        file = target_directory / batch_name / f"{target_name}.sdf"
    else:
        file = target_directory / target_name / "input.sdf"
    
    mol = Molecule.from_file(str(file.resolve()), "SDF", allow_undefined_stereo=True)
    return mol.to_smiles(mapped=True), mol.to_smiles()

In [31]:
def read_directory():
    import pandas as pd
    
    here = pathlib.Path(".")
    indicate_logs = sorted(
        here.glob("*/*/*/optimize.tmp/*/iter_0000/indicate.log")
    )
    
    data = {
        "Experiment": [],
        "Environment": [],
        "Replicate": [],
        "Target type": [],
        "Batch": [],
        "Target name": [],
        "QCArchive ID": [],
        "Batch ID": [],
        "Mapped SMILES": [],
        "SMILES": [],
        "Term": [],
        
    }
    for logfile in tqdm.tqdm(indicate_logs):
        with logfile.open("r") as f:
            contents = [x.strip() for x in f.readlines()]
        for line in contents:
            fields = line.split()
            if not fields or not len(re.findall("-", fields[0])) == 1:
                continue
            name = fields[0]
            if name.startswith("torsion"):
                qcarchive_id = name.split("-")[1]
                batch_id = -1
                term_type = "torsion"
            else:
                qcarchive_id, batch_id = name.split("-")
                term_type = "optgeo"
                
            batch_name = logfile.parent.parent.stem
            term = float(fields[-1])
            mapped_smiles, smiles = get_smiles(batch_name, name, logfile)
            
            rep_directory = logfile.parent.parent.parent.parent
            data["Experiment"].append(rep_directory.parent.parent.stem)
            data["Environment"].append(rep_directory.parent.stem)
            data["Replicate"].append(rep_directory.stem)
            data["Target type"].append(term_type)
            data["Batch"].append(batch_name)
            data["Target name"].append(name)
            data["QCArchive ID"].append(int(qcarchive_id))
            data["Batch ID"].append(int(batch_id))
            data["Mapped SMILES"].append(mapped_smiles)
            data["SMILES"].append(smiles)
            data["Term"].append(term)
            
    
    df = pd.DataFrame(data)
    return df

In [38]:
df = read_directory()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:00<00:00, 94.11it/s]


In [39]:
df

Unnamed: 0,Experiment,Environment,Replicate,Target type,Batch,Target name,QCArchive ID,Batch ID,Mapped SMILES,SMILES,Term
0,01_opt-48-17,fb-193-tk-010-oe-2022,rep1,optgeo,opt-geo-batch-48,18437974-17,18437974,17,[H:20][c:2]1[c:4]([c:8]([c:5]([c:3]([c:7]1[C:1...,[H]c1c(c(c(c(c1C#N)[H])[H])/C(=N/N([H])c2nc(c(...,2097.572
1,01_opt-48-17,fb-193-tk-010-oe-2022,rep2,optgeo,opt-geo-batch-48,18437974-17,18437974,17,[H:20][c:2]1[c:4]([c:8]([c:5]([c:3]([c:7]1[C:1...,[H]c1c(c(c(c(c1C#N)[H])[H])/C(=N/N([H])c2nc(c(...,2097.572
2,01_opt-48-17,fb-193-tk-010-oe-2022,rep3,optgeo,opt-geo-batch-48,18437974-17,18437974,17,[H:20][c:2]1[c:4]([c:8]([c:5]([c:3]([c:7]1[C:1...,[H]c1c(c(c(c(c1C#N)[H])[H])/C(=N/N([H])c2nc(c(...,2097.572
3,01_opt-48-17,fb-195-tk-013-oe-2022-interchange-replace-cache,rep1,optgeo,opt-geo-batch-48,18437974-17,18437974,17,[H:20][c:2]1[c:4]([c:8]([c:5]([c:3]([c:7]1[C:1...,[H]c1c(c(c(c(c1C#N)[H])[H])/C(=N/N([H])c2nc(c(...,442.121
4,01_opt-48-17,fb-195-tk-013-oe-2022-interchange-replace-cache,rep2,optgeo,opt-geo-batch-48,18437974-17,18437974,17,[H:20][c:2]1[c:4]([c:8]([c:5]([c:3]([c:7]1[C:1...,[H]c1c(c(c(c(c1C#N)[H])[H])/C(=N/N([H])c2nc(c(...,442.121
5,01_opt-48-17,fb-195-tk-013-oe-2022-interchange-replace-cache,rep3,optgeo,opt-geo-batch-48,18437974-17,18437974,17,[H:20][c:2]1[c:4]([c:8]([c:5]([c:3]([c:7]1[C:1...,[H]c1c(c(c(c(c1C#N)[H])[H])/C(=N/N([H])c2nc(c(...,442.121
6,02_opt-71-20,fb-193-tk-010-oe-2022,rep1,optgeo,opt-geo-batch-71,19094129-20,19094129,20,[H:16][c:1]1[c:2]([c:5]([c:6]([c:3]([c:4]1[C:9...,[H]c1c(c(c(c(c1C([H])([H])C([H])([H])N([H])[H]...,470.715
7,02_opt-71-20,fb-193-tk-010-oe-2022,rep2,optgeo,opt-geo-batch-71,19094129-20,19094129,20,[H:16][c:1]1[c:2]([c:5]([c:6]([c:3]([c:4]1[C:9...,[H]c1c(c(c(c(c1C([H])([H])C([H])([H])N([H])[H]...,470.715
8,02_opt-71-20,fb-193-tk-010-oe-2022,rep3,optgeo,opt-geo-batch-71,19094129-20,19094129,20,[H:16][c:1]1[c:2]([c:5]([c:6]([c:3]([c:4]1[C:9...,[H]c1c(c(c(c(c1C([H])([H])C([H])([H])N([H])[H]...,470.715
9,02_opt-71-20,fb-195-tk-013-oe-2022-interchange-replace-cache,rep1,optgeo,opt-geo-batch-71,19094129-20,19094129,20,[H:16][c:1]1[c:2]([c:5]([c:6]([c:3]([c:4]1[C:9...,[H]c1c(c(c(c(c1C([H])([H])C([H])([H])N([H])[H]...,117.418


In [43]:
from openmm import unit

In [47]:
(1.1 * unit.elementary_charge)._value

1.1