In [27]:
import re
import pathlib
import tqdm
import numpy as np
import pandas as pd
import MDAnalysis as mda
from openff.toolkit.topology.molecule import Molecule, unit
import seaborn as sns
from matplotlib import pyplot as plt

%matplotlib inline

In [28]:
def get_smiles(batch_name: str, target_name: str):
    from openff.toolkit.topology import Molecule
    
    target_directory = pathlib.Path("../targets")
    
    if batch_name.startswith("opt"):
        file = target_directory / batch_name / f"{target_name}.sdf"
    else:
        file = target_directory / target_name / "input.sdf"
    
    mol = Molecule.from_file(str(file.resolve()), "SDF", allow_undefined_stereo=True)
    return mol.to_smiles(mapped=True), mol.to_smiles()

In [29]:
def read_indicate(logfile):
    import pandas as pd
    
    logfile = pathlib.Path(logfile)

    data = {
        "Target type": [],
        "Batch": [],
        "Target name": [],
        "QCArchive ID": [],
        "Batch ID": [],
        "Mapped SMILES": [],
        "SMILES": [],
        "Term": [],
    }
    with logfile.open("r") as f:
        contents = [x.strip() for x in f.readlines()]
    for line in contents:
        fields = line.split()
        if not fields or not len(re.findall("-", fields[0])) == 1:
            continue
        name = fields[0]
        if name.startswith("torsion"):
            qcarchive_id = name.split("-")[1]
            batch_id = -1
            term_type = "torsion"
            continue
        else:
            qcarchive_id, batch_id = name.split("-")
            term_type = "optgeo"

        batch_name = logfile.parent.parent.stem
        term = float(fields[-1])
        mapped_smiles, smiles = get_smiles(batch_name, name)

        data["Target type"].append(term_type)
        data["Batch"].append(batch_name)
        data["Target name"].append(name)
        data["QCArchive ID"].append(int(qcarchive_id))
        data["Batch ID"].append(int(batch_id))
        data["Mapped SMILES"].append(mapped_smiles)
        data["SMILES"].append(smiles)
        data["Term"].append(term)
    
    df = pd.DataFrame(data)
    df.index = df["Target name"]
    
    directory = logfile.parent.parent.parent.parent
    df["Replicate"] = directory.stem
    df["Environment"] = directory.parent.stem
    df["Project"] = directory.parent.parent.stem
    return df

In [30]:
def read_all():
    here = pathlib.Path(".")
    indicate_logs = sorted(here.glob("*/*/*/optimize.tmp/*/iter_0000/indicate.log"))
    
    dfs = []
    for logfile in tqdm.tqdm(indicate_logs):
        dfs.append(read_indicate(logfile))
    df = pd.concat(dfs)
    return df

In [None]:
df = read_all()

 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊  | 30326/30761 [25:06<01:00,  7.14it/s]

In [None]:
df.to_csv("all_targets.csv")

In [None]:
df

In [None]:
df.columns

In [None]:
opt_columns = [
    'Batch', 'QCArchive ID', 'Batch ID',
   'Mapped SMILES', 'SMILES', 'Term', 'Replicate', 'Environment',
   'Project'
]

In [None]:
opt_df = pd.DataFrame(
    df[df["Target type"] == "optgeo"][opt_columns]
)

In [None]:
def get_rmsd(
    batch_name: str,
    target_name: str,
    project_a: str,
    environment_a: str,
    replicate_a: str,
    project_b: str,
    environment_b: str,
    replicate_b: str
):
    from rdkit import Chem
    from rdkit.Chem import AllChem
    from MDAnalysis.analysis import rms
    
    mapped_smiles, _ = get_smiles(batch_name, target_name)
    
    mol = Molecule.from_mapped_smiles(mapped_smiles, allow_undefined_stereo=True)
    
    xyz_subpath = f"optimize.tmp/{batch_name}/iter_0000/{target_name}_mmopt.xyz"
    xyz_a = pathlib.Path(project_a) / environment_a / replicate_a / xyz_subpath
    xyz_b = pathlib.Path(project_b) / environment_b / replicate_b / xyz_subpath
    
    conformer_a = np.array(mda.Universe(xyz_a).atoms.positions).astype(float)
    conformer_b = np.array(mda.Universe(xyz_b).atoms.positions).astype(float)
    
    rmsd = rms.rmsd(conformer_a, conformer_b)
    
    mol._conformers = [conformer_a * unit.angstrom]
    rdmol_a = mol.to_rdkit()
    
    mol._conformers = [conformer_b * unit.angstrom]
    rdmol_b = mol.to_rdkit()
    
    best_rmsd = AllChem.GetBestRMS(rdmol_a, rdmol_b)
    return rmsd, best_rmsd

## Non-1.9.6 differences

In [65]:
potential_problem_targets = {}
consistent = []

opt_df_not_196 = opt_df[opt_df.Environment != "fb-196-ic-0318-oe-2022"]
for (batch_id, target_name), subdf in opt_df_not_196.groupby(by=["Batch", "Target name"]):
    unique_terms = subdf.Term.unique()
    difference = subdf.Term.max() - subdf.Term.min()
    if difference > 0.001:
        potential_problem_targets[(batch_id, target_name)] = subdf
    else:
        consistent.append((batch_id, target_name))

In [66]:
len(potential_problem_targets)

666

In [67]:
keys = list(potential_problem_targets.keys())

In [73]:
key = keys[1]
key

('opt-geo-batch-1', '18433053-13')

In [76]:
potential_problem_targets[key].Term.values

array([555.896, 555.896, 555.896, 555.896, 555.896, 555.896, 555.897,
       555.897, 555.897, 555.896, 555.896, 555.896, 555.896, 555.896,
       555.896, 555.897, 555.897, 555.897, 555.896, 555.896, 555.896,
       555.896, 555.896, 555.896, 555.897, 555.897, 555.897])

In [77]:
help(get_rmsd)

Help on function get_rmsd in module __main__:

get_rmsd(batch_name: str, target_name: str, project_a: str, environment_a: str, replicate_a: str, project_b: str, environment_b: str, replicate_b: str)



In [78]:

project = potential_problem_targets[key].Project.values[-1]
get_rmsd(
    *key,
    project_a=project,
    environment_a="fb-193-tk-010-oe-2022-reordered",
    replicate_a="rep1",
    project_b=project,
    environment_b="fb-195-tk-013-oe-2022-interchange-replace-cache",
    replicate_b="rep1",
)

(1.1425002725123635e-06, 1.1101660563055304e-06)

In [81]:
rmsd_difference = {
    "Difference": [],
    "RMSD": [],
    "Best RMSD": []
}

for key in tqdm.tqdm(potential_problem_targets):
    a = "fb-193-tk-010-oe-2022-reordered"
    b = "fb-195-tk-013-oe-2022-interchange-replace-cache"
    subdf = potential_problem_targets[key]
    sub_a = subdf[subdf.Environment == a]
    sub_b = subdf[subdf.Environment == b]
    difference = abs(sub_a.Term.values[0] - sub_b.Term.values[0])
    project = subdf.Project.values[-1]
    rmsd, best_rmsd = get_rmsd(
        *key,
        project_a=project,
        environment_a=a,
        replicate_a="rep1",
        project_b=project,
        environment_b=b,
        replicate_b="rep1",
    )
    
    rmsd_difference["Difference"].append(difference)
    rmsd_difference["RMSD"].append(rmsd)
    rmsd_difference["Best RMSD"].append(best_rmsd)

rmsd_difference_df = pd.DataFrame(rmsd_difference)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 666/666 [08:09<00:00,  1.36it/s]


In [86]:
rmsd_difference_df.to_csv("rmsd-vs-objective_193-vs-195.csv")

In [93]:
diff_001 = rmsd_difference_df[rmsd_difference_df.Difference <= 0.0011]
print(diff_001.RMSD.min(), diff_001.RMSD.max())
print(diff_001["Best RMSD"].min(), diff_001["Best RMSD"].max())


7.99564522107703e-08 5.3510634345326214e-05
0.0 5.3504443958427124e-05


In [95]:
diff_002 = rmsd_difference_df[
    (rmsd_difference_df.Difference > 0.0011)
    & (rmsd_difference_df.Difference <= 0.0021)
]
print(diff_002.RMSD.min(), diff_002.RMSD.max())
print(diff_002["Best RMSD"].min(), diff_002["Best RMSD"].max())


5.497979818346711e-07 7.2445385466706e-05
5.277620225233711e-07 7.244401154226771e-05


In [104]:
rmsd_difference_df.Difference.max()

1030.8169999999998

In [107]:
for counter in range(100):
    low = 0.0001 + (counter * 0.1)
    high = low + 0.1
    subdf = rmsd_difference_df[
        (rmsd_difference_df.Difference > low)
        & (rmsd_difference_df.Difference <= high)
    ]
    if not len(subdf):
        continue
    rmsd_low = subdf["RMSD"].min()
    rmsd_high = subdf["RMSD"].max()
    best_rmsd_low = subdf["Best RMSD"].min()
    best_rmsd_high = subdf["Best RMSD"].max()
    print(f"{low:.3f}-{high:.3f},{rmsd_low:.0e} to {rmsd_high:.0e},{best_rmsd_low:.0e} to {best_rmsd_high:.0e}")

0.000-0.100,8e-08 to 6e-04,0e+00 to 6e-04
0.100-0.200,6e-05 to 8e-04,6e-05 to 8e-04
0.200-0.300,5e-04 to 8e-04,5e-04 to 8e-04
0.300-0.400,4e-04 to 4e-04,4e-04 to 4e-04
7.500-7.600,1e-01 to 1e-01,1e-01 to 1e-01


## Between-rep differences

In [46]:
potential_problem_targets_all = {}
consistent_all = []
for (batch_id, target_name), subdf in opt_df.groupby(by=["Batch", "Target name"]):
    difference = subdf.Term.max() - subdf.Term.min()
    if difference > 5:
        potential_problem_targets_all[(batch_id, target_name)] = subdf
    else:
        consistent_all.append((batch_id, target_name))

In [47]:
len(potential_problem_targets_all)

44

In [48]:
len(consistent_all)

5394

In [49]:
keys = list(potential_problem_targets_all.keys())

In [50]:
key = keys[0]
key

('opt-geo-batch-108', '19095393-10')

In [51]:
potential_problem_targets_all[key]

Unnamed: 0_level_0,Batch,QCArchive ID,Batch ID,Mapped SMILES,SMILES,Term,Replicate,Environment,Project
Target name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
19095393-10,opt-geo-batch-108,19095393.0,10.0,[H:35][c:1]1[c:3]([c:9]([c:7]([n:27][c:6]1[H:4...,[H]c1c(c(c(nc1[H])[H])C([H])([H])N([H])C(=O)c2...,85.713,rep1,fb-193-tk-010-oe-2022,001_fit-all
19095393-10,opt-geo-batch-108,19095393.0,10.0,[H:35][c:1]1[c:3]([c:9]([c:7]([n:27][c:6]1[H:4...,[H]c1c(c(c(nc1[H])[H])C([H])([H])N([H])C(=O)c2...,85.713,rep2,fb-193-tk-010-oe-2022,001_fit-all
19095393-10,opt-geo-batch-108,19095393.0,10.0,[H:35][c:1]1[c:3]([c:9]([c:7]([n:27][c:6]1[H:4...,[H]c1c(c(c(nc1[H])[H])C([H])([H])N([H])C(=O)c2...,85.713,rep3,fb-193-tk-010-oe-2022,001_fit-all
19095393-10,opt-geo-batch-108,19095393.0,10.0,[H:35][c:1]1[c:3]([c:9]([c:7]([n:27][c:6]1[H:4...,[H]c1c(c(c(nc1[H])[H])C([H])([H])N([H])C(=O)c2...,85.713,rep1,fb-193-tk-010-oe-2022-reordered,001_fit-all
19095393-10,opt-geo-batch-108,19095393.0,10.0,[H:35][c:1]1[c:3]([c:9]([c:7]([n:27][c:6]1[H:4...,[H]c1c(c(c(nc1[H])[H])C([H])([H])N([H])C(=O)c2...,85.713,rep2,fb-193-tk-010-oe-2022-reordered,001_fit-all
19095393-10,opt-geo-batch-108,19095393.0,10.0,[H:35][c:1]1[c:3]([c:9]([c:7]([n:27][c:6]1[H:4...,[H]c1c(c(c(nc1[H])[H])C([H])([H])N([H])C(=O)c2...,85.713,rep3,fb-193-tk-010-oe-2022-reordered,001_fit-all
19095393-10,opt-geo-batch-108,19095393.0,10.0,[H:35][c:1]1[c:3]([c:9]([c:7]([n:27][c:6]1[H:4...,[H]c1c(c(c(nc1[H])[H])C([H])([H])N([H])C(=O)c2...,85.713,rep1,fb-195-tk-013-oe-2022-interchange-replace-cache,001_fit-all
19095393-10,opt-geo-batch-108,19095393.0,10.0,[H:35][c:1]1[c:3]([c:9]([c:7]([n:27][c:6]1[H:4...,[H]c1c(c(c(nc1[H])[H])C([H])([H])N([H])C(=O)c2...,85.713,rep2,fb-195-tk-013-oe-2022-interchange-replace-cache,001_fit-all
19095393-10,opt-geo-batch-108,19095393.0,10.0,[H:35][c:1]1[c:3]([c:9]([c:7]([n:27][c:6]1[H:4...,[H]c1c(c(c(nc1[H])[H])C([H])([H])N([H])C(=O)c2...,85.713,rep3,fb-195-tk-013-oe-2022-interchange-replace-cache,001_fit-all
19095393-10,opt-geo-batch-108,19095393.0,10.0,[H:35][c:1]1[c:3]([c:9]([c:7]([n:27][c:6]1[H:4...,[H]c1c(c(c(nc1[H])[H])C([H])([H])N([H])C(=O)c2...,85.713,rep1,fb-195-tk-013-oe-2022-interchange-replace-cach...,001_fit-all


In [52]:
target_rows = []

for key in keys:
    subdf = potential_problem_targets_all[key]
    for project, project_df in subdf.groupby("Project"):
        row = {"Batch": key[0], "Target": key[1], "Project": project}
        for env, env_df in project_df.groupby("Environment"):
            row[env] = env_df.Term.values[0]
        target_rows.append(row)

target_df = pd.DataFrame.from_records(target_rows)

In [53]:
target_df

Unnamed: 0,Batch,Target,Project,fb-193-tk-010-oe-2022,fb-193-tk-010-oe-2022-reordered,fb-195-tk-013-oe-2022-interchange-replace-cache,fb-195-tk-013-oe-2022-interchange-replace-cache-switching,fb-196-ic-0318-oe-2022,fb-196-ic-0318-oe-2022-crit
0,opt-geo-batch-108,19095393-10,001_fit-all,85.713,85.713,85.713,85.713,46.043,85.713
1,opt-geo-batch-108,19095393-10,002_fit-opt,85.713,85.713,85.713,85.713,46.043,
2,opt-geo-batch-108,19095393-10,111_opt-geo-batch-108,85.713,85.713,85.713,,46.043,
3,opt-geo-batch-109,19095414-4,001_fit-all,973.288,678.567,678.565,678.565,678.593,678.565
4,opt-geo-batch-109,19095414-4,002_fit-opt,973.288,678.567,678.565,678.565,678.593,
...,...,...,...,...,...,...,...,...,...
127,opt-geo-batch-96,19095011-20,002_fit-opt,1561.274,2592.095,1561.278,1561.278,1561.667,
128,opt-geo-batch-96,19095011-20,099_opt-geo-batch-96,1561.274,2592.095,1561.278,,1561.667,
129,opt-geo-batch-96,19095014-23,001_fit-all,239.718,1177.130,239.723,239.723,239.679,239.723
130,opt-geo-batch-96,19095014-23,002_fit-opt,239.718,1177.130,239.723,239.723,239.679,


In [65]:
(target_df["fb-196-ic-0318-oe-2022-crit"] - target_df["fb-196-ic-0318-oe-2022"]).sum()

1183.8810000000008

In [66]:
target_df.to_csv("target_differences.csv")

In [67]:
target_df_1 = target_df[target_df.Project == "001_fit-all"]

In [68]:
target_df_1.to_csv("target_differences_1.csv")

In [61]:
target_df_1

Unnamed: 0,Batch,Target,Project,fb-193-tk-010-oe-2022,fb-193-tk-010-oe-2022-reordered,fb-195-tk-013-oe-2022-interchange-replace-cache,fb-195-tk-013-oe-2022-interchange-replace-cache-switching,fb-196-ic-0318-oe-2022,fb-196-ic-0318-oe-2022-crit
0,opt-geo-batch-108,19095393-10,001_fit-all,85.713,85.713,85.713,85.713,46.043,85.713
3,opt-geo-batch-109,19095414-4,001_fit-all,973.288,678.567,678.565,678.565,678.593,678.565
6,opt-geo-batch-113,19095588-8,001_fit-all,800.755,488.803,488.803,488.803,488.806,488.803
9,opt-geo-batch-115,19095648-2,001_fit-all,241.176,592.293,241.176,241.176,241.209,241.176
12,opt-geo-batch-123,19095890-3,001_fit-all,432.754,859.172,424.808,424.808,424.811,424.808
15,opt-geo-batch-125,19095964-29,001_fit-all,557.615,557.896,557.836,557.836,527.51,557.836
18,opt-geo-batch-127,95602811-22,001_fit-all,828.404,371.009,828.402,828.402,828.471,828.402
21,opt-geo-batch-131,95602486-3,001_fit-all,169.287,395.607,169.287,169.287,169.288,169.287
24,opt-geo-batch-132,110312272-7,001_fit-all,44.947,44.947,44.947,44.947,19.051,44.947
27,opt-geo-batch-132,110312279-12,001_fit-all,19.782,19.782,19.782,19.782,11.507,19.782


In [64]:
df[
    (df["Target name"] == "18438954-4")
    & (df["Project"] == "001_fit-all")
]

Unnamed: 0_level_0,Target type,Batch,Target name,QCArchive ID,Batch ID,Mapped SMILES,SMILES,Term,Replicate,Environment,Project
Target name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
18438954-4,optgeo,opt-geo-batch-58,18438954-4,18438954.0,4.0,[H:16][c:1]1[c:2]([c:4]([c:6]([c:5]([c:3]1[H:1...,[H]c1c(c(c(c(c1[H])[H])SC2=NC(=NN2[H])[N+](=O)...,661.058,rep1,fb-193-tk-010-oe-2022,001_fit-all
18438954-4,optgeo,opt-geo-batch-58,18438954-4,18438954.0,4.0,[H:16][c:1]1[c:2]([c:4]([c:6]([c:5]([c:3]1[H:1...,[H]c1c(c(c(c(c1[H])[H])SC2=NC(=NN2[H])[N+](=O)...,661.058,rep2,fb-193-tk-010-oe-2022,001_fit-all
18438954-4,optgeo,opt-geo-batch-58,18438954-4,18438954.0,4.0,[H:16][c:1]1[c:2]([c:4]([c:6]([c:5]([c:3]1[H:1...,[H]c1c(c(c(c(c1[H])[H])SC2=NC(=NN2[H])[N+](=O)...,661.058,rep3,fb-193-tk-010-oe-2022,001_fit-all
18438954-4,optgeo,opt-geo-batch-58,18438954-4,18438954.0,4.0,[H:16][c:1]1[c:2]([c:4]([c:6]([c:5]([c:3]1[H:1...,[H]c1c(c(c(c(c1[H])[H])SC2=NC(=NN2[H])[N+](=O)...,661.059,rep1,fb-193-tk-010-oe-2022-reordered,001_fit-all
18438954-4,optgeo,opt-geo-batch-58,18438954-4,18438954.0,4.0,[H:16][c:1]1[c:2]([c:4]([c:6]([c:5]([c:3]1[H:1...,[H]c1c(c(c(c(c1[H])[H])SC2=NC(=NN2[H])[N+](=O)...,661.059,rep2,fb-193-tk-010-oe-2022-reordered,001_fit-all
18438954-4,optgeo,opt-geo-batch-58,18438954-4,18438954.0,4.0,[H:16][c:1]1[c:2]([c:4]([c:6]([c:5]([c:3]1[H:1...,[H]c1c(c(c(c(c1[H])[H])SC2=NC(=NN2[H])[N+](=O)...,661.059,rep3,fb-193-tk-010-oe-2022-reordered,001_fit-all
18438954-4,optgeo,opt-geo-batch-58,18438954-4,18438954.0,4.0,[H:16][c:1]1[c:2]([c:4]([c:6]([c:5]([c:3]1[H:1...,[H]c1c(c(c(c(c1[H])[H])SC2=NC(=NN2[H])[N+](=O)...,661.06,rep1,fb-195-tk-013-oe-2022-interchange-replace-cache,001_fit-all
18438954-4,optgeo,opt-geo-batch-58,18438954-4,18438954.0,4.0,[H:16][c:1]1[c:2]([c:4]([c:6]([c:5]([c:3]1[H:1...,[H]c1c(c(c(c(c1[H])[H])SC2=NC(=NN2[H])[N+](=O)...,661.06,rep2,fb-195-tk-013-oe-2022-interchange-replace-cache,001_fit-all
18438954-4,optgeo,opt-geo-batch-58,18438954-4,18438954.0,4.0,[H:16][c:1]1[c:2]([c:4]([c:6]([c:5]([c:3]1[H:1...,[H]c1c(c(c(c(c1[H])[H])SC2=NC(=NN2[H])[N+](=O)...,661.06,rep3,fb-195-tk-013-oe-2022-interchange-replace-cache,001_fit-all
18438954-4,optgeo,opt-geo-batch-58,18438954-4,18438954.0,4.0,[H:16][c:1]1[c:2]([c:4]([c:6]([c:5]([c:3]1[H:1...,[H]c1c(c(c(c(c1[H])[H])SC2=NC(=NN2[H])[N+](=O)...,661.06,rep1,fb-195-tk-013-oe-2022-interchange-replace-cach...,001_fit-all


## 1.9.5 differences

In [29]:
help(get_rmsd)

Help on function get_rmsd in module __main__:

get_rmsd(batch_name: str, target_name: str, project_a: str, environment_a: str, replicate_a: str, project_b: str, environment_b: str, replicate_b: str)



In [23]:
def cross_compare(
    batch_id,
    target_name,
    project="001_fit-all",
    replicate="rep1",
):
    envs = [
        "fb-193-tk-010-oe-2022",
        "fb-193-tk-010-oe-2022-reordered",
        "fb-195-tk-013-oe-2022-interchange-replace-cache",
        "fb-195-tk-013-oe-2022-interchange-replace-cache-switching",
        "fb-196-ic-0318-oe-2022",
    ]
    data = {}
    for env in envs:
        data[env] = dict.fromkeys(envs, 0)
    for i, env_a in enumerate(envs):
        for j, env_b in enumerate(envs[i+1:], i+1):
            rmsd, best_rmsd = get_rmsd(
                batch_id,
                target_name,
                project_a=project,
                environment_a=env_a,
                replicate_a=replicate,
                project_b=project,
                environment_b=env_b,
                replicate_b=replicate,
            )
            data[env_a][env_b] = rmsd
            data[env_b][env_a] = best_rmsd
    rmsd_df = pd.DataFrame.from_records(data)
    rmsd_df.to_csv(f"rmsd_cross_compare/{batch_id}_{target_name}.csv")
    return rmsd_df
        

In [39]:
df_19_17 = cross_compare("opt-geo-batch-19", "18434757-17")

In [40]:
df_19_17

Unnamed: 0,fb-193-tk-010-oe-2022,fb-193-tk-010-oe-2022-reordered,fb-195-tk-013-oe-2022-interchange-replace-cache,fb-196-ic-0318-oe-2022
fb-193-tk-010-oe-2022,0.0,8.041106e-07,0.304596,0.30454
fb-193-tk-010-oe-2022-reordered,0.542704,0.0,0.304597,0.30454
fb-195-tk-013-oe-2022-interchange-replace-cache,0.30625,0.3323345,0.0,6.9e-05
fb-196-ic-0318-oe-2022,0.306197,0.3323941,7.5e-05,0.0


In [41]:
df_20_6 = cross_compare("opt-geo-batch-20", "18434815-6")
df_20_6

Unnamed: 0,fb-193-tk-010-oe-2022,fb-193-tk-010-oe-2022-reordered,fb-195-tk-013-oe-2022-interchange-replace-cache,fb-196-ic-0318-oe-2022
fb-193-tk-010-oe-2022,0.0,1e-06,0.264101,0.264143
fb-193-tk-010-oe-2022-reordered,1e-06,0.0,0.264101,0.264143
fb-195-tk-013-oe-2022-interchange-replace-cache,0.264472,0.264472,0.0,8.1e-05
fb-196-ic-0318-oe-2022,0.264514,0.264514,8.2e-05,0.0


In [26]:
df_58_4 = cross_compare("opt-geo-batch-58", "18438954-4")
df_58_4



Unnamed: 0,fb-193-tk-010-oe-2022,fb-193-tk-010-oe-2022-reordered,fb-195-tk-013-oe-2022-interchange-replace-cache,fb-195-tk-013-oe-2022-interchange-replace-cache-switching,fb-196-ic-0318-oe-2022
fb-193-tk-010-oe-2022,0.0,3e-06,6e-06,5.913733e-06,1.591251
fb-193-tk-010-oe-2022-reordered,4e-06,0.0,3e-06,2.60448e-06,1.591251
fb-195-tk-013-oe-2022-interchange-replace-cache,8e-06,4e-06,0.0,1.040544e-07,1.591251
fb-195-tk-013-oe-2022-interchange-replace-cache-switching,8e-06,4e-06,0.0,0.0,1.591251
fb-196-ic-0318-oe-2022,1.666589,1.666589,1.666589,1.666589,0.0


In [32]:
get_rmsd(
    "opt-geo-batch-19",
    "18434757-17",
    project_a="001_fit-all",
    environment_a="fb-193-tk-010-oe-2022",
    replicate_a="rep1",
    project_b="001_fit-all",
    environment_b="fb-195-tk-013-oe-2022-interchange-replace-cache",
    replicate_b="rep1",
)



(0.30625010957393206, 0.3045961832493517)

In [33]:
get_rmsd(
    "opt-geo-batch-19",
    "18434757-17",
    project_a="001_fit-all",
    environment_a="fb-193-tk-010-oe-2022",
    replicate_a="rep1",
    project_b="001_fit-all",
    environment_b="fb-196-ic-0318-oe-2022",
    replicate_b="rep1",
)

(0.30619653517378803, 0.30453992102407207)

In [34]:
get_rmsd(
    "opt-geo-batch-19",
    "18434757-17",
    project_a="001_fit-all",
    environment_a="fb-193-tk-010-oe-2022",
    replicate_a="rep1",
    project_b="001_fit-all",
    environment_b="fb-193-tk-010-oe-2022-reordered",
    replicate_b="rep1",
)

(0.5427040210800871, 8.041106202144648e-07)

## Make targets smaller