# Diasteromers
The data contains some diastereomeric monomers. We check how consistent these react.

In [26]:
import pathlib
import sys

sys.path.append(str(pathlib.Path().resolve().parents[1]))

import pandas as pd
from sklearn.metrics import accuracy_score

from src.util.db_utils import SynFermDatabaseConnection
from src.definitions import DATA_DIR

In [2]:
con = SynFermDatabaseConnection()

In [4]:
res = con.con.execute("SELECT initiator_long, monomer_long, terminator_long, long_name, l.binary_A, l.binary_B, l.binary_C, l.binary_D, l.binary_E, l.binary_F, l.binary_G, l.binary_H FROM experiments INNER JOIN main.labels l on experiments.id = l.experiment_id WHERE valid IS NULL OR valid NOT LIKE 'ERROR%';").fetchall()
df = pd.DataFrame(res, columns=["initiator", "monomer", "terminator", "long", "binary_A", "binary_B", "binary_C", "binary_D", "binary_E", "binary_F", "binary_G", "binary_H"])
df.head()

Unnamed: 0,initiator,monomer,terminator,long,binary_A,binary_B,binary_C,binary_D,binary_E,binary_F,binary_G,binary_H
0,Ph023,Mon017,TerTH010,Ph023 + Mon017 + TerTH010,1,1,1,0,1,1,1,1.0
1,Ph023,Mon017,TerTH026,Ph023 + Mon017 + TerTH026,1,1,1,0,1,1,1,0.0
2,Ph023,Mon017,TerTH015,Ph023 + Mon017 + TerTH015,1,1,1,0,0,1,1,0.0
3,Ph023,Mon017,TerTH020,Ph023 + Mon017 + TerTH020,1,1,1,1,1,1,1,1.0
4,Ph023,Mon017,TerABT001,Ph023 + Mon017 + TerABT001,1,1,0,1,1,1,1,1.0


In [5]:
enantiomer_pairs = [
["Mon001", "Mon087"],
["Mon003", "Mon078"],
["Mon011", "Mon088"],
["Mon013", "Mon074"],
["Mon014", "Mon090"],
["Mon015", "Mon076"],
["Mon016", "Mon096"],
["Mon017", "Mon075"],
["Mon019", "Mon091"],
["Mon020", "Mon077"],
]

In [35]:
# check the accuracy for one of the enantiomers approximating the outcome for the other
for pair in enantiomer_pairs:
    exp1 = df.loc[df["monomer"] == pair[0]]
    exp2 = df.loc[df["monomer"] == pair[1]]
    both = pd.merge(exp1, exp2, on=["initiator", "terminator"], how="inner")
    print(pair)
    print(f"{len(both)} overlapping samples")
    for s in "ABCDEFGH":
        print(s, ":", accuracy_score(both[f"binary_{s}_x"], both[f"binary_{s}_y"]))

['Mon001', 'Mon087']
149 overlapping samples
A : 0.9798657718120806
B : 0.9060402684563759
C : 0.959731543624161
D : 0.9798657718120806
E : 0.9530201342281879
F : 0.9463087248322147
G : 0.8791946308724832
H : 0.8859060402684564
['Mon003', 'Mon078']
0 overlapping samples
A : nan
B : nan
C : nan
D : nan
E : nan
F : nan
G : nan
H : nan
['Mon011', 'Mon088']
106 overlapping samples
A : 1.0
B : 0.9433962264150944
C : 0.8113207547169812
D : 1.0
E : 0.9528301886792453
F : 1.0
G : 0.9245283018867925
H : 0.7924528301886793
['Mon013', 'Mon074']
123 overlapping samples
A : 1.0
B : 0.983739837398374
C : 0.9105691056910569
D : 0.7804878048780488
E : 0.983739837398374
F : 1.0
G : 0.9512195121951219
H : 0.8536585365853658
['Mon014', 'Mon090']
158 overlapping samples
A : 1.0
B : 0.9810126582278481
C : 0.9493670886075949
D : 0.9556962025316456
E : 1.0
F : 0.9746835443037974
G : 0.9113924050632911
H : 0.8734177215189873
['Mon015', 'Mon076']
976 overlapping samples
A : 0.9989754098360656
B : 0.93852459016

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


## Conclusion
For most products, formation is highly reproducible, especially for A, B, E, and F. H is the worst but still around 85-90% reproducible.
