# SynFerm data preparation
#### Targets:
- Import experiment, representation, and target data from db
- Export to CSV
#### Difference to previous
- Get reactionSMILES from virtuallibrary -> they will not have stereochem on the PG
- Exclude Mon078
- Don't export to JSON (we don't need this anymore)

In [3]:
import datetime
import pathlib
import sys

sys.path.append(str(pathlib.Path().resolve().parents[1]))

import pandas as pd

from src.util.db_utils import SynFermDatabaseConnection
from src.definitions import DATA_DIR

In [4]:
con = SynFermDatabaseConnection()  # we will use this for various simple queries

## Import data

In [6]:
# note that we only select valid reactions by using the INNER JOIN with the labels table and excluding Mon078
res = con.con.execute("""SELECT e.id, r.I_long, r.M_long, r.T_long, r.product_A_smiles, r.I_smiles, r.M_smiles, r.T_smiles, r.reaction_smiles, v.reaction_smiles_atom_mapped, l.binary_A, l.binary_B, l.binary_C, l.binary_D, l.binary_E, l.binary_F, l.binary_G, l.binary_H, l.scaled_A, l.scaled_B, l.scaled_C, l.scaled_D, l.scaled_E, l.scaled_F, l.scaled_G, l.scaled_H, l."major_A-C" FROM experiments e LEFT JOIN representations r on e.id = r.experiment_id INNER JOIN labels l on e.id = l.experiment_id LEFT JOIN virtuallibrary v on e.vl_id = v.id WHERE e.monomer_long != 'Mon078';""").fetchall()

columns = ["experiment_id", "I_long", "M_long", "T_long", "product_A_smiles", "I_smiles", "M_smiles", "T_smiles", "reaction_smiles", "reaction_smiles_atom_mapped", "binary_A", "binary_B", "binary_C", "binary_D", "binary_E", "binary_F", "binary_G", "binary_H", "scaled_A", "scaled_B", "scaled_C", "scaled_D", "scaled_E", "scaled_F", "scaled_G", "scaled_H", "major_A-C"]
df = pd.DataFrame(res, columns=columns)
print(f'Number of reactions (in total): {len(df)}')

Number of reactions (in total): 39857


In [7]:
# doublecheck we don't have missing values
df['scaled_A'].isna().sum()

0

## Aggregate duplicates
For training, we want to remove duplicates from out data.
To aggregate we follow these steps:
1. Take the mean of the scaled values
2. From the mean scaled values, calculate the binary labels and the major_A-C label

In [8]:
# how many duplicates are there?
df["product_A_smiles"].duplicated().sum()

371

In [9]:
df.head()

Unnamed: 0,experiment_id,I_long,M_long,T_long,product_A_smiles,I_smiles,M_smiles,T_smiles,reaction_smiles,reaction_smiles_atom_mapped,...,binary_H,scaled_A,scaled_B,scaled_C,scaled_D,scaled_E,scaled_F,scaled_G,scaled_H,major_A-C
0,10578,Ph023,Mon017,TerTH010,CC(C)(C)OC(=O)CC[C@@H](Cc1nnc(C=Cc2ccccc2)s1)N...,O=C(c1ccc(Cl)cc1)[B-](F)(F)F.[K+],CC(C)(C)OC(=O)CC[C@H]1C[C@]2(ON1)OC1(CCCCC1)OC2=O,Cl.NNC(=S)/C=C/c1ccccc1,O=C(c1ccc(Cl)cc1)[B-](F)(F)F.CC(C)(C)OC(=O)CC[...,F[B-](F)(F)[C:1](=[O:2])[c:11]1[cH:12][cH:14][...,...,1.0,2.430935,1.277949,0.285254,0.0,0.413053,0.232611,5.452566,0.290557,A
1,10579,Ph023,Mon017,TerTH026,CC(C)(C)OC(=O)CC[C@@H](Cc1nnc(-c2cn[nH]c2)s1)N...,O=C(c1ccc(Cl)cc1)[B-](F)(F)F.[K+],CC(C)(C)OC(=O)CC[C@H]1C[C@]2(ON1)OC1(CCCCC1)OC2=O,Cl.NNC(=S)c1cn[nH]c1,O=C(c1ccc(Cl)cc1)[B-](F)(F)F.CC(C)(C)OC(=O)CC[...,F[B-](F)(F)[C:1](=[O:2])[c:11]1[cH:12][cH:14][...,...,0.0,0.382317,0.926318,0.24002,0.0,0.064906,0.339427,5.693844,0.0,B
2,10580,Ph023,Mon017,TerTH015,CC(C)(C)OC(=O)CC[C@@H](Cc1nnc(-c2cc(Cl)cc(Cl)c...,O=C(c1ccc(Cl)cc1)[B-](F)(F)F.[K+],CC(C)(C)OC(=O)CC[C@H]1C[C@]2(ON1)OC1(CCCCC1)OC2=O,Cl.NNC(=S)c1cc(Cl)cc(Cl)c1,O=C(c1ccc(Cl)cc1)[B-](F)(F)F.CC(C)(C)OC(=O)CC[...,F[B-](F)(F)[C:1](=[O:2])[c:11]1[cH:12][cH:14][...,...,0.0,0.931135,0.867479,0.042005,0.0,0.0,0.291865,5.59629,0.0,A
3,10581,Ph023,Mon017,TerTH020,CN(C)c1cccc(-c2nnc(C[C@H](CCC(=O)OC(C)(C)C)NC(...,O=C(c1ccc(Cl)cc1)[B-](F)(F)F.[K+],CC(C)(C)OC(=O)CC[C@H]1C[C@]2(ON1)OC1(CCCCC1)OC2=O,CN(C)c1cccc(C(=S)NN)c1.Cl,O=C(c1ccc(Cl)cc1)[B-](F)(F)F.CC(C)(C)OC(=O)CC[...,F[B-](F)(F)[C:1](=[O:2])[c:11]1[cH:12][cH:14][...,...,1.0,2.138999,2.543982,0.074154,0.00083,0.283941,0.321137,6.585098,0.213757,B
4,10584,Ph023,Mon017,TerABT001,CC(C)(C)OC(=O)CC[C@@H](Cc1nc2ccccc2s1)NC(=O)c1...,O=C(c1ccc(Cl)cc1)[B-](F)(F)F.[K+],CC(C)(C)OC(=O)CC[C@H]1C[C@]2(ON1)OC1(CCCCC1)OC2=O,Nc1ccccc1S,O=C(c1ccc(Cl)cc1)[B-](F)(F)F.CC(C)(C)OC(=O)CC[...,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,...,1.0,2.400752,0.011716,0.0,0.001561,0.209155,1.204209,7.226035,0.811754,A


In [10]:
# aggregate duplicates
group = df.groupby(["I_long", "M_long", "T_long", "product_A_smiles", "I_smiles", "M_smiles", "T_smiles", "reaction_smiles", "reaction_smiles_atom_mapped"])

# take the mean of the scaled values
scaled_responses = group[[f"scaled_{i}" for i in "ABCDEFGH"]].mean()

# reassign the binary labels
binary_responses = scaled_responses.applymap(lambda x: 1 if x > 0 else 0).rename(columns={f"scaled_{i}": f"binary_{i}" for i in "ABCDEFGH"})

# reassign the major_A-C label
major = scaled_responses[[f"scaled_{i}" for i in "ABC"]].idxmax(axis=1).str.strip("scaled_").rename("major_A-C")
major.loc[scaled_responses[[f"scaled_{i}" for i in "ABC"]].sum(axis=1) == 0] = "no_product"

# merge the results
exp_nr = group["experiment_id"].agg(lambda x: x if len(x) == 1 else "/".join([str(i) for i in x]))
df = pd.merge(exp_nr, binary_responses, left_index=True, right_index=True).merge(scaled_responses, left_index=True, right_index=True).merge(major, left_index=True, right_index=True).reset_index()
# length should be original length minus number of duplicates
len(df)

39486

In [13]:
# how many positives (ratio)?
df.binary_A.sum() / len(df)

0.8208478954566175

## Export
Now we have a cleaned dataset. Export to CSV.

In [14]:
# export to CSV, with timestamp
df.to_csv(DATA_DIR / "curated_data" / f"synferm_dataset_{datetime.datetime.today().strftime('%Y-%m-%d')}_{len(df)}records.csv", index=False)