# SynFerm synthetic data preparation

We have used the 0D model to prepare a synthetic data set.
Here, we export the synthetic data set for training.
Whereever we have a "real" result, this will supersede the synthetic result.

In [1]:
import datetime
import pathlib
import sys

sys.path.append(str(pathlib.Path().resolve().parents[1]))

import pandas as pd

from src.util.db_utils import SynFermDatabaseConnection
from src.definitions import DATA_DIR

In [2]:
con = SynFermDatabaseConnection()

## Obtain real data
Here we just import the previously prepared data set.

In [3]:
real_df = pd.read_csv(DATA_DIR / "curated_data" / f"synferm_dataset_2023-12-20_39486records.csv")[["I_long", "M_long", "T_long", "reaction_smiles_atom_mapped", "binary_A", "binary_B", "binary_C"]]
real_df.head()

Unnamed: 0,I_long,M_long,T_long,reaction_smiles_atom_mapped,binary_A,binary_B,binary_C
0,2-Pyr003,Fused002,TerABT004,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,1,1,0
1,2-Pyr003,Fused002,TerABT007,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,0,0,0
2,2-Pyr003,Fused002,TerABT013,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,0,0,0
3,2-Pyr003,Fused002,TerABT014,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,1,1,0
4,2-Pyr003,Fused002,TerTH001,F[B-](F)(F)[C:1](=[O:2])[c:11]1[cH:12][cH:14][...,1,1,0


## Obtain synthetic data

In [6]:
res = con.con.execute("SELECT v.initiator_long, v.monomer_long, v.terminator_long, v.type, v.reaction_smiles_atom_mapped, p.binary_outcome FROM virtuallibrary v INNER JOIN virtuallibrary_predictions p on v.id = p.vl_id WHERE p.binary_model = '2024-01-04-085409_305115_fold0' ORDER BY type;").fetchall()
res_df = pd.DataFrame(res, columns=["I_long", "M_long", "T_long", "product_type", "reaction_smiles_atom_mapped", "binary"])

group = res_df.groupby(["I_long", "M_long", "T_long",])
res_df["reaction_smiles_atom_mapped"] = group["reaction_smiles_atom_mapped"].ffill()  # save b/c ORDER BY type in SQL

In [7]:
syn_df = res_df.pivot(index=["I_long", "M_long", "T_long", "reaction_smiles_atom_mapped",], columns="product_type", values="binary").rename(columns={"A": "binary_A", "B": "binary_B", "C": "binary_C"}).reset_index()
syn_df

product_type,I_long,M_long,T_long,reaction_smiles_atom_mapped,binary_A,binary_B,binary_C
0,2-Pyr003,Fused002,TerABT001,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,1,1,0
1,2-Pyr003,Fused002,TerABT004,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,1,0,0
2,2-Pyr003,Fused002,TerABT005,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,1,0,0
3,2-Pyr003,Fused002,TerABT006,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,1,0,0
4,2-Pyr003,Fused002,TerABT007,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,0,0,0
...,...,...,...,...,...,...,...
195032,Pyrazine002,Spiro017,TerTH023,F[B-](F)(F)[C:1](=[O:2])[c:11]1[cH:12][n:14][c...,1,0,0
195033,Pyrazine002,Spiro017,TerTH025,F[B-](F)(F)[C:1](=[O:2])[c:11]1[cH:12][n:14][c...,1,1,0
195034,Pyrazine002,Spiro017,TerTH026,F[B-](F)(F)[C:1](=[O:2])[c:11]1[cH:12][n:14][c...,1,0,0
195035,Pyrazine002,Spiro017,TerTH027,F[B-](F)(F)[C:1](=[O:2])[c:11]1[cH:12][n:14][c...,1,0,0


## Merge synthetic and real data
If we have real data, we overwrite the synthetic data

In [8]:
comb = syn_df.merge(real_df, on=["I_long", "M_long", "T_long"], how="left")
comb.head()

Unnamed: 0,I_long,M_long,T_long,reaction_smiles_atom_mapped_x,binary_A_x,binary_B_x,binary_C_x,reaction_smiles_atom_mapped_y,binary_A_y,binary_B_y,binary_C_y
0,2-Pyr003,Fused002,TerABT001,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,1,1,0,,,,
1,2-Pyr003,Fused002,TerABT004,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,1,0,0,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,1.0,1.0,0.0
2,2-Pyr003,Fused002,TerABT005,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,1,0,0,,,,
3,2-Pyr003,Fused002,TerABT006,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,1,0,0,,,,
4,2-Pyr003,Fused002,TerABT007,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,0,0,0,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,0.0,0.0,0.0


In [9]:
# where do we have real data?
~comb.isna().any(axis=1)

0         False
1          True
2         False
3         False
4          True
          ...  
195032    False
195033    False
195034    False
195035     True
195036    False
Length: 195037, dtype: bool

In [17]:
import numpy as np

In [28]:
# we save the indices to later distinguish real and synthetic data (since we want to evaluate on real data)
with open(    DATA_DIR / "curated_data" / f"synferm_dataset_{datetime.datetime.today().strftime('%Y-%m-%d')}_{len(comb)}records_synthetic_real-indices.txt", "w") as f:
    for i in comb.loc[~comb.isna().any(axis=1)].index.to_list():
        f.write(f"{i}\n")

In [10]:
# overwrite binary_A where we have real data
comb.loc[~comb.isna().any(axis=1), ["binary_A_x", "binary_B_x", "binary_C_x"]] = comb.loc[~comb.isna().any(axis=1), ["binary_A_y", "binary_B_y", "binary_C_y"]].values

In [11]:
comb.loc[~comb.isna().any(axis=1), ["binary_A_y", "binary_B_y", "binary_C_y"]]

Unnamed: 0,binary_A_y,binary_B_y,binary_C_y
1,1.0,1.0,0.0
4,0.0,0.0,0.0
10,0.0,0.0,0.0
11,1.0,1.0,0.0
18,1.0,1.0,0.0
...,...,...,...
195015,1.0,0.0,0.0
195016,1.0,1.0,0.0
195026,1.0,0.0,0.0
195027,1.0,1.0,0.0


In [12]:
comb

Unnamed: 0,I_long,M_long,T_long,reaction_smiles_atom_mapped_x,binary_A_x,binary_B_x,binary_C_x,reaction_smiles_atom_mapped_y,binary_A_y,binary_B_y,binary_C_y
0,2-Pyr003,Fused002,TerABT001,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,1.0,1.0,0.0,,,,
1,2-Pyr003,Fused002,TerABT004,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,1.0,1.0,0.0,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,1.0,1.0,0.0
2,2-Pyr003,Fused002,TerABT005,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,1.0,0.0,0.0,,,,
3,2-Pyr003,Fused002,TerABT006,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,1.0,0.0,0.0,,,,
4,2-Pyr003,Fused002,TerABT007,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,0.0,0.0,0.0,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
195032,Pyrazine002,Spiro017,TerTH023,F[B-](F)(F)[C:1](=[O:2])[c:11]1[cH:12][n:14][c...,1.0,0.0,0.0,,,,
195033,Pyrazine002,Spiro017,TerTH025,F[B-](F)(F)[C:1](=[O:2])[c:11]1[cH:12][n:14][c...,1.0,1.0,0.0,,,,
195034,Pyrazine002,Spiro017,TerTH026,F[B-](F)(F)[C:1](=[O:2])[c:11]1[cH:12][n:14][c...,1.0,0.0,0.0,,,,
195035,Pyrazine002,Spiro017,TerTH027,F[B-](F)(F)[C:1](=[O:2])[c:11]1[cH:12][n:14][c...,1.0,0.0,0.0,F[B-](F)(F)[C:1](=[O:2])[c:11]1[cH:12][n:14][c...,1.0,0.0,0.0


## Export
Now we have a cleaned dataset. Export to CSV.

In [13]:
comb[["I_long", "M_long", "T_long", "reaction_smiles_atom_mapped_x", "binary_A_x", "binary_B_x", "binary_C_x"]]\
.rename(columns={"reaction_smiles_atom_mapped_x": "reaction_smiles_atom_mapped", 
                 "binary_A_x": "binary_A", 
                 "binary_B_x": "binary_B", 
                 "binary_C_x": "binary_C"
                })



Unnamed: 0,I_long,M_long,T_long,reaction_smiles_atom_mapped,binary_A,binary_B,binary_C
0,2-Pyr003,Fused002,TerABT001,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,1.0,1.0,0.0
1,2-Pyr003,Fused002,TerABT004,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,1.0,1.0,0.0
2,2-Pyr003,Fused002,TerABT005,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,1.0,0.0,0.0
3,2-Pyr003,Fused002,TerABT006,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,1.0,0.0,0.0
4,2-Pyr003,Fused002,TerABT007,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,0.0,0.0,0.0
...,...,...,...,...,...,...,...
195032,Pyrazine002,Spiro017,TerTH023,F[B-](F)(F)[C:1](=[O:2])[c:11]1[cH:12][n:14][c...,1.0,0.0,0.0
195033,Pyrazine002,Spiro017,TerTH025,F[B-](F)(F)[C:1](=[O:2])[c:11]1[cH:12][n:14][c...,1.0,1.0,0.0
195034,Pyrazine002,Spiro017,TerTH026,F[B-](F)(F)[C:1](=[O:2])[c:11]1[cH:12][n:14][c...,1.0,0.0,0.0
195035,Pyrazine002,Spiro017,TerTH027,F[B-](F)(F)[C:1](=[O:2])[c:11]1[cH:12][n:14][c...,1.0,0.0,0.0


In [14]:
# export to CSV, with timestamp
comb[
    ["I_long", "M_long", "T_long", "reaction_smiles_atom_mapped_x", "binary_A_x", "binary_B_x", "binary_C_x"]
].rename(
    columns={"reaction_smiles_atom_mapped_x": "reaction_smiles_atom_mapped", 
             "binary_A_x": "binary_A", 
             "binary_B_x": "binary_B", 
             "binary_C_x": "binary_C"
            }
).to_csv(
    DATA_DIR / "curated_data" / f"synferm_dataset_{datetime.datetime.today().strftime('%Y-%m-%d')}_{len(comb)}records_synthetic.csv", 
    index=False
)