# SynFerm synthetic data preparation

We have used the 0D model to prepare a synthetic data set.
Here, we export the synthetic data set for training.
Wherever we have a "real" result, this will supersede the synthetic result.

In [None]:
import datetime
import pathlib
import sys

sys.path.append(str(pathlib.Path().resolve().parents[1]))

import pandas as pd

from src.util.db_utils import SynFermDatabaseConnection
from src.definitions import DATA_DIR

In [None]:
con = SynFermDatabaseConnection()

## Obtain real data
Here we just import the previously prepared data set.

In [None]:
real_df = pd.read_csv(DATA_DIR / "curated_data" / f"synferm_dataset_2023-12-20_39486records.csv")[["I_long", "M_long", "T_long", "reaction_smiles_atom_mapped", "binary_A", "binary_B", "binary_C"]]
real_df.head()

## Obtain synthetic data

In [None]:
res = con.con.execute("SELECT v.initiator_long, v.monomer_long, v.terminator_long, v.type, v.reaction_smiles_atom_mapped, p.binary_outcome FROM virtuallibrary v INNER JOIN virtuallibrary_predictions p on v.id = p.vl_id WHERE p.binary_model = '2024-01-04-085409_305115_fold0' ORDER BY type;").fetchall()
res_df = pd.DataFrame(res, columns=["I_long", "M_long", "T_long", "product_type", "reaction_smiles_atom_mapped", "binary"])

group = res_df.groupby(["I_long", "M_long", "T_long",])
res_df["reaction_smiles_atom_mapped"] = group["reaction_smiles_atom_mapped"].ffill()  # save b/c ORDER BY type in SQL

In [None]:
syn_df = res_df.pivot(index=["I_long", "M_long", "T_long", "reaction_smiles_atom_mapped",], columns="product_type", values="binary").rename(columns={"A": "binary_A", "B": "binary_B", "C": "binary_C"}).reset_index()
syn_df

## Merge synthetic and real data
If we have real data, we overwrite the synthetic data

In [None]:
comb = syn_df.merge(real_df, on=["I_long", "M_long", "T_long"], how="left")
comb.head()

In [None]:
# where do we have real data?
~comb.isna().any(axis=1)

In [None]:
# we save the indices to later distinguish real and synthetic data (since we want to evaluate on real data)
with open(    DATA_DIR / "curated_data" / f"synferm_dataset_{datetime.datetime.today().strftime('%Y-%m-%d')}_{len(comb)}records_synthetic_real-indices.txt", "w") as f:
    for i in comb.loc[~comb.isna().any(axis=1)].index.to_list():
        f.write(f"{i}\n")

In [None]:
# overwrite binary_A where we have real data
comb.loc[~comb.isna().any(axis=1), ["binary_A_x", "binary_B_x", "binary_C_x"]] = comb.loc[~comb.isna().any(axis=1), ["binary_A_y", "binary_B_y", "binary_C_y"]].values

In [None]:
comb.loc[~comb.isna().any(axis=1), ["binary_A_y", "binary_B_y", "binary_C_y"]]

In [None]:
comb

## Export
Now we have a cleaned dataset. Export to CSV.

In [None]:
comb[["I_long", "M_long", "T_long", "reaction_smiles_atom_mapped_x", "binary_A_x", "binary_B_x", "binary_C_x"]]\
.rename(columns={"reaction_smiles_atom_mapped_x": "reaction_smiles_atom_mapped", 
                 "binary_A_x": "binary_A", 
                 "binary_B_x": "binary_B", 
                 "binary_C_x": "binary_C"
                })



In [None]:
# export to CSV, with timestamp
comb[
    ["I_long", "M_long", "T_long", "reaction_smiles_atom_mapped_x", "binary_A_x", "binary_B_x", "binary_C_x"]
].rename(
    columns={"reaction_smiles_atom_mapped_x": "reaction_smiles_atom_mapped", 
             "binary_A_x": "binary_A", 
             "binary_B_x": "binary_B", 
             "binary_C_x": "binary_C"
            }
).to_csv(
    DATA_DIR / "curated_data" / f"synferm_dataset_{datetime.datetime.today().strftime('%Y-%m-%d')}_{len(comb)}records_synthetic.csv", 
    index=False
)