# SynFerm data preparation
#### Targets:
- Import experiment, representation, and target data from db
- Export to CSV

In [1]:
import os
import sys

sys.path.insert(0, os.path.abspath('../..'))

import pandas as pd

from src.util.db_utils import SynFermDatabaseConnection
from src.definitions import DATA_DIR


In [2]:
con = SynFermDatabaseConnection()  # we will use this for various simple queries

In [3]:
# note that we only select valid reaction by using the INNER JOIN with the labels table
res = con.con.execute('SELECT e.id, r.I_long, r.M_long, r.T_long, r.product_A_smiles, r.I_smiles, r.M_smiles, r.T_smiles, r.reaction_smiles, r.reaction_smiles_atom_mapped, l.binary_A, l.binary_B, l.binary_C, l.binary_D, l.binary_E, l.binary_F, l.binary_G, l.binary_H, l.scaled_A, l.scaled_B, l.scaled_C, l.scaled_D, l.scaled_E, l.scaled_F, l.scaled_G, l.scaled_H, l."major_A-C" FROM experiments e LEFT JOIN representations r on e.id = r.experiment_id INNER JOIN labels l on e.id = l.experiment_id;').fetchall()

columns = ["experiment_id", "I_long", "M_long", "T_long", "product_A_smiles", "I_smiles", "M_smiles", "T_smiles", "reaction_smiles", "reaction_smiles_atom_mapped", "binary_A", "binary_B", "binary_C", "binary_D", "binary_E", "binary_F", "binary_G", "binary_H", "scaled_A", "scaled_B", "scaled_C", "scaled_D", "scaled_E", "scaled_F", "scaled_G", "scaled_H", "major_A-C"]
df = pd.DataFrame(res, columns=columns)
print(f'Number of reactions (in total): {len(df)}')

Number of reactions (in total): 40433


In [4]:
# doublecheck we don't have missing values
df['scaled_A'].isna().sum()

0

Now we have a cleaned dataset. Export to CSV.

In [5]:
df.to_csv(DATA_DIR / "curated_data" / f"synferm_dataset_2023-07-20_{len(df)}records.csv", index=False)

Export the same data as hierarchically nested JSON (for d3.js visualizations)

In [None]:
import json

In [9]:
res = {"name": "synfermdata", "children": []}

In [10]:
for i in df["I_long"].unique():
    print(i)  # show progress
    res["children"].append({"name": i, "children": []})
    for m in df.loc[df["I_long"] == i, "M_long"].unique():
        res["children"][-1]["children"].append({"name": m, "children": []})
        for t in df.loc[(df["I_long"] == i) & (df["M_long"] == m), "T_long"].unique():
            values = df.loc[(df["I_long"] == i) & (df["M_long"] == m) & (df["T_long"] == t), ["scaled_A", 'scaled_B', "scaled_C"]].values.flatten().tolist()
            res["children"][-1]["children"][-1]["children"].append(
                {"name": t, "children": 
                    [{"name": "A", "value": values[0]}, {"name": "B", "value": values[1]}, {"name": "C", "value": values[2]}]})

Ph023
Ph010
Ph018
Al013
BiPh006
8-Quin003
6-Quin001
Ph022
5-Quin001
BiAl009
Ph009
Ph017
2-Pyr010
Al002
8-Quin008
Ph007
4-Pym001
2-Pyr006
Al036
Al005
3-Thio001
Ph026
3-Pyr003
Ph005
2-Pyr007
Pyrazine001
Ph027
Al004
Al007
Al038
BiAl008
Ph021
8-Quin004
4-Pyr002
3-Fur001
Ph028
BiPh011
2-Pyr009
2-Pyr008
Al001
BiPyr001
Ph013
4-Pyrazole001
Ph002
Ph020
8-Quin005
BiPh007
BiPh002
2-Pyr003
Ph034
3-Pyr004
BiPh010
Ph033
BiAl007
Al003
Pyrazine002
Ph031
BiPh001
Ph030
2-Thio001
3-Pyr002
Ph006
Ph001
BiPyr004
Ph024
Ph025
BiPh004


In [11]:
with open(DATA_DIR / "curated_data" / "synferm.json", "w") as outfile:
    json.dump(res, outfile)