# SynFerm data preparation
#### Targets:
- Import experiment, representation, and target data from DB
- Export to CSV

In [None]:
import datetime
import json
import pathlib
import sys

sys.path.append(str(pathlib.Path().resolve().parents[1]))

import pandas as pd

from src.util.db_utils import SynFermDatabaseConnection
from src.definitions import DATA_DIR

In [None]:
con = SynFermDatabaseConnection()  # we will use this for various simple queries

## Import data

In [None]:
# note that we only select valid reactions by using the INNER JOIN with the labels table
res = con.con.execute('SELECT e.id, r.I_long, r.M_long, r.T_long, r.product_A_smiles, r.I_smiles, r.M_smiles, r.T_smiles, r.reaction_smiles, r.reaction_smiles_atom_mapped, l.binary_A, l.binary_B, l.binary_C, l.binary_D, l.binary_E, l.binary_F, l.binary_G, l.binary_H, l.scaled_A, l.scaled_B, l.scaled_C, l.scaled_D, l.scaled_E, l.scaled_F, l.scaled_G, l.scaled_H, l."major_A-C" FROM experiments e LEFT JOIN representations r on e.id = r.experiment_id INNER JOIN labels l on e.id = l.experiment_id;').fetchall()

columns = ["experiment_id", "I_long", "M_long", "T_long", "product_A_smiles", "I_smiles", "M_smiles", "T_smiles", "reaction_smiles", "reaction_smiles_atom_mapped", "binary_A", "binary_B", "binary_C", "binary_D", "binary_E", "binary_F", "binary_G", "binary_H", "scaled_A", "scaled_B", "scaled_C", "scaled_D", "scaled_E", "scaled_F", "scaled_G", "scaled_H", "major_A-C"]
df = pd.DataFrame(res, columns=columns)
print(f'Number of reactions (in total): {len(df)}')

In [None]:
# doublecheck we don't have missing values
df['scaled_A'].isna().sum()

## Aggregate duplicates
For training, we want to remove duplicates from out data.
To aggregate we follow these steps:
1. Take the mean of the scaled values
2. From the mean scaled values, calculate the binary labels and the major_A-C label

In [None]:
# how many duplicates are there?
df["product_A_smiles"].duplicated().sum()

In [None]:
df.head()

In [None]:
# aggregate duplicates
group = df.groupby(["I_long", "M_long", "T_long", "product_A_smiles", "I_smiles", "M_smiles", "T_smiles", "reaction_smiles", "reaction_smiles_atom_mapped"])

# take the mean of the scaled values
scaled_responses = group[[f"scaled_{i}" for i in "ABCDEFGH"]].mean()

# reassign the binary labels
binary_responses = scaled_responses.applymap(lambda x: 1 if x > 0 else 0).rename(columns={f"scaled_{i}": f"binary_{i}" for i in "ABCDEFGH"})

# reassign the major_A-C label
major = scaled_responses[[f"scaled_{i}" for i in "ABC"]].idxmax(axis=1).str.strip("scaled_").rename("major_A-C")
major.loc[scaled_responses[[f"scaled_{i}" for i in "ABC"]].sum(axis=1) == 0] = "no_product"

# merge the results
exp_nr = group["experiment_id"].agg(lambda x: x if len(x) == 1 else "/".join([str(i) for i in x]))
df = pd.merge(exp_nr, binary_responses, left_index=True, right_index=True).merge(scaled_responses, left_index=True, right_index=True).merge(major, left_index=True, right_index=True).reset_index()
# length should be original length minus number of duplicates
len(df)

In [None]:
df.binary_A.sum()

## Export
Now we have a cleaned dataset. Export to CSV.

In [None]:
# export to CSV, with timestamp
df.to_csv(DATA_DIR / "curated_data" / f"synferm_dataset_{datetime.datetime.today().strftime('%Y-%m-%d')}_{len(df)}records.csv", index=False)

Export the same data as hierarchically nested JSON (for d3.js visualizations)
Note: takes a few minutes to run

In [None]:
# create nested dictionary for JSON export
res = {"name": "synfermdata", "children": []}

for i in df["I_long"].unique():
    print(i)  # show progress
    res["children"].append({"name": i, "children": []})
    for m in df.loc[df["I_long"] == i, "M_long"].unique():
        res["children"][-1]["children"].append({"name": m, "children": []})
        for t in df.loc[(df["I_long"] == i) & (df["M_long"] == m), "T_long"].unique():
            values = df.loc[(df["I_long"] == i) & (df["M_long"] == m) & (df["T_long"] == t), ["scaled_A", 'scaled_B', "scaled_C"]].values.flatten().tolist()
            res["children"][-1]["children"][-1]["children"].append(
                {"name": t, "children": 
                    [{"name": "A", "value": values[0]}, {"name": "B", "value": values[1]}, {"name": "C", "value": values[2]}]})

In [None]:
# export to JSON
with open(DATA_DIR / "curated_data" / f"synferm_dataset_{datetime.datetime.today().strftime('%Y-%m-%d')}_{len(df)}records.json", "w") as outfile:
    json.dump(res, outfile)