# SynFerm data preparation
#### Targets:
- Import experiment, representation, and target data from db
- Export to CSV.
  The CSV should contain the following columns:
  `['I_long', 'M_long', 'T_long', 'product_A_smiles', 'I_smiles', 'M_smiles', 'T_smiles', 'reaction_smiles', 'reaction_smiles_atom_mapped', 'experiment_id', 'binary_A', 'binary_B', 'binary_C', 'binary_D', 'binary_E', 'binary_F', 'binary_G', 'binary_H', 'scaled_A', 'scaled_B', 'scaled_C', 'scaled_D', 'scaled_E', 'scaled_F', 'scaled_G', 'scaled_H', 'major_A-C']`

In [None]:
import datetime
import pathlib
import sys

sys.path.append(str(pathlib.Path().resolve().parents[1]))

import pandas as pd

from src.util.db_utils import SynFermDatabaseConnection
from src.definitions import DATA_DIR
from src.library_design.reaction_generator import SFReactionGenerator

In [None]:
con = SynFermDatabaseConnection()  # we will use this for various simple queries

## Import data

In [None]:
# note that we only select valid reactions by using the INNER JOIN with the labels table
# also note that this means that if anything should still change about the data set, the labels have to be regenerated before using this!
res = con.con.execute("""
SELECT e.id, e.initiator_long as I_long, e.monomer_long as M_long, e.terminator_long as T_long, e.product_A_smiles, b_i.SMILES as I_smiles, b_m.SMILES as M_smiles, b_t.SMILES as T_smiles, l.binary_A, l.binary_B, l.binary_C, l.binary_D, l.binary_E, l.binary_F, l.binary_G, l.binary_H, l.scaled_A, l.scaled_B, l.scaled_C, l.scaled_D, l.scaled_E, l.scaled_F, l.scaled_G, l.scaled_H, l."major_A-C"
FROM experiments e
    LEFT JOIN building_blocks b_i on e.initiator_long = b_i.long
    LEFT JOIN building_blocks b_m on e.monomer_long = b_m.long
    LEFT JOIN building_blocks b_t on e.terminator_long = b_t.long
    INNER JOIN labels l on e.id = l.experiment_id;
""").fetchall()

columns = ["experiment_id", "I_long", "M_long", "T_long", "product_A_smiles", "I_smiles", "M_smiles", "T_smiles", "binary_A", "binary_B", "binary_C", "binary_D", "binary_E", "binary_F", "binary_G", "binary_H", "scaled_A", "scaled_B", "scaled_C", "scaled_D", "scaled_E", "scaled_F", "scaled_G", "scaled_H", "major_A-C"]
df = pd.DataFrame(res, columns=columns)
print(f'Number of reactions (in total): {len(df)}')

In [None]:
def make_reaction_smiles(initiator, monomer, terminator, product):
    """Form unmapped, plain reactionSMILES"""
    return f"{initiator}.{monomer}.{terminator}>>{product}"

In [None]:
# generate plain reactionSMILES (not desalted or anything)
reaction_smiles = [make_reaction_smiles(row["I_smiles"], row["M_smiles"], row["T_smiles"], row["product_A_smiles"]) for i, row in df.iterrows()]
len(reaction_smiles)

In [None]:
df["reaction_smiles"] = reaction_smiles

In [None]:
gen = SFReactionGenerator()
# we wrap the generator to catch errors
def get_reaction_smiles(x):
    try:
        return gen.get_reaction_smiles(x)
    except ValueError as e:
        print(e)
        print(x)
        return None

In [None]:
# generate atom-mapped reactionSMILES (~15 min)
df["reaction_smiles_atom_mapped"] = df["product_A_smiles"].apply(get_reaction_smiles)
df.head()

In [None]:
# doublecheck we don't have missing values
df['scaled_A'].isna().sum()

In [None]:
# check we don't have missing features
df['reaction_smiles_atom_mapped'].isna().sum()

## Aggregate duplicates
For training, we want to remove duplicates from out data.
To aggregate we follow these steps:
1. Take the mean of the scaled values
2. From the mean scaled values, calculate the binary labels and the major_A-C label

In [None]:
# aggregate duplicates
group = df.groupby(["I_long", "M_long", "T_long", "product_A_smiles", "I_smiles", "M_smiles", "T_smiles", "reaction_smiles", "reaction_smiles_atom_mapped"])

# take the mean of the scaled values
scaled_responses = group[[f"scaled_{i}" for i in "ABCDEFGH"]].mean()

# reassign the binary labels
binary_responses = scaled_responses.applymap(lambda x: 1 if x > 0 else 0).rename(columns={f"scaled_{i}": f"binary_{i}" for i in "ABCDEFGH"})

# reassign the major_A-C label
major = scaled_responses[[f"scaled_{i}" for i in "ABC"]].idxmax(axis=1).str.strip("scaled_").rename("major_A-C")
major.loc[scaled_responses[[f"scaled_{i}" for i in "ABC"]].sum(axis=1) == 0] = "no_product"

# merge the results
exp_nr = group["experiment_id"].agg(lambda x: x if len(x) == 1 else "/".join([str(i) for i in x]))
df_clean = pd.merge(exp_nr, binary_responses, left_index=True, right_index=True)\
    .merge(scaled_responses, left_index=True, right_index=True)\
    .merge(major, left_index=True, right_index=True)\
    .reset_index()[['I_long', 'M_long', 'T_long', 'product_A_smiles', 'I_smiles', 'M_smiles', 'T_smiles', 'reaction_smiles', 'reaction_smiles_atom_mapped', 'experiment_id', 'binary_A', 'binary_B', 'binary_C', 'binary_D', 'binary_E', 'binary_F', 'binary_G', 'binary_H', 'scaled_A', 'scaled_B', 'scaled_C', 'scaled_D', 'scaled_E', 'scaled_F', 'scaled_G', 'scaled_H', 'major_A-C']]
# length should be original length minus number of duplicates
len(df_clean)

In [None]:
# how many positives (ratio) for A?
df_clean.binary_A.sum() / len(df_clean)

In [None]:
# how many positives (ratio) for B?
df_clean.binary_B.sum() / len(df_clean)

In [None]:
# how many positives (ratio) for C?
df_clean.binary_C.sum() / len(df_clean)

## Export
Now we have a cleaned dataset. Export to CSV.

In [None]:
# export to CSV, with timestamp
df_clean.to_csv(DATA_DIR / "curated_data" / f"synferm_dataset_{datetime.datetime.today().strftime('%Y-%m-%d')}_{len(df_clean)}records.csv", index=False)