# Generate ML data sets

Generate data sets as used for training the ML models

    a) Assign labels according to outcome by LCMS or isolation
    b) Drop duplicates
    c) select features

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("../data/featurized_dataS1.csv")  # run `featurize_slap_reaction.ipynb` first to generate this data from Data S2
df.head()

## Assign labels

For LCMS yield,`0` if normalized lcms ratio <= threshold, `1` if > threshold. Threshold is 0.01.


In [None]:
# assessing lcms_label
lcms_threshold = 0.01
# lcms ratio is min-max-normalized over ALL reactions that we have data for
norm_lcms = (df["lcms_ratio"] - df["lcms_ratio"].min()) / (df["lcms_ratio"].max() - df["lcms_ratio"].min())
df["lcms_label"] = pd.cut(norm_lcms, bins=(-1e-8, lcms_threshold, 1.0 + 1e-8), labels=(0, 1))

### Drop duplicates

We will drop plates 2 and 4 (duplicates of 1 and 3), because the data is less reliable according to our previous analysis. This leaves about 100 duplicates for LCMS data.

For the remaining duplicates, we will merge the records by majority vote. If the vote is even (e.g. reaction worked once and didn't work once), we will consider the reaction successful.

In [None]:
# split into lcms and isolated data
# remove plates 2 and 4 for lcms
df_lcms = df.loc[~df["plate_nr"].isin((2, 4))].copy()

In [None]:
# identify duplicates for lcms
duplicated_reaction_smiles = df_lcms.loc[df_lcms.duplicated(subset=["reactionSMILES"], keep=False), "reactionSMILES"].unique()

# set all duplicate records to the same label, based on majority-vote
for reac in duplicated_reaction_smiles:
    success_rate = df_lcms.loc[df["reactionSMILES"] == reac, "lcms_label"].astype("float").mean()
    if success_rate > 0.49:
        df_lcms.loc[df["reactionSMILES"] == reac, "lcms_label"] = 1
    else:
        df_lcms.loc[df["reactionSMILES"] == reac, "lcms_label"] = 0

# drop duplicates
df_lcms = df_lcms.drop_duplicates(subset=["reactionSMILES"])


## Select features

We select different sets of features for datasets appropriate to different algorithms.

In [None]:
# data set with unbalanced reactionSMILES and LCMS labels (this is the one we use for the ML models)
df_lcms[["reactionSMILES", "lcms_label"]]\
    .rename(columns={"reactionSMILES": "SMILES", "lcms_label": "label"})\
    .to_csv("../data/reactionSMILESunbalanced_LCMS.csv", index=False)  # note that this is the same as Data S3, published as supplementary data

# data set with imine intermediate SMILES and LCMS labels
df_lcms[["imines", "lcms_label"]]\
    .rename(columns={"imines": "SMILES", "lcms_label": "label"})\
    .to_csv("../data/imines_LCMS.csv", index=False)  # not used for experiments in the publication
