# Analyze training data

We want to know which building blocks are contained in the training data of each split, to be able to tell at inference whether a reaction should be predicted for by the 0D, 1D, 2D, or 3D model

In [17]:
import pathlib
import sys
sys.path.append(str(pathlib.Path("__file__").absolute().parents[1]))

import numpy as np
import pandas as pd
from rdkit import Chem

from src.util.definitions import LOG_DIR, DATA_ROOT, TRAINED_MODEL_DIR
from src.util.rdkit_util import canonicalize_smiles, desalt_building_block

In [4]:
all_data = pd.read_csv(DATA_ROOT / "synferm_dataset_2023-09-05_40018records.csv")
all_data["I_smiles"] = all_data["I_smiles"].apply(lambda x: Chem.MolToSmiles(desalt_building_block(x)))
all_data["M_smiles"] = all_data["M_smiles"].apply(lambda x: Chem.MolToSmiles(desalt_building_block(x)))
all_data["T_smiles"] = all_data["T_smiles"].apply(lambda x: Chem.MolToSmiles(desalt_building_block(x)))
all_data.head()

Unnamed: 0,I_long,M_long,T_long,product_A_smiles,I_smiles,M_smiles,T_smiles,reaction_smiles,reaction_smiles_atom_mapped,experiment_id,...,binary_H,scaled_A,scaled_B,scaled_C,scaled_D,scaled_E,scaled_F,scaled_G,scaled_H,major_A-C
0,2-Pyr003,Fused002,TerABT004,COc1ccc(CCOC(=O)N2C[C@H](NC(=O)c3cccc(Cl)n3)[C...,O=C(c1cccc(Cl)n1)[B-](F)(F)F,COc1ccc(CCOC(=O)N2C[C@@H]3NO[C@]4(OC5(CCCCC5)O...,Nc1ccc(F)cc1S,O=C(c1cccc(Cl)n1)[B-](F)(F)F.COc1ccc(CCOC(=O)N...,F[B-](F)(F)[C:2]([c:1]1[cH:16][cH:18][cH:20][c...,56113,...,0,0.036021,0.003427,0.0,0.020975,0.002958,0.941981,0.914281,0.0,A
1,2-Pyr003,Fused002,TerABT007,COc1ccc(CCOC(=O)N2C[C@H](NC(=O)c3cccc(Cl)n3)[C...,O=C(c1cccc(Cl)n1)[B-](F)(F)F,COc1ccc(CCOC(=O)N2C[C@@H]3NO[C@]4(OC5(CCCCC5)O...,Nc1cc(Br)ccc1S,O=C(c1cccc(Cl)n1)[B-](F)(F)F.COc1ccc(CCOC(=O)N...,F[B-](F)(F)[C:2]([c:1]1[cH:16][cH:18][cH:20][c...,56114,...,0,0.0,0.0,0.0,0.006159,0.364398,0.928851,1.106548,0.0,no_product
2,2-Pyr003,Fused002,TerABT013,COc1ccc(CCOC(=O)N2C[C@H](NC(=O)c3cccc(Cl)n3)[C...,O=C(c1cccc(Cl)n1)[B-](F)(F)F,COc1ccc(CCOC(=O)N2C[C@@H]3NO[C@]4(OC5(CCCCC5)O...,Nc1cc(C(F)(F)F)ccc1S,O=C(c1cccc(Cl)n1)[B-](F)(F)F.COc1ccc(CCOC(=O)N...,F[B-](F)(F)[C:2]([c:1]1[cH:16][cH:18][cH:20][c...,56106,...,1,0.0,0.0,0.0,0.014212,2.16642,1.013596,0.537785,0.05686,no_product
3,2-Pyr003,Fused002,TerABT014,COc1ccc(CCOC(=O)N2C[C@H](NC(=O)c3cccc(Cl)n3)[C...,O=C(c1cccc(Cl)n1)[B-](F)(F)F,COc1ccc(CCOC(=O)N2C[C@@H]3NO[C@]4(OC5(CCCCC5)O...,Nc1ccc(Cl)cc1S,O=C(c1cccc(Cl)n1)[B-](F)(F)F.COc1ccc(CCOC(=O)N...,F[B-](F)(F)[C:2]([c:1]1[cH:16][cH:18][cH:20][c...,56112,...,0,0.028915,0.005039,0.0,0.015578,0.504057,0.992614,0.890646,0.0,A
4,2-Pyr003,Fused002,TerTH001,COc1ccc(CCOC(=O)N2C[C@H](NC(=O)c3cccc(Cl)n3)[C...,O=C(c1cccc(Cl)n1)[B-](F)(F)F,COc1ccc(CCOC(=O)N2C[C@@H]3NO[C@]4(OC5(CCCCC5)O...,NNC(=S)c1ccccc1,O=C(c1cccc(Cl)n1)[B-](F)(F)F.COc1ccc(CCOC(=O)N...,F[B-](F)(F)[C:2]([c:1]1[cH:13][cH:15][cH:17][c...,56109,...,0,0.350061,0.643219,0.0,0.031689,0.613596,0.109309,0.439018,0.0,B


In [5]:
split_dir = DATA_ROOT / "splits" / "synferm_dataset_2023-09-05_0D_split_final-retrain"
train_files = sorted([path for path in split_dir.iterdir() if "_train.csv" in str(path)], key=lambda x: x.name)
train_files

[PosixPath('/home/julian/PycharmProjects/synferm-predictions/data/splits/synferm_dataset_2023-09-05_0D_split_final-retrain/fold0_train.csv')]

In [21]:
for path in train_files:
    train_idx = pd.read_csv(path).to_numpy().flatten()
    train_data = all_data.loc[train_idx]
    train_data[["I_long", "I_smiles"]].drop_duplicates().rename(columns={"I_long": "long", "I_smiles": "smiles"}).sort_values(by="long").to_csv(split_dir / f"{path.name.split('_')[0]}_train_initiators.csv", index=False)
    train_data[["M_long", "M_smiles"]].drop_duplicates().rename(columns={"M_long": "long", "M_smiles": "smiles"}).sort_values(by="long").to_csv(split_dir / f"{path.name.split('_')[0]}_train_monomers.csv", index=False)
    train_data[["T_long", "T_smiles"]].drop_duplicates().rename(columns={"T_long": "long", "T_smiles": "smiles"}).sort_values(by="long").to_csv(split_dir / f"{path.name.split('_')[0]}_train_terminators.csv", index=False)


In [22]:
# we save the same information to the 0D production model that was trained on this split
production_model_run_group = "2023-11-29-145809_246697"
for path in train_files:
    train_idx = pd.read_csv(path).to_numpy().flatten()
    train_data = all_data.loc[train_idx]
    train_data[["I_long", "I_smiles"]].drop_duplicates().rename(columns={"I_long": "long", "I_smiles": "smiles"}).sort_values(by="long").to_csv(TRAINED_MODEL_DIR / f"{production_model_run_group}_{path.name.split('_')[0]}" / "train_initiators.csv", index=False)
    train_data[["M_long", "M_smiles"]].drop_duplicates().rename(columns={"M_long": "long", "M_smiles": "smiles"}).sort_values(by="long").to_csv(TRAINED_MODEL_DIR / f"{production_model_run_group}_{path.name.split('_')[0]}" / "train_monomers.csv", index=False)
    train_data[["T_long", "T_smiles"]].drop_duplicates().rename(columns={"T_long": "long", "T_smiles": "smiles"}).sort_values(by="long").to_csv(TRAINED_MODEL_DIR / f"{production_model_run_group}_{path.name.split('_')[0]}" / "train_terminators.csv", index=False)
