# Analyze training data

We want to know which building blocks are contained in the training data of each split, to be able to tell at inference whether a reaction should be predicted for by the 0D, 1D, 2D, or 3D model

In [None]:
import pathlib
import sys
sys.path.append(str(pathlib.Path().absolute().parent))

import numpy as np
import pandas as pd
from rdkit import Chem

from src.util.definitions import LOG_DIR, DATA_ROOT, TRAINED_MODEL_DIR
from src.util.rdkit_util import canonicalize_smiles, desalt_building_block

In [None]:
all_data = pd.read_csv(DATA_ROOT / "synferm_dataset_2023-12-20_39486records.csv")

all_data["I_smiles"] = all_data["I_smiles"].apply(lambda x: Chem.MolToSmiles(desalt_building_block(x)))
all_data["M_smiles"] = all_data["M_smiles"].apply(lambda x: Chem.MolToSmiles(desalt_building_block(x)))
all_data["T_smiles"] = all_data["T_smiles"].apply(lambda x: Chem.MolToSmiles(desalt_building_block(x)))
all_data.head()

In [None]:
split_dir = DATA_ROOT / "splits" / "synferm_dataset_2023-12-20_0D_split_final_retrain"
train_files = sorted([path for path in split_dir.iterdir() if "_train.csv" in str(path)], key=lambda x: x.name)
train_files

In [None]:
for path in train_files:
    train_idx = pd.read_csv(path).to_numpy().flatten()
    train_data = all_data.loc[train_idx]
    train_data[["I_long", "I_smiles"]].drop_duplicates().rename(columns={"I_long": "long", "I_smiles": "smiles"}).sort_values(by="long").to_csv(split_dir / f"{path.name.split('_')[0]}_train_initiators.csv", index=False)
    train_data[["M_long", "M_smiles"]].drop_duplicates().rename(columns={"M_long": "long", "M_smiles": "smiles"}).sort_values(by="long").to_csv(split_dir / f"{path.name.split('_')[0]}_train_monomers.csv", index=False)
    train_data[["T_long", "T_smiles"]].drop_duplicates().rename(columns={"T_long": "long", "T_smiles": "smiles"}).sort_values(by="long").to_csv(split_dir / f"{path.name.split('_')[0]}_train_terminators.csv", index=False)


In [None]:
# we save the same information to the 0D production model that was trained on this split
production_model_run_group = "2024-01-04-085409_305115"
for path in train_files:
    train_idx = pd.read_csv(path).to_numpy().flatten()
    train_data = all_data.loc[train_idx]
    train_data[["I_long", "I_smiles"]].drop_duplicates().rename(columns={"I_long": "long", "I_smiles": "smiles"}).sort_values(by="long").to_csv(TRAINED_MODEL_DIR / f"{production_model_run_group}_{path.name.split('_')[0]}" / "train_initiators.csv", index=False)
    train_data[["M_long", "M_smiles"]].drop_duplicates().rename(columns={"M_long": "long", "M_smiles": "smiles"}).sort_values(by="long").to_csv(TRAINED_MODEL_DIR / f"{production_model_run_group}_{path.name.split('_')[0]}" / "train_monomers.csv", index=False)
    train_data[["T_long", "T_smiles"]].drop_duplicates().rename(columns={"T_long": "long", "T_smiles": "smiles"}).sort_values(by="long").to_csv(TRAINED_MODEL_DIR / f"{production_model_run_group}_{path.name.split('_')[0]}" / "train_terminators.csv", index=False)


In [None]:
#Â for the 1D model, we do the same, but we have to do it across all 9 folds
split_dir = DATA_ROOT / "splits" / "synferm_dataset_2023-12-20_1D_split"
train_files = sorted([path for path in split_dir.iterdir() if "_train.csv" in str(path)], key=lambda x: x.name)
train_files

In [None]:
for path in train_files:
    train_idx = pd.read_csv(path).to_numpy().flatten()
    train_data = all_data.loc[train_idx]
    train_data[["I_long", "I_smiles"]].drop_duplicates().rename(columns={"I_long": "long", "I_smiles": "smiles"}).sort_values(by="long").to_csv(split_dir / f"{path.name.split('_')[0]}_train_initiators.csv", index=False)
    train_data[["M_long", "M_smiles"]].drop_duplicates().rename(columns={"M_long": "long", "M_smiles": "smiles"}).sort_values(by="long").to_csv(split_dir / f"{path.name.split('_')[0]}_train_monomers.csv", index=False)
    train_data[["T_long", "T_smiles"]].drop_duplicates().rename(columns={"T_long": "long", "T_smiles": "smiles"}).sort_values(by="long").to_csv(split_dir / f"{path.name.split('_')[0]}_train_terminators.csv", index=False)


In [None]:
# we save the same information to the 1D production models that were trained on this split
production_model_run_group = "2024-01-23-063840_864375"
for path in train_files:
    train_idx = pd.read_csv(path).to_numpy().flatten()
    train_data = all_data.loc[train_idx]
    train_data[["I_long", "I_smiles"]].drop_duplicates().rename(columns={"I_long": "long", "I_smiles": "smiles"}).sort_values(by="long").to_csv(TRAINED_MODEL_DIR / f"{production_model_run_group}_{path.name.split('_')[0]}" / "train_initiators.csv", index=False)
    train_data[["M_long", "M_smiles"]].drop_duplicates().rename(columns={"M_long": "long", "M_smiles": "smiles"}).sort_values(by="long").to_csv(TRAINED_MODEL_DIR / f"{production_model_run_group}_{path.name.split('_')[0]}" / "train_monomers.csv", index=False)
    train_data[["T_long", "T_smiles"]].drop_duplicates().rename(columns={"T_long": "long", "T_smiles": "smiles"}).sort_values(by="long").to_csv(TRAINED_MODEL_DIR / f"{production_model_run_group}_{path.name.split('_')[0]}" / "train_terminators.csv", index=False)
