In [None]:
%load_ext lab_black

In [None]:
import pandas as pd
from rdkit.Chem import AllChem, PandasTools, Draw
from rdkit.Chem.rdFMCS import FindMCS

In [None]:
PandasTools.RenderImagesInAllDataFrames(images=True)

In [None]:
from pathlib import Path
from anki_model import DeckSet, DeckNumberer, TaxonomyNote, Package
from tqdm.notebook import tqdm

In [None]:
upstream_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vQgbPmrlTty5Q2luk79OigcbyWyQXAQR4xMpxNWJYHwMPpZvGjhBN7wd88vgAyWGMyzwIedvpR4iiNO/pub?output=xlsx"
sheets = pd.read_excel(upstream_url, sheet_name=None, dtype=str, keep_default_na=False)

In [None]:
data = pd.concat(sheets.values())
data = data[~data.skip.astype(bool)]
data = data.assign(
    ROMol_2D=data.pubchemid.replace({"": None}).fillna(data.chemblid).map(
        lambda it: AllChem.MolFromMolFile(f"data/{it}_2D.mol")
    )
)

In [None]:
def align_all(mols: list[AllChem.Mol]):
    mols = list(mols)
    template = mols[0]
    for mol in mols:
        mcs = FindMCS([template, mol], timeout=2)
        patt = AllChem.MolFromSmarts(mcs.smartsString)

        tpl_match = template.GetSubstructMatch(patt)
        mol_match = mol.GetSubstructMatch(patt)

        AllChem.Compute2DCoords(mol)
        try:
            AllChem.AlignMol(mol, template, atomMap=list(zip(mol_match, tpl_match)))
        except RuntimeError:
            pass

        template = mol

    return mols

In [None]:
data_dir = Path("data_taxonomy")
data_dir.mkdir(exist_ok=True)

In [None]:
regenerate = True

In [None]:
decks = DeckSet("B15 Pharmazeutische Chemie (Taxonomie)")
deck_numberer = DeckNumberer()

In [None]:
for i, (clf, group) in enumerate(tqdm(data.groupby("classification", sort=False))):
    filename_unlabeled = data_dir.joinpath(f"{i:03d}_unlabeled.svg")
    filename_labeled = data_dir.joinpath(f"{i:03d}_labeled.svg")

    if regenerate:
        mols = align_all(group.ROMol_2D)

        img_unlabeled = Draw.MolsToGridImage(mols, useSVG=True)
        filename_unlabeled.write_text(img_unlabeled.data)

        img_labeled = Draw.MolsToGridImage(mols, legends=list(group.name), useSVG=True)
        filename_labeled.write_text(img_labeled.data)

    taxonomy = clf.split("::")
    deck_name = deck_numberer.number(taxonomy[0])

    decks.add_note(
        deck_name,
        TaxonomyNote(
            taxonomy=taxonomy,
            file_unlabeled=filename_unlabeled.name,
            file_labeled=filename_labeled.name,
        ),
    )

In [None]:
Package(
    deck_or_decks=decks.to_list(), media_files=data_dir.glob("*.svg")
).write_to_file(data_dir.joinpath(f"{decks.root_deck_name}.apkg"))