In [None]:
#| default_exp machine_learning.definition_and_notation_naming

# machine_learning.definition_and_notation_naming

> Functions for gathering and processing data to train and for using ML models to "name" definitions and notations

`trouver.machine_learning.tokenize.def_and_notat_token_classification` has functions for gathering and processing data to train and for using ML models to identify definitions and notations introduced in notes via token classification. Identified definition and notations are marked using HTML tags. It would be convenient to predict the "names" for these definitions and notations.

TODO: insert examples of definitions and notations with HTML tags and examples of what the "names" of these definitions and notation should be

In [None]:
#| export
import copy
import random
from typing import Literal, Optional, TypedDict

import bs4
from bs4 import BeautifulSoup, Tag
from pathvalidate import sanitize_filename
from transformers import pipelines
import warnings

from trouver.helper.html import (
    remove_html_tags_in_text, add_HTML_tag_data_to_raw_text,
    StrAndHTMLTagsWithIndices, HTMLTagWithIndices)
from trouver.helper.latex.formatting import (
    fix_autogen_formatting
)
from trouver.helper.latex.processing import (
    correct_latex_syntax_error, _list_of_candidates_from_math_mode_strings, math_mode_string_is_syntactically_valid,
)
from trouver.helper.latex.augment import (
    random_char_modification,
    dollar_sign_manipulation, remove_math_keywords, random_word_removal, random_latex_command_removal,
    push_dollar_signs, augment_text,
    change_font_styles_at_random, change_greek_letters_at_random, remove_font_styles_at_random
)

from trouver.markdown.file import MarkdownFile
from trouver.personal_vault.note_processing import process_standard_information_note
from trouver.obsidian.vault import VaultNote

from trouver.machine_learning.notation_summarization import (
    notation_summarization_data_from_note, single_input_for_notation_summarization, NotationSummaryData
)


## TypedDict class for wrapping definition and notation naming dat

In [None]:
#| export
class DefNotatNamingData(TypedDict):
    def_or_notat: Literal["definition", "notation"] # Whether HTML tag marks an introduced definition or notation.
    name: str # The "name" of the definition or notation.
    text: str # The text in which the definition or notation is introduced. The introduced definition/notation is marked with an HTML tag.
    note_name: str # The name of the VaultNote that `text` comes from.

## Gather ML data from information notes

In [None]:
#| export

# TODO: test
# TODO: change return type to `DefNotatNamingData`
def def_notat_naming_data_from_information_note(
        info_note: VaultNote, # The standard information note from which to draw data.
    ) -> list[DefNotatNamingData]: # Each dict corresponds to a single datapoint, which holds the data of the naming of a single definition or notation (latex str) introduced in `info_note`. 
    """
    Obtain data for naming definitions and notations for a standard information
    note.

    Definitions and notations should be marked by HTML tags (see
    `machine_learning.tokenize.def_and_notat_token_classification`).
    - A definition is to be marked by an HTML tag with a `definition` attribute,
      which is the definition's "name", i.e. words and/or phrases describing what
      the definition is called and to what objects/situations the definition
      is applicable. If multiple combinations of words/phrases are appropriate,
      then they are separated by a single semicolon `;`. If the `definition`
      attribute is `""`, then the definition name has not been marked, both manually
      and automatically.
    - A notation (technically the full LaTeX string in which the notation is
      introducedis) is to be marked by an HTML tag with a `notation` attribute,
      which is the notation's "name", i.e. the actual notation introduced in
      the LaTeX string (without surrounding dollar signs (`$` or `$$`)). If
      multiple notations are appropriate, then they are separated by
      double semicolons `;;`. If the `notation` attribute is `""`, then it
      means that either the notation has not been marked, or that the
      LaTeX string (minus the surrounding dollar signs) is exactly the
      introduced notation. 


    **Returns**
    - list[dict[str, str]]
        - Each dict corresponds to a single datapoint, which holds the data of
          the naming of a single definition or notation (latex str) introduced
          in `info_note`. The keys are `'text'` and `'definition`' or
          `'notation`'. The `text` entry should be the processed text of
          `info_note`, see `process_standard_information_note` 

    """
    mf = MarkdownFile.from_vault_note(info_note)

    # Processes the info note in all ways except for the HTML tags
    mf = process_standard_information_note(
        mf, info_note.vault,
        True, True, True, True, True, False, True, True, True, True,
        True, True, True, None, True)
    
    text_without_html_tags, removed_tags = remove_html_tags_in_text(str(mf))
    list_of_dicts: list[DefNotatNamingData] = []
    for removed_tag, start, end in removed_tags:
        if 'definition' in removed_tag.attrs:
            def_or_notat = 'definition'
        elif 'notation' in removed_tag.attrs:
            def_or_notat = 'notation'
        else:
            continue
        location_marking_tag = BeautifulSoup(f'<b {def_or_notat}="">{removed_tag.text}', 'html.parser')

        data_point_dict = DefNotatNamingData(
            def_or_notat=def_or_notat,
            name=removed_tag.attrs[def_or_notat],
            text=add_HTML_tag_data_to_raw_text(
                text_without_html_tags, [(location_marking_tag, start, end)]),
            note_name=info_note.name
            )
        list_of_dicts.append(data_point_dict)

    return list_of_dicts

## Augment data

In [None]:
#| export
def _split_text_by_html_data_parts(
        datapoint: DefNotatNamingData
        ) -> tuple[str, str, str, bs4.element.Tag]: # The text before the HTML tag, the text of the HTML tag, and the text after the HTML tag
    r"""
    Helper function
    """
    to_return: list[str] = []
    html_data: StrAndHTMLTagsWithIndices = remove_html_tags_in_text(datapoint['text'])
    raw_text: str = html_data.raw_text
    tags: list[HTMLTagWithIndices] = html_data.tags
    start, end = tags[0].start, tags[0].end
    
    return (raw_text[:start], raw_text[start:end], raw_text[end:], tags[0].tag)

In [None]:
#| export
def augment_def_and_notat_naming_data(
        datapoint: DefNotatNamingData,
        num_augmentation_sets: int = 1, # Each augmentation set consists of an augmentation with low, medium, and high probability modifications.
        seed: Optional[int] = None
        ) -> list[DefNotatNamingData]:
    r"""
    Augment a given datapoint for HTML tagging.
    """
    augmented_datapoints: list[DefNotatNamingData] = []
    pieces: tuple[str, str, str, bs4.element.Tag] = _split_text_by_html_data_parts(datapoint)
    if seed is not None:
        random.seed(seed)

    for _ in range(num_augmentation_sets):
        augmented_datapoints.append(
            _augment_def_and_notat_naming_data_once(pieces, 'low', datapoint))
        augmented_datapoints.append(
            _augment_def_and_notat_naming_data_once(pieces, 'mid', datapoint))
        augmented_datapoints.append(
            _augment_def_and_notat_naming_data_once(pieces, 'hi', datapoint))
        # augmented_datapoints.append(_augment_html_data_once(pieces, 'high'))
    return augmented_datapoints


def _augment_def_and_notat_naming_data_once(
        pieces: tuple[str, str, str, bs4.element.Tag],
        modification: Literal['low', 'mid', 'high'],
        original_datapoint: DefNotatNamingData,
        ) -> DefNotatNamingData:

    methods = [
        # (push_dollar_signs,0.2),
        (remove_font_styles_at_random, 0.1), (change_font_styles_at_random, 0.2), (change_greek_letters_at_random, 0.1), 
        (remove_math_keywords,0.1), (random_latex_command_removal,0.2),
        (random_word_removal,0.1), (dollar_sign_manipulation,0.05),
        (random_char_modification,0.001)]
    if modification == 'low':
        method_inclusion_chance = 0.3
        scale = 0.5
    elif modification == 'mid':
        method_inclusion_chance = 0.5
        scale = 1.0
    else:
        method_inclusion_chance = 0.8
        scale = 1.5
    
    random_methods = []
    def create_method(method, p, scale):
        return lambda x: method(x, p=p*scale)
    for method, p in methods:
        if random.random() < method_inclusion_chance:
            random_methods.append(create_method(method, p, scale))

    start_augment = augment_text(pieces[0], random_methods)
    tag = copy.copy(pieces[3])
    mid_augment_with_html_tag = augment_text(pieces[1], random_methods)
    tag.string = mid_augment_with_html_tag
    end_augment = augment_text(pieces[2], random_methods)
    accumulated_text = f'{start_augment}{str(tag)}{end_augment}'
    return DefNotatNamingData(
        def_or_notat=original_datapoint['def_or_notat'],
        name=original_datapoint['name'],
        note_name=original_datapoint['note_name'],
        text=accumulated_text
        )

## Use the ML model

In [None]:
#| export

# TODO: mark the note with and `_auto` tag and make it so that 
def predict_names(
        info_note: VaultNote,
        def_and_notat_pipeline: Optional[pipelines.text2text_generation.SummarizationPipeline], # A pipeline wrapping an ML model which predicts the naming of both definition and notations.
        def_pipeline: Optional[pipelines.text2text_generation.SummarizationPipeline],  # A pipeline wrapping an ML model which predicts the naming of definitions. 
        notat_pipeline: Optional[pipelines.text2text_generation.SummarizationPipeline], # A pipeline wrapping an ML model which predicts the naming of notations. 
        ) -> list[str]:
    r"""
    Predict the names of the definitions and notations using the trained ML models

    Either `def_and_notat_pipeline` or both `def_pipeline` and `notat_pipeline`
    should be provided.
    """
    if (def_and_notat_pipeline is None and 
            (def_pipeline is None or notat_pipeline is None)):
        raise ValueError(
            "Expected `def_and_notat_pipeline` to be specified or "
            "both `def_pipeline` and `notat_pipeline` to be specified.")
    data_points = def_notat_naming_data_from_information_note(info_note)
    return [_name_prediction_for_data_point(
        data_point, def_and_notat_pipeline, def_pipeline, notat_pipeline)
        for data_point in data_points]


def _name_prediction_for_data_point(
        data_point: DefNotatNamingData, 
        def_and_notat_pipeline: Optional[pipelines.text2text_generation.SummarizationPipeline], 
        def_pipeline: Optional[pipelines.text2text_generation.SummarizationPipeline],  
        notat_pipeline: Optional[pipelines.text2text_generation.SummarizationPipeline], 
        ) -> str:
    if def_and_notat_pipeline is not None:
        summarizer = def_and_notat_pipeline
    elif data_point['def_or_notat'] == 'definition':
    # elif 'definition' in data_point:
        summarizer = def_pipeline
        summarizer_output = summarizer(data_point['text'])
    else:
        summarizer = notat_pipeline
        summarizer_output = summarizer(data_point['text'], max_length=20, min_length=0)
    return summarizer_output[0]['summary_text']


In [None]:
#| export
def add_names_to_html_tags_in_info_note(
        info_note: VaultNote,
        def_and_notat_pipeline: Optional[pipelines.text2text_generation.SummarizationPipeline] = None, # A pipeline wrapping an ML model which predicts the naming of both definition and notations.
        def_pipeline: Optional[pipelines.text2text_generation.SummarizationPipeline] = None,  # A pipeline wrapping an ML model which predicts the naming of definitions. 
        notat_pipeline: Optional[pipelines.text2text_generation.SummarizationPipeline] = None, # A pipeline wrapping an ML model which predicts the naming of notations. 
        # summarizer: pipelines.text2text_generation.SummarizationPipeline, # The pipeline with the ML model
        overwrite: bool = False, # If `True`, overwrite pre-existing, nonempty attributes. If `False`, ignore pre-existing, nonempty attributes and only write on attributes that are empty.
        fix_formatting: bool = True, # If `True`, fix the formatting for notation names.
        correct_syntax: bool = True, # If `True`, attempt to fix syntax errors for notation names.
        ) -> None:
    """
    Predict the names of definitions and notations marked with
    HTML tags within `info_note` and write those names in the
    `"definition"` or `"notation"` attributes in each tag.

    Either `def_and_notat_pipeline` or both `def_pipeline` and `notat_pipeline`
    should be provided.

    An `#_auto/notation_notes_linked` tag is added to
    `origin_notation_note` if such a tag is not already
    present.
    """
    raw_info_note_text = info_note.text()
    raw_info_note_text_minus_html_tags, tags_and_locats = remove_html_tags_in_text(
        raw_info_note_text)
    predicted_names = predict_names(
        info_note, def_and_notat_pipeline, def_pipeline,
        notat_pipeline)

    # If somehow a different number of HTML tags were found
    if len(predicted_names) != len(tags_and_locats):
        # TODO: do warning
        warnings.warn(
            "Somehow, an inconsistent number of HTML tags are "
            f"detected in the note: {info_note.name}.\n"
            "This will raise some indexing issues when marking the definition "
            "and notation names")
    new_tags_and_locations = []
    any_preds_written = False
    for name, (tag, start, end) in zip(predicted_names, tags_and_locats):
        if 'definition' in tag.attrs:
            def_or_notat = 'definition'
        elif 'notation' in tag.attrs:
            def_or_notat = 'notation'
            if correct_syntax and math_mode_string_is_syntactically_valid(name):
                name = _correct_syntax(name, tag)
            if fix_formatting:
                name = fix_autogen_formatting(name)
        else:
            # tag could be neither a definition nor a notation tag.
            def_or_notat = ''
        if def_or_notat and (tag.attrs[def_or_notat] == "" or overwrite):
            tag[def_or_notat] = name
            any_preds_written = True
        new_tags_and_locations.append((tag, start, end))
    new_info_note_text = add_HTML_tag_data_to_raw_text(
        raw_info_note_text_minus_html_tags, new_tags_and_locations)
    mf = MarkdownFile.from_string(new_info_note_text)
    if any_preds_written:
        mf.add_tags('_auto/def_and_notat_names_added')
    mf.write(info_note)


def _correct_syntax(
        name: str,
        tag: Tag
        ) -> str:
    """
    This is a helper function of `add_names_to_html_tags_in_info_note`.
    """
    replacement_candidates = _list_of_candidates_from_math_mode_strings(tag.text)
    return correct_latex_syntax_error(name, replacement_candidates)

# Naming notation notes

Another convenient functionality is to name notation notes automatically.

In [None]:
#| export

# TODO: test
def autogen_name_from_notation_note(
        notation_note: VaultNote, pipeline):
    data_dict: NotationSummaryData = notation_summarization_data_from_note(
        notation_note, notation_note.vault,
        check_for_actual_summarization=False)
    if data_dict is None:
        return None
    # TODO: change classical_formatting to False after retraining the model for this formatting
    input = single_input_for_notation_summarization(
        data_dict, classical_formatting=True)
    return pipeline(input)[0]['summary_text']

def sanitize_autogen_name(autogen_name):
    autogen_name = autogen_name.replace(' ', '')
    autogen_name = autogen_name.replace('[', '')
    autogen_name = autogen_name.replace(']', '')
    autogen_name = autogen_name.replace('.', '')
    return sanitize_filename(autogen_name)


def add_autogen_name_to_notation_note(
        notation_note: VaultNote,
        autogen_name: str
        ) -> None:
    mf = MarkdownFile.from_vault_note(notation_note)
    if not mf.has_metadata():
        mf.add_metadata_section()

    metadata = mf.metadata()
    metadata['autogen_name'] = [autogen_name] 
    mf.replace_metadata(metadata, enquote_entries_in_fields=['latex_in_original'])

    mf.write(notation_note)
    # mf.metadata



In [None]:
#| export

# TODO: test
def predict_name_and_add_to_notation_note(
        notation_note: VaultNote,
        notation_note_naming_pipeline: pipelines.text2text_generation.SummarizationPipeline,
        reference: str
        ) -> None:
    """
    Predict an appropriate name for the notation note and add it in the YAML frontmatter metadata.
    """
    mf = MarkdownFile.from_vault_note(notation_note)
    if mf.has_tag('_meta/notation_note_named') or 'autogen_name' in str(mf):
        return
    autogen_name = autogen_name_from_notation_note(notation_note, notation_note_naming_pipeline)
    if autogen_name is None:
        return
    autogen_name = sanitize_autogen_name(autogen_name)
    autogen_name = f'{reference}_notation_{autogen_name}'
    add_autogen_name_to_notation_note(notation_note, autogen_name)