In [None]:
#| default_exp markdown.obsidian.personal.machine_learning.note_linking

# markdown.obsidian.personal.machine_learning.note_linking
> Functions for gathering and processing data to train and for using ML models to tell if information notes and notation notes should be linked to one another.

In [None]:
#| export

from abc import ABC, abstractmethod
from enum import Enum
from os import PathLike
import re
from typing import Literal, NamedTuple, Optional, TypedDict, TypeVar, Union

from fastcore.basics import patch

from trouver.helper.regex import find_regex_in_text
from trouver.markdown.markdown.file import MarkdownFile
from trouver.markdown.obsidian.links import links_from_text, LinkType, ObsidianLink, MARKDOWNLINK_CAPTURE_PATTERN
from trouver.markdown.obsidian.personal.notation.parse import NotationNoteParsed, parse_notation_note, _notat_str
from trouver.markdown.obsidian.personal.note_processing import process_standard_information_note
from trouver.markdown.obsidian.personal.note_type import (
    PersonalNoteTypeEnum, assert_note_is_of_type, note_is_of_type, type_of_note
)
from trouver.markdown.obsidian.vault import VaultNote

In [None]:
from fastcore.test import *

## Gather ML data

There are different "levels" at which gathering linking data makes sense --- in particular links are typically, but not always, made between notes in the same "reference folder", a group of notes belonging to a single mathematical text. Moreover, it is easier to mark a "positive" instance of linking, i.e. by creating the appropriate link, than it is to mark a "negative" instance of linking --- in practice, it should be convenient to assume that most pairs of notes without links between them are not positive instances of linking.


### `NoteLinkEnum` and `NoteData` classes

In [None]:
#| export

class NoteLinkEnum(Enum):
    r"""
    The type of (not necessarily direct) link specified by two notes
    """
    NO_LINK = 0
    INFO_TO_INFO_IN_CONTENT = 1 # Typically, a Wikistyle link to the other info note in the content of the info note.
    INFO_TO_INFO_IN_SEE_ALSO = 2 # A Wikistyle link to the other info note in the "See Also" section.
    INFO_TO_INFO_VIA_NOTAT = 3 # When a notation note is embedded into an info note, typically in a footnote, this enum signifies the notation note's main info note as being indirectly linked by the info note.
    INFO_TO_NOTAT_VIA_EMBEDDING = 4 # When a notation note is embedded into an info note, typically in a footnote.
    NOTAT_TO_INFO = 5 # A Wikistyle link to an info note in the notation note's content (Currently, this is not a type of link that is focused on)
    NOTAT_TO_INFO_VIA_NOTAT = 6 # When a notation note (relied) is linked in a notation note (origin), typically as a Markdownstyle link, this enum signifies the main info note of relied as being indirectly linked by origin.
    NOTAT_TO_NOTAT = 7 # When a notation note is linked in a notation note, typically as a Markdownstyle link.



In [None]:
#| export
# class LinkDataTuple(NamedTuple):
#     linked_note_name: str
#     link_type: NoteLinkEnum

In [None]:
#| export
def _replace_underscores_with_spaces(text: str|None) -> str|None:
    """
    Replace all underscores in the input string with spaces.

    This is a helper function to the `NoteData` constructor.

    Args:
    text (str): The input string containing underscores.

    Returns:
    str: A new string with underscores replaced by spaces.
    """
    if text is None:
        return text
    return text.replace("_", " ")


In [None]:
#| export

class NoteData(ABC):
    reference: str|None  # Name of the reference

    #For the following attributes, the "note" more accurately refers to the note itself if the
    # note is an info note or to the main note of the note if the note is a notation note.
    section: str|None # Name of the chapter/section that the note belongs to.
    section_num: int|None # The number n such that `section` is the nth section in `reference`.
    note_num_in_section: int|None # The number n such that the note is the nth note in `section`.
    char_displacement_in_section: int|None # The number n such that the note roughly starts off with the nth character of section.
    subsection: str|None # Name of the section/subsection that the note belongs to.
    subsection_num: int|None # The number n such that `subsection` is the nth subsection in `section`.
    note_num_in_subsection: int|None # The number n such that the note is the nth note in `subsection`.
    char_displacement_in_subsection: int|None # The number n such that the note roughly starts off with the nth character of subsection.

    note_name: str
    note_content: str  # Typically, an output of `str(process_standard_information_note)`.
    directly_linked_notes: dict[str, set[NoteLinkEnum]]  # The names of the notes that the note directly links to along with the type of link


    def __init__(
            self,
            reference: str | None,
            section: str | None,
            section_num: int | None,
            note_num_in_section: int | None,
            char_displacement_in_section: int | None,
            subsection: str | None,
            subsection_num: int | None,
            note_num_in_subsection: int | None,
            char_displacement_in_subsection: int | None,
            note_name: str,
            note_content: str,
            directly_linked_notes: dict[str, set[NoteLinkEnum]]):
        self.reference = reference
        self.section = _replace_underscores_with_spaces(section)
        self.section_num = section_num
        self.note_num_in_section = note_num_in_section
        self.char_displacement_in_section = char_displacement_in_section
        self.subsection = _replace_underscores_with_spaces(subsection)
        self.subsection_num = subsection_num
        self.note_num_in_subsection = note_num_in_subsection
        self.char_displacement_in_subsection = char_displacement_in_subsection
        self.note_name = note_name
        self.note_content = note_content
        self.directly_linked_notes = directly_linked_notes

    def direct_links_to_note(self, relied_note_name: str) -> set[NoteLinkEnum]:
        if relied_note_name not in self.directly_linked_notes:
            return {NoteLinkEnum.NO_LINK}
        else:
            return self.directly_linked_notes[relied_note_name]

    @classmethod
    # @abstractmethod
    def from_note(cls, note: VaultNote):
        pass


In [None]:
#| export
K = TypeVar('K')
V = TypeVar('V')
def _update_dict(
        dict: dict[K, set[V]],
        key: K, 
        value: V,
        ) -> None:
    """
    Appropriately add the values to a set in the dict, or create a new set if needed.
    """
    if key not in dict:
        dict[key] = {}
    dict[key].update(value)



In [None]:
#| export
class InfoNoteData(NoteData):
    """
    The `NoteData` Subclass representing the linked note data for a standard information note.

    Note that `directly_linked_notes` 
    """
    def __init__(
            self,
            reference: str | None,
            section: str | None,
            section_num: int | None,
            note_num_in_section: int | None,
            char_displacement_in_section: int | None,
            subsection: str | None,
            subsection_num: int | None,
            note_num_in_subsection: int | None,
            char_displacement_in_subsection: int | None,
            note_name: str,
            note_content: str,
            directly_linked_notes: dict[str, set[NoteLinkEnum]]
            ):
        super().__init__(
            reference,
            section,
            section_num,
            note_num_in_section,
            char_displacement_in_section,
            subsection,
            subsection_num,
            note_num_in_subsection,
            char_displacement_in_subsection,
            note_name,
            note_content,
            directly_linked_notes
        )


In [None]:
sample_note_data = InfoNoteData(
    reference='Sample_Reference_Book',
    section='Chapter_3_Data_Structures',
    section_num=3,
    note_num_in_section=5,
    char_displacement_in_section=1200,
    subsection='3.2_Lists_and_Tuples',
    subsection_num=2,
    note_num_in_subsection=2,
    char_displacement_in_subsection=450,
    note_name='List_Comprehensions',
    note_content='List comprehensions provide a concise way to create lists...',
    directly_linked_notes={
        'info_note_1': {NoteLinkEnum.INFO_TO_INFO_IN_CONTENT},
        'info_note_2': {NoteLinkEnum.INFO_TO_INFO_IN_SEE_ALSO},
        'info_note_3': {NoteLinkEnum.INFO_TO_INFO_IN_CONTENT, NoteLinkEnum.INFO_TO_INFO_VIA_NOTAT},
        'notat_note_1': {NoteLinkEnum.INFO_TO_NOTAT_VIA_EMBEDDING}
    }
)

test_eq(
    sample_note_data.direct_links_to_note('info_note_1'),
    {NoteLinkEnum.INFO_TO_INFO_IN_CONTENT})

test_eq(
    sample_note_data.direct_links_to_note('info_note_3'),
    {NoteLinkEnum.INFO_TO_INFO_IN_CONTENT, NoteLinkEnum.INFO_TO_INFO_VIA_NOTAT})

test_eq(
    sample_note_data.direct_links_to_note('non_existent'),
    {NoteLinkEnum.NO_LINK})


In [None]:
#| export
def _get_see_also_note_links(
        mf: MarkdownFile, # The MarkdownFile representing the info note.
        vault: PathLike,
        directly_linked_notes: dict[str, set[NoteLinkEnum]],
        ):
    """
    Add names of linked notes in the `'See Also'` section of an info note to `directly_linked_notes`, where
    notes are listed in bulleted lists in the form `'- [[note_name]] <- Optional description of the note>'`

    The notation notes that are linked in this way and any other note linked in an
    optional description in the `'See Also'` section are ignored.

    Helper function to `InfoNoteData.from_note`
    """
    see_also_mf = MarkdownFile.from_list(mf.remove_section('See Also'))
    # 1.1. parse the links in see also
    abstract_link = ObsidianLink(
        is_embedded=False,
        file_name=-1, anchor=-1, custom_text=-1, link_type=LinkType.WIKILINK)
    pattern = f'- {abstract_link.to_regex()}'
    see_also_text = str(see_also_mf)
    ranges = find_regex_in_text(see_also_text, pattern)
    links = [ObsidianLink.from_text[start+2:end] for start, end in ranges]
    note_names = [link.file_name for link in links]

    for note_name in note_names:
        linked_note = VaultNote(vault=vault, name=note_name)
        # 1.2. if links are notation notes, then probably ignore
        if (linked_note.exists() and
                type_of_note(linked_note) == PersonalNoteTypeEnum.STANDARD_INFORMATION_NOTE):
            # 1.3. if links are info notes, then add NoteLinkEnum.INFO_TO_INFO_IN_SEE_ALSO
            _update_dict(directly_linked_notes, note_name, NoteLinkEnum.INFO_TO_INFO_IN_SEE_ALSO)



In [None]:
#| export
def _get_note_names_from_abstract_link(
        abstract_link: ObsidianLink,
        note_text: str
        ) -> list[str]:
    pattern = abstract_link.to_regex()
    ranges = find_regex_in_text(note_text, pattern)
    links = [ObsidianLink.from_text[start:end] for start, end in ranges]
    note_names = [link.file_name for link in links]
    return note_names

In [None]:
#| export
def _get_notation_embedded_note_links(
        mf: MarkdownFile, # The MarkdownFile representing the info note.
        vault: PathLike,
        directly_linked_notes: dict[str, set[NoteLinkEnum]],
        ):
    """
    Add names of embedded notation notes throughout an info note to `directly_linked_notes`
    
    Helper function to `InfoNoteData.from_note`
    """
    abstract_link = ObsidianLink(
        is_embedded=True,
        file_name=-1, anchor=-1, custom_text=-1, link_type=LinkType.WIKILINK)
    note_names = _get_note_names_from_abstract_link(abstract_link, str(mf))

    for note_name in note_names:
        linked_note = VaultNote(vault=vault, name=note_name)
        if (linked_note.exists() and
                type_of_note(linked_note) == PersonalNoteTypeEnum.NOTATION_NOTE):
            # 2.1. Get the embedded note links that are to notation notes, whose main notes are not `note`.
            # TODO: figure out if the main note of notat note should be added with INFO_TO_INFO_VIA_NOTAT
            # "On the second pass".
            _update_dict(directly_linked_notes, note_name, NoteLinkEnum.INFO_TO_NOTAT_VIA_EMBEDDING)

In [None]:
#| export
def _get_links_to_info_notes_in_content(
        mf: MarkdownFile, # The MarkdownFile representing the info note.
        vault: PathLike,
        directly_linked_notes: dict[str, set[NoteLinkEnum]],
        ):
    """
    Add names of info notes linked throughout the content of an info note to
    `directly_linked_notes`    

    Helper function to `InfoNoteData.from_note`
    """
    abstract_link = ObsidianLink(
        is_embedded=False,
        file_name=-1, anchor=-1, custom_text=-1, link_type=LinkType.WIKILINK)
    note_names = _get_note_names_from_abstract_link(abstract_link, str(mf))

    for note_name in note_names:
        linked_note = VaultNote(vault=vault, name=note_name)
        if (linked_note.exists() and
                type_of_note(linked_note) == PersonalNoteTypeEnum.NOTATION_NOTE):
            _update_dict(directly_linked_notes, note_name, NoteLinkEnum.INFO_TO_INFO_IN_CONTENT)

In [None]:
#| export
@patch(cls_method=True)
def from_note(
        cls: InfoNoteData,
        note: VaultNote,
        reference: str | None,
        section: str | None,
        section_num: int | None,
        note_num_in_section: int | None,
        char_displacement_in_section: int | None,
        subsection: str | None,
        subsection_num: int | None,
        note_num_in_subsection: int | None,
        char_displacement_in_subsection: int | None,
        ):
    """
    Return an `InfoNoteData` object by setting up the `directly_linked_notes` attribute.
    """
    assert_note_is_of_type(note, PersonalNoteTypeEnum.STANDARD_INFORMATION_NOTE)
    mf = MarkdownFile.from_vault_note(note)
    # TODO get the linked notes
    directly_linked_notes: dict[str, set[NoteLinkEnum]] = {}
    # 1. get see also note links
    _get_see_also_note_links(mf, note.vault, directly_linked_notes)
    # 2. get notation embedded note links
    mf.remove_section('See Also')
    _get_notation_embedded_note_links(mf, note.vault, directly_linked_notes)
    # 3. get other note links in the content
    mf.remove_footnotes_to_embedded_links()
    _get_links_to_info_notes_in_content(mf, note.vault, directly_linked_notes)
    content = str(process_standard_information_note(mf))
    return cls(
        reference, section, section_num, note_num_in_section, char_displacement_in_section, subsection, subsection_num, note_num_in_subsection, char_displacement_in_subsection, note_name=note.name, note_content=content, directly_linked_notes=directly_linked_notes)

In [None]:
#| export
class NotatNoteData(NoteData):

    parsed: NotationNoteParsed|None
    main_note: str|None # The name of the main note
    # main_note_content: str|None 

    def __init__(
            self,
            reference: str | None,
            section: str | None,
            section_num: int | None,
            note_num_in_section: int | None,
            char_displacement_in_section: int | None,
            subsection: str | None,
            subsection_num: int | None,
            note_num_in_subsection: int | None,
            char_displacement_in_subsection: int | None,
            note_name: str,
            note_content: str,
            directly_linked_notes: dict[str, set[NoteLinkEnum]],
            parsed: NotationNoteParsed|None,
            main_note: str|None,
            # main_note_content: str|None,
            ):
        super().__init__(
            reference,
            section,
            section_num,
            note_num_in_section,
            char_displacement_in_section,
            subsection,
            subsection_num,
            note_num_in_subsection,
            char_displacement_in_subsection,
            note_name,
            note_content,
            directly_linked_notes
        )
        self.parsed = parsed
        self.main_note = main_note
        # self.main_note_content = main_note_content


In [None]:
#| export
# TODO: Maybe import these from `34_markdown.obsidian.personal.machien_learning.notation_linking.ipynb` or delete those functions if deprecating that module.

def _linked_note_names_from_content(
        content: str) -> list[str]:
    linked_note_names = []
    for match in re.findall(MARKDOWNLINK_CAPTURE_PATTERN, content):
        link_name = match[1]
        if link_name.endswith('.md'):
            link_name = link_name[:-3]
        linked_note_names.append(link_name)
    return linked_note_names
        

def _linked_notat_note_names_from_content(
        content: str, vault: PathLike) -> list[str]:
    linked_note_names = _linked_note_names_from_content(content)
    linked_notation_note_names = []
    for linked_note_name in linked_note_names:
        note = VaultNote(vault, name=linked_note_name, update_cache=False)
        if note_is_of_type(note, PersonalNoteTypeEnum.NOTATION_NOTE):
            linked_notation_note_names.append(note.name)
    return linked_notation_note_names

In [None]:
#| export
def _get_links_to_notat_notes(
        mf: MarkdownFile, # The MarkdownFile representing the notation note.
        vault: PathLike,
        directly_linked_notes: dict[str, set[NoteLinkEnum]],
        parsed: NotationNoteParsed,
        ):
    """
    Add names of notation notes notes linked either in the content or in the trailing
    bulleted list in a notation note  to `directly_linked_notes`    

    Helper function to `NotatNoteData.from_note`
    """
    # 1. Identify notation notes in the trailing bulleted list within `parsed`
    for _, linked_notat_note_name in parsed.linked_notation_notes:
        _update_dict(
            directly_linked_notes, linked_notat_note_name, NoteLinkEnum.NOTAT_TO_NOTAT)
    # 2. Identify notation notes linked in the content.
    content = str(parsed.main_content_markdown_file)
    content_linked_notat_notes = _linked_notat_note_names_from_content(
        content, vault)
    for linked_notat_note_name in content_linked_notat_notes:
        _update_dict(
            directly_linked_notes, linked_notat_note_name, NoteLinkEnum.NOTAT_TO_NOTAT)

    # 3. Identify info notes linked in the content.
    # TODO: get info note links
    abstract_link = ObsidianLink(
        is_embedded=False,
        file_name=-1, anchor=-1, custom_text=-1, link_type=LinkType.WIKILINK)
    note_names = _get_note_names_from_abstract_link(abstract_link, str(mf))
    for linked_note_name in note_names:
        linked_note = VaultNote(vault=vault, name=linked_note_name)
        if (linked_note.exists() and
                type_of_note(linked_note) == PersonalNoteTypeEnum.NOTATION_NOTE):
            # 2.1. Get the embedded note links that are to notation notes, whose main notes are not `note`.
            # TODO: figure out if the main note of notat note should be added with NOTAT_TO_INFO_VIA_NOTAT during the "second pass".
            _update_dict(
                directly_linked_notes, linked_note_name, NoteLinkEnum.NOTAT_TO_INFO)


In [None]:
#| export
@patch(cls_method=True)
def from_note(
        cls: NotatNoteData,
        note: VaultNote,
        reference: str | None,
        section: str | None,
        section_num: int | None,
        note_num_in_section: int | None,
        char_displacement_in_section: int | None,
        subsection: str | None,
        subsection_num: int | None,
        note_num_in_subsection: int | None,
        char_displacement_in_subsection: int | None,
        parsed: NotationNoteParsed|None,
        # main_note: str|None,
        # main_note_content: str|None,
        ):

    """
    Return an `InfoNoteData` object by setting up the `directly_linked_notes` attribute.
    """
    assert_note_is_of_type(note, PersonalNoteTypeEnum.NOTATION_NOTE)
    mf = MarkdownFile.from_vault_note(note)
    # TODO get the linked notes
    directly_linked_notes: dict[str, set[NoteLinkEnum]] = {}
    _get_links_to_notat_notes(mf, note.vault, directly_linked_notes)

    # content = str(process_standard_information_note(mf))
    return cls(
        reference, section, section_num, note_num_in_section, char_displacement_in_section,
        subsection, subsection_num, note_num_in_subsection, char_displacement_in_subsection,
        note_name=note.name, note_content=str(parsed.main_content_markdown_file),
        directly_linked_notes=directly_linked_notes,
        parsed=parsed, main_note=parsed.name_of_main_note)
        # main_note_content=main_note_content)

In [None]:
#| export
class NotePairData(TypedDict):
    origin_note: NoteData
    relied_note: NoteData
    linked_type: NoteLinkEnum

### Going through reference to gather note data

In [None]:
#| export
def note_data_from_index_note(
        vault: PathLike,
        reference: str,
        index_note_name: str,
        starting_char: int,
        ) -> dict[str, NoteData]:
    """
    Return `NoteData` concerning the info notes linked to an index name
    in a reference along with the associated notation notes.
    """
    # for 
    index_note = VaultNote(vault, name=index_note_name)
    mf = MarkdownFile.from_vault_note(index_note)
    mf.get_headings_and_text(mf)

In [None]:
#| export
def note_data_from_reference(
        vault: PathLike,
        reference: str
        ) -> dict[str, NoteData]:
    return


SyntaxError: incomplete input (1321492353.py, line 6)