In [None]:
#| default_exp machine_learning.note_data

# machine_learning.note_data
> Functions for preliminarily gathering data about definition and notation notes and how they might be related.

In [None]:
#| export
from abc import ABC, abstractmethod
import copy
from enum import Enum
from pathlib import Path
from os import PathLike
import re
from typing import Literal, NamedTuple, Optional, Self, TypedDict, TypeVar, Union

from fastcore.basics import patch

from trouver.helper.latex.augment import (
augment_text, choose_modification_methods_at_random, remove_font_styles_at_random, change_font_styles_at_random, change_greek_letters_at_random, remove_math_keywords, random_latex_command_removal, random_word_removal, dollar_sign_manipulation, random_char_modification
)
from trouver.helper.regex import find_regex_in_text, latex_indices
from trouver.obsidian.file import MarkdownFile
from trouver.obsidian.links import links_from_text, LinkType, ObsidianLink, MARKDOWNLINK_CAPTURE_PATTERN
from trouver.notation.in_standard_info_note import notation_notes_linked_in_see_also_section
from trouver.notation.parse import NotationNoteParsed, parse_notation_note, notation_in_note, main_of_notation
from trouver.personal_vault.note_processing import process_standard_information_note, ProcessNoteError
from trouver.personal_vault.note_type import (
    PersonalNoteTypeEnum, assert_note_is_of_type, note_is_of_type, type_of_note)

from trouver.personal_vault.notes import (
    notes_linked_in_note,  notes_linked_in_notes_linked_in_note)
from trouver.personal_vault.reference import index_note_for_reference, all_paths_to_notes_in_reference_folder
from trouver.obsidian.vault import VaultNote


In [None]:
from unittest.mock import MagicMock
from unittest.mock import patch as mock_patch

from fastcore.test import *

from nbdev.showdoc import show_doc

## Gather ML data

There are different "levels" at which gathering linking data makes sense --- in particular links are typically, but not always, made between notes in the same "reference folder", a group of notes belonging to a single mathematical text. Moreover, it is easier to mark a "positive" instance of linking, i.e. by creating the appropriate link, than it is to mark a "negative" instance of linking --- in practice, it should be convenient to assume that most pairs of notes without links between them are not positive instances of linking.


### `NoteLinkEnum` and `NoteData` classes

In [None]:
#| export

class NoteLinkEnum(Enum):
    r"""
    The type of (not necessarily direct) link specified by two notes
    """
    NO_LINK = 0
    INFO_TO_INFO_IN_CONTENT = 1 # Typically, a Wikistyle link to the other info note in the content of the info note.
    INFO_TO_INFO_IN_SEE_ALSO = 2 # A Wikistyle link to the other info note in the "See Also" section.
    INFO_TO_INFO_VIA_NOTAT = 3 # When a notation note is embedded into an info note, typically in a footnote, this enum signifies the notation note's main info note as being indirectly linked by the info note.
    INFO_TO_NOTAT_VIA_EMBEDDING = 4 # When a notation note is embedded into an info note, typically in a footnote.
    NOTAT_TO_INFO = 5 # A Wikistyle link to an info note in the notation note's content (Currently, this is not a type of link that is focused on)
    NOTAT_TO_INFO_VIA_NOTAT = 6 # When a notation note (relied) is linked in a notation note (origin), typically as a Markdownstyle link, this enum signifies the main info note of relied as being indirectly linked by origin.
    NOTAT_TO_NOTAT = 7 # When a notation note is linked in a notation note, typically as a Markdownstyle link.



In [None]:
#| export
# class LinkDataTuple(NamedTuple):
#     linked_note_name: str
#     link_type: NoteLinkEnum

In [None]:
#| export
def _replace_underscores_with_spaces(text: str|None) -> str|None:
    """
    Replace all underscores in the input string with spaces.

    This is a helper function to the `NoteData` constructor.

    Args:
    text (str): The input string containing underscores.

    Returns:
    str: A new string with underscores replaced by spaces.
    """
    if text is None:
        return text
    return text.replace("_", " ")


In [None]:
#| hide
test_text = 'this_is_a_name'
test_eq(_replace_underscores_with_spaces(test_text), 'this is a name')
test_eq(_replace_underscores_with_spaces(None), None)

In [None]:
#| export


POSITION_DATA_ATTRIBUTES = ['section', 'section_num', 'note_num_in_section', 'char_displacement_in_section',
                            'subsection', 'subsection_num', 'note_num_in_subsection',
                            'char_displacement_in_subsection']
class NoteData(ABC):
    reference: str|None  # Name of the reference

    #For the following attributes, the "note" more accurately refers to the note itself if the
    # note is an info note or to the main note of the note if the note is a notation note.
    section: str|None # Name of the chapter/section that the note belongs to.
    section_num: int|None # The number n such that `section` is the nth section in `reference`.
    note_num_in_section: int|None # The number n such that the note is the nth note in `section`.
    char_displacement_in_section: int|None # The number n such that the note roughly starts off with the nth character of section.
    subsection: str|None # Name of the section/subsection that the note belongs to.
    subsection_num: int|None # The number n such that `subsection` is the nth subsection in `section`.
    note_num_in_subsection: int|None # The number n such that the note is the nth note in `subsection`.
    char_displacement_in_subsection: int|None # The number n such that the note roughly starts off with the nth character of subsection.

    note_name: str
    note_content: str  # Typically, an output of `str(process_standard_information_note)`.
    directly_linked_notes: dict[str, set[NoteLinkEnum]]  # The names of the notes that the note directly links to along with the type of link
    reverse_linked_notes: dict[str, set[NoteLinkEnum]] # The names of the notes that link to the note that this `NoteData` instance represents and the type of link (from the other note). Use the `find_reverse_links` function to add data here.
    tags: set[str] # The tags in the YAML frontmatter metadata of the note.


    def __init__(
            self,
            reference: str | None,
            section: str | None,
            section_num: int | None,
            note_num_in_section: int | None,
            char_displacement_in_section: int | None,
            subsection: str | None,
            subsection_num: int | None,
            note_num_in_subsection: int | None,
            char_displacement_in_subsection: int | None,
            note_name: str,
            note_content: str,
            directly_linked_notes: dict[str, set[NoteLinkEnum]],
            tags: set[str] | None,
            ):
        """
        The `reverse_linked_notes` attribute is not populated durign constrution. Use
        the `find_reverse_links` to add data there.  
        """
        self.reference = reference
        self.section = _replace_underscores_with_spaces(section)
        self.section_num = section_num
        self.note_num_in_section = note_num_in_section
        self.char_displacement_in_section = char_displacement_in_section
        self.subsection = _replace_underscores_with_spaces(subsection)
        self.subsection_num = subsection_num
        self.note_num_in_subsection = note_num_in_subsection
        self.char_displacement_in_subsection = char_displacement_in_subsection
        self.note_name = note_name
        self.note_content = note_content
        self.directly_linked_notes = directly_linked_notes
        self.reverse_linked_notes = {}
        self.tags = tags

    def direct_links_to_note(self, relied_note_name: str) -> set[NoteLinkEnum]:
        if relied_note_name not in self.directly_linked_notes:
            return {NoteLinkEnum.NO_LINK}
        else:
            return self.directly_linked_notes[relied_note_name]

    @classmethod
    # @abstractmethod
    def from_note(cls, note: VaultNote):
        pass


    def _metadata_string(
            self,
            items: list[str]) -> str:
        """
        Helper function for `positional_data_string`, `meta_data_string`.
        """
        parts: list[str] = []
        for item in items:
            attribute = getattr(self, item, None)
            if attribute is None:
                continue
            parts.append(f'{item}: {attribute}')
        return '\n'.join(parts)


    # @patch
    def positional_data_string(
            self) -> str:
        """
        Return a string of the positional data of this object relevant for training and prediction. 
        """
        return self._metadata_string(POSITION_DATA_ATTRIBUTES)

    @abstractmethod
    def data_string(
            self,
            format: Literal['bert', 't5'], # The model type for whcih the data string is used. This affects the formatting/special tokens used.
            # note_data: Optional[dict[str, Self]] = None, # A dict from which to find information of other NoteData
            ) -> str:
        """
        Return a string of the data of this object relevant for training and prediction on an NLP model.
        """
        pass
        

    def deepcopy(self):
        return copy.deepcopy(self)


In [None]:
#| export
def note_data_order_cmp(
        note_data_1: NoteData,
        note_data_2: NoteData,
        ) -> Literal[1, 0, -1] | None: # -1 if (the main information note associated with)`note_data_1` precedes (that of) `note_data_2`. 1 if `note_data_2` precedes `note_data_1`. 0 if `note_data_1` and `note_data_2` seem to come from the same information note. `None` if hte positions of the two notes that the `NoteData` objects represent are not comparable (e.g. the positional data is not recorded, or the references are different).
    """
    Determine which (information note of) note precedes the (information note of the) other note.
    """
    if note_data_1.reference is None or note_data_2.reference is None:
        return None
    if note_data_1.reference != note_data_2.reference:
        return None
    if note_data_1.section_num < note_data_2.section_num:
        return -1
    elif note_data_1.section_num > note_data_2.section_num:
        return 1
    if note_data_1.note_num_in_section < note_data_2.note_num_in_section:
        return -1
    elif note_data_1.note_num_in_section > note_data_2.note_num_in_section:
        return 1
    return 0

In [None]:
#| export
@patch
def randomly_modify(
        self: NoteData,
        augmentation: Optional[Literal['high', 'mid', 'low']],
        erase_position_metadata: bool = False, # If `True`, erase the 'section', 'section_num', etc. metadata.
        # attributes_to_modify: list[str] = POSITION_DATA_ATTRIBUTES,
        ) -> None:
    """
    Modify this object at random for data augmentation. 
    
    The changes made are essentially permanent, so only apply this method to
    copies (See `deepcopy`).
    """
    if erase_position_metadata:
        for item in POSITION_DATA_ATTRIBUTES:
            setattr(self, item, None)
    if augmentation == 'low':
        method_inclusion_chance = 0.3
        scale = 0.5
    elif augmentation == 'mid':
        method_inclusion_chance = 0.5
        scale = 1.0
    else:
        method_inclusion_chance = 0.8
        scale = 1.5
    for item in POSITION_DATA_ATTRIBUTES:
        attribute = getattr(self, item, None)
        if attribute is None:
            continue
        elif isinstance(attribute, str):
            methods = [
                (remove_font_styles_at_random, 0.1), (change_font_styles_at_random, 0.2), (change_greek_letters_at_random, 0.1), 
                (remove_math_keywords,0.1), (random_latex_command_removal,0.2),
                (random_word_removal,0.1), (dollar_sign_manipulation,0.05),
                (random_char_modification, 0.01)]
            random_methods = choose_modification_methods_at_random(
                methods, method_inclusion_chance, scale)
            modified_attribute = augment_text(attribute, random_methods)
        elif item.startswith('char_displacement'): 
            modified_attribute = modify_int_by_at_most_at_most_offset(
                attribute, offset=0.10*scale, lower_bound=0)
            # attribute.        
        else: # isintance(attribute, int) holds
            modified_attribute = modify_int_by_at_most_at_most_value(
                attribute, value=int(5*scale), lower_bound=1)
        setattr(self, item, modified_attribute)
    methods = [
        (remove_font_styles_at_random, 0.1), (change_font_styles_at_random, 0.2), (change_greek_letters_at_random, 0.1), 
        (remove_math_keywords,0.1), (random_latex_command_removal,0.2),
        (random_word_removal,0.1), (dollar_sign_manipulation,0.05),
        (random_char_modification, 0.001)]
    random_methods = choose_modification_methods_at_random(
        methods, method_inclusion_chance, scale)
    self.note_content = augment_text(self.note_content, random_methods)
    

In [None]:
#| export

In [None]:
#| export
K = TypeVar('K')
V = TypeVar('V')
def _update_dict(
        dict: dict[K, set[V]],
        key: K, 
        value: V,
        ) -> None:
    """
    Appropriately add the values to a set in the dict, or create a new set if needed.
    """
    if key not in dict:
        dict[key] = set()
    dict[key].add(value)



In [None]:
#| hide
dicty = {}
_update_dict(dicty, 'key1', 'value1')
test_eq(dicty, {'key1': {'value1'}})
_update_dict(dicty, 'key1', 'value2')
test_eq(dicty, {'key1': {'value1', 'value2'}})
_update_dict(dicty, 'key2', 'value3')
test_eq(dicty, {'key1': {'value1', 'value2'}, 'key2': {'value3'}})

In [None]:
#| export
class InfoNoteData(NoteData):
    """
    The `NoteData` Subclass representing the linked note data for a standard information note.

    Note that `directly_linked_notes` 
    """
    def __init__(
            self,
            reference: str | None,
            section: str | None,
            section_num: int | None,
            note_num_in_section: int | None,
            char_displacement_in_section: int | None,
            subsection: str | None,
            subsection_num: int | None,
            note_num_in_subsection: int | None,
            char_displacement_in_subsection: int | None,
            note_name: str,
            note_content: str,
            directly_linked_notes: dict[str, set[NoteLinkEnum]],
            tags: set[str] | None,
            ):
        super().__init__(
            reference,
            section,
            section_num,
            note_num_in_section,
            char_displacement_in_section,
            subsection,
            subsection_num,
            note_num_in_subsection,
            char_displacement_in_subsection,
            note_name,
            note_content,
            directly_linked_notes,
            tags)

    def data_string(
            self,
            format: Literal['bert', 't5'], # The model type for whcih the data string is used. This affects the formatting/special tokens used.
            # note_data: Optional[dict[str, NoteData]] = None,
            ) -> str:
        position_data = super(InfoNoteData, self).positional_data_string()
        if format == 'bert':
            return f"{position_data}\n[SEP]\n{self.note_content}"
        else:
            return f"{position_data}\n{self.note_content}"
            

In [None]:
sample_note_data = InfoNoteData(
    reference='Sample_Reference_Book',
    section='Chapter_3_Data_Structures',
    section_num=3,
    note_num_in_section=5,
    char_displacement_in_section=1200,
    subsection='3.2_Lists_and_Tuples',
    subsection_num=2,
    note_num_in_subsection=2,
    char_displacement_in_subsection=450,
    note_name='List_Comprehensions',
    note_content='List comprehensions provide a concise way to create lists...',
    directly_linked_notes={
        'info_note_1': {NoteLinkEnum.INFO_TO_INFO_IN_CONTENT},
        'info_note_2': {NoteLinkEnum.INFO_TO_INFO_IN_SEE_ALSO},
        'info_note_3': {NoteLinkEnum.INFO_TO_INFO_IN_CONTENT, NoteLinkEnum.INFO_TO_INFO_VIA_NOTAT},
        'notat_note_1': {NoteLinkEnum.INFO_TO_NOTAT_VIA_EMBEDDING}
    },
    tags=None
)

test_eq(
    sample_note_data.direct_links_to_note('info_note_1'),
    {NoteLinkEnum.INFO_TO_INFO_IN_CONTENT})

test_eq(
    sample_note_data.direct_links_to_note('info_note_3'),
    {NoteLinkEnum.INFO_TO_INFO_IN_CONTENT, NoteLinkEnum.INFO_TO_INFO_VIA_NOTAT})

test_eq(
    sample_note_data.direct_links_to_note('non_existent'),
    {NoteLinkEnum.NO_LINK})


In [None]:
#| export
def _get_see_also_note_links(
        mf: MarkdownFile, # The MarkdownFile representing the info note.
        vault: PathLike,
        directly_linked_notes: dict[str, set[NoteLinkEnum]], # This will eventually be the `directly_linked_notes` attribute of the `InfoNoteData` object representing the info note.
        ) -> None:
    """
    Add names of linked notes in the `'See Also'` section of an info note to `directly_linked_notes`, where
    notes are listed in bulleted lists in the form `'- [[note_name]] <- Optional description of the note>'`

    The notation notes that are linked in this way and any other note linked in an
    optional description in the `'See Also'` section are ignored.

    Helper function to `InfoNoteData.from_note`
    """
    see_also_section_parts: list[dict] = mf.remove_section('See Also')
    see_also_mf = MarkdownFile(see_also_section_parts)
    # 1.1. parse the links in see also
    abstract_link = ObsidianLink(
        is_embedded=False,
        file_name=-1, anchor=-1, custom_text=-1, link_type=LinkType.WIKILINK)
    pattern = f'^- {abstract_link.to_regex()}'
    pattern = re.compile(pattern, flags=re.MULTILINE)
    see_also_text = str(see_also_mf)
    ranges = find_regex_in_text(see_also_text, pattern)
    links = [ObsidianLink.from_text(see_also_text[start+2:end]) for start, end in ranges]
    note_names = [link.file_name for link in links]

    for note_name in note_names:
        linked_note = VaultNote(vault=vault, name=note_name)
        # 1.2. if links are notation notes, then probably ignore
        if (linked_note.exists() and
                type_of_note(linked_note) == PersonalNoteTypeEnum.STANDARD_INFORMATION_NOTE):
            # 1.3. if links are info notes, then add NoteLinkEnum.INFO_TO_INFO_IN_SEE_ALSO
            _update_dict(directly_linked_notes, note_name, NoteLinkEnum.INFO_TO_INFO_IN_SEE_ALSO)



In [None]:
#| hide
text = r'''
# Topic[^1]
asdfasdf
# See Also
- [[notat_note]]
- [[info_note]]
'''
mf = MarkdownFile.from_string(text)

with mock_patch('__main__.VaultNote') as mock_vault_note, \
     mock_patch('__main__.type_of_note') as mock_type_of_note:

    mock_notat_note = MagicMock()
    mock_info_note = MagicMock()
    mock_vault_note.side_effect = [mock_notat_note, mock_info_note]
    mock_type_of_note.side_effect = [
        PersonalNoteTypeEnum.NOTATION_NOTE,
        PersonalNoteTypeEnum.STANDARD_INFORMATION_NOTE]
        
    directly_linked_notes = {}
    _get_see_also_note_links(mf, Path('/fake/vault'), directly_linked_notes)
    test_eq(directly_linked_notes, {'info_note': {NoteLinkEnum.INFO_TO_INFO_IN_SEE_ALSO}})

In [None]:
text = r'''
# Topic[^1]
asdfasdf
# See Also
- [[notat_note]] - [[do_not_get_info_from_this|text]]
- [[info_note]]
'''
mf = MarkdownFile.from_string(text)

with mock_patch('__main__.VaultNote') as mock_vault_note, \
     mock_patch('__main__.type_of_note') as mock_type_of_note:

    mock_notat_note = MagicMock()
    mock_info_note = MagicMock()
    mock_vault_note.side_effect = [mock_notat_note, mock_info_note]
    mock_type_of_note.side_effect = [
        PersonalNoteTypeEnum.NOTATION_NOTE,
        PersonalNoteTypeEnum.STANDARD_INFORMATION_NOTE]
        
    directly_linked_notes = {}
    _get_see_also_note_links(mf, Path('/fake/vault'), directly_linked_notes)
    test_eq(directly_linked_notes, {'info_note': {NoteLinkEnum.INFO_TO_INFO_IN_SEE_ALSO}})

In [None]:
#| export
def _get_note_names_from_abstract_link(
        abstract_link: ObsidianLink,
        note_text: str
        ) -> list[str]:
    pattern = abstract_link.to_regex()
    ranges = find_regex_in_text(note_text, pattern)
    links = [ObsidianLink.from_text(note_text[start:end]) for start, end in ranges]
    note_names = [link.file_name for link in links]
    return note_names

In [None]:
#| hide

In [None]:
#| export
def _get_notation_embedded_note_links(
        mf: MarkdownFile, # The MarkdownFile representing the info note.
        vault: PathLike,
        directly_linked_notes: dict[str, set[NoteLinkEnum]],
        ):
    """
    Add names of embedded notation notes throughout an info note to `directly_linked_notes`
    
    Helper function to `InfoNoteData.from_note`
    """
    abstract_link = ObsidianLink(
        is_embedded=True,
        file_name=-1, anchor=-1, custom_text=-1, link_type=LinkType.WIKILINK)
    note_names = _get_note_names_from_abstract_link(abstract_link, str(mf))

    for note_name in note_names:
        linked_note = VaultNote(
            vault=vault, name=note_name, update_cache=False)

        if (linked_note.exists() and
                type_of_note(linked_note) == PersonalNoteTypeEnum.NOTATION_NOTE):
            # 2.1. Get the embedded note links that are to notation notes, whose main notes are not `note`.
            _update_dict(directly_linked_notes, note_name, NoteLinkEnum.INFO_TO_NOTAT_VIA_EMBEDDING)

In [None]:
#| hide

text = r'''
# Topic[^1]

asdfasdf[^2]

[[non_embedded_note]]

[^2]: ![[embedded_notat_note]]
'''
mf = MarkdownFile.from_string(text)

with mock_patch('__main__.VaultNote') as mock_vault_note, \
     mock_patch('__main__.type_of_note') as mock_type_of_note:

    mock_non_embedded_note = MagicMock()
    mock_non_embedded_note.exists.return_value = True
    mock_embedded_notation_note = MagicMock()
    mock_embedded_notation_note.exists.return_value = True
    mock_vault_note.side_effect = [mock_non_embedded_note, mock_embedded_notation_note]

    mock_type_of_note.side_effect = [
        PersonalNoteTypeEnum.NOTATION_NOTE]
    directly_linked_notes = {}
    _get_notation_embedded_note_links(mf, Path('/fake/vault'), directly_linked_notes)
    test_eq(
        directly_linked_notes,
        {'embedded_notat_note': {NoteLinkEnum.INFO_TO_NOTAT_VIA_EMBEDDING}})
        

In [None]:
#| export
def _get_links_to_info_notes_in_content(
        mf: MarkdownFile, # The MarkdownFile representing the info note.
        vault: PathLike,
        directly_linked_notes: dict[str, set[NoteLinkEnum]],
        ):
    """
    Add names of info notes linked throughout the content of an info note to
    `directly_linked_notes`    

    Helper function to `InfoNoteData.from_note`
    """
    abstract_link = ObsidianLink(
        is_embedded=False,
        file_name=-1, anchor=-1, custom_text=-1, link_type=LinkType.WIKILINK)
    note_names = _get_note_names_from_abstract_link(abstract_link, str(mf))

    for note_name in note_names:
        linked_note = VaultNote(
            vault=vault, name=note_name, update_cache=False)
        if (linked_note.exists() and
                type_of_note(linked_note) == PersonalNoteTypeEnum.STANDARD_INFORMATION_NOTE):
            _update_dict(directly_linked_notes, note_name, NoteLinkEnum.INFO_TO_INFO_IN_CONTENT)

In [None]:
#| hide
text = r'''
# Topic[^1]

asdfasdf

[[non_embedded_note]]

'''
mf = MarkdownFile.from_string(text)

with mock_patch('__main__.VaultNote') as mock_vault_note, \
     mock_patch('__main__.type_of_note') as mock_type_of_note:

    mock_non_embedded_note = MagicMock()
    mock_non_embedded_note.exists.return_value = True
    mock_embedded_notation_note = MagicMock()
    mock_embedded_notation_note.exists.return_value = True
    mock_vault_note.side_effect = [mock_non_embedded_note, mock_embedded_notation_note]

    mock_type_of_note.side_effect = [
        PersonalNoteTypeEnum.STANDARD_INFORMATION_NOTE,
        PersonalNoteTypeEnum.NOTATION_NOTE,
        ]
    directly_linked_notes = {}
    _get_links_to_info_notes_in_content(mf, Path('/fake/vault'), directly_linked_notes)
    test_eq(
        directly_linked_notes,
        {'non_embedded_note': {NoteLinkEnum.INFO_TO_INFO_IN_CONTENT}})
        

In [None]:
#| export
@patch(cls_method=True)
def from_note(
        cls: InfoNoteData,
        note: VaultNote,
        reference: str | None,
        section: str | None,
        section_num: int | None,
        note_num_in_section: int | None,
        char_displacement_in_section: int | None,
        subsection: str | None,
        subsection_num: int | None,
        note_num_in_subsection: int | None,
        char_displacement_in_subsection: int | None,
        ):
    """
    Return an `InfoNoteData` object by setting up the `directly_linked_notes` attribute.

    In practice, the `_find_indirect_links_to_info_notes_via_notat_notes` helper
    function should be applied (see also `note_data_from_reference`) for
    `directly_linked_notes` to really be complete.

    When looking at links in the see also section of the information note, only the
    link of the "subject" note (i.e. the note that starts of the line, so the bulleted
    item looks like "- [[subject_note_name]] - <comments/description...>")
    """
    assert_note_is_of_type(note, PersonalNoteTypeEnum.STANDARD_INFORMATION_NOTE)
    mf = MarkdownFile.from_vault_note(note) 
    tags = mf.tags()
    directly_linked_notes: dict[str, set[NoteLinkEnum]] = {}
    # 1. get see also note links
    _get_see_also_note_links(mf, note.vault, directly_linked_notes)
    # 2. get notation embedded note links
    mf.remove_section('See Also')
    _get_notation_embedded_note_links(mf, note.vault, directly_linked_notes)
    # 3. get other note links in the content
    mf.remove_footnotes_to_embedded_links()
    _get_links_to_info_notes_in_content(mf, note.vault, directly_linked_notes)
    content = str(process_standard_information_note(mf, note.vault))
    return cls(
        reference, section, section_num, note_num_in_section, char_displacement_in_section, subsection, subsection_num, note_num_in_subsection, char_displacement_in_subsection, note_name=note.name, note_content=content, directly_linked_notes=directly_linked_notes, tags=tags)

In [None]:
show_doc(InfoNoteData.positional_data_string)

---

[source](https://github.com/hyunjongkimmath/trouver/blob/main/trouver/markdown/obsidian/personal/machine_learning/note_data.py#L166){target="_blank" style="float:right; font-size:smaller"}

### NoteData.positional_data_string

>      NoteData.positional_data_string ()

*Return a string of the positional data of this object relevant for training and prediction.*

In [None]:
show_doc(InfoNoteData.data_string)

---

[source](https://github.com/hyunjongkimmath/trouver/blob/main/trouver/markdown/obsidian/personal/machine_learning/note_data.py#L326){target="_blank" style="float:right; font-size:smaller"}

### InfoNoteData.data_string

>      InfoNoteData.data_string (format:Literal['bert','t5'])

*Return a string of the data of this object relevant for training and prediction on an NLP model.*

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| format | Literal | The model type for whcih the data string is used. This affects the formatting/special tokens used. |
| **Returns** | **str** | **note_data: Optional[dict[str, NoteData]] = None,** |

In [None]:
sample_note_data = InfoNoteData(
    reference='Sample_Reference_Book',
    section='Chapter_3_Data_Structures',
    section_num=3,
    note_num_in_section=5,
    char_displacement_in_section=1200,
    subsection='3.2_Lists_and_Tuples',
    subsection_num=2,
    note_num_in_subsection=2,
    char_displacement_in_subsection=450,
    note_name='List_Comprehensions',
    note_content='List comprehensions provide a concise way to create lists...',
    directly_linked_notes={
        'info_note_1': {NoteLinkEnum.INFO_TO_INFO_IN_CONTENT},
        'info_note_2': {NoteLinkEnum.INFO_TO_INFO_IN_SEE_ALSO},
        'info_note_3': {NoteLinkEnum.INFO_TO_INFO_IN_CONTENT, NoteLinkEnum.INFO_TO_INFO_VIA_NOTAT},
        'notat_note_1': {NoteLinkEnum.INFO_TO_NOTAT_VIA_EMBEDDING}
    },
    tags=None
)

print(sample_note_data.data_string(format='bert'))
print(sample_note_data.data_string(format='t5'))

section: Chapter 3 Data Structures
section_num: 3
note_num_in_section: 5
char_displacement_in_section: 1200
subsection: 3.2 Lists and Tuples
subsection_num: 2
note_num_in_subsection: 2
char_displacement_in_subsection: 450
[SEP]
List comprehensions provide a concise way to create lists...
section: Chapter 3 Data Structures
section_num: 3
note_num_in_section: 5
char_displacement_in_section: 1200
subsection: 3.2 Lists and Tuples
subsection_num: 2
note_num_in_subsection: 2
char_displacement_in_subsection: 450
List comprehensions provide a concise way to create lists...


In [None]:
# TODO: test

In [None]:
#| hide

In [None]:
#| export
class NotatNoteData(NoteData):

    parsed: NotationNoteParsed|None
    main_note: str|None # The name of the main note
    main_note_content: str|None 
    notation_str: str|None
    latex_in_original: str|None
    """
    The attributes `section`, `section_num`, `note_num_in_section`, `char_displacement_in_section`, `subsection`,
    `subsection_num`, `note_num_in_subsection`, `char_displacement_in_subsection` are all those of the main
    note of the notation note, i.e. the note whose name is `main_note`.

    """
    def __init__(
            self,
            reference: str | None,
            section: str | None,
            section_num: int | None,
            note_num_in_section: int | None,
            char_displacement_in_section: int | None,
            subsection: str | None,
            subsection_num: int | None,
            note_num_in_subsection: int | None,
            char_displacement_in_subsection: int | None,
            note_name: str,
            note_content: str,
            directly_linked_notes: dict[str, set[NoteLinkEnum]],
            tags: set[str] | None,
            parsed: NotationNoteParsed|None,
            main_note: str|None,
            main_note_content: str|None,
            notation_str: str|None,
            latex_in_original: str|None,
            ):
        super().__init__(
            reference,
            section,
            section_num,
            note_num_in_section,
            char_displacement_in_section,
            subsection,
            subsection_num,
            note_num_in_subsection,
            char_displacement_in_subsection,
            note_name,
            note_content,
            directly_linked_notes,
            tags
        )
        self.parsed = parsed
        self.main_note = main_note
        self.main_note_content = main_note_content
        self.notation_str = notation_str
        self.latex_in_original = latex_in_original


    def data_string(
            self,
            format: Literal['bert', 't5'], # The type of model that will handle the data; this affects the formatting of the string.
            # note_data: Optional[dict[str, NoteData]] = None, # A dict from which to find information of the main onte of the notation note. content data
            ) -> str:
        """
        Return a string of the data of this object relevant for training and prediction on an NLP model.

        For a `NotatNoteData` object, this can contain its notation string and `"latex_in_original"` attribute
        in the notation note's YAML frontmatter metadata, along with the content of the main note of the
        notation note.
        """
        positional_data_str = self.positional_data_string() # super(NotatNoteData, self).data_string()
        parts: list[str] = [positional_data_str]
        if self.parsed and self.parsed.notation_str:
            parts.append(f'notation_str: {self.parsed.notation_str}')
        if self.parsed:
            meta = self.parsed.yaml_frontmatter_meta
            if meta and 'latex_in_original' in meta and len(meta['latex_in_original']) > 0:
                parts.append(f'latex_in_original: {meta['latex_in_original'][0]}')
        if format == 'bert':
            parts.append('[SEP]')
        else:
            parts.append('</s>')
        if self.note_content:
            parts.append(self.note_content)
            if format == 'bert':
                parts.append('[SEP]')
            else:
                parts.append('</s>')
        if self.main_note_content:
            parts.append(f'main_note_content: {self.main_note_content}')
        # if note_data and self.main_note in note_data:
        #     main_note_content = note_data[self.main_note].note_content
        #     parts.append(f'main_note_content: {main_note_content}')
        # if self.parsed and self.main_note
        return '\n'.join(parts)

In [None]:
#| export
# TODO: Maybe import these from `34_markdown.obsidian.personal.machien_learning.notation_linking.ipynb` or delete those functions if deprecating that module.

def _linked_note_names_from_content(
        content: str) -> list[str]:
    linked_note_names = []
    for match in re.findall(MARKDOWNLINK_CAPTURE_PATTERN, content):
        link_name = match[1]
        if link_name.endswith('.md'):
            link_name = link_name[:-3]
        linked_note_names.append(link_name)
    return linked_note_names
        

def _linked_notat_note_names_from_content(
        content: str, vault: PathLike) -> list[str]:
    linked_note_names = _linked_note_names_from_content(content)
    linked_notation_note_names = []
    for linked_note_name in linked_note_names:
        note = VaultNote(vault, name=linked_note_name, update_cache=False)
        if note_is_of_type(note, PersonalNoteTypeEnum.NOTATION_NOTE):
            linked_notation_note_names.append(note.name)
    return linked_notation_note_names

In [None]:
#| export
def _get_links_to_notat_notes(
        mf: MarkdownFile, # The MarkdownFile representing the notation note.
        vault: PathLike,
        directly_linked_notes: dict[str, set[NoteLinkEnum]],
        parsed: NotationNoteParsed,
        ):
    """
    Add names of notation notes notes linked either in the content or in the trailing
    bulleted list in a notation note  to `directly_linked_notes`    

    Helper function to `NotatNoteData.from_note`
    """
    # 1. Identify notation notes in the trailing bulleted list within `parsed`
    for _, linked_notat_note_name in parsed.linked_notation_notes:
        _update_dict(
            directly_linked_notes, linked_notat_note_name, NoteLinkEnum.NOTAT_TO_NOTAT)
    # 2. Identify notation notes linked in the content.
    content = str(parsed.main_content_markdown_file)
    content_linked_notat_notes = _linked_notat_note_names_from_content(
        content, vault)
    for linked_notat_note_name in content_linked_notat_notes:
        _update_dict(
            directly_linked_notes, linked_notat_note_name, NoteLinkEnum.NOTAT_TO_NOTAT)

    # 3. Identify info notes linked in the content.
    abstract_link = ObsidianLink(
        is_embedded=False,
        file_name=-1, anchor=-1, custom_text=-1, link_type=LinkType.WIKILINK)
    note_names = _get_note_names_from_abstract_link(abstract_link, str(mf))
    for linked_note_name in note_names:
        linked_note = VaultNote(
            vault=vault, name=linked_note_name, update_cache=False)
        if (linked_note.exists() and
                type_of_note(linked_note) == PersonalNoteTypeEnum.STANDARD_INFORMATION_NOTE):
            # 2.1. Get the embedded note links that are to notation notes, whose main notes are not `note`.
            _update_dict(
                directly_linked_notes, linked_note_name, NoteLinkEnum.NOTAT_TO_INFO)


In [None]:
#| hide
text = r'''the blah blah of the [[info_note_providing_data|something]].
It is defined as the [$asd$](other_notation_note.md) blah blah.
'''
mf = MarkdownFile.from_string(text)

parsed = NotationNoteParsed(
    yaml_frontmatter_meta=None, notation_str='$asdf$',
    name_of_main_note='main_note',
    main_content_markdown_file='mf',
    linked_notation_notes = [('$X$', 'other_Notation_note_2.md')])

with mock_patch('__main__.VaultNote') as mock_vault_note, \
     mock_patch('__main__.type_of_note') as mock_type_of_note:

    mock_info_note_providing_data = MagicMock()
    mock_non_embedded_note.exists.return_value = True
    # mock_embedded_notation_note = MagicMock()
    # mock_embedded_notation_note.exists.return_value = True
    mock_vault_note.side_effect = [mock_info_note_providing_data]

    mock_type_of_note.side_effect = [
        PersonalNoteTypeEnum.STANDARD_INFORMATION_NOTE,
        ]
    directly_linked_notes = {}

    _get_links_to_notat_notes(mf, Path('/fake/vault'), directly_linked_notes, parsed)
    print(directly_linked_notes)
    test_eq(
        directly_linked_notes,
        {'other_Notation_note_2.md': {NoteLinkEnum.NOTAT_TO_NOTAT},
         'info_note_providing_data': {NoteLinkEnum.NOTAT_TO_INFO}}
    )

    # _get_links_to_info_notes_in_content(mf, Path('/fake/vault'), directly_linked_notes)
    # test_eq(
    #     directly_linked_notes,
    #     {'non_embedded_note': {NoteLinkEnum.INFO_TO_INFO_IN_CONTENT}})
        

{'other_Notation_note_2.md': {<NoteLinkEnum.NOTAT_TO_NOTAT: 7>}, 'info_note_providing_data': {<NoteLinkEnum.NOTAT_TO_INFO: 5>}}


In [None]:
#| export
@patch(cls_method=True)
def from_note(
        cls: NotatNoteData,
        note: VaultNote,
        reference: str | None,
        section: str | None,
        section_num: int | None,
        note_num_in_section: int | None,
        char_displacement_in_section: int | None,
        subsection: str | None,
        subsection_num: int | None,
        note_num_in_subsection: int | None,
        char_displacement_in_subsection: int | None,
        parsed: NotationNoteParsed|None,
        # main_note: str|None,
        # main_note_content: str|None,
        ):
    """
    Return an `NotatNoteData` object. 

    The `main_note_content` attribute is not set up via this method.

    In practice, the `_find_indirect_links_to_info_notes_via_notat_notes` helper
    function should be applied (see also `note_data_from_reference`) for
    `directly_linked_notes` to really be complete.

    """
    assert_note_is_of_type(note, PersonalNoteTypeEnum.NOTATION_NOTE)
    mf = MarkdownFile.from_vault_note(note)
    tags = mf.tags()
    directly_linked_notes: dict[str, set[NoteLinkEnum]] = {}
    _get_links_to_notat_notes(mf, note.vault, directly_linked_notes, parsed)
    if parsed and parsed.notation_str:
        notation_str = parsed.notation_str
    else:
        notation_str = None
    if parsed:
        meta = parsed.yaml_frontmatter_meta
        if (meta and 'latex_in_original' in meta and isinstance(meta['latex_in_original'], list)
                and len(meta['latex_in_original']) > 0):
            latex_in_original = meta['latex_in_original'][0]
        else:
            latex_in_original = None
    else:
        latex_in_original = None
    processed_mf = process_standard_information_note(
        parsed.main_content_markdown_file, note.vault)
    return cls(
        reference, section, section_num, note_num_in_section, char_displacement_in_section,
        subsection, subsection_num, note_num_in_subsection, char_displacement_in_subsection,
        note_name=note.name, note_content=str(processed_mf),
        directly_linked_notes=directly_linked_notes, tags=tags,
        parsed=parsed, main_note=parsed.name_of_main_note, main_note_content=None, notation_str=notation_str,
        latex_in_original=latex_in_original)
        # main_note_content=main_note_content)

In [None]:
show_doc(NotatNoteData.data_string)

---

[source](https://github.com/hyunjongkimmath/trouver/blob/main/trouver/markdown/obsidian/personal/machine_learning/note_data.py#L535){target="_blank" style="float:right; font-size:smaller"}

### NotatNoteData.data_string

>      NotatNoteData.data_string (format:Literal['bert','t5'])

*Return a string of the data of this object relevant for training and prediction on an NLP model.

For a `NotatNoteData` object, this can contain its notation string and `"latex_in_original"` attribute
in the notation note's YAML frontmatter metadata, along with the content of the main note of the
notation note.*

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| format | Literal | The type of model that will handle the data; this affects the formatting of the string. |
| **Returns** | **str** | **note_data: Optional[dict[str, NoteData]] = None, # A dict from which to find information of the main onte of the notation note. content data** |

In [None]:

sample_note_data = NotatNoteData(
    reference='Sample_Reference_Book',
    section='Chapter_3_Data_Structures',
    section_num=3,
    note_num_in_section=5,
    char_displacement_in_section=1200,
    subsection='3.2_Lists_and_Tuples',
    subsection_num=2,
    note_num_in_subsection=2,
    char_displacement_in_subsection=450,
    note_name='notation_O_X_something',
    note_content='sheaf of regular functions...',
    directly_linked_notes={
        'info_note_1': {NoteLinkEnum.INFO_TO_INFO_IN_CONTENT},
        'info_note_2': {NoteLinkEnum.INFO_TO_INFO_IN_SEE_ALSO},
        'info_note_3': {NoteLinkEnum.INFO_TO_INFO_IN_CONTENT, NoteLinkEnum.INFO_TO_INFO_VIA_NOTAT},
        'notat_note_1': {NoteLinkEnum.INFO_TO_NOTAT_VIA_EMBEDDING}
    },
    tags=None,
    parsed=None,
    main_note='sample_main_note',
    main_note_content='List comprehensions provide a concise way to create lists...',
    notation_str=r"$\mathcal{O}_X$",
    latex_in_original=r"\\mathcal{O}_X",
)

note_data: dict[str, NoteData] = {}
note_data['sample_main_note'] = InfoNoteData(
    reference='Sample_Reference_Book',
    section='Chapter_3_Data_Structures',
    section_num=3,
    note_num_in_section=5,
    char_displacement_in_section=1200,
    subsection='3.2_Lists_and_Tuples',
    subsection_num=2,
    note_num_in_subsection=2,
    char_displacement_in_subsection=450,
    note_name='sample_main_note',
    note_content='List comprehensions provide a concise way to create lists...',
    directly_linked_notes={},
    tags=None
)

print(sample_note_data.data_string(format='bert'))
print('')
print(sample_note_data.data_string(format='t5'))

sample_note_data.parsed = NotationNoteParsed(
    yaml_frontmatter_meta={'latex_in_original': [r'\\mathcal{O}_X']},
    notation_str=r'$\mathcal{O}_X$',
    name_of_main_note='sample_main_note',
    main_content_markdown_file="sheaf of regular functions...",
    linked_notation_notes = [],
)
# note_data['sample_main_note'].parsed = NotationNoteParsed()

print('')
print(sample_note_data.data_string(format='bert'))
print('')
print(sample_note_data.data_string(format='t5'))

section: Chapter 3 Data Structures
section_num: 3
note_num_in_section: 5
char_displacement_in_section: 1200
subsection: 3.2 Lists and Tuples
subsection_num: 2
note_num_in_subsection: 2
char_displacement_in_subsection: 450
[SEP]
sheaf of regular functions...
[SEP]
main_note_content: List comprehensions provide a concise way to create lists...

section: Chapter 3 Data Structures
section_num: 3
note_num_in_section: 5
char_displacement_in_section: 1200
subsection: 3.2 Lists and Tuples
subsection_num: 2
note_num_in_subsection: 2
char_displacement_in_subsection: 450
</s>
sheaf of regular functions...
</s>
main_note_content: List comprehensions provide a concise way to create lists...

section: Chapter 3 Data Structures
section_num: 3
note_num_in_section: 5
char_displacement_in_section: 1200
subsection: 3.2 Lists and Tuples
subsection_num: 2
note_num_in_subsection: 2
char_displacement_in_subsection: 450
notation_str: $\mathcal{O}_X$
latex_in_original: \\mathcal{O}_X
[SEP]
sheaf of regular fun

In [None]:
#| export
@patch
def randomly_modify(
        self: NotatNoteData,
        augmentation: Optional[Literal['high', 'mid', 'low']],
        erase_position_metadata: bool = False, # If `True`, erase the 'section', 'section_num', etc. metadata.
        # attributes_to_modify: list[str] = POSITION_DATA_ATTRIBUTES + 
        ) -> None:
    """
    Modify this object at random for data augmentation. 

    For the `NotatNoteData` class, the 
    
    The changes made are essentially permanent, so only apply this method to
    copies (See `deepcopy`).
    """
    # super().randomly_modify(augmentation, erase_position_metadata)
    super(NotatNoteData, self).randomly_modify(augmentation, erase_position_metadata)

    if augmentation == 'low':
        method_inclusion_chance = 0.3
        scale = 0.5
    elif augmentation == 'mid':
        method_inclusion_chance = 0.5
        scale = 1.0
    else:
        method_inclusion_chance = 0.8
        scale = 1.5

    methods = [
        (remove_font_styles_at_random, 0.1), (change_font_styles_at_random, 0.2), (change_greek_letters_at_random, 0.1), 
        (remove_math_keywords,0.1), (random_latex_command_removal,0.2),
        (random_word_removal,0.1), (dollar_sign_manipulation,0.05),
        (random_char_modification, 0.001)]
    random_methods = choose_modification_methods_at_random(
        methods, method_inclusion_chance, scale)

    # 1. modify notation_str
    # 2. modify latex_in_original
    # 3. modify main_note_content
    if self.main_note_content:
        self.main_note_content = augment_text(
            self.main_note_content, random_methods)
    if not self.parsed:
        return
    
    if self.notation_str:
        self.notation_str = augment_text(self.notation_str, random_methods)
    if self.latex_in_original:
        self.latex_in_original = augment_text(self.latex_in_original, random_methods)
    # Randomly remove the notation note content.
    if self.note_content and random.random() < method_inclusion_chance * 0.1:
        self.note_content = None


In [None]:
show_doc(NotatNoteData.randomly_modify)

---

[source](https://github.com/hyunjongkimmath/trouver/blob/main/trouver/markdown/obsidian/personal/machine_learning/note_data.py#L696){target="_blank" style="float:right; font-size:smaller"}

### NotatNoteData.randomly_modify

>      NotatNoteData.randomly_modify
>                                     (augmentation:Optional[Literal['high','mid
>                                     ','low']],
>                                     erase_position_metadata:bool=False)

*Modify this object at random for data augmentation. 

For the `NotatNoteData` class, the 

The changes made are essentially permanent, so only apply this method to
copies (See `deepcopy`).*

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| augmentation | Optional |  |  |
| erase_position_metadata | bool | False | If `True`, erase the 'section', 'section_num', etc. metadata. |
| **Returns** | **None** |  | **attributes_to_modify: list[str] = POSITION_DATA_ATTRIBUTES + ** |

### Going through reference to gather note data

In [None]:
#| export
def _non_heading_section_has_links(
        headings_text_dict: dict[str, str] ,
        vault: PathLike,
        ) -> bool:
    """
    Return `True` if the sectionless text at the beginning of an
    index note has links to information notes.

    Helper function to `note_data_from_index_note`.
    """
    if '' not in headings_text_dict:
        return False
    text = headings_text_dict['']
    links_in_index_note: list[ObsidianLink] = links_from_text(text)
    for link in links_in_index_note:
        note = VaultNote(vault, name=link.file_name)
        if note_is_of_type(
            note, PersonalNoteTypeEnum.STANDARD_INFORMATION_NOTE):
            return True
    return False

In [None]:
#| hide
mf = MarkdownFile.from_string( r'''
[[link1]]
# hi
[[link2]]
''')
listy = mf.get_headings(levels=1)
dicty = mf.get_headings_and_text(levels=1)

with mock_patch('__main__.VaultNote') as mock_vault_note, \
     mock_patch('__main__.note_is_of_type') as mock_note_is_of_type:

    mock_note1 = MagicMock()
    mock_note2 = MagicMock()
    mock_vault_note.side_effect = [mock_note1, mock_note2]

    mock_note_is_of_type.return_value = False
    assert not _non_heading_section_has_links(dicty, Path('/fake/vault'))

    mock_note_is_of_type.return_value = True 
    assert _non_heading_section_has_links(dicty, Path('/fake/vault'))

    # mock_note_is_of_type.side_effect = [False, True]j
    # assert _non_heading_section_has_links(dicty, Path('/fake/vault'))


# _non_heading_section_has_links(dicty)

In [None]:
#| export


def _note_data_under_heading_in_index_note(
        vault: PathLike,
        reference: str,
        index_note_name: str,
        section_num: int | None,
        note_num_in_section: int,
        char_displacement_in_section: int,
        subsection_num: int,
        heading: str, # Heading of the section in the index note
        text: str, # Text of the section in the index note under the heading 
        ) -> tuple[dict[str, InfoNoteData], dict[str, NotatNoteData], int, int]: # The two dicts for the data for the info notes and their notation notes under the heading, along with the updated values for `note_num_in_section` and `char_displacement_in_section` .
    """
    Note: A "section" in the index note is a "subsection" for the reference.

    Helper function to `note_data_from_index_note`.
    """
    links_in_index_note: list[ObsidianLink] = links_from_text(text)
    if heading.startswith('#'):
        heading = heading.strip('#')
        heading = heading.strip(' ')
    subsection: str = heading
    section: str = _replace_underscores_with_spaces(index_note_name[7:])
    char_displacement_in_subsection: int = 0

    heading_info_note_data: dict[str, InfoNoteData] = {}
    heading_notat_note_data: dict[str, NotatNoteData] = {}
    for note_num_in_subsection, link in enumerate(links_in_index_note):
        note = VaultNote(vault, name=link.file_name)
        if not note_is_of_type(note, PersonalNoteTypeEnum.STANDARD_INFORMATION_NOTE):
            continue
        try:
            info_note_data = InfoNoteData.from_note(
                note, reference, section, section_num,
                note_num_in_section + note_num_in_subsection,
                char_displacement_in_section, subsection, subsection_num,
                note_num_in_subsection+1, char_displacement_in_subsection)
            notat_notes: list[VaultNote] = notation_notes_linked_in_see_also_section(note, vault)
            for notat_note in notat_notes:
                parsed: NotationNoteParsed = parse_notation_note(notat_note)
                notat_note_data = NotatNoteData.from_note(
                    notat_note, reference, section, section_num,
                    note_num_in_section + note_num_in_subsection,
                    char_displacement_in_section, subsection, subsection_num,
                    note_num_in_subsection+1, char_displacement_in_subsection, parsed)
                heading_notat_note_data[notat_note.name] = notat_note_data
            heading_info_note_data[note.name] = info_note_data
            char_displacement_in_subsection += len(info_note_data.note_content)
            char_displacement_in_section += len(info_note_data.note_content)
        # except TypeError
        except ProcessNoteError as e:
            print(f"The following note could not be properly processed: {note}")
        
    return (
        heading_info_note_data, heading_notat_note_data,
        note_num_in_section + len(links_in_index_note),
        char_displacement_in_section,
        )


In [None]:
#| export
def _remove_only_heading_titles_from_level_2_or_higher_headings(
        index_note_mf: MarkdownFile):
    """
    Remove level 2 or greater headings (just the headings, not the contents)

    Ensures that level 2 or greater headings are ignored, but their
    contents are not ignored when parsing index notes.

    Helper function to `note_data_from_index_note`.

    """
    headings: dict[int, str] = index_note_mf.get_headings_by_line_number(
        range(2,7), include_start=False)
    lines_to_remove = sorted(list(headings))
    lines_to_remove.reverse()
    for line_to_remove in lines_to_remove:
        index_note_mf.remove_line(line_to_remove)

In [None]:
#| export
def note_data_from_index_note(
        vault: PathLike,
        reference: str,
        index_note_name: str,
        section_num: int | None,
        # starting_char: int | None,
        ) -> tuple[dict[str, InfoNoteData], dict[str, NotatNoteData]]: # The note data
    """
    Return `NoteData` concerning the info notes linked to an index name
    in a reference along with the associated notation notes.
    """
    index_note = VaultNote(vault, name=index_note_name)
    index_note_mf = MarkdownFile.from_vault_note(index_note)
    _remove_only_heading_titles_from_level_2_or_higher_headings(index_note_mf)
    headings: list[str] = index_note_mf.get_headings(
        levels=1, include_start=True)
    headings_text_dict: dict[str, str] = index_note_mf.get_headings_and_text()
    if not _non_heading_section_has_links(headings_text_dict, vault) and '' in headings_text_dict:
        del headings_text_dict['']
        if headings[0] == '':
            headings.pop(0)

    char_displacement_in_section: int = 0
    note_num_in_section: int = 1
    index_info_note_data: dict[str, InfoNoteData] = {}
    index_notat_note_data: dict[str, NotatNoteData] = {}
    for subsection_num, heading in enumerate(headings):
        text = headings_text_dict[heading]
        heading_info_note_data, heading_notat_note_data,\
            note_num_in_section, char_displacement_in_section\
                = _note_data_under_heading_in_index_note(
                vault, reference, index_note_name, section_num,
                note_num_in_section, char_displacement_in_section,
                subsection_num + 1, heading, text)
        index_info_note_data.update(heading_info_note_data)
        index_notat_note_data.update(heading_notat_note_data)
    return index_info_note_data, index_notat_note_data

In [None]:
mf = MarkdownFile.from_string( r'''
# hi
asdf
asdf
''')
print(str(mf))
listy = mf.get_headings(levels=1)
# dicty['']
dicty = mf.get_headings_and_text(levels=1)
dicty['']


# hi
asdf
asdf


''

In [None]:
#| export
def _find_indirect_links_to_info_notes_via_notat_notes(
        info_note_data: dict[str, InfoNoteData],
        notat_note_data: dict[str, NotatNoteData],
        ) -> None:
    """
    Update `info_note_data` and `notat_note_data` to include
    `INFO_TO_INFO_VIA_NOTAT` and `NOTAT_TO_INFO_VIA_NOTAT` link data.

    Helper function to `note_data_from_reference`.
    """
    for _, info_note_data_point in info_note_data.items():
        info_notes_to_add_via_notat_links_for: list[str] = []
        for linked_note_name, link_types in info_note_data_point.directly_linked_notes.items():
            if not (NoteLinkEnum.INFO_TO_NOTAT_VIA_EMBEDDING in link_types
                    and linked_note_name in notat_note_data):
                continue
            if not linked_note_name in notat_note_data:
                continue
            other_info_note_name = notat_note_data[linked_note_name].main_note
            info_notes_to_add_via_notat_links_for.append(other_info_note_name)
        for other_info_note_name in info_notes_to_add_via_notat_links_for:
            _update_dict(info_note_data_point.directly_linked_notes, other_info_note_name,
                         NoteLinkEnum.INFO_TO_INFO_VIA_NOTAT)
    
    for _, notat_note_data_point in notat_note_data.items():
        info_notes_to_add_via_notat_links_for: list[str] = []
        for linked_note_name, link_types in notat_note_data_point.directly_linked_notes.items():
            if not (NoteLinkEnum.NOTAT_TO_NOTAT in link_types
                    and linked_note_name in notat_note_data):
                continue
            if not linked_note_name in notat_note_data:
                continue
            info_note_name = notat_note_data[linked_note_name].main_note
            info_notes_to_add_via_notat_links_for.append(info_note_name)
        for info_note_name in info_notes_to_add_via_notat_links_for:
            _update_dict(notat_note_data_point.directly_linked_notes, info_note_name,
                         NoteLinkEnum.NOTAT_TO_INFO_VIA_NOTAT)

In [None]:
#| export
def _note_data_from_vault_note_on_the_fly(
        note: VaultNote,
        reference: Optional[str] = None,
        note_data: Optional[dict[str, NoteData]] = None, # A means to access the main note's `NoteData` for getting the positional data, in case `note` represents a notation note.
        ) -> NoteData:
    """
    Invoke the `from_note` factory method on-the-fly,
    possibly ignoring positional data.
    """
    if type_of_note(note) == PersonalNoteTypeEnum.STANDARD_INFORMATION_NOTE:
        return InfoNoteData.from_note(
            note, reference, section=None, section_num=None, note_num_in_section=None,
            char_displacement_in_section=None, subsection=None, subsection_num=None,
            note_num_in_subsection=None, char_displacement_in_subsection=None)
    elif type_of_note(note) == PersonalNoteTypeEnum.NOTATION_NOTE:
        notat_note_parsed = parse_notation_note(note)
        if note_data and notat_note_parsed.name_of_main_note in note_data:
            main_note_data = note_data[notat_note_parsed.name_of_main_note]
            return NotatNoteData.from_note(
                note, reference, main_note_data.section, main_note_data.section_num,
                main_note_data.note_num_in_section,
                main_note_data.char_displacement_in_section, main_note_data.subsection,
                main_note_data.subsection_num, main_note_data.note_num_in_subsection,
                main_note_data.char_displacement_in_subsection,
                parsed=notat_note_parsed)
        else:
            return NotatNoteData.from_note(
                note, reference, section=None, section_num=None, note_num_in_section=None,
                char_displacement_in_section=None, subsection=None, subsection_num=None,
                note_num_in_subsection=None, char_displacement_in_subsection=None,
                parsed=notat_note_parsed)

In [None]:
#| export
def _note_data_from_notes_not_reachable_from_index_notes(
        vault: PathLike,
        reference: str,
        info_note_data: dict[str, InfoNoteData], # The information note data from notes that are linked in index notes
        notat_note_data: dict[str, NotatNoteData], # The notation note data from notes that are reachable from index notes.
        ) -> tuple[dict[str, InfoNoteData], dict[str, NotatNoteData]]:
    """
    Obtain note data from notes not reachable from index notes; these should
    usually be information notes that are somehow not linked in index notes and 
    notation notes that are not linked to their main information notes.
    """
    all_notes_in_reference: dict[str, list[str]] = all_paths_to_notes_in_reference_folder(
        vault, reference, as_dict=True)
    unreachable_info_note_data: dict[str, InfoNoteData] = {}
    unreachable_notat_note_data: dict[str, NotatNoteData] = {}
    note_data: dict[str, NoteData] = info_note_data | notat_note_data
    print(f"Printing names to notes not reachable from index notes in the reference: {reference}...")
    for name, rel_paths in all_notes_in_reference.items():
        if name in note_data:
            continue
        vn = VaultNote(vault, rel_path=rel_paths[0])
        if type_of_note(vn) not in [
            PersonalNoteTypeEnum.STANDARD_INFORMATION_NOTE, PersonalNoteTypeEnum.NOTATION_NOTE]:
            continue
        print(f'[[{name}]]')
        try:
            unreachable_note_data = _note_data_from_vault_note_on_the_fly(
                vn, reference, info_note_data)
            if type_of_note(vn) == PersonalNoteTypeEnum.STANDARD_INFORMATION_NOTE:
                unreachable_info_note_data[name] = unreachable_note_data
            elif type_of_note(vn) == PersonalNoteTypeEnum.NOTATION_NOTE:
                unreachable_notat_note_data[name] = unreachable_note_data
        except Exception as e:
            print(e)
    print('Finished printing unreachable note names')
    return unreachable_info_note_data, unreachable_notat_note_data


In [None]:
#| export

def note_data_from_reference(
        vault: PathLike,
        reference: str,
        add_via_notat_note_linking: bool = True, # If `True`, find and add the `INFO_TO_INFO_VIA_NOTAT` and `NOTAT_TO_INFO_VIA_NOTAT` enums in the `NoteData`s' directly_linked_notes attributes as appropriate.
        ) -> tuple[dict[str, InfoNoteData], dict[str, NotatNoteData]]:
    """
    Obtain `InfoNoteData` and the `NotatNoteData` from the information
    notes and notation notes of the reference as available.


    """
    index_of_reference_note: VaultNote = index_note_for_reference(
        vault, reference, update_cache=True)
    index_notes: list[VaultNote] = notes_linked_in_note(
        index_of_reference_note, as_dict=False)
    reference_info_note_data: dict[str, InfoNoteData] = {}
    reference_notat_note_data: dict[str, NotatNoteData] = {}
    for section_num, index_note in enumerate(index_notes):
        index_info_note_data, index_notat_note_data = note_data_from_index_note(
            vault, reference, index_note.name, section_num+1)
        reference_info_note_data.update(index_info_note_data)
        reference_notat_note_data.update(index_notat_note_data)

    unreachable_info_note_data, unreachable_notat_note_data =  _note_data_from_notes_not_reachable_from_index_notes(
        vault, reference, reference_info_note_data, reference_notat_note_data)
    reference_info_note_data.update(unreachable_info_note_data)
    reference_notat_note_data.update(unreachable_notat_note_data)
    
    if add_via_notat_note_linking:
        _find_indirect_links_to_info_notes_via_notat_notes(
            reference_info_note_data, reference_notat_note_data)

    return reference_info_note_data, reference_notat_note_data



In [None]:
# TODO: test

In [None]:
#| export
def find_reverse_links(
        info_note_data: dict[str, InfoNoteData],
        notat_note_data: dict[str, NotatNoteData],
        ) -> None:
    """
    Update the `reverse_lined_notes` attributes in the vaues of `info_note_data` and `notat_note_data`
    """
    for info_note_name, info_note_data_point in info_note_data.items():
        for linked_note_name, link_types in info_note_data_point.directly_linked_notes.items():
            if linked_note_name in info_note_data:
                other_note_data = info_note_data[linked_note_name]
            elif linked_note_name in notat_note_data:
                other_note_data = notat_note_data[linked_note_name]
            else:
                continue
            other_note_data.reverse_linked_notes[info_note_name] = set(link_types)

    for notat_note_name, notat_note_data_point in notat_note_data.items():
        for linked_note_name, link_types in notat_note_data_point.directly_linked_notes.items():
            if linked_note_name in info_note_data:
                other_note_data = info_note_data[linked_note_name]
            elif linked_note_name in notat_note_data:
                other_note_data = notat_note_data[linked_note_name]
            else:
                continue
            other_note_data.reverse_linked_notes[notat_note_name] = set(link_types)
    

In [None]:
# find_reverse_links(info_note_data, notat_note_data)

In [None]:
# info_note_data['fulton_it_Cycles 1.5'].directly_linked_notes

In [None]:
#| export
def get_main_note_content_of_notat_note_data(
        info_note_data: dict[str, InfoNoteData],
        notat_note_data: dict[str, NotatNoteData],
        ) -> None:
    """
    Update the `main_note_content` attribute of each value in
    `notat_note_data`.
    """
    for _, notat_data_point in notat_note_data.items():
        if not notat_data_point.main_note in info_note_data:
            continue
        notat_data_point.main_note_content = info_note_data[
            notat_data_point.main_note].note_content
    

In [None]:
# get_main_note_content_of_notat_note_data(
    # info_note_data, notat_note_data)