In [None]:
#| default_exp markdown.obsidian.personal.machine_learning.information_note_types

# markdown.obsiidan.personal.machine_learning.information_note_types

> Functions for gathering machine learning data on the types of math information notes from tags and for using ML models trained on such data to predict typeso of math information notes.

Some common types of components in mathematical writing include: definitions, notations, concepts (e.g. theorems, propositions, corollaries, lemmas), proofs. The functions in this module gather data from labeled "standard information notes" (formatted in `trouver`'s standard formatting) in an `Obsidian.md` vault about the types of these notes. Such data can be used to train a categorization ML model to predict types of unlabeled notes.

The labels are done by Markdown tags in the notes' YAML frontmatter meta (so tags in the body of the Markdown file, are ignored). For example, the note 

```Markdown
---
cssclass: clean-embeds
aliases: []
tags: [_meta/literature_note, _meta/definition]
---
# This is a title of a note[^1]

We could talk about many things. I like to talk about rings!

A **ring** is a set equipped with two binary operators $+$ and $\cdot$
such that...

# See Also

# Meta
## References
![[_reference_sample_reference]]

## Citations and Footnotes
[^1]: Author names, Some way to identify where this information comes from
```


has the tag `#_meta/definition` [^1]

[^1]: Note that the tag in the YAML frontmatter meta is notated as `_meta/definition`, which lacks the starting hashtag `#`.



In [None]:
#| export
LABEL_TAGS = [
    '#_meta/concept', '#_meta/exercise', '#_meta/definition', '#_meta/example', 
    '#_meta/narrative', '#_meta/notation', '#_meta/proof', '#_meta/remark',
    '#_meta/TODO/split', '#_meta/TODO/merge', '#_meta/TODO/delete', '#_meta/hint',
    '#_meta/how_to', '#_meta/conjecture', '#_meta/convention',
    '#_meta/context',
]

`LABEL_TAGS` above lists the tags that we would like to eventually train a model to predict. The following are the tags for which the author of `trouver` has ample labeled data: 

- `#_meta/concept` labels a note that contains a general concept, e.g. by virtue of stating a theorem/proposiiton/lemma/corollary. 
- `#_meta/definition` labels a note that introduces a definition.
- `#_meta/exercise` labels a note that contains an exercise/problem.
- `#_meta/example` labels a note that contains an example.
- `#_meta/narrative` labels a note that contains narrative - explainations of the material that is presented. Narratives most usually occur at the start/end of a chapter/section of a book/text/paper and in-between definitions/theorems/etc.
- `#_meta/notation` labels a note that introduces a notation.
- `#_meta/proof` labels a note that contains a proof.
- `#_meta/remark` labels a note that contains a remark.

It is often appropriate to label a single note with more than one of these tags. For example, a note containing the statement "We define the ring $\mathbb{Z}/n\mathbb{Z}$ of integers modulo $n$" is both a definition note and a notation note because it both introduces notion of the ring of integers modulo $n$ and gives notation for the ring.


In [None]:
#| export
import os
from os import PathLike
from pathlib import Path
import shutil

import pandas as pd

from trouver.helper import current_time_formatted_to_minutes
from trouver.markdown.markdown.file import MarkdownFile
from trouver.markdown.obsidian.personal.machine_learning.database_update import max_ID, append_to_database
from trouver.markdown.obsidian.personal.note_processing import process_standard_information_note
from trouver.markdown.obsidian.vault import VaultNote

In [None]:
import shutil
import tempfile
from unittest import mock

from fastcore.test import *

from trouver.helper import _test_directory
from trouver.markdown.obsidian.personal.notes import notes_linked_in_note

## Gather and label data

In [None]:
#| export 


# export
def note_is_labeled_with_tag(
        note: VaultNote,
        label_tag: str # A tag which labels a type that `note` is. Includes the beginning hashtag `#`, e.g. `#_meta/definition`, `#_meta/TODO/split`
        ) -> bool: # `True` if `note` is labeled as type `label_type`.
    """
    Return `True` if the standard information note is labeled as
    begin a specified type.

    **Raises**

    - `ValueError`
        - If `label_tag` does not include the beginning hashtag `#`.
    """
    # assert is_standard_information_note
    if not label_tag.startswith('#'):
        raise ValueError(f"`label_tag` does not start with a hashtag `#`: {label_tag}")
    label_tag = label_tag[1:]
    mf = MarkdownFile.from_vault_note(note)
    return label_tag in mf.metadata()['tags']



In [None]:
sample_text = r"""---
cssclass: clean-embeds
aliases: []
tags: [_meta/literature_note, _meta/definition]
---
# This is a title of a note[^1]

We could talk about many things. I like to talk about rings!

A **ring** is a set equipped with two binary operators $+$ and $\cdot$
such that...

# See Also

# Meta
## References
![[_reference_sample_reference]]

## Citations and Footnotes
[^1]: Author names, Some way to identify where this information comes from
"""
sample_mf = MarkdownFile.from_string(sample_text)

with mock.patch("__main__.MarkdownFile.from_vault_note", return_value=sample_mf) as mock_markdownfile_from_vault_note:
    mock_note = None
    # This is setup in such a way that the invocation to
    # `note_is_labeled_with_tag` will use
    # a note whose text is `sample_text`.
    assert note_is_labeled_with_tag(mock_note, '#_meta/definition')
    assert not note_is_labeled_with_tag(mock_note, '#_meta/notation')
    assert not note_is_labeled_with_tag(mock_note, '#_meta/concept')

    with ExceptionExpected(ValueError):
        # The argument to `label_tag` requires the starting hashtag `#`.`
        note_is_labeled_with_tag(mock_note, '_meta/definition')


In [None]:
#| export
def note_labels(
        note: VaultNote
        ) -> dict[str, str]:
        # Each key is a string, which is a tag, including the starting hashtag `#`. Each value is a string, either `'IS {tag}'` or `'NOT {tag}'`.
    """Return a dict indicating what labels a note has.

    The labels come from the `LABEL_TAGS` dict.
    """
    label_dict = {label_tag: note_is_labeled_with_tag(note, label_tag)
                  for label_tag in LABEL_TAGS}
    return {tag: (f'IS {tag}' if flag else f'NOT {tag}')
            for tag, flag in label_dict.items()}
    

In [None]:
sample_text = r"""---
cssclass: clean-embeds
aliases: []
tags: [_meta/literature_note, _meta/definition]
---
# This is a title of a note[^1]

We could talk about many things. I like to talk about rings!

A **ring** is a set equipped with two binary operators $+$ and $\cdot$
such that...

# See Also

# Meta
## References
![[_reference_sample_reference]]

## Citations and Footnotes
[^1]: Author names, Some way to identify where this information comes from
"""
sample_mf = MarkdownFile.from_string(sample_text)

with mock.patch("__main__.MarkdownFile.from_vault_note", return_value=sample_mf) as mock_markdownfile_from_vault_note:
    mock_note = None
    # This is setup in such a way that the invocation to
    # `note_labels` will use
    # a note whose text is `sample_text`.
    sample_output = note_labels(mock_note)
    test_eq(sample_output['#_meta/definition'], 'IS #_meta/definition')
    test_eq(sample_output['#_meta/concept'], 'NOT #_meta/concept')
    for label_tag in LABEL_TAGS:
        assert label_tag in sample_output
    print(sample_output)

    

{'#_meta/concept': 'NOT #_meta/concept', '#_meta/exercise': 'NOT #_meta/exercise', '#_meta/definition': 'IS #_meta/definition', '#_meta/example': 'NOT #_meta/example', '#_meta/narrative': 'NOT #_meta/narrative', '#_meta/notation': 'NOT #_meta/notation', '#_meta/proof': 'NOT #_meta/proof', '#_meta/remark': 'NOT #_meta/remark', '#_meta/TODO/split': 'NOT #_meta/TODO/split', '#_meta/TODO/merge': 'NOT #_meta/TODO/merge', '#_meta/TODO/delete': 'NOT #_meta/TODO/delete', '#_meta/hint': 'NOT #_meta/hint', '#_meta/how_to': 'NOT #_meta/how_to', '#_meta/conjecture': 'NOT #_meta/conjecture', '#_meta/convention': 'NOT #_meta/convention', '#_meta/context': 'NOT #_meta/context'}


The way that data for information note types should be obtained is fairly simple - for each note, 

In [None]:
#| export
def gather_information_note_types(
        vault: PathLike,
        notes: list[VaultNote],
        ) -> pd.DataFrame: # Has columns `Time added`, `Time modified`, `Note name`, `Full note content`, `Processed note content` as well as columns for each tag label. See `append_to_information_note_type_database` for more details about these columns.
    """
    Return a `pandas.DataFrame` encapsulating the data of note labels.
    """
    labels_of_notes = [note_labels(note) for note in notes]
    rows = []
    current_time = current_time_formatted_to_minutes()
    for i, (note, labels_of_note) in enumerate(zip(notes, labels_of_notes)):
        mf = MarkdownFile.from_vault_note(note)
        rows.append({
            'Time added': current_time,
            'Time modified': current_time,
            'Note name': note.name,
            'Full note content': str(mf), 
            'Processed note content': str(process_standard_information_note(
                mf, vault)),
            **labels_of_note
        })
    return pd.DataFrame(rows)
    # notes_with_processed_text_and_
    # process_standard_information_note


In [None]:
test_vault = _test_directory() / 'test_vault_6'
index_note = VaultNote(test_vault, name='_index_1_introduction_reference_with_tag_labels')
# There are just 5 notes
notes = notes_linked_in_note(index_note, as_dict=False)
df = gather_information_note_types(test_vault, notes)
test_eq(len(df), 5)
df.head()

Unnamed: 0,Time added,Time modified,Note name,Full note content,Processed note content,#_meta/concept,#_meta/exercise,#_meta/definition,#_meta/example,#_meta/narrative,...,#_meta/proof,#_meta/remark,#_meta/TODO/split,#_meta/TODO/merge,#_meta/TODO/delete,#_meta/hint,#_meta/how_to,#_meta/conjecture,#_meta/convention,#_meta/context
0,2023-01-12T17:04,2023-01-12T17:04,reference_with_tag_labels_something_something,---\ncssclass: clean-embeds\naliases: []\ntags...,"In this chapter, we describe some basics of ri...",NOT #_meta/concept,NOT #_meta/exercise,NOT #_meta/definition,NOT #_meta/example,IS #_meta/narrative,...,NOT #_meta/proof,NOT #_meta/remark,NOT #_meta/TODO/split,NOT #_meta/TODO/merge,NOT #_meta/TODO/delete,NOT #_meta/hint,NOT #_meta/how_to,NOT #_meta/conjecture,NOT #_meta/convention,NOT #_meta/context
1,2023-01-12T17:04,2023-01-12T17:04,reference_with_tag_labels_Definition 1,---\ncssclass: clean-embeds\naliases: []\ntags...,A ring is a set with binary operators $+$ and ...,NOT #_meta/concept,NOT #_meta/exercise,NOT #_meta/definition,NOT #_meta/example,NOT #_meta/narrative,...,NOT #_meta/proof,NOT #_meta/remark,NOT #_meta/TODO/split,NOT #_meta/TODO/merge,NOT #_meta/TODO/delete,NOT #_meta/hint,NOT #_meta/how_to,NOT #_meta/conjecture,NOT #_meta/convention,NOT #_meta/context
2,2023-01-12T17:04,2023-01-12T17:04,reference_with_tag_labels_Definition 2,---\ncssclass: clean-embeds\naliases: []\ntags...,Let $n \geq 1$ be an integer. The ring of inte...,NOT #_meta/concept,NOT #_meta/exercise,IS #_meta/definition,NOT #_meta/example,NOT #_meta/narrative,...,NOT #_meta/proof,NOT #_meta/remark,NOT #_meta/TODO/split,NOT #_meta/TODO/merge,NOT #_meta/TODO/delete,NOT #_meta/hint,NOT #_meta/how_to,NOT #_meta/conjecture,NOT #_meta/convention,NOT #_meta/context
3,2023-01-12T17:04,2023-01-12T17:04,reference_with_tag_labels_Exercise 1,---\ncssclass: clean-embeds\naliases: [referen...,Show that $\mathbb{Z}/n\mathbb{Z}$ is a ring.\n,NOT #_meta/concept,IS #_meta/exercise,NOT #_meta/definition,NOT #_meta/example,NOT #_meta/narrative,...,NOT #_meta/proof,NOT #_meta/remark,NOT #_meta/TODO/split,NOT #_meta/TODO/merge,NOT #_meta/TODO/delete,NOT #_meta/hint,NOT #_meta/how_to,NOT #_meta/conjecture,NOT #_meta/convention,NOT #_meta/context
4,2023-01-12T17:04,2023-01-12T17:04,reference_with_tag_labels_Theorem 1,---\ncssclass: clean-embeds\naliases: []\ntags...,Theorem 1. Let $R$ be a UFD. Then $R[x]$ is a ...,IS #_meta/concept,NOT #_meta/exercise,NOT #_meta/definition,NOT #_meta/example,NOT #_meta/narrative,...,IS #_meta/proof,NOT #_meta/remark,NOT #_meta/TODO/split,NOT #_meta/TODO/merge,NOT #_meta/TODO/delete,NOT #_meta/hint,NOT #_meta/how_to,NOT #_meta/conjecture,NOT #_meta/convention,NOT #_meta/context


In [None]:
#| export
def append_to_information_note_type_database(
        vault: PathLike, # The vault freom which the data is drawn
        file: PathLike, # The path to a CSV file
        notes: list[VaultNote], # the notes to add to the database
        backup: bool = True # If `True`, makes a copy of `file` in the same directory and with the same name, except with an added extension of `.bak`.
        ) -> None:
    """
    Either create a `csv` file containing data for information note type
    labels or append to an existing `csv` file.

    The columns of the database file are as follows:

    - `Time added` - The time when the row was added.
    - `Time modified` - The time when the labels of the row 
    - `Note name` - The name of the note from which the data for the row
      was derived.
    - `Full note content` - The entire content/text of the note.
    - `Processed note content` - The "raw" content of the note without
      the YAML frontmatter meta, Markdown headings, links, footnotes, etc. 

    All timestamps are in UTC time and specify time to minutes
    (i.e. no seconds/microseconds).
    
    If a "new" note has the same processed content as a pre-existing
    note and anything is different about the "new" note, then update
    the row of the existing note. In particular, the following are updated:
    - Time modified (set to current time)
    - Note name (overwritten)
    - Full note content (overwritten)
    - Columns for categorization (overwritten)
    
    This method assumes that all the processed content in the
    CSV file are all distinct if the CSV file exists.
    """
    if not notes:
      return
    file = Path(file)
    df = pd.read_csv(file) if os.path.exists(file) else None
    start_ID_from = max_ID(df) + 1 if not df is None else 1
    new_df = gather_information_note_types(vault, notes)
    cols = [
        'Time added', 'Time modified', 'Note name',
        'Full note content', 'Processed note content']
    cols.extend(LABEL_TAGS)
    cols_to_update = ['Time modified', 'Note name', 'Full note content']
    cols_to_update.extend(LABEL_TAGS)
    append_to_database(
        file, new_df, cols, 'Processed note content', cols_to_update, backup)


In [None]:
with tempfile.TemporaryDirectory(prefix='temp_dir', dir=os.getcwd()) as temp_dir:
    temp_vault = Path(temp_dir) / 'test_vault_6'
    shutil.copytree(_test_directory() / 'test_vault_6', temp_vault)

    index_note = VaultNote(temp_vault, name='_index_1_introduction_reference_with_tag_labels')
    notes = notes_linked_in_note(index_note, as_dict=False)
    file = temp_vault / '_ml_data' / 'information_note_type_labels.csv'
    append_to_information_note_type_database(
         temp_vault, file, notes)

    # Uncomment these lines to see `temp_vault` and its contents.
    # os.startfile(os.getcwd())
    # input()
    df = pd.read_csv(file)
    print(df.head())
    



         Time added     Time modified  \
0  2023-01-12T17:07  2023-01-12T17:07   
1  2023-01-12T17:07  2023-01-12T17:07   
2  2023-01-12T17:07  2023-01-12T17:07   
3  2023-01-12T17:07  2023-01-12T17:07   
4  2023-01-12T17:07  2023-01-12T17:07   

                                       Note name  \
0  reference_with_tag_labels_something_something   
1         reference_with_tag_labels_Definition 1   
2         reference_with_tag_labels_Definition 2   
3           reference_with_tag_labels_Exercise 1   
4            reference_with_tag_labels_Theorem 1   

                                   Full note content  \
0  ---\ncssclass: clean-embeds\naliases: []\ntags...   
1  ---\ncssclass: clean-embeds\naliases: []\ntags...   
2  ---\ncssclass: clean-embeds\naliases: []\ntags...   
3  ---\ncssclass: clean-embeds\naliases: [referen...   
4  ---\ncssclass: clean-embeds\naliases: []\ntags...   

                              Processed note content      #_meta/concept  \
0  In this chapter, we desc