In [None]:
#| default_exp markdown.obsidian.personal.machine_learning.definition_identification

# markdown.obsidian.personal.machine_learning.definition_identification
> Functions for finding definitions

In [None]:
#| export
import os 
from os import PathLike
from pathlib import Path
from typing import Union

import pandas as pd

from trouver.helper.date_and_time import current_time_formatted_to_minutes
from trouver.helper.definition_and_notation import definition_asterisk_indices
from trouver.markdown.markdown.file import MarkdownFile
from trouver.markdown.obsidian.personal.note_processing import process_standard_information_note
from trouver.markdown.obsidian.vault import VaultNote

# Gather ML data from information notes

In [None]:
#| export
def definitions_in_text(
        text: str
        ) -> list[str]:
    """
    Return the list of str with the definitions in the text.
    """
    indices = definition_asterisk_indices(text)
    return [text[start+2:end-2] for start, end in indices]

In [None]:
# TODO: exmaples

In [None]:
#| export
# TODO: implement a measure to not get the definition identification data, e.g. by 
# detecting a `_auto/definition_identification` tag.
def definition_identification_data_from_note(
        note: VaultNote,
        vault: PathLike
        ) -> Union[dict[str, str], None]: # The keys to the dict are "Note name", "Raw text", "Definitions". However, `None` is returned if `note` does not exist.
    """Obtain definition identification data from the information note.

    """
    if not note.exists():
        return None
    mf = MarkdownFile.from_vault_note(note)
    mf = process_standard_information_note(
        mf, vault, remove_double_asterisks=False)
    mf_text = str(mf)
    definitions = definitions_in_text(mf_text)
    raw_text = mf_text.replace('**', '') 
    return {
        "Note name": note.name,
        "Raw text": raw_text,
        "Definitions": definitions}

In [None]:
# TODO: examples

In [None]:
#| export
def gather_definition_identification_data(
        vault: PathLike,
        notes: list[VaultNote]
        ) -> pd.DataFrame: # 
    """
    Return a `pandas.DataFrame` encapsulating the data of definition
    identifications.
    
    cf. `definition_identification_data_from_note`, which is the function
    with which the definition identification data is drawn.
    
    This function is mainly used in
    `append_to_definition_identification_database`.
    """
    definition_identification_data = [
        definition_identification_data_from_note(note, vault) for note in notes]
    definition_identification_data = [
        row for row in definition_identification_data
        if row is not None]
    current_time = current_time_formatted_to_minutes()
    for row in definition_identification_data:
        row['Time added'] = current_time
        row['Time modified'] = current_time
    return pd.DataFrame(definition_identification_data)

In [None]:
# TODO: examples

In [None]:
#| export

In [None]:
def append_to_definition_identification_database(
        vault: PathLike, # The vault from which the data is drawn
        file: PathLike, # The path to a CSV file
        notes: list[VaultNote], # The notation notes to consider adding to the database
        backup: bool = True # If `True`, makes a copy of `file` in the same directory and with the same name, except with an added extension of `.bak`.
        ) -> None:
    """
    Either create a `csv` file containing data for definition
    identification or append to an existing `csv` file.

    The columns of the database file are as follows:

    - `Time added` - The time when the row was added.
    - `Time modified` - The time when the labels of the row 
    - `Notation note name` - The name of the note from which the data for the row
      was derived.
    - 'Notation' - The notation which is being summarized
    - 'Latex in original' - The entry of the `latex_in_original` field of the
      note if available, cf. `make_a_notation_note`
    - `"Summary"` - The summary of the notation.
    - `"Main note name"` - The name of the main note of the
      notation note
    - `"Processed main note contents"` - The processed contents of the
      main note

    All timestamps are in UTC time and specify time to minutes
    (i.e. no seconds/microseconds).
    
    TODO: implement updating rows and rewrite the next paragraph to
    accurately reflect the implementation. I would like the 'Notation', 'Latex in original',
    'Summary', 'processed main note contents' to be the "pivot_cols"

    If a "new" note has the same processed content as a pre-existing
    note and anything is different about the "new" note, then update
    the row of the existing note. In particular, the following are updated:
    - Time modified (set to current time)
    - Notation (overwritten)
    - Latex in original (overwritten)
    - Summary (overwritten)
    - Main note name (overwritten)
    - Processed main note contents (overwritten)
    
    This method assumes that all the processed content in the
    CSV file are all distinct if the CSV file exists.
    """
    if not notes:
        return
    file = Path(file)
    ddf = pd.read_csv(file) if os.path.exists(file) else None
    new_df = gather_definition_identification_data(vault, notes)
    if new_df.empty:
        return
    cols = [
        
    ]