In [18]:
from pathlib import Path
import regex
from pprint import pformat

path = Path(
    "/Users/jonathan/mres_thesis/wine_analysis_hplc_uv/src/wine_analysis_hplc_uv/notes/devnotes.md"
)


def get_notes_from_path(path) -> list[str]:
    with open(path, "r") as f:
        string = f.read()

    pattern = r"(?<=\])\n\n(?=\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})"
    notes = regex.split(pattern=pattern, string=string)

    # simple validation of the regex split. All notes should start with a '2' and end with a ']'
    for note in notes:
        if note[0] != "2":
            raise ValueError("note doesnt start with a 2")
        if note[-1] != "]":
            raise ValueError("note doesnt end with a ']'")

    return notes


notes = get_notes_from_path(path=path)


In [19]:
def extract_note_fields(note: str) -> dict[str, str] | None:
    regexps = dict(
        datetime=r"(?P<datetime>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})",
        separator=r" - ",
        title=r"(?P<title>.+?(?=\.|\?))",
        content=r"(?<=\2)\. (?P<content>.+?)(?= tag)",
        tags=r"(?<=\3) tags: (?P<tags>.*)$",
    )

    regexp = ""
    for patt in regexps.values():
        regexp += patt

    match = regex.match(regexp, note)

    if match:
        fields = match.groupdict()
    else:
        fields = None

    return fields


def check_notes_without_matches(notes, decomposed_notes):
    # number of notes that didnt match
    n_no_match = len(notes) - len([note for note in decomposed_notes if note])

    # index of each that didnt match
    no_match_indexes = [idx for idx, note in enumerate(decomposed_notes) if not note]
    # the notes themselves
    no_match_notes = [notes[i] for i in no_match_indexes]

    if no_match_indexes:
        with ValueError as e:
            e.add_note(f"number of notes without match: {n_no_match}")
            e.add_note(f"no match notes indexes: {no_match_indexes}")
            e.add_note(
                f"notes that didnt match:\n\n{pformat(dict(zip(no_match_indexes, no_match_notes)))}"
            )


def decompose_notes(notes) -> list[dict[str, str]]:
    """
    Decompose note strings in `notes` into a list of dicts of prespecified fields: 'datetime', 'title', 'content', 'tags'
    """
    decomposed_notes = [extract_note_fields(note) for note in notes]
    check_notes_without_matches(decomposed_notes=decomposed_notes, notes=notes)

    if not any(decomposed_notes):
        raise ValueError("a match was not found in a note")

    return decomposed_notes


decomp_notes = decompose_notes(notes=notes)


In [20]:
# validate datetimes

from dateutil.parser import parse, ParserError


def validate_datetimes(notes: list[dict[str, str]]):
    no_parse_date_notes = []

    for note in notes:
        old_dt = note["datetime"]
        try:
            new_dt = parse(old_dt).isoformat()
        except ParserError:
            new_dt = old_dt
            no_parse_date_notes.append(note)
        note["datetime"] = new_dt

    if no_parse_date_notes:
        n_no_parse = len(no_parse_date_notes)
        date_titles = [
            {k: v for k, v in note.items() if k in ["title", "datetime"]}
            for note in no_parse_date_notes
        ]
        err_str = f"Some note datetimes were unable to be parsed. {n_no_parse} were not parsed. They are as follows:\n\n{pformat(date_titles)}"
        raise ValueError(err_str)
    return notes


validate_datetimes(notes=decomp_notes)


[{'datetime': '2023-09-07T13:18:55',
  'title': 'Adding nbstripout as precommit',
  'content': 'Have added [nbstripout](https://github.com/kynan/nbstripout) as a pre-commit hook to ensure that notebooks are never commited with output. This hook will check if there is output, and clear it if so, simply requiring you to add the change before commiting again.',
  'tags': '[wine_analysis_hplc_uv, notebooks, nbstripout, precommits, git, commit, log, project_management]'},
 {'datetime': '2024-05-03T15:24:29',
  'title': 'Designing a Query API',
  'content': "I need specific logic for each table and column type. this is divided into: whether the input is an iterable or a scalar. What are the modes of operation? If no argument is submitted, dont add a WHERE to the query. if its an iterable, use IN, and if its a scalar, use '='. Further logic is needed for the wavelength and mins columns, where ease of use requires the input of ranges. I could continue to subset the relation as it is lazily eva

In [21]:
# add title field as lowered and underscore seperated titles, stripped and illegal characters removed

from wine_analysis_hplc_uv import definitions


def add_filenames(notes: list[dict[str, str]]) -> list[dict[str, str]]:
    """
    add a 'file_name' pair whose value is created from the 'title' string, cleaned for use as a file name. It is validated by testing whether the new file can be written.
    """
    try:
        temp_dir: Path = Path(definitions.ROOT_DIR).parent / "test_valid_names"

        temp_dir.mkdir()

        for note in notes:
            title = note["title"]

            # clean
            name = title.lower().strip()

            # remove any trailing punctuation if present
            if name[-1] in ["?", ",", "."]:
                name = name[:-1]

            # replace spaces
            name = name.replace(" ", "_")

            # add ".md"
            name = name + ".md"

            # test the new file

            temp_outpath = temp_dir / name

            try:
                temp_outpath.touch(exist_ok=False)
            except OSError as e:
                e.add_note(f"file name potentially  invalid: {name}")
                raise e
            finally:
                temp_outpath.unlink()
                note["filename"] = name

    except Exception as e:
        raise e
    finally:
        temp_dir.rmdir()

    return notes


add_filenames(notes=decomp_notes)


[{'datetime': '2023-09-07T13:18:55',
  'title': 'Adding nbstripout as precommit',
  'content': 'Have added [nbstripout](https://github.com/kynan/nbstripout) as a pre-commit hook to ensure that notebooks are never commited with output. This hook will check if there is output, and clear it if so, simply requiring you to add the change before commiting again.',
  'tags': '[wine_analysis_hplc_uv, notebooks, nbstripout, precommits, git, commit, log, project_management]',
  'filename': 'adding_nbstripout_as_precommit.md'},
 {'datetime': '2024-05-03T15:24:29',
  'title': 'Designing a Query API',
  'content': "I need specific logic for each table and column type. this is divided into: whether the input is an iterable or a scalar. What are the modes of operation? If no argument is submitted, dont add a WHERE to the query. if its an iterable, use IN, and if its a scalar, use '='. Further logic is needed for the wavelength and mins columns, where ease of use requires the input of ranges. I could 

In [24]:
def parse_tags(notes):
    """
    Convert the tags from strings to lists of strings.
    """

    # get the tags

    for note in notes:
        tags = note["tags"].strip()
        # remove brackets
        tags_without_brackets = tags[1:-1].strip()

        # if trailing comma, remove

        if tags_without_brackets[-1] == ",":
            tags_without_brackets = tags_without_brackets[:-1]

        cleaned_tags = [tag.strip() for tag in tags_without_brackets.rsplit(",")]

        for tag in cleaned_tags:
            if " " in tag:
                raise ValueError(
                    f"something went wrong when parsing {note['title']}, space detected.."
                )

        note["cleaned_tags"] = cleaned_tags

    return notes


decomp_notes = parse_tags(notes=decomp_notes)


In [23]:
import frontmatter

frontmatter_objs = []

for note in decomp_notes:
    frontmatter.Post()


ModuleNotFoundError: No module named 'frontmatter'