In [None]:
%load_ext autoreload
%autoreload 2
import polars as pl
import frontmatter
from pathlib import Path
from dateutil.parser import parse as dateutil_parse, ParserError as dateutil_ParserError

vault_path = Path("/Users/jonathan/001_obsidian_vault")

paths = (
    list(vault_path.glob("zettel/*.md"))
    + list(vault_path.glob("to_be_processed/*.md"))
    + list(vault_path.glob("z_literature_notes/*.md"))
)

print(len(paths))


In [None]:
import markdown_parser

posts = markdown_parser.parse(paths=paths)


In [None]:
# checking for incorrect length digit cdts

import re


def _reformat_dates_starting_with_22(
    dates: list[dict[str, str]],
) -> list[dict[str, str]]:
    """
    format the dates to match iso
    """

    regexp_without_named_grp = r"^22(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})$"
    repl_without_named_grp = r"\1-\2-\3T\4:\5:00"

    for date in dates:
        cdt = date["cdt"]
        date["new_cdt"] = re.sub(regexp_without_named_grp, repl_without_named_grp, cdt)

    return dates


def reformat_dates_starting_with_22(
    posts: list[dict[Path, frontmatter.Post]],
) -> list[dict[str, str]]:
    matched_22_prefix_dates = find_dates_starting_with_22(posts=posts)
    matched_22_prefix_dates = _reformat_dates_starting_with_22(
        dates=matched_22_prefix_dates
    )

    return matched_22_prefix_dates


dates_starting_with_22_formatted = reformat_dates_starting_with_22(posts=posts)


In [None]:
import re

# observe the string formats

no_cdt = []
all_digits = []
regexp_with_22_match_no_seconds = r"22\d{12}"
match_regexp_22_no_seconds = []
# "2202204281129"
regexp_2_match_no_seconds = r"2\d{12}"
match_regexp_2_no_seconds = []
# 2022071900
regexp_match_10_digits = r"\d{10}"
digits_10 = []
not_match_regexp_22 = []
space_colon_regexp = r"(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2})"
space_colon_sub = r"\1-\2-\3T\4:\5:00"
space_colons = []
no_space_colon_regexp = r"(\d{4})-(\d{2})-(\d{2})-(\d{2})-(\d{2}):(\d{2})"
no_space_colon_sub = r"\1-\2-\3T\4:\5:\6"
no_space_colon = []
all_hyphen_regexp = r"(\d{4})-(\d{2})-(\d{2})-(\d{2})-(\d{2})-(\d{2})"
all_hyphen_sub = r"\1-\2-\3T\4:\5:\6"
all_hyphen = []
min_sec_colon_regexp = r"(\d{4})-(\d{2})-(\d{2})-(\d{2}):(\d{2}):(\d{2})"
min_sec_colon_sub = r"\1-\2-\3T\4:\5:\6"
min_sec_colon = []
yyyy_mm_dd_hyphen_regexp = r"(\d{4})-(\d{2})-(\d{2})"
min_sec_colon_sub = r"\1-\2-\3T00:00:00"
yyyy_mm_dd_hyphens = []

other = []
for d in posts:
    path = d["path"]
    cdt = d["post"].get("cdt")

    if not cdt:
        no_cdt.append(path)
    else:
        cdt_str = str(cdt)
        if cdt_str.isdigit():
            # prefixed by 22 but no seconds..
            if re.match(regexp_with_22_match_no_seconds, cdt_str):
                match_regexp_22_no_seconds.append(
                    {
                        "path": path,
                        "cdt": cdt_str,
                    }
                )
                continue
            # prefixed by 2 but no seconds..
            if re.match(regexp_2_match_no_seconds, cdt_str):
                match_regexp_2_no_seconds.append(
                    {
                        "path": path,
                        "cdt": cdt_str,
                    }
                )
                continue

            # 10 digit string
            if re.match(regexp_match_10_digits, cdt_str):
                digits_10.append({"path": path, "cdt": cdt_str})
                continue
            # all other digit strings

            try:
                all_digits.append(
                    {
                        "path": path,
                        "cdt": cdt_str,
                        "new_cdt": dateutil_parse(cdt_str).isoformat(),
                    }
                )
            except (dateutil_ParserError, OverflowError) as e:
                e.add_note(f"while parsing {path}..")
                e.add_note(f"with cdt = {cdt}..")
                raise e
            continue
        elif re.match(space_colon_regexp, cdt_str):
            space_colons.append(
                {
                    "path": path,
                    "cdt": cdt_str,
                    "new_cdt": re.sub(space_colon_regexp, space_colon_sub, cdt_str),
                }
            )
            continue
        elif re.match(no_space_colon_regexp, cdt_str):
            no_space_colon.append({"path": path, "cdt": cdt_str})
            continue
        elif re.match(all_hyphen_regexp, cdt_str):
            all_hyphen.append({"path": path, "cdt": cdt_str})
            continue
        elif re.match(min_sec_colon_regexp, cdt_str):
            min_sec_colon.append({"path": path, "cdt": cdt_str})
            continue
        elif re.match(yyyy_mm_dd_hyphen_regexp, cdt_str):
            yyyy_mm_dd_hyphens.append({"path": path, "cdt": cdt_str})
            continue
        else:
            other.append({"path": path, "cdt": cdt_str})

results = {
    "no_cdt": no_cdt,
    "all_digits": all_digits,
    "prefix_22": match_regexp_22_no_seconds,
    "prefix_2": match_regexp_2_no_seconds,
    "space_colons": space_colons,
    "no_space_colon": no_space_colon,
    "all_hyphens": all_hyphen,
    "min_sec_colon": min_sec_colon,
    "yyyy_mm_dd_hyphen": yyyy_mm_dd_hyphens,
    "other": other,
}

print("results:\n")
for k, v in results.items():
    print(k, len(v))

print("")
print("total_files:", len(paths))

# tot_parsed_files = sum(len(cdts) for cdts in results.values())
# print("n parsed files", tot_parsed_files)


579 files dont have cdt. Possible options are that the cdt is in the file name, or within the content. Lets leave them for now and focus on the 'other'


In [None]:
for oth in other:
    print(oth["path"], "\n", oth["cdt"])
