In [1]:
from oltk.dataset import VaultDataset
from oltk.features import extract_header_props

I personally do not own an obsidian grapth big enough to check certain features.
I do not own any rights on the repository and it's contents by any means, the following data is in full posession of it's rightful author [Luke Tianpei](https://github.com/TianpeiLuke)

In [2]:
# !mkdir ./tmp_data
# %cd ./tmp_data
# !git clone https://github.com/TianpeiLuke/SelfStudyNotes
# %cd ../

In [3]:
vault = VaultDataset("./tmp_data/SelfStudyNotes/obsidian_notes")

In [4]:
%%time
vault.construct()

CPU times: user 652 ms, sys: 272 ms, total: 924 ms
Wall time: 886 ms


In [5]:
%%time
samples = vault.get_v()

Error reading : [Errno 2] No such file or directory: ''
Error reading : [Errno 2] No such file or directory: ''
CPU times: user 73.8 ms, sys: 62 ms, total: 136 ms
Wall time: 134 ms


In [6]:
from oltk.utils.re_constants import JINJA_EXPR, TEMPLATER_EXPR
import re
import polars as pl

In [7]:
def tidy_up_template(content: str) -> str:
    content = re.sub(JINJA_EXPR, "__TEMPLATE_IMPUTATION__", content)
    content = re.sub(TEMPLATER_EXPR, "__TEMPLATE_IMPUTATION__", content)
    return content

In [8]:
samples = samples.with_columns(
    pl.col("text")
    .map_elements(
        tidy_up_template,
        return_dtype=str,
    )
    .alias("text_without_templates")
).drop(pl.col("text"))

In [9]:
def extract_tags_and_date(x):
    props = extract_header_props(x)
    return {"tags": props.get("tags", []), "date": str(props.get("date of note", ""))}

In [10]:
samples = samples.with_columns(
    pl.col("text_without_templates")
    .map_elements(
        lambda x: extract_tags_and_date(x),
        return_dtype=pl.Struct({"tags": pl.List(str), "date": str}),
    )
    .alias("stat")
)

In [11]:
samples

id,title,text_without_templates,stat
u32,str,str,struct[2]
0,"""Talk with my girl""","""--- tags:  - restaurant regio…","{[""restaurant""],""2025-07-24""}"
1,"""Skin on Leg Arm and Food""","""--- tags:  - health  - skin …","{[""health"", ""skin""],""2025-02-02""}"
2,"""Skin on Head""","""--- tags:  - health  - skin …","{[""health"", ""skin""],""2025-02-02""}"
3,"""Flowers""","""--- tags: aliases: date of n…","{null,""2024-12-23""}"
4,"""Chocolate Box Godvia""","""--- tags: aliases: date of n…","{null,""2025-02-05""}"
…,…,…,…
4177,"""Blog Snippet Card""","""--- tags:  - excerpt aliases:…","{[""excerpt""],""__TEMPLATE_IMPUTATION__""}"
4178,"""Literature Summary Note Card""","""--- tags: - summary aliases…","{[""summary""],""__TEMPLATE_IMPUTATION__""}"
4179,"""Literature Reading Question Ca…","""--- tags: - critical_thinki…","{[""critical_thinking/raise_questions""],""__TEMPLATE_IMPUTATION__""}"
4180,"""Model Card""","""--- tags: - model aliases: …","{[""model""],""__TEMPLATE_IMPUTATION__""}"


In [12]:
instances = (
    samples.select("stat")
    .unnest("stat")
    .explode("tags")
    .with_columns(pl.col("date").str.to_date(strict=False).alias("inferred_date"))
    .drop("date")
)

In [13]:
pl.Config.set_tbl_rows(50)
pl.Config.set_fmt_str_lengths(128)

polars.config.Config

In [14]:
instances.select(pl.col("tags").value_counts()).unnest("tags").sort(
    by="count", descending=True
).limit(50)

tags,count
str,u32
"""concept""",3140
"""math/functional_analysis""",366
"""code""",324
"""code_snippet""",317
"""math/differential_geometry""",301
"""book""",266
"""math/linear_algebra""",237
"""math/topology""",222
"""math/stochastic_process""",221
"""machine_learning/models""",184


In [15]:
from oltk.iziviz import progress_gem, focus_chladni
import plotly.io as pio

In [16]:
pio.templates.default = "plotly_dark"

In [17]:
progress_gem(
    instances,
    "tags",
    "inferred_date",
    ts_delta="2w",
    groups=[
        ["code", "code_package"],
        ["large_language_models", "natural_language_processing"],
        ["math/topology", "math/graph_theory"],
        ["optimization/algorithm", "optimization/theory"],
        "math/differential_geometry",
        "math/real_analysis",
    ],
).show()

In [20]:
focus_chladni(
    instances,
    "tags",
    "inferred_date",
    ts_delta="2w",
    groups=[
        ["code", "code_package"],
        ["large_language_models", "natural_language_processing"],
        ["math/topology", "math/graph_theory"],
        ["optimization/algorithm", "optimization/theory"],
        "math/differential_geometry",
        "math/real_analysis",
    ],
).show()

In [19]:
# !rm -rf ./tmp_data