In [1]:
from oltk.dataset import VaultDataset
from oltk.features import extract_header_props
from oltk.utils.re_constants import JINJA_EXPR, TEMPLATER_EXPR
import re
import polars as pl

In [2]:
vault = VaultDataset("/root/igkh/codes/ObsidianLTK/examples/data/DataScienceNotes")

In [3]:
vault.construct()

In [4]:
samples = vault.get_v()

Error reading : [Errno 2] No such file or directory: ''
Error reading : [Errno 2] No such file or directory: ''


In [5]:
def tidy_up_template(content: str) -> str:
    content = re.sub(JINJA_EXPR, "__TEMPLATE_IMPUTATION__", content)
    content = re.sub(TEMPLATER_EXPR, "__TEMPLATE_IMPUTATION__", content)
    return content

In [6]:
samples = samples.with_columns(
    pl.col("text")
    .map_elements(
        tidy_up_template,
        return_dtype=str,
    )
    .alias("text_without_templates")
).drop(pl.col("text"))

In [7]:
def extract_tags_and_date(x):
    props = extract_header_props(x)
    return {"tags": props.get("tags", []), "date": str(props.get("date", ""))}


In [8]:
samples = samples.with_columns(
    pl.col("text_without_templates")
    .map_elements(
        lambda x: extract_tags_and_date(x),
        return_dtype=pl.Struct({"tags": pl.List(str), "date": str}),
    )
    .alias("stat")
)

In [155]:
instances = samples.select("stat").unnest("stat").explode("tags").with_columns(pl.col("date").str.to_date(strict=False).alias("inferred_date")).drop("date")

In [156]:
instances

tags,inferred_date
str,date
"""DevOPS""",2025-10-28
"""NLP""",0007-10-25
"""obsidian_meta""",0007-10-25
"""MathModeling""",2025-10-27
"""Search""",2025-10-27
"""STATS""",
"""RL""",
"""accelerate""",0002-10-25
"""py""",0002-10-25
"""RL/Bandits""",


In [176]:
def delta_cum_stat_by_ts(df:pl.DataFrame, val_col, ts_col, delta, min_ts=None, max_ts=None, cold_start=True):
    if min_ts is None:
        min_ts = df[ts_col].min()
    else:
        min_ts = pl.max_horizontal(min_ts, df[ts_col].min())
    if max_ts is None:
        max_ts = df[ts_col].max()
    else:
        max_ts = pl.min_horizontal(max_ts, df[ts_col].max())

    obs_range = pl.DataFrame(pl.date_range(min_ts, max_ts, delta, eager=True, closed="both").alias("__ts_bucket__"))

    if not cold_start:
        df_filtered = df.with_columns(pl.col(ts_col).fill_null(min_ts))
    else:
        df_filtered = df.filter(pl.col(ts_col) >= min_ts)
        
    df_filtered = df_filtered.select([ts_col, val_col]).filter(pl.col(ts_col) <= max_ts)

    dense_count_before_buckets = df_filtered.group_by([val_col, ts_col]).agg(pl.len().alias("__instance_cnt__")).sort(ts_col)
    

    bucketized_count = dense_count_before_buckets.join_asof(
            obs_range,
            left_on=ts_col,
            right_on="__ts_bucket__",
            strategy="forward",
            tolerance=None
        ).drop(ts_col)

    grid = obs_range.join(df.select(val_col).unique(subset=val_col), how="cross")

    sparse_count_bucketized = grid.join(bucketized_count, on=["__ts_bucket__", val_col], how="left").with_columns(pl.col("__instance_cnt__").fill_null(0))
    
    sparse_cum_count_bucketized = sparse_count_bucketized.with_columns(
        pl.col("__instance_cnt__")
        .cum_sum()
        .over(val_col, order_by="__ts_bucket__")
        .alias("__cum_cnt__")
    ).drop("__instance_cnt__")
    
    return sparse_cum_count_bucketized.pivot(
        on = val_col,
        index="__ts_bucket__",
        values="__cum_cnt__",
        aggregate_function="sum"
    )

In [177]:
stat = delta_cum_stat_by_ts(instances, "tags", "inferred_date", "1d", min_ts=pl.date(2025,1,1), max_ts=pl.date(2025,12,31))

In [110]:
pl.Config.set_tbl_rows(128)
pl.Config.set_fmt_str_lengths(128)

polars.config.Config

In [178]:
stat

__ts_bucket__,obsidian_meta,MATH/OP,accelerate,TODO,ML/Trees,ALGO/Trees,NLP,null,MATH,MATH/FN,RL,MathModeling,STATS/BayessianInference,DevOPS,CV,WISDOM,RL/PG,GraphML,Graphs,py,STATS,CXX,iOS,AIGen,ML,RL/Bandits,Search
date,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
2025-01-01,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2025-01-02,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2025-01-03,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2025-01-04,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2025-01-05,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2025-01-06,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2025-01-07,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2025-01-08,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2025-01-09,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2025-01-10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [45]:
!pip install plotly

Collecting plotly
  Downloading plotly-6.5.2-py3-none-any.whl.metadata (8.5 kB)
Collecting narwhals>=1.15.1 (from plotly)
  Downloading narwhals-2.15.0-py3-none-any.whl.metadata (13 kB)
Downloading plotly-6.5.2-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m1.9 MB/s[0m  [33m0:00:05[0mm0:00:01[0m00:01[0m
[?25hDownloading narwhals-2.15.0-py3-none-any.whl (432 kB)
Installing collected packages: narwhals, plotly
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [plotly]2m1/2[0m [plotly]
[1A[2KSuccessfully installed narwhals-2.15.0 plotly-6.5.2


In [None]:
def cat_groups_resolver(df: pl.DataFrame, src_col:str, dst_col:str, groups):
    mapping = dict()
    use_cats = set()

    for member in groups:
        if type(member) == list:
            new_val = " & ".join(member)
            for submem in member:
                mapping[submem] = new_val
        use_cats = use_cats.union(set(member))

    return df.filter(pl.col(src_col).is_in(use_cats)).with_columns(pl.col(src_col).replace(mapping).alias(dst_col)).drop(src_col)


{'A': 'A & B', 'B': 'A & B', 'E': 'E & F', 'F': 'E & F'}
{'C', 'B', 'D', 'G', 'E', 'A', 'F'}
