Bootstrap: repo root + fixed paths + basic logger

In [1]:
# [CELL 01-00] Bootstrap: repo root + fixed paths + basic logger

import os
import sys
import json
import time
import uuid
import hashlib
from pathlib import Path
from datetime import datetime
from typing import Any, Dict, List

import numpy as np
import pandas as pd

t0 = datetime.now()
print(f"[CELL 01-00] start={t0.isoformat(timespec='seconds')}")
print("[CELL 01-00] CWD:", Path.cwd().resolve())

def find_repo_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start, *start.parents]:
        if (p / "PROJECT_STATE.md").exists():
            return p
    raise RuntimeError("Could not find PROJECT_STATE.md. Open notebook from within the repo.")

REPO_ROOT = find_repo_root(Path.cwd())
print("[CELL 01-00] REPO_ROOT:", REPO_ROOT)

PATHS = {
    "PROJECT_STATE": REPO_ROOT / "PROJECT_STATE.md",
    "META_REGISTRY": REPO_ROOT / "meta.json",
    "DATA_RAW": REPO_ROOT / "data" / "raw",
    "DATA_INTERIM": REPO_ROOT / "data" / "interim",
    "DATA_PROCESSED": REPO_ROOT / "data" / "processed",
    "REPORTS": REPO_ROOT / "reports",
}
for k, v in PATHS.items():
    print(f"[CELL 01-00] {k}={v}")

def cell_start(cell_id: str, title: str, **kwargs: Any) -> float:
    t = time.time()
    print(f"\n[{cell_id}] {title}")
    print(f"[{cell_id}] start={datetime.now().isoformat(timespec='seconds')}")
    for k, v in kwargs.items():
        print(f"[{cell_id}] {k}={v}")
    return t

def cell_end(cell_id: str, t0: float, **kwargs: Any) -> None:
    for k, v in kwargs.items():
        print(f"[{cell_id}] {k}={v}")
    print(f"[{cell_id}] elapsed={time.time()-t0:.2f}s")
    print(f"[{cell_id}] done")

print("[CELL 01-00] done")


[CELL 01-00] start=2026-01-06T21:26:07
[CELL 01-00] CWD: C:\anonymous-users-mooc-session-meta\notebooks
[CELL 01-00] REPO_ROOT: C:\anonymous-users-mooc-session-meta
[CELL 01-00] PROJECT_STATE=C:\anonymous-users-mooc-session-meta\PROJECT_STATE.md
[CELL 01-00] META_REGISTRY=C:\anonymous-users-mooc-session-meta\meta.json
[CELL 01-00] DATA_RAW=C:\anonymous-users-mooc-session-meta\data\raw
[CELL 01-00] DATA_INTERIM=C:\anonymous-users-mooc-session-meta\data\interim
[CELL 01-00] DATA_PROCESSED=C:\anonymous-users-mooc-session-meta\data\processed
[CELL 01-00] REPORTS=C:\anonymous-users-mooc-session-meta\reports
[CELL 01-00] done


Reproducibility: seed everything

In [2]:
# [CELL 01-01] Reproducibility: seed everything

t0 = cell_start("CELL 01-01", "Seed everything")

GLOBAL_SEED = 20260106

def seed_everything(seed: int) -> None:
    import random
    random.seed(seed)
    np.random.seed(seed)

seed_everything(GLOBAL_SEED)

cell_end("CELL 01-01", t0, seed=GLOBAL_SEED)



[CELL 01-01] Seed everything
[CELL 01-01] start=2026-01-06T21:26:07
[CELL 01-01] seed=20260106
[CELL 01-01] elapsed=0.00s
[CELL 01-01] done


JSON IO (atomic) + hashing helpers

In [3]:
# [CELL 01-02] JSON IO (atomic) + hashing helpers

t0 = cell_start("CELL 01-02", "JSON IO + hashing")

def write_json_atomic(path: Path, obj: Any, indent: int = 2) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    tmp = path.with_suffix(path.suffix + f".tmp_{uuid.uuid4().hex}")
    with tmp.open("w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=indent)
    tmp.replace(path)

def read_json(path: Path) -> Any:
    if not path.exists():
        raise RuntimeError(f"Missing JSON file: {path}")
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

def sha256_file(path: Path, chunk_size: int = 1024 * 1024) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        while True:
            b = f.read(chunk_size)
            if not b:
                break
            h.update(b)
    return h.hexdigest()

def assert_nonempty_df(df: pd.DataFrame, name: str) -> None:
    if df is None or not isinstance(df, pd.DataFrame) or df.shape[0] == 0:
        raise RuntimeError(f"{name} is empty or invalid DataFrame")

cell_end("CELL 01-02", t0)



[CELL 01-02] JSON IO + hashing
[CELL 01-02] start=2026-01-06T21:26:07
[CELL 01-02] elapsed=0.00s
[CELL 01-02] done


Run tagging + report/config/manifest + meta.json append-only

In [4]:
# [CELL 01-03] Run tagging + report/config/manifest + meta.json registry

t0 = cell_start("CELL 01-03", "Start run + init run files + meta.json")

NOTEBOOK_NAME = "01_ingest_mars"
RUN_TAG = datetime.now().strftime("%Y%m%d_%H%M%S")
RUN_ID = uuid.uuid4().hex

OUT_DIR = PATHS["REPORTS"] / NOTEBOOK_NAME / RUN_TAG
OUT_DIR.mkdir(parents=True, exist_ok=True)

REPORT_PATH = OUT_DIR / "report.json"
CONFIG_PATH = OUT_DIR / "config.json"
MANIFEST_PATH = OUT_DIR / "manifest.json"

RAW_DIR = PATHS["DATA_RAW"] / "mars"
OUT_PARQUET = PATHS["DATA_INTERIM"] / "mars_events_raw.parquet"
OUT_DUCKDB = PATHS["DATA_INTERIM"] / "mars.duckdb"

CFG = {
    "notebook": NOTEBOOK_NAME,
    "run_id": RUN_ID,
    "run_tag": RUN_TAG,
    "seed": GLOBAL_SEED,
    "paths": {
        "raw_dir": str(RAW_DIR),
        "out_parquet": str(OUT_PARQUET),
        "out_duckdb": str(OUT_DUCKDB),
        "out_dir": str(OUT_DIR),
    },
    "ingest": {
        "accepted_ext": [".json", ".jsonl", ".csv", ".parquet"],
        "parquet_compression": "zstd",
        "duckdb_view": "mars_events_raw",
    }
}

write_json_atomic(CONFIG_PATH, CFG)

report = {
    "run_id": RUN_ID,
    "notebook": NOTEBOOK_NAME,
    "run_tag": RUN_TAG,
    "created_at": datetime.now().isoformat(timespec="seconds"),
    "repo_root": str(REPO_ROOT),
    "metrics": {},
    "key_findings": [],
    "sanity_samples": {},
    "data_fingerprints": {},
    "notes": [],
}
write_json_atomic(REPORT_PATH, report)

manifest = {
    "run_id": RUN_ID,
    "notebook": NOTEBOOK_NAME,
    "run_tag": RUN_TAG,
    "artifacts": [],
}
write_json_atomic(MANIFEST_PATH, manifest)

# meta.json append-only
META_PATH = PATHS["META_REGISTRY"]
if not META_PATH.exists():
    write_json_atomic(META_PATH, {"schema_version": 1, "runs": []})

meta = read_json(META_PATH)
if "runs" not in meta or not isinstance(meta["runs"], list):
    raise RuntimeError("meta.json invalid: missing 'runs' list")

meta["runs"].append({
    "run_id": RUN_ID,
    "notebook": NOTEBOOK_NAME,
    "run_tag": RUN_TAG,
    "out_dir": str(OUT_DIR),
    "created_at": datetime.now().isoformat(timespec="seconds"),
})
write_json_atomic(META_PATH, meta)

cell_end("CELL 01-03", t0,
         out_dir=str(OUT_DIR),
         report=str(REPORT_PATH),
         config=str(CONFIG_PATH),
         manifest=str(MANIFEST_PATH),
         meta=str(META_PATH))



[CELL 01-03] Start run + init run files + meta.json
[CELL 01-03] start=2026-01-06T21:26:07
[CELL 01-03] out_dir=C:\anonymous-users-mooc-session-meta\reports\01_ingest_mars\20260106_212607
[CELL 01-03] report=C:\anonymous-users-mooc-session-meta\reports\01_ingest_mars\20260106_212607\report.json
[CELL 01-03] config=C:\anonymous-users-mooc-session-meta\reports\01_ingest_mars\20260106_212607\config.json
[CELL 01-03] manifest=C:\anonymous-users-mooc-session-meta\reports\01_ingest_mars\20260106_212607\manifest.json
[CELL 01-03] meta=C:\anonymous-users-mooc-session-meta\meta.json
[CELL 01-03] elapsed=0.02s
[CELL 01-03] done


DuckDB dependency check

In [5]:
# [CELL 01-04] DuckDB dependency check

t0 = cell_start("CELL 01-04", "Import duckdb")

try:
    import duckdb
except Exception as e:
    raise RuntimeError(
        "Missing duckdb. Install via:\n"
        "  conda install -c conda-forge duckdb\n"
        "or\n"
        "  pip install duckdb\n"
    ) from e

print("[CELL 01-04] duckdb_version:", duckdb.__version__)
cell_end("CELL 01-04", t0)



[CELL 01-04] Import duckdb
[CELL 01-04] start=2026-01-06T21:26:07
[CELL 01-04] duckdb_version: 1.4.3
[CELL 01-04] elapsed=0.03s
[CELL 01-04] done


Enumerate raw files + fingerprint

In [6]:
# [CELL 01-05] Select explicit_ratings_en.csv only + fingerprint

t0 = cell_start("CELL 01-05", "Select explicit_ratings_en.csv only", raw_dir=str(RAW_DIR))

target_name = "explicit_ratings_en.csv"
target_path = RAW_DIR / target_name

if not target_path.exists():
    # show what exists to avoid guessing
    existing = sorted([p.name for p in RAW_DIR.rglob("*") if p.is_file()])
    raise RuntimeError(
        f"Required file not found: {target_path}\n"
        f"Files under {RAW_DIR} (first 50): {existing[:50]}"
    )

raw_fp = {
    "root": str(RAW_DIR),
    "n_files": 1,
    "files": [{
        "name": target_path.name,
        "relpath": str(target_path.relative_to(RAW_DIR)),
        "bytes": int(target_path.stat().st_size),
        "sha256": sha256_file(target_path),
        "suffix": target_path.suffix.lower(),
    }]
}

print("[CELL 01-05] using:", target_path.name)
print(json.dumps(raw_fp["files"][0], indent=2))

cell_end("CELL 01-05", t0)



[CELL 01-05] Select explicit_ratings_en.csv only
[CELL 01-05] start=2026-01-06T21:26:07
[CELL 01-05] raw_dir=C:\anonymous-users-mooc-session-meta\data\raw\mars
[CELL 01-05] using: explicit_ratings_en.csv
{
  "name": "explicit_ratings_en.csv",
  "relpath": "explicit_ratings_en.csv",
  "bytes": 145724,
  "sha256": "8190b9f10afcb44b7542616648ee3c7825f42f7bed832784a57e083b53773708",
  "suffix": ".csv"
}
[CELL 01-05] elapsed=0.00s
[CELL 01-05] done


Load raw → DataFrame (auto by extension)

Load explicit_ratings_en.csv only

In [7]:
# [CELL 01-06] Load explicit_ratings_en.csv only

t0 = cell_start("CELL 01-06", "Load explicit_ratings_en.csv")

events = pd.read_csv(RAW_DIR / "explicit_ratings_en.csv")
assert_nonempty_df(events, "events")

events["__source_file"] = "explicit_ratings_en.csv"

print("[CELL 01-06] shape:", events.shape)
print("[CELL 01-06] columns:", list(events.columns))

# minimal preview
print("[CELL 01-06] head(3):")
print(events.head(3).to_string(index=False))

cell_end("CELL 01-06", t0, rows=int(events.shape[0]), cols=int(events.shape[1]))



[CELL 01-06] Load explicit_ratings_en.csv
[CELL 01-06] start=2026-01-06T21:26:07
[CELL 01-06] shape: (3659, 6)
[CELL 01-06] columns: ['user_id', 'item_id', 'watch_percentage', 'created_at', 'rating', '__source_file']
[CELL 01-06] head(3):
 user_id  item_id  watch_percentage          created_at  rating           __source_file
  224557      510               100 2018-09-28 16:18:29      10 explicit_ratings_en.csv
  224557      615               100 2018-09-28 16:22:22      10 explicit_ratings_en.csv
  224557     7680               100 2018-09-28 16:23:34      10 explicit_ratings_en.csv
[CELL 01-06] rows=3659
[CELL 01-06] cols=6
[CELL 01-06] elapsed=0.01s
[CELL 01-06] done


Save canonical Parquet (raw → parquet)

In [8]:
# [CELL 01-07] Save canonical Parquet (raw → parquet)

t0 = cell_start("CELL 01-07", "Save canonical raw table as Parquet", out_parquet=str(OUT_PARQUET))

OUT_PARQUET.parent.mkdir(parents=True, exist_ok=True)
events.to_parquet(OUT_PARQUET, index=False, compression=CFG["ingest"]["parquet_compression"])

parq_bytes = int(OUT_PARQUET.stat().st_size)
parq_sha = sha256_file(OUT_PARQUET)

print("[CELL 01-07] saved:", OUT_PARQUET)
print("[CELL 01-07] bytes:", parq_bytes)
print("[CELL 01-07] sha256:", parq_sha)

cell_end("CELL 01-07", t0)



[CELL 01-07] Save canonical raw table as Parquet
[CELL 01-07] start=2026-01-06T21:26:07
[CELL 01-07] out_parquet=C:\anonymous-users-mooc-session-meta\data\interim\mars_events_raw.parquet
[CELL 01-07] saved: C:\anonymous-users-mooc-session-meta\data\interim\mars_events_raw.parquet
[CELL 01-07] bytes: 43047
[CELL 01-07] sha256: 3e261bffc99b67cd75a4b03b71785b5dbb854a6aea5db131f4763cb57d51fc9d
[CELL 01-07] elapsed=0.03s
[CELL 01-07] done


DuckDB: create DB + view from Parquet (parquet → duckdb)

In [11]:
# [CELL 01-08] DuckDB DB + view from Parquet

t0 = cell_start("CELL 01-08", "Create DuckDB and VIEW from Parquet", out_duckdb=str(OUT_DUCKDB))

OUT_DUCKDB.parent.mkdir(parents=True, exist_ok=True)
con = duckdb.connect(str(OUT_DUCKDB))

view = CFG["ingest"]["duckdb_view"]
con.execute(f"DROP VIEW IF EXISTS {view};")
con.execute(f"""
CREATE VIEW {view} AS
SELECT * FROM read_parquet('{str(OUT_PARQUET).replace("'", "''")}')
""")

n = con.execute(f"SELECT COUNT(*) FROM {view}").fetchone()[0]
schema_df = con.execute(f"DESCRIBE {view}").fetchdf()

print("[CELL 01-08] view:", view)
print("[CELL 01-08] rows:", int(n))
print("[CELL 01-08] schema_head:")
print(schema_df.head(40).to_string(index=False))

con.close()
print("[CELL 01-08] closed DuckDB connection")

cell_end("CELL 01-08", t0, rows=int(n))



[CELL 01-08] Create DuckDB and VIEW from Parquet
[CELL 01-08] start=2026-01-06T21:27:00
[CELL 01-08] out_duckdb=C:\anonymous-users-mooc-session-meta\data\interim\mars.duckdb
[CELL 01-08] view: mars_events_raw
[CELL 01-08] rows: 3659
[CELL 01-08] schema_head:
     column_name column_type null  key default extra
         user_id      BIGINT  YES None    None  None
         item_id      BIGINT  YES None    None  None
watch_percentage      BIGINT  YES None    None  None
      created_at     VARCHAR  YES None    None  None
          rating      BIGINT  YES None    None  None
   __source_file     VARCHAR  YES None    None  None
[CELL 01-08] closed DuckDB connection
[CELL 01-08] rows=3659
[CELL 01-08] elapsed=0.02s
[CELL 01-08] done


Update report + manifest (fingerprints + sanity sample)

In [12]:
# [CELL 01-09] Update report + manifest

t0 = cell_start("CELL 01-09", "Write fingerprints + sanity sample to report + manifest")

report = read_json(REPORT_PATH)
manifest = read_json(MANIFEST_PATH)

sample_cols = list(events.columns)[:20]
head3 = events.loc[:2, sample_cols].to_dict(orient="records")

report["data_fingerprints"]["mars_raw_files"] = {
    "root": raw_fp["root"],
    "n_files": raw_fp["n_files"],
    "files_first3": raw_fp["files"][:3],
}
report["data_fingerprints"]["mars_events_raw_parquet"] = {
    "path": str(OUT_PARQUET),
    "bytes": parq_bytes,
    "sha256": parq_sha,
}
report["sanity_samples"]["mars_events_raw_head3"] = head3
report["notes"].append("Storage rule enforced: raw -> parquet -> duckdb (VIEW mars_events_raw)")

write_json_atomic(REPORT_PATH, report)

def add_artifact(path: Path) -> None:
    rec = {
        "path": str(path),
        "bytes": int(path.stat().st_size),
        "sha256": None,
        "sha256_error": None,
    }
    try:
        rec["sha256"] = sha256_file(path)
    except PermissionError as e:
        # Windows file lock (e.g., duckdb still open somewhere)
        rec["sha256_error"] = f"PermissionError: {e}"
        print(f"[CELL 01-09] WARN: could not hash (locked): {path}")
    manifest["artifacts"].append(rec)

add_artifact(OUT_PARQUET)
add_artifact(OUT_DUCKDB)

write_json_atomic(MANIFEST_PATH, manifest)

print("[CELL 01-09] updated_report:", REPORT_PATH)
print("[CELL 01-09] updated_manifest:", MANIFEST_PATH)

cell_end("CELL 01-09", t0)



[CELL 01-09] Write fingerprints + sanity sample to report + manifest
[CELL 01-09] start=2026-01-06T21:27:22
[CELL 01-09] updated_report: C:\anonymous-users-mooc-session-meta\reports\01_ingest_mars\20260106_212607\report.json
[CELL 01-09] updated_manifest: C:\anonymous-users-mooc-session-meta\reports\01_ingest_mars\20260106_212607\manifest.json
[CELL 01-09] elapsed=0.03s
[CELL 01-09] done


Next-step checklist (Notebook 02)

In [13]:
# [CELL 01-10] Next step checklist

t0 = cell_start("CELL 01-10", "Next step checklist")

print("Next: 02_sessionize_mars.ipynb")
print("Input source for all queries: DuckDB view mars_events_raw in data/interim/mars.duckdb")
print("Notebook 02 will:")
print("  - Identify actual schema fields (user/item/timestamp/event).")
print("  - Normalize timestamps and enforce ordering.")
print("  - Sessionize with justified gap threshold (later).")
print("No sessionization is performed in Notebook 01.")

cell_end("CELL 01-10", t0)



[CELL 01-10] Next step checklist
[CELL 01-10] start=2026-01-06T21:28:20
Next: 02_sessionize_mars.ipynb
Input source for all queries: DuckDB view mars_events_raw in data/interim/mars.duckdb
Notebook 02 will:
  - Identify actual schema fields (user/item/timestamp/event).
  - Normalize timestamps and enforce ordering.
  - Sessionize with justified gap threshold (later).
No sessionization is performed in Notebook 01.
[CELL 01-10] elapsed=0.00s
[CELL 01-10] done


EDA setup + safe DuckDB open/close

In [14]:
# [CELL 01-11] EDA setup (DuckDB open/close; Windows-safe)

t0 = cell_start("CELL 01-11", "EDA setup (DuckDB connection)")

import duckdb

DB_PATH = OUT_DUCKDB
VIEW = CFG["ingest"]["duckdb_view"]  # mars_events_raw

con = duckdb.connect(str(DB_PATH), read_only=True)

print("[CELL 01-11] db:", DB_PATH)
print("[CELL 01-11] view:", VIEW)

cell_end("CELL 01-11", t0)



[CELL 01-11] EDA setup (DuckDB connection)
[CELL 01-11] start=2026-01-06T21:31:23
[CELL 01-11] db: C:\anonymous-users-mooc-session-meta\data\interim\mars.duckdb
[CELL 01-11] view: mars_events_raw
[CELL 01-11] elapsed=0.02s
[CELL 01-11] done


Schema + top-level stats (rows/cols, column names)

In [15]:
# [CELL 01-12] Schema + basic stats

t0 = cell_start("CELL 01-12", "Schema + basic stats")

schema_df = con.execute(f"DESCRIBE {VIEW}").fetchdf()
n_rows = con.execute(f"SELECT COUNT(*) AS n FROM {VIEW}").fetchone()[0]

print("[CELL 01-12] rows:", int(n_rows))
print("[CELL 01-12] schema:")
print(schema_df.to_string(index=False))

cols = schema_df["column_name"].tolist()
print("[CELL 01-12] columns:", cols)

cell_end("CELL 01-12", t0, rows=int(n_rows), n_cols=len(cols))



[CELL 01-12] Schema + basic stats
[CELL 01-12] start=2026-01-06T21:31:43
[CELL 01-12] rows: 3659
[CELL 01-12] schema:
     column_name column_type null  key default extra
         user_id      BIGINT  YES None    None  None
         item_id      BIGINT  YES None    None  None
watch_percentage      BIGINT  YES None    None  None
      created_at     VARCHAR  YES None    None  None
          rating      BIGINT  YES None    None  None
   __source_file     VARCHAR  YES None    None  None
[CELL 01-12] columns: ['user_id', 'item_id', 'watch_percentage', 'created_at', 'rating', '__source_file']
[CELL 01-12] rows=3659
[CELL 01-12] n_cols=6
[CELL 01-12] elapsed=0.01s
[CELL 01-12] done


Auto-guess key columns (user/item/rating/timestamp)

In [16]:
# [CELL 01-13] Auto-guess key columns (no assumptions)

t0 = cell_start("CELL 01-13", "Guess key columns (user/item/rating/timestamp)")

cols_lower = [c.lower() for c in cols]

def guess_col(candidates):
    for pat in candidates:
        for c in cols:
            if pat in c.lower():
                return c
    return None

guess = {
    "user": guess_col(["user", "learner", "student", "uid"]),
    "item": guess_col(["item", "course", "resource", "content", "cid", "iid"]),
    "rating": guess_col(["rating", "rate", "score", "stars"]),
    "ts": guess_col(["timestamp", "time", "date", "created", "ts"]),
}

print("[CELL 01-13] guessed:", guess)

# Hard requirement for session work later: user + item + timestamp (or orderable time)
missing = [k for k in ["user", "item", "ts"] if guess[k] is None]
if missing:
    raise RuntimeError(
        f"Missing required column guesses: {missing}. "
        f"Cannot proceed to sessionization without them. Columns={cols}"
    )

USER_COL = guess["user"]
ITEM_COL = guess["item"]
TS_COL   = guess["ts"]
RATING_COL = guess["rating"]  # may be None

print("[CELL 01-13] USER_COL:", USER_COL)
print("[CELL 01-13] ITEM_COL:", ITEM_COL)
print("[CELL 01-13] TS_COL:", TS_COL)
print("[CELL 01-13] RATING_COL:", RATING_COL)

cell_end("CELL 01-13", t0)



[CELL 01-13] Guess key columns (user/item/rating/timestamp)
[CELL 01-13] start=2026-01-06T21:32:07
[CELL 01-13] guessed: {'user': 'user_id', 'item': 'item_id', 'rating': 'rating', 'ts': 'created_at'}
[CELL 01-13] USER_COL: user_id
[CELL 01-13] ITEM_COL: item_id
[CELL 01-13] TS_COL: created_at
[CELL 01-13] RATING_COL: rating
[CELL 01-13] elapsed=0.00s
[CELL 01-13] done


Core dataset counts (users/items/interactions, duplicates)

In [17]:
# [CELL 01-14] Core counts + duplicates

t0 = cell_start("CELL 01-14", "Counts (users/items/interactions) + duplicates")

q = f"""
SELECT
  COUNT(*) AS n_interactions,
  COUNT(DISTINCT {USER_COL}) AS n_users,
  COUNT(DISTINCT {ITEM_COL}) AS n_items
FROM {VIEW}
"""
base = con.execute(q).fetchdf().iloc[0].to_dict()

dup_q = f"""
SELECT COUNT(*) AS n_dups
FROM (
  SELECT {USER_COL}, {ITEM_COL}, {TS_COL}, COUNT(*) AS c
  FROM {VIEW}
  GROUP BY 1,2,3
  HAVING COUNT(*) > 1
)
"""
n_dups = int(con.execute(dup_q).fetchone()[0])

print("[CELL 01-14] base:", base)
print("[CELL 01-14] duplicate (user,item,ts) groups:", n_dups)

cell_end("CELL 01-14", t0, **{k:int(v) for k,v in base.items()}, n_dup_groups=n_dups)



[CELL 01-14] Counts (users/items/interactions) + duplicates
[CELL 01-14] start=2026-01-06T21:32:51
[CELL 01-14] base: {'n_interactions': 3659, 'n_users': 822, 'n_items': 776}
[CELL 01-14] duplicate (user,item,ts) groups: 4
[CELL 01-14] n_interactions=3659
[CELL 01-14] n_users=822
[CELL 01-14] n_items=776
[CELL 01-14] n_dup_groups=4
[CELL 01-14] elapsed=0.01s
[CELL 01-14] done


Timestamp health (nulls + min/max + parse check)

In [18]:
# [CELL 01-15] Timestamp health

t0 = cell_start("CELL 01-15", "Timestamp health (null/min/max)")

# Null count
null_ts = int(con.execute(f"SELECT COUNT(*) FROM {VIEW} WHERE {TS_COL} IS NULL").fetchone()[0])

# Min/max (as-is)
minmax = con.execute(f"SELECT MIN({TS_COL}) AS min_ts, MAX({TS_COL}) AS max_ts FROM {VIEW}").fetchdf().iloc[0].to_dict()

print("[CELL 01-15] null_ts:", null_ts)
print("[CELL 01-15] minmax:", minmax)

cell_end("CELL 01-15", t0, null_ts=null_ts)



[CELL 01-15] Timestamp health (null/min/max)
[CELL 01-15] start=2026-01-06T21:33:26
[CELL 01-15] null_ts: 0
[CELL 01-15] minmax: {'min_ts': '2018-09-28 14:38:15', 'max_ts': '2021-09-20 16:26:06'}
[CELL 01-15] null_ts=0
[CELL 01-15] elapsed=0.00s
[CELL 01-15] done


Interactions per user distribution (quantiles)

In [19]:
# [CELL 01-16] Interactions per user distribution (quantiles)

t0 = cell_start("CELL 01-16", "Per-user interaction count distribution")

per_user_q = f"""
WITH c AS (
  SELECT {USER_COL} AS u, COUNT(*) AS cnt
  FROM {VIEW}
  GROUP BY 1
)
SELECT
  COUNT(*) AS n_users,
  MIN(cnt) AS min_cnt,
  approx_quantile(cnt, 0.50) AS p50,
  approx_quantile(cnt, 0.90) AS p90,
  approx_quantile(cnt, 0.95) AS p95,
  approx_quantile(cnt, 0.99) AS p99,
  MAX(cnt) AS max_cnt
FROM c
"""
user_dist = con.execute(per_user_q).fetchdf().iloc[0].to_dict()
print("[CELL 01-16] user_dist:", user_dist)

top_users = con.execute(f"""
SELECT {USER_COL} AS user_id, COUNT(*) AS cnt
FROM {VIEW}
GROUP BY 1
ORDER BY cnt DESC
LIMIT 10
""").fetchdf()
print("[CELL 01-16] top_users:")
print(top_users.to_string(index=False))

cell_end("CELL 01-16", t0)



[CELL 01-16] Per-user interaction count distribution
[CELL 01-16] start=2026-01-06T21:33:44
[CELL 01-16] user_dist: {'n_users': 822, 'min_cnt': 1, 'p50': 2, 'p90': 8, 'p95': 16, 'p99': 59, 'max_cnt': 134}
[CELL 01-16] top_users:
 user_id  cnt
  604039  134
  537825  127
  277945  101
  295494   67
  605299   64
  284287   64
  492163   63
  279350   62
  259390   62
  483016   51
[CELL 01-16] elapsed=0.01s
[CELL 01-16] done


Interactions per item distribution (quantiles)

In [20]:
# [CELL 01-17] Interactions per item distribution (quantiles)

t0 = cell_start("CELL 01-17", "Per-item interaction count distribution")

per_item_q = f"""
WITH c AS (
  SELECT {ITEM_COL} AS i, COUNT(*) AS cnt
  FROM {VIEW}
  GROUP BY 1
)
SELECT
  COUNT(*) AS n_items,
  MIN(cnt) AS min_cnt,
  approx_quantile(cnt, 0.50) AS p50,
  approx_quantile(cnt, 0.90) AS p90,
  approx_quantile(cnt, 0.95) AS p95,
  approx_quantile(cnt, 0.99) AS p99,
  MAX(cnt) AS max_cnt
FROM c
"""
item_dist = con.execute(per_item_q).fetchdf().iloc[0].to_dict()
print("[CELL 01-17] item_dist:", item_dist)

top_items = con.execute(f"""
SELECT {ITEM_COL} AS item_id, COUNT(*) AS cnt
FROM {VIEW}
GROUP BY 1
ORDER BY cnt DESC
LIMIT 10
""").fetchdf()
print("[CELL 01-17] top_items:")
print(top_items.to_string(index=False))

cell_end("CELL 01-17", t0)



[CELL 01-17] Per-item interaction count distribution
[CELL 01-17] start=2026-01-06T21:34:08
[CELL 01-17] item_dist: {'n_items': 776, 'min_cnt': 1, 'p50': 2, 'p90': 8, 'p95': 14, 'p99': 35, 'max_cnt': 179}
[CELL 01-17] top_items:
 item_id  cnt
     510  179
   43457  109
     512   78
    7626   76
     733   48
     545   44
   16366   42
   43458   35
   32022   33
  183716   31
[CELL 01-17] elapsed=0.01s
[CELL 01-17] done


Rating distribution (only if rating exists)

In [21]:
# [CELL 01-18] Rating distribution (optional)

t0 = cell_start("CELL 01-18", "Rating distribution (if available)")

rating_summary = None
if RATING_COL is not None:
    rating_summary = con.execute(f"""
    SELECT
      COUNT(*) AS n,
      COUNT(*) FILTER (WHERE {RATING_COL} IS NULL) AS n_null,
      MIN({RATING_COL}) AS min,
      approx_quantile({RATING_COL}, 0.50) AS p50,
      approx_quantile({RATING_COL}, 0.90) AS p90,
      MAX({RATING_COL}) AS max
    FROM {VIEW}
    """).fetchdf().iloc[0].to_dict()
    print("[CELL 01-18] rating_summary:", rating_summary)
else:
    print("[CELL 01-18] No rating column detected; skipping.")

cell_end("CELL 01-18", t0)



[CELL 01-18] Rating distribution (if available)
[CELL 01-18] start=2026-01-06T21:34:26
[CELL 01-18] rating_summary: {'n': 3659, 'n_null': 0, 'min': 1, 'p50': 10, 'p90': 10, 'max': 10}
[CELL 01-18] elapsed=0.00s
[CELL 01-18] done


Write EDA summary into report.json + close DB

In [22]:
# [CELL 01-19] Persist EDA summary into report.json + close DuckDB (avoid Windows lock)

t0 = cell_start("CELL 01-19", "Write EDA summary to report + close DuckDB")

report = read_json(REPORT_PATH)

eda = {
    "guessed_columns": {
        "user": USER_COL,
        "item": ITEM_COL,
        "timestamp": TS_COL,
        "rating": RATING_COL,
    },
    "base_counts": {k:int(v) for k,v in base.items()},
    "n_dup_groups_user_item_ts": int(n_dups),
    "timestamp_minmax": minmax,
    "timestamp_null_count": int(null_ts),
    "user_interaction_dist": user_dist,
    "item_interaction_dist": item_dist,
    "top_users_head10": top_users.to_dict(orient="records"),
    "top_items_head10": top_items.to_dict(orient="records"),
}

if RATING_COL is not None:
    eda["rating_summary"] = rating_summary

report["key_findings"].append("Added EDA (DuckDB-based) before sessionization to characterize MARS explicit_ratings_en.csv.")
report["sanity_samples"]["mars_eda_summary"] = eda

write_json_atomic(REPORT_PATH, report)
print("[CELL 01-19] updated_report:", REPORT_PATH)

con.close()
print("[CELL 01-19] closed DuckDB connection")

cell_end("CELL 01-19", t0)



[CELL 01-19] Write EDA summary to report + close DuckDB
[CELL 01-19] start=2026-01-06T21:34:40
[CELL 01-19] updated_report: C:\anonymous-users-mooc-session-meta\reports\01_ingest_mars\20260106_212607\report.json
[CELL 01-19] closed DuckDB connection
[CELL 01-19] elapsed=0.02s
[CELL 01-19] done


Re-open DuckDB (read-only) for plotting

In [23]:
# [CELL 01-20] Re-open DuckDB for plotting (read-only, Windows-safe)

t0 = cell_start("CELL 01-20", "Open DuckDB read-only for plots")

import duckdb
import matplotlib.pyplot as plt

con = duckdb.connect(str(OUT_DUCKDB), read_only=True)
VIEW = CFG["ingest"]["duckdb_view"]

PLOTS_DIR = OUT_DIR / "plots"
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

print("[CELL 01-20] db:", OUT_DUCKDB)
print("[CELL 01-20] view:", VIEW)
print("[CELL 01-20] plots_dir:", PLOTS_DIR)

cell_end("CELL 01-20", t0)



[CELL 01-20] Open DuckDB read-only for plots
[CELL 01-20] start=2026-01-06T21:36:02
[CELL 01-20] db: C:\anonymous-users-mooc-session-meta\data\interim\mars.duckdb
[CELL 01-20] view: mars_events_raw
[CELL 01-20] plots_dir: C:\anonymous-users-mooc-session-meta\reports\01_ingest_mars\20260106_212607\plots
[CELL 01-20] elapsed=0.86s
[CELL 01-20] done


Plot: interactions per user (log10 buckets)

In [24]:
# [CELL 01-21] Plot interactions-per-user distribution (log10 buckets)

t0 = cell_start("CELL 01-21", "Plot: interactions per user (log10 buckets)")

q = f"""
WITH per_user AS (
  SELECT {USER_COL} AS u, COUNT(*) AS cnt
  FROM {VIEW}
  GROUP BY 1
),
buckets AS (
  SELECT
    CASE
      WHEN cnt <= 0 THEN -1
      WHEN cnt = 1 THEN 0
      ELSE CAST(floor(log10(cnt)) AS INTEGER)
    END AS log10_bucket,
    COUNT(*) AS n_users
  FROM per_user
  GROUP BY 1
)
SELECT log10_bucket, n_users
FROM buckets
ORDER BY log10_bucket;
"""
df = con.execute(q).fetchdf()

print("[CELL 01-21] bucket_rows:", df.shape[0])
print(df.head(10).to_string(index=False))

# labels: bucket k means cnt in [10^k, 10^(k+1)-1], with special for 1
x = df["log10_bucket"].astype(int).tolist()
y = df["n_users"].astype(int).tolist()
xticks = x
xticklabels = []
for k in x:
    if k == 0:
        xticklabels.append("1")
    elif k > 0:
        xticklabels.append(f"10^{k}..")
    else:
        xticklabels.append("<=0?")

plt.figure()
plt.bar(range(len(x)), y)
plt.xticks(range(len(x)), xticklabels, rotation=45, ha="right")
plt.yscale("log")  # default log scaling; helps sparse tails
plt.xlabel("Interactions per user (log10 bucket)")
plt.ylabel("Number of users (log scale)")
plt.title("MARS: User activity distribution (log-bucketed)")
plt.tight_layout()

out_png = PLOTS_DIR / "mars_user_interactions_logbucket.png"
plt.savefig(out_png, dpi=200)
plt.close()

print("[CELL 01-21] saved:", out_png)

cell_end("CELL 01-21", t0, out_plot=str(out_png))



[CELL 01-21] Plot: interactions per user (log10 buckets)
[CELL 01-21] start=2026-01-06T21:36:18
[CELL 01-21] bucket_rows: 3
 log10_bucket  n_users
            0      749
            1       70
            2        3
[CELL 01-21] saved: C:\anonymous-users-mooc-session-meta\reports\01_ingest_mars\20260106_212607\plots\mars_user_interactions_logbucket.png
[CELL 01-21] out_plot=C:\anonymous-users-mooc-session-meta\reports\01_ingest_mars\20260106_212607\plots\mars_user_interactions_logbucket.png
[CELL 01-21] elapsed=0.50s
[CELL 01-21] done


Plot: interactions per item (log10 buckets)

In [25]:
# [CELL 01-22] Plot interactions-per-item distribution (log10 buckets)

t0 = cell_start("CELL 01-22", "Plot: interactions per item (log10 buckets)")

q = f"""
WITH per_item AS (
  SELECT {ITEM_COL} AS i, COUNT(*) AS cnt
  FROM {VIEW}
  GROUP BY 1
),
buckets AS (
  SELECT
    CASE
      WHEN cnt <= 0 THEN -1
      WHEN cnt = 1 THEN 0
      ELSE CAST(floor(log10(cnt)) AS INTEGER)
    END AS log10_bucket,
    COUNT(*) AS n_items
  FROM per_item
  GROUP BY 1
)
SELECT log10_bucket, n_items
FROM buckets
ORDER BY log10_bucket;
"""
df = con.execute(q).fetchdf()

print("[CELL 01-22] bucket_rows:", df.shape[0])
print(df.head(10).to_string(index=False))

x = df["log10_bucket"].astype(int).tolist()
y = df["n_items"].astype(int).tolist()
xticklabels = []
for k in x:
    if k == 0:
        xticklabels.append("1")
    elif k > 0:
        xticklabels.append(f"10^{k}..")
    else:
        xticklabels.append("<=0?")

plt.figure()
plt.bar(range(len(x)), y)
plt.xticks(range(len(x)), xticklabels, rotation=45, ha="right")
plt.yscale("log")
plt.xlabel("Interactions per item (log10 bucket)")
plt.ylabel("Number of items (log scale)")
plt.title("MARS: Item popularity distribution (log-bucketed)")
plt.tight_layout()

out_png = PLOTS_DIR / "mars_item_interactions_logbucket.png"
plt.savefig(out_png, dpi=200)
plt.close()

print("[CELL 01-22] saved:", out_png)

cell_end("CELL 01-22", t0, out_plot=str(out_png))



[CELL 01-22] Plot: interactions per item (log10 buckets)
[CELL 01-22] start=2026-01-06T21:36:38
[CELL 01-22] bucket_rows: 3
 log10_bucket  n_items
            0      713
            1       61
            2        2
[CELL 01-22] saved: C:\anonymous-users-mooc-session-meta\reports\01_ingest_mars\20260106_212607\plots\mars_item_interactions_logbucket.png
[CELL 01-22] out_plot=C:\anonymous-users-mooc-session-meta\reports\01_ingest_mars\20260106_212607\plots\mars_item_interactions_logbucket.png
[CELL 01-22] elapsed=0.24s
[CELL 01-22] done


Plot: rating histogram (only if rating exists)

In [26]:
# [CELL 01-23] Plot rating histogram (if rating column exists)

t0 = cell_start("CELL 01-23", "Plot: rating histogram (optional)")

out_png = None

if RATING_COL is None:
    print("[CELL 01-23] No rating column detected; skipping.")
else:
    # Use DuckDB to get rating counts (works for numeric or discrete values)
    df = con.execute(f"""
    SELECT {RATING_COL} AS rating, COUNT(*) AS n
    FROM {VIEW}
    WHERE {RATING_COL} IS NOT NULL
    GROUP BY 1
    ORDER BY 1
    """).fetchdf()

    print("[CELL 01-23] rating_rows:", df.shape[0])
    print(df.head(20).to_string(index=False))

    plt.figure()
    plt.bar(df["rating"].astype(str), df["n"].astype(int))
    plt.xticks(rotation=45, ha="right")
    plt.xlabel("Rating value")
    plt.ylabel("Count")
    plt.title("MARS: Rating distribution")
    plt.tight_layout()

    out_png = PLOTS_DIR / "mars_rating_distribution.png"
    plt.savefig(out_png, dpi=200)
    plt.close()

    print("[CELL 01-23] saved:", out_png)

cell_end("CELL 01-23", t0, out_plot=str(out_png) if out_png else None)



[CELL 01-23] Plot: rating histogram (optional)
[CELL 01-23] start=2026-01-06T21:37:06
[CELL 01-23] rating_rows: 10
 rating    n
      1  200
      2  135
      3   90
      4  105
      5   61
      6   74
      7   72
      8   79
      9   96
     10 2747
[CELL 01-23] saved: C:\anonymous-users-mooc-session-meta\reports\01_ingest_mars\20260106_212607\plots\mars_rating_distribution.png
[CELL 01-23] out_plot=C:\anonymous-users-mooc-session-meta\reports\01_ingest_mars\20260106_212607\plots\mars_rating_distribution.png
[CELL 01-23] elapsed=0.14s
[CELL 01-23] done


Plot: interactions over time (only if timestamp parses)

In [27]:
# [CELL 01-24] Plot interactions over time (if timestamp parse succeeds)

t0 = cell_start("CELL 01-24", "Plot: interactions over time (timestamp parse check)")

# Try parse TS_COL into TIMESTAMP in DuckDB
# If TS_COL is already timestamp/date, TRY_CAST will succeed.
parse_check = con.execute(f"""
WITH x AS (
  SELECT
    {TS_COL} AS raw_ts,
    TRY_CAST({TS_COL} AS TIMESTAMP) AS ts_parsed
  FROM {VIEW}
)
SELECT
  COUNT(*) AS n,
  SUM(CASE WHEN ts_parsed IS NULL THEN 1 ELSE 0 END) AS n_failed
FROM x
""").fetchdf().iloc[0].to_dict()

n = int(parse_check["n"])
n_failed = int(parse_check["n_failed"])
ok_ratio = 1.0 - (n_failed / max(1, n))

print("[CELL 01-24] parse_check:", {"n": n, "n_failed": n_failed, "ok_ratio": ok_ratio})

out_png = None
if ok_ratio < 0.90:
    print("[CELL 01-24] Timestamp parse success < 90%; skipping time plot (we’ll normalize timestamps in Notebook 02).")
else:
    df = con.execute(f"""
    WITH x AS (
      SELECT TRY_CAST({TS_COL} AS TIMESTAMP) AS ts
      FROM {VIEW}
      WHERE {TS_COL} IS NOT NULL
    )
    SELECT
      date_trunc('day', ts) AS day,
      COUNT(*) AS n
    FROM x
    WHERE ts IS NOT NULL
    GROUP BY 1
    ORDER BY 1
    """).fetchdf()

    print("[CELL 01-24] daily_rows:", df.shape[0])
    print(df.head(10).to_string(index=False))

    plt.figure()
    plt.plot(df["day"], df["n"])
    plt.xticks(rotation=45, ha="right")
    plt.xlabel("Day")
    plt.ylabel("Interactions")
    plt.title("MARS: Interactions over time (daily)")
    plt.tight_layout()

    out_png = PLOTS_DIR / "mars_interactions_over_time_daily.png"
    plt.savefig(out_png, dpi=200)
    plt.close()

    print("[CELL 01-24] saved:", out_png)

cell_end("CELL 01-24", t0, out_plot=str(out_png) if out_png else None)



[CELL 01-24] Plot: interactions over time (timestamp parse check)
[CELL 01-24] start=2026-01-06T21:37:34
[CELL 01-24] parse_check: {'n': 3659, 'n_failed': 0, 'ok_ratio': 1.0}
[CELL 01-24] daily_rows: 595
       day  n
2018-09-28 44
2018-09-29  2
2018-10-01  1
2018-10-02  2
2018-10-04 31
2018-10-05  3
2018-10-06  1
2018-10-07 10
2018-10-08  5
2018-10-09  6
[CELL 01-24] saved: C:\anonymous-users-mooc-session-meta\reports\01_ingest_mars\20260106_212607\plots\mars_interactions_over_time_daily.png
[CELL 01-24] out_plot=C:\anonymous-users-mooc-session-meta\reports\01_ingest_mars\20260106_212607\plots\mars_interactions_over_time_daily.png
[CELL 01-24] elapsed=0.20s
[CELL 01-24] done


Register plot files in manifest + report, then close DB

In [28]:
# [CELL 01-25] Add plot artifacts to manifest/report + close DuckDB

t0 = cell_start("CELL 01-25", "Update manifest/report with plot artifacts + close DuckDB")

report = read_json(REPORT_PATH)
manifest = read_json(MANIFEST_PATH)

plot_files = sorted([p for p in PLOTS_DIR.glob("*.png") if p.is_file()])
print("[CELL 01-25] plot_files:", [p.name for p in plot_files])

# Add to manifest (hashing is safe because read_only connection; still close after)
for p in plot_files:
    manifest["artifacts"].append({
        "path": str(p),
        "bytes": int(p.stat().st_size),
        "sha256": sha256_file(p),
    })

report["notes"].append(f"Saved {len(plot_files)} EDA plots under {str(PLOTS_DIR)}")
report["sanity_samples"]["mars_eda_plots"] = [p.name for p in plot_files]

write_json_atomic(MANIFEST_PATH, manifest)
write_json_atomic(REPORT_PATH, report)

con.close()
print("[CELL 01-25] closed DuckDB connection")

cell_end("CELL 01-25", t0, n_plots=len(plot_files))



[CELL 01-25] Update manifest/report with plot artifacts + close DuckDB
[CELL 01-25] start=2026-01-06T21:37:56
[CELL 01-25] plot_files: ['mars_interactions_over_time_daily.png', 'mars_item_interactions_logbucket.png', 'mars_rating_distribution.png', 'mars_user_interactions_logbucket.png']
[CELL 01-25] closed DuckDB connection
[CELL 01-25] n_plots=4
[CELL 01-25] elapsed=0.02s
[CELL 01-25] done
