Bootstrap: repo root + fixed paths + basic logger

In [1]:
# [CELL 01-00] Bootstrap: repo root + fixed paths + basic logger

import os
import sys
import json
import time
import uuid
import hashlib
from pathlib import Path
from datetime import datetime
from typing import Any, Dict, List

import numpy as np
import pandas as pd

t0 = datetime.now()
print(f"[CELL 01-00] start={t0.isoformat(timespec='seconds')}")
print("[CELL 01-00] CWD:", Path.cwd().resolve())

def find_repo_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start, *start.parents]:
        if (p / "PROJECT_STATE.md").exists():
            return p
    raise RuntimeError("Could not find PROJECT_STATE.md. Open notebook from within the repo.")

REPO_ROOT = find_repo_root(Path.cwd())
print("[CELL 01-00] REPO_ROOT:", REPO_ROOT)

PATHS = {
    "PROJECT_STATE": REPO_ROOT / "PROJECT_STATE.md",
    "META_REGISTRY": REPO_ROOT / "meta.json",
    "DATA_RAW": REPO_ROOT / "data" / "raw",
    "DATA_INTERIM": REPO_ROOT / "data" / "interim",
    "DATA_PROCESSED": REPO_ROOT / "data" / "processed",
    "REPORTS": REPO_ROOT / "reports",
}
for k, v in PATHS.items():
    print(f"[CELL 01-00] {k}={v}")

def cell_start(cell_id: str, title: str, **kwargs: Any) -> float:
    t = time.time()
    print(f"\n[{cell_id}] {title}")
    print(f"[{cell_id}] start={datetime.now().isoformat(timespec='seconds')}")
    for k, v in kwargs.items():
        print(f"[{cell_id}] {k}={v}")
    return t

def cell_end(cell_id: str, t0: float, **kwargs: Any) -> None:
    for k, v in kwargs.items():
        print(f"[{cell_id}] {k}={v}")
    print(f"[{cell_id}] elapsed={time.time()-t0:.2f}s")
    print(f"[{cell_id}] done")

print("[CELL 01-00] done")


[CELL 01-00] start=2026-01-06T21:23:21
[CELL 01-00] CWD: C:\anonymous-users-mooc-session-meta\notebooks
[CELL 01-00] REPO_ROOT: C:\anonymous-users-mooc-session-meta
[CELL 01-00] PROJECT_STATE=C:\anonymous-users-mooc-session-meta\PROJECT_STATE.md
[CELL 01-00] META_REGISTRY=C:\anonymous-users-mooc-session-meta\meta.json
[CELL 01-00] DATA_RAW=C:\anonymous-users-mooc-session-meta\data\raw
[CELL 01-00] DATA_INTERIM=C:\anonymous-users-mooc-session-meta\data\interim
[CELL 01-00] DATA_PROCESSED=C:\anonymous-users-mooc-session-meta\data\processed
[CELL 01-00] REPORTS=C:\anonymous-users-mooc-session-meta\reports
[CELL 01-00] done


Reproducibility: seed everything

In [2]:
# [CELL 01-01] Reproducibility: seed everything

t0 = cell_start("CELL 01-01", "Seed everything")

GLOBAL_SEED = 20260106

def seed_everything(seed: int) -> None:
    import random
    random.seed(seed)
    np.random.seed(seed)

seed_everything(GLOBAL_SEED)

cell_end("CELL 01-01", t0, seed=GLOBAL_SEED)



[CELL 01-01] Seed everything
[CELL 01-01] start=2026-01-06T21:23:21
[CELL 01-01] seed=20260106
[CELL 01-01] elapsed=0.00s
[CELL 01-01] done


JSON IO (atomic) + hashing helpers

In [3]:
# [CELL 01-02] JSON IO (atomic) + hashing helpers

t0 = cell_start("CELL 01-02", "JSON IO + hashing")

def write_json_atomic(path: Path, obj: Any, indent: int = 2) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    tmp = path.with_suffix(path.suffix + f".tmp_{uuid.uuid4().hex}")
    with tmp.open("w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=indent)
    tmp.replace(path)

def read_json(path: Path) -> Any:
    if not path.exists():
        raise RuntimeError(f"Missing JSON file: {path}")
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

def sha256_file(path: Path, chunk_size: int = 1024 * 1024) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        while True:
            b = f.read(chunk_size)
            if not b:
                break
            h.update(b)
    return h.hexdigest()

def assert_nonempty_df(df: pd.DataFrame, name: str) -> None:
    if df is None or not isinstance(df, pd.DataFrame) or df.shape[0] == 0:
        raise RuntimeError(f"{name} is empty or invalid DataFrame")

cell_end("CELL 01-02", t0)



[CELL 01-02] JSON IO + hashing
[CELL 01-02] start=2026-01-06T21:23:21
[CELL 01-02] elapsed=0.00s
[CELL 01-02] done


Run tagging + report/config/manifest + meta.json append-only

In [4]:
# [CELL 01-03] Run tagging + report/config/manifest + meta.json registry

t0 = cell_start("CELL 01-03", "Start run + init run files + meta.json")

NOTEBOOK_NAME = "01_ingest_mars"
RUN_TAG = datetime.now().strftime("%Y%m%d_%H%M%S")
RUN_ID = uuid.uuid4().hex

OUT_DIR = PATHS["REPORTS"] / NOTEBOOK_NAME / RUN_TAG
OUT_DIR.mkdir(parents=True, exist_ok=True)

REPORT_PATH = OUT_DIR / "report.json"
CONFIG_PATH = OUT_DIR / "config.json"
MANIFEST_PATH = OUT_DIR / "manifest.json"

RAW_DIR = PATHS["DATA_RAW"] / "mars"
OUT_PARQUET = PATHS["DATA_INTERIM"] / "mars_events_raw.parquet"
OUT_DUCKDB = PATHS["DATA_INTERIM"] / "mars.duckdb"

CFG = {
    "notebook": NOTEBOOK_NAME,
    "run_id": RUN_ID,
    "run_tag": RUN_TAG,
    "seed": GLOBAL_SEED,
    "paths": {
        "raw_dir": str(RAW_DIR),
        "out_parquet": str(OUT_PARQUET),
        "out_duckdb": str(OUT_DUCKDB),
        "out_dir": str(OUT_DIR),
    },
    "ingest": {
        "accepted_ext": [".json", ".jsonl", ".csv", ".parquet"],
        "parquet_compression": "zstd",
        "duckdb_view": "mars_events_raw",
    }
}

write_json_atomic(CONFIG_PATH, CFG)

report = {
    "run_id": RUN_ID,
    "notebook": NOTEBOOK_NAME,
    "run_tag": RUN_TAG,
    "created_at": datetime.now().isoformat(timespec="seconds"),
    "repo_root": str(REPO_ROOT),
    "metrics": {},
    "key_findings": [],
    "sanity_samples": {},
    "data_fingerprints": {},
    "notes": [],
}
write_json_atomic(REPORT_PATH, report)

manifest = {
    "run_id": RUN_ID,
    "notebook": NOTEBOOK_NAME,
    "run_tag": RUN_TAG,
    "artifacts": [],
}
write_json_atomic(MANIFEST_PATH, manifest)

# meta.json append-only
META_PATH = PATHS["META_REGISTRY"]
if not META_PATH.exists():
    write_json_atomic(META_PATH, {"schema_version": 1, "runs": []})

meta = read_json(META_PATH)
if "runs" not in meta or not isinstance(meta["runs"], list):
    raise RuntimeError("meta.json invalid: missing 'runs' list")

meta["runs"].append({
    "run_id": RUN_ID,
    "notebook": NOTEBOOK_NAME,
    "run_tag": RUN_TAG,
    "out_dir": str(OUT_DIR),
    "created_at": datetime.now().isoformat(timespec="seconds"),
})
write_json_atomic(META_PATH, meta)

cell_end("CELL 01-03", t0,
         out_dir=str(OUT_DIR),
         report=str(REPORT_PATH),
         config=str(CONFIG_PATH),
         manifest=str(MANIFEST_PATH),
         meta=str(META_PATH))



[CELL 01-03] Start run + init run files + meta.json
[CELL 01-03] start=2026-01-06T21:23:21
[CELL 01-03] out_dir=C:\anonymous-users-mooc-session-meta\reports\01_ingest_mars\20260106_212321
[CELL 01-03] report=C:\anonymous-users-mooc-session-meta\reports\01_ingest_mars\20260106_212321\report.json
[CELL 01-03] config=C:\anonymous-users-mooc-session-meta\reports\01_ingest_mars\20260106_212321\config.json
[CELL 01-03] manifest=C:\anonymous-users-mooc-session-meta\reports\01_ingest_mars\20260106_212321\manifest.json
[CELL 01-03] meta=C:\anonymous-users-mooc-session-meta\meta.json
[CELL 01-03] elapsed=0.01s
[CELL 01-03] done


DuckDB dependency check

In [5]:
# [CELL 01-04] DuckDB dependency check

t0 = cell_start("CELL 01-04", "Import duckdb")

try:
    import duckdb
except Exception as e:
    raise RuntimeError(
        "Missing duckdb. Install via:\n"
        "  conda install -c conda-forge duckdb\n"
        "or\n"
        "  pip install duckdb\n"
    ) from e

print("[CELL 01-04] duckdb_version:", duckdb.__version__)
cell_end("CELL 01-04", t0)



[CELL 01-04] Import duckdb
[CELL 01-04] start=2026-01-06T21:23:21
[CELL 01-04] duckdb_version: 1.4.3
[CELL 01-04] elapsed=0.04s
[CELL 01-04] done


Enumerate raw files + fingerprint

In [6]:
# [CELL 01-05] Select explicit_ratings_en.csv only + fingerprint

t0 = cell_start("CELL 01-05", "Select explicit_ratings_en.csv only", raw_dir=str(RAW_DIR))

target_name = "explicit_ratings_en.csv"
target_path = RAW_DIR / target_name

if not target_path.exists():
    # show what exists to avoid guessing
    existing = sorted([p.name for p in RAW_DIR.rglob("*") if p.is_file()])
    raise RuntimeError(
        f"Required file not found: {target_path}\n"
        f"Files under {RAW_DIR} (first 50): {existing[:50]}"
    )

raw_fp = {
    "root": str(RAW_DIR),
    "n_files": 1,
    "files": [{
        "name": target_path.name,
        "relpath": str(target_path.relative_to(RAW_DIR)),
        "bytes": int(target_path.stat().st_size),
        "sha256": sha256_file(target_path),
        "suffix": target_path.suffix.lower(),
    }]
}

print("[CELL 01-05] using:", target_path.name)
print(json.dumps(raw_fp["files"][0], indent=2))

cell_end("CELL 01-05", t0)



[CELL 01-05] Select explicit_ratings_en.csv only
[CELL 01-05] start=2026-01-06T21:23:21
[CELL 01-05] raw_dir=C:\anonymous-users-mooc-session-meta\data\raw\mars
[CELL 01-05] using: explicit_ratings_en.csv
{
  "name": "explicit_ratings_en.csv",
  "relpath": "explicit_ratings_en.csv",
  "bytes": 145724,
  "sha256": "8190b9f10afcb44b7542616648ee3c7825f42f7bed832784a57e083b53773708",
  "suffix": ".csv"
}
[CELL 01-05] elapsed=0.00s
[CELL 01-05] done


Load raw → DataFrame (auto by extension)

Load explicit_ratings_en.csv only

In [7]:
# [CELL 01-06] Load explicit_ratings_en.csv only

t0 = cell_start("CELL 01-06", "Load explicit_ratings_en.csv")

events = pd.read_csv(RAW_DIR / "explicit_ratings_en.csv")
assert_nonempty_df(events, "events")

events["__source_file"] = "explicit_ratings_en.csv"

print("[CELL 01-06] shape:", events.shape)
print("[CELL 01-06] columns:", list(events.columns))

# minimal preview
print("[CELL 01-06] head(3):")
print(events.head(3).to_string(index=False))

cell_end("CELL 01-06", t0, rows=int(events.shape[0]), cols=int(events.shape[1]))



[CELL 01-06] Load explicit_ratings_en.csv
[CELL 01-06] start=2026-01-06T21:23:21
[CELL 01-06] shape: (3659, 6)
[CELL 01-06] columns: ['user_id', 'item_id', 'watch_percentage', 'created_at', 'rating', '__source_file']
[CELL 01-06] head(3):
 user_id  item_id  watch_percentage          created_at  rating           __source_file
  224557      510               100 2018-09-28 16:18:29      10 explicit_ratings_en.csv
  224557      615               100 2018-09-28 16:22:22      10 explicit_ratings_en.csv
  224557     7680               100 2018-09-28 16:23:34      10 explicit_ratings_en.csv
[CELL 01-06] rows=3659
[CELL 01-06] cols=6
[CELL 01-06] elapsed=0.01s
[CELL 01-06] done


Save canonical Parquet (raw → parquet)

In [8]:
# [CELL 01-07] Save canonical Parquet (raw → parquet)

t0 = cell_start("CELL 01-07", "Save canonical raw table as Parquet", out_parquet=str(OUT_PARQUET))

OUT_PARQUET.parent.mkdir(parents=True, exist_ok=True)
events.to_parquet(OUT_PARQUET, index=False, compression=CFG["ingest"]["parquet_compression"])

parq_bytes = int(OUT_PARQUET.stat().st_size)
parq_sha = sha256_file(OUT_PARQUET)

print("[CELL 01-07] saved:", OUT_PARQUET)
print("[CELL 01-07] bytes:", parq_bytes)
print("[CELL 01-07] sha256:", parq_sha)

cell_end("CELL 01-07", t0)



[CELL 01-07] Save canonical raw table as Parquet
[CELL 01-07] start=2026-01-06T21:23:21
[CELL 01-07] out_parquet=C:\anonymous-users-mooc-session-meta\data\interim\mars_events_raw.parquet
[CELL 01-07] saved: C:\anonymous-users-mooc-session-meta\data\interim\mars_events_raw.parquet
[CELL 01-07] bytes: 43047
[CELL 01-07] sha256: 3e261bffc99b67cd75a4b03b71785b5dbb854a6aea5db131f4763cb57d51fc9d
[CELL 01-07] elapsed=0.02s
[CELL 01-07] done


DuckDB: create DB + view from Parquet (parquet → duckdb)

In [9]:
# [CELL 01-08] DuckDB DB + view from Parquet

t0 = cell_start("CELL 01-08", "Create DuckDB and VIEW from Parquet", out_duckdb=str(OUT_DUCKDB))

OUT_DUCKDB.parent.mkdir(parents=True, exist_ok=True)
con = duckdb.connect(str(OUT_DUCKDB))

view = CFG["ingest"]["duckdb_view"]
con.execute(f"DROP VIEW IF EXISTS {view};")
con.execute(f"""
CREATE VIEW {view} AS
SELECT * FROM read_parquet('{str(OUT_PARQUET).replace("'", "''")}')
""")

n = con.execute(f"SELECT COUNT(*) FROM {view}").fetchone()[0]
schema_df = con.execute(f"DESCRIBE {view}").fetchdf()

print("[CELL 01-08] view:", view)
print("[CELL 01-08] rows:", int(n))
print("[CELL 01-08] schema_head:")
print(schema_df.head(40).to_string(index=False))

cell_end("CELL 01-08", t0, rows=int(n))



[CELL 01-08] Create DuckDB and VIEW from Parquet
[CELL 01-08] start=2026-01-06T21:23:21
[CELL 01-08] out_duckdb=C:\anonymous-users-mooc-session-meta\data\interim\mars.duckdb
[CELL 01-08] view: mars_events_raw
[CELL 01-08] rows: 3659
[CELL 01-08] schema_head:
     column_name column_type null  key default extra
         user_id      BIGINT  YES None    None  None
         item_id      BIGINT  YES None    None  None
watch_percentage      BIGINT  YES None    None  None
      created_at     VARCHAR  YES None    None  None
          rating      BIGINT  YES None    None  None
   __source_file     VARCHAR  YES None    None  None
[CELL 01-08] rows=3659
[CELL 01-08] elapsed=0.03s
[CELL 01-08] done


Update report + manifest (fingerprints + sanity sample)

In [11]:
# [CELL 01-09] Update report + manifest

t0 = cell_start("CELL 01-09", "Write fingerprints + sanity sample to report + manifest")

report = read_json(REPORT_PATH)
manifest = read_json(MANIFEST_PATH)

sample_cols = list(events.columns)[:20]
head3 = events.loc[:2, sample_cols].to_dict(orient="records")

report["data_fingerprints"]["mars_raw_files"] = {
    "root": raw_fp["root"],
    "n_files": raw_fp["n_files"],
    "files_first3": raw_fp["files"][:3],
}
report["data_fingerprints"]["mars_events_raw_parquet"] = {
    "path": str(OUT_PARQUET),
    "bytes": parq_bytes,
    "sha256": parq_sha,
}
report["sanity_samples"]["mars_events_raw_head3"] = head3
report["notes"].append("Storage rule enforced: raw -> parquet -> duckdb (VIEW mars_events_raw)")

write_json_atomic(REPORT_PATH, report)

def add_artifact(path: Path) -> None:
    manifest["artifacts"].append({
        "path": str(path),
        "bytes": int(path.stat().st_size),
        "sha256": sha256_file(path),
    })

add_artifact(OUT_PARQUET)
add_artifact(OUT_DUCKDB)

write_json_atomic(MANIFEST_PATH, manifest)

print("[CELL 01-09] updated_report:", REPORT_PATH)
print("[CELL 01-09] updated_manifest:", MANIFEST_PATH)

cell_end("CELL 01-09", t0)



[CELL 01-09] Write fingerprints + sanity sample to report + manifest
[CELL 01-09] start=2026-01-06T21:25:08


PermissionError: [Errno 13] Permission denied: 'C:\\anonymous-users-mooc-session-meta\\data\\interim\\mars.duckdb'