# Notebook 01: Ingest MARS Dataset

**Purpose:** Load and explore the MARS (MOOCs And Recommender Systems) dataset.

**Input:** `data/raw/mars/explicit_ratings_en.csv`

**Output:** 
- `data/interim/mars/interactions.parquet` - Cleaned interactions
- `reports/01_ingest_mars/<run_tag>/report.json`

**MARS Dataset:**
- Explicit ratings from English MOOC platform
- Columns: user_id, item_id, watch_percentage, created_at, rating

In [1]:
# [CELL 01-00] Bootstrap: repo root + paths + imports

import os
import sys
import json
import time
import uuid
import hashlib
from pathlib import Path
from datetime import datetime
from typing import Any, Dict, List

import numpy as np
import pandas as pd

t0 = datetime.now()
print(f"[CELL 01-00] start={t0.isoformat(timespec='seconds')}")
print("[CELL 01-00] CWD:", Path.cwd().resolve())

def find_repo_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start, *start.parents]:
        if (p / "PROJECT_STATE.md").exists():
            return p
    raise RuntimeError("Could not find PROJECT_STATE.md")

REPO_ROOT = find_repo_root(Path.cwd())
print("[CELL 01-00] REPO_ROOT:", REPO_ROOT)

PATHS = {
    "META_REGISTRY": REPO_ROOT / "meta.json",
    "DATA_RAW": REPO_ROOT / "data" / "raw",
    "DATA_INTERIM": REPO_ROOT / "data" / "interim",
    "DATA_PROCESSED": REPO_ROOT / "data" / "processed",
    "REPORTS": REPO_ROOT / "reports",
}

for k, v in PATHS.items():
    print(f"[CELL 01-00] {k}={v}")

def cell_start(cell_id: str, title: str, **kwargs: Any) -> float:
    t = time.time()
    print(f"\n[{cell_id}] {title}")
    print(f"[{cell_id}] start={datetime.now().isoformat(timespec='seconds')}")
    for k, v in kwargs.items():
        print(f"[{cell_id}] {k}={v}")
    return t

def cell_end(cell_id: str, t0: float, **kwargs: Any) -> None:
    for k, v in kwargs.items():
        print(f"[{cell_id}] {k}={v}")
    print(f"[{cell_id}] elapsed={time.time()-t0:.2f}s")
    print(f"[{cell_id}] done")

def write_json_atomic(path: Path, obj: Any, indent: int = 2) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    tmp = path.with_suffix(path.suffix + f".tmp_{uuid.uuid4().hex}")
    with tmp.open("w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=indent)
    tmp.replace(path)

def read_json(path: Path) -> Any:
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

print("[CELL 01-00] done")

[CELL 01-00] start=2026-01-12T22:17:28
[CELL 01-00] CWD: C:\anonymous-users-mooc-session-meta\notebooks
[CELL 01-00] REPO_ROOT: C:\anonymous-users-mooc-session-meta
[CELL 01-00] META_REGISTRY=C:\anonymous-users-mooc-session-meta\meta.json
[CELL 01-00] DATA_RAW=C:\anonymous-users-mooc-session-meta\data\raw
[CELL 01-00] DATA_INTERIM=C:\anonymous-users-mooc-session-meta\data\interim
[CELL 01-00] DATA_PROCESSED=C:\anonymous-users-mooc-session-meta\data\processed
[CELL 01-00] REPORTS=C:\anonymous-users-mooc-session-meta\reports
[CELL 01-00] done


In [2]:
# [CELL 01-01] Setup run tagging

t0 = cell_start("CELL 01-01", "Setup run tagging")

NOTEBOOK_NAME = "01_ingest_mars"
RUN_TAG = datetime.now().strftime("%Y%m%d_%H%M%S")
RUN_ID = uuid.uuid4().hex

OUT_DIR = PATHS["REPORTS"] / NOTEBOOK_NAME / RUN_TAG
OUT_DIR.mkdir(parents=True, exist_ok=True)

REPORT_PATH = OUT_DIR / "report.json"
CONFIG_PATH = OUT_DIR / "config.json"

report = {
    "run_id": RUN_ID,
    "notebook": NOTEBOOK_NAME,
    "run_tag": RUN_TAG,
    "created_at": datetime.now().isoformat(timespec="seconds"),
    "metrics": {},
    "key_findings": [],
    "sanity_samples": {},
}

cell_end("CELL 01-01", t0, out_dir=str(OUT_DIR))


[CELL 01-01] Setup run tagging
[CELL 01-01] start=2026-01-12T22:17:28
[CELL 01-01] out_dir=C:\anonymous-users-mooc-session-meta\reports\01_ingest_mars\20260112_221728
[CELL 01-01] elapsed=0.00s
[CELL 01-01] done


In [3]:
# [CELL 01-02] Load raw MARS data

t0 = cell_start("CELL 01-02", "Load raw MARS data")

RAW_PATH = PATHS["DATA_RAW"] / "mars" / "explicit_ratings_en.csv"
print(f"[CELL 01-02] Loading: {RAW_PATH}")

if not RAW_PATH.exists():
    raise FileNotFoundError(f"MARS raw data not found: {RAW_PATH}")

df_raw = pd.read_csv(RAW_PATH)

print(f"[CELL 01-02] Raw shape: {df_raw.shape}")
print(f"[CELL 01-02] Columns: {list(df_raw.columns)}")
print(f"[CELL 01-02] Dtypes:\n{df_raw.dtypes}")

cell_end("CELL 01-02", t0, n_rows=len(df_raw))


[CELL 01-02] Load raw MARS data
[CELL 01-02] start=2026-01-12T22:17:28
[CELL 01-02] Loading: C:\anonymous-users-mooc-session-meta\data\raw\mars\explicit_ratings_en.csv
[CELL 01-02] Raw shape: (3659, 5)
[CELL 01-02] Columns: ['user_id', 'item_id', 'watch_percentage', 'created_at', 'rating']
[CELL 01-02] Dtypes:
user_id              int64
item_id              int64
watch_percentage     int64
created_at          object
rating               int64
dtype: object
[CELL 01-02] n_rows=3659
[CELL 01-02] elapsed=0.02s
[CELL 01-02] done


In [4]:
# [CELL 01-03] Explore data statistics

t0 = cell_start("CELL 01-03", "Explore data statistics")

print(f"[CELL 01-03] Sample rows:")
print(df_raw.head(10))

print(f"\n[CELL 01-03] Basic statistics:")
n_users = df_raw["user_id"].nunique()
n_items = df_raw["item_id"].nunique()
n_interactions = len(df_raw)

print(f"  - Total interactions: {n_interactions:,}")
print(f"  - Unique users: {n_users:,}")
print(f"  - Unique items: {n_items:,}")
print(f"  - Density: {n_interactions / (n_users * n_items) * 100:.4f}%")

print(f"\n[CELL 01-03] Interactions per user:")
user_counts = df_raw.groupby("user_id").size()
print(f"  - Min: {user_counts.min()}")
print(f"  - Max: {user_counts.max()}")
print(f"  - Mean: {user_counts.mean():.2f}")
print(f"  - Median: {user_counts.median():.2f}")

print(f"\n[CELL 01-03] Rating distribution:")
print(df_raw["rating"].value_counts().sort_index())

print(f"\n[CELL 01-03] Watch percentage distribution:")
print(df_raw["watch_percentage"].describe())

report["metrics"]["n_interactions"] = n_interactions
report["metrics"]["n_users"] = n_users
report["metrics"]["n_items"] = n_items

cell_end("CELL 01-03", t0)


[CELL 01-03] Explore data statistics
[CELL 01-03] start=2026-01-12T22:17:28
[CELL 01-03] Sample rows:
   user_id  item_id  watch_percentage           created_at  rating
0   224557      510               100  2018-09-28 16:18:29      10
1   224557      615               100  2018-09-28 16:22:22      10
2   224557     7680               100  2018-09-28 16:23:34      10
3   224293      510               100  2018-09-28 17:20:30      10
4   224293      515               100  2018-09-28 17:40:02      10
5   224293      516                32  2018-09-28 17:41:45       4
6   224442      510               100  2018-10-01 20:11:35      10
7   196845      510                 7  2018-10-06 13:12:52       1
8   265845      510               100  2018-10-10 21:17:26      10
9   224470      510               100  2018-10-11 19:59:49      10

[CELL 01-03] Basic statistics:
  - Total interactions: 3,659
  - Unique users: 822
  - Unique items: 776
  - Density: 0.5736%

[CELL 01-03] Interactions per us

In [5]:
# [CELL 01-04] Parse timestamps and clean data

t0 = cell_start("CELL 01-04", "Parse timestamps and clean data")

# Parse timestamps
df_raw["timestamp"] = pd.to_datetime(df_raw["created_at"])
df_raw["ts_epoch"] = df_raw["timestamp"].astype("int64") // 10**9

print(f"[CELL 01-04] Date range:")
print(f"  - Min: {df_raw['timestamp'].min()}")
print(f"  - Max: {df_raw['timestamp'].max()}")

# Remove duplicates (same user, same item, same timestamp)
n_before = len(df_raw)
df_clean = df_raw.drop_duplicates(subset=["user_id", "item_id", "ts_epoch"])
n_after = len(df_clean)

print(f"[CELL 01-04] Removed {n_before - n_after:,} duplicate interactions")

# Sort by user and timestamp
df_clean = df_clean.sort_values(["user_id", "ts_epoch"]).reset_index(drop=True)

cell_end("CELL 01-04", t0, n_clean=len(df_clean))


[CELL 01-04] Parse timestamps and clean data
[CELL 01-04] start=2026-01-12T22:17:28
[CELL 01-04] Date range:
  - Min: 2018-09-28 14:38:15
  - Max: 2021-09-20 16:26:06
[CELL 01-04] Removed 4 duplicate interactions
[CELL 01-04] n_clean=3655
[CELL 01-04] elapsed=0.05s
[CELL 01-04] done


In [6]:
# [CELL 01-05] Save cleaned interactions

t0 = cell_start("CELL 01-05", "Save cleaned interactions")

# Select columns to save
df_out = df_clean[["user_id", "item_id", "ts_epoch", "watch_percentage", "rating"]].copy()

# Save to parquet
OUT_INTERIM = PATHS["DATA_INTERIM"] / "mars"
OUT_INTERIM.mkdir(parents=True, exist_ok=True)

out_path = OUT_INTERIM / "interactions.parquet"
df_out.to_parquet(out_path, index=False)

print(f"[CELL 01-05] Saved: {out_path}")
print(f"[CELL 01-05] Shape: {df_out.shape}")
print(f"[CELL 01-05] File size: {out_path.stat().st_size / 1024 / 1024:.2f} MB")

report["metrics"]["n_clean_interactions"] = len(df_out)
report["key_findings"].append(f"MARS dataset: {n_users:,} users, {n_items:,} items, {len(df_out):,} interactions")

cell_end("CELL 01-05", t0)


[CELL 01-05] Save cleaned interactions
[CELL 01-05] start=2026-01-12T22:17:28
[CELL 01-05] Saved: C:\anonymous-users-mooc-session-meta\data\interim\mars\interactions.parquet
[CELL 01-05] Shape: (3655, 5)
[CELL 01-05] File size: 0.04 MB
[CELL 01-05] elapsed=0.11s
[CELL 01-05] done


In [7]:
# [CELL 01-06] Save report

t0 = cell_start("CELL 01-06", "Save report")

# Add sanity samples
report["sanity_samples"]["first_3_rows"] = df_out.head(3).to_dict(orient="records")

write_json_atomic(REPORT_PATH, report)
print(f"[CELL 01-06] Saved: {REPORT_PATH}")

# Update meta.json
META_PATH = PATHS["META_REGISTRY"]
if not META_PATH.exists():
    write_json_atomic(META_PATH, {"schema_version": 1, "runs": []})
meta = read_json(META_PATH)
meta["runs"].append({
    "run_id": RUN_ID,
    "notebook": NOTEBOOK_NAME,
    "run_tag": RUN_TAG,
    "out_dir": str(OUT_DIR),
    "created_at": datetime.now().isoformat(timespec="seconds"),
})
write_json_atomic(META_PATH, meta)

print("\n" + "="*60)
print("NOTEBOOK 01 COMPLETE: MARS Ingestion")
print("="*60)
print(f"Users: {n_users:,}")
print(f"Items: {n_items:,}")
print(f"Interactions: {len(df_out):,}")
print(f"Output: {out_path}")

cell_end("CELL 01-06", t0)


[CELL 01-06] Save report
[CELL 01-06] start=2026-01-12T22:17:28
[CELL 01-06] Saved: C:\anonymous-users-mooc-session-meta\reports\01_ingest_mars\20260112_221728\report.json

NOTEBOOK 01 COMPLETE: MARS Ingestion
Users: 822
Items: 776
Interactions: 3,655
Output: C:\anonymous-users-mooc-session-meta\data\interim\mars\interactions.parquet
[CELL 01-06] elapsed=0.01s
[CELL 01-06] done
