Notebook header + imports + version logging

In [1]:
# [CELL 02-00] Notebook 02 — EDA Target MOOC (MARS)
# Goal: load raw target dataset (MARS), inspect schema, compute EDA counts, and export canonical events parquet.
# Canonical schema (agreed): user_id, item_id, timestamp, signal_type, value
# Guardrail: NO session-gap analysis, NO sessionization, NO prefix->label samples.

import os
import sys
import json
import math
import time
import platform
from pathlib import Path
from datetime import datetime

import pandas as pd

try:
    import duckdb
except Exception as e:
    raise RuntimeError("duckdb is required for this notebook. Please install duckdb.") from e

print("[02-00] Python:", sys.version)
print("[02-00] Platform:", platform.platform())
print("[02-00] pandas:", pd.__version__)
print("[02-00] duckdb:", duckdb.__version__)
print("[02-00] CWD:", Path.cwd().resolve())


[02-00] Python: 3.11.14 | packaged by Anaconda, Inc. | (main, Oct 21 2025, 18:30:03) [MSC v.1929 64 bit (AMD64)]
[02-00] Platform: Windows-10-10.0.22621-SP0
[02-00] pandas: 2.3.3
[02-00] duckdb: 1.4.3
[02-00] CWD: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\notebooks


Locate repo root (Windows-safe) + standard folders

In [2]:
# [CELL 02-01] Bootstrap: locate repo root reliably (Windows-safe)

from pathlib import Path

CWD = Path.cwd().resolve()
print("[02-01] Initial CWD:", CWD)

def find_repo_root(start: Path) -> Path:
    for p in [start, *start.parents]:
        if (p / "PROJECT_STATE.md").exists():
            print(f"[02-01] Found PROJECT_STATE.md in: {p}")
            return p
    for p in [start, *start.parents]:
        if (p / ".git").exists():
            print(f"[02-01] Found .git in: {p}")
            return p
    raise FileNotFoundError("Could not locate repo root (PROJECT_STATE.md or .git not found).")

REPO_ROOT = find_repo_root(CWD)
print("[02-01] REPO_ROOT:", REPO_ROOT)

NOTEBOOKS_DIR = REPO_ROOT / "notebooks"
DATA_DIR      = REPO_ROOT / "data"
RAW_DIR       = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"
REPORTS_DIR   = REPO_ROOT / "reports"

for d in [DATA_DIR, RAW_DIR, PROCESSED_DIR, REPORTS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("[02-01] RAW_DIR:", RAW_DIR)
print("[02-01] PROCESSED_DIR:", PROCESSED_DIR)
print("[02-01] REPORTS_DIR:", REPORTS_DIR)


[02-01] Initial CWD: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\notebooks
[02-01] Found PROJECT_STATE.md in: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta
[02-01] REPO_ROOT: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta
[02-01] RAW_DIR: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\raw
[02-01] PROCESSED_DIR: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed
[02-01] REPORTS_DIR: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports


Create RUN_TAG + run folders + helpers

In [3]:
# [CELL 02-02] Run tag + output folders + helpers

RUN_TAG = datetime.now().strftime("%Y%m%d_%H%M%S")
RUN_REPORT_DIR = REPORTS_DIR / "02_eda_target_mooc" / RUN_TAG
RUN_REPORT_DIR.mkdir(parents=True, exist_ok=True)

TARGET_NAME = "mars"
TARGET_PROCESSED_DIR = PROCESSED_DIR / TARGET_NAME
TARGET_PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

print("[02-02] RUN_TAG:", RUN_TAG)
print("[02-02] RUN_REPORT_DIR:", RUN_REPORT_DIR)
print("[02-02] TARGET_PROCESSED_DIR:", TARGET_PROCESSED_DIR)

def save_json(path: Path, obj: dict):
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(json.dumps(obj, indent=2, ensure_ascii=False), encoding="utf-8")
    print("[02-02] Wrote JSON:", path)

def save_df_csv(path: Path, df: pd.DataFrame, index=False):
    path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path, index=index)
    print("[02-02] Wrote CSV:", path, "| shape:", df.shape)

run_meta = {
    "run_tag": RUN_TAG,
    "notebook": "02_eda_target_mooc.ipynb",
    "created_at": datetime.now().isoformat(timespec="seconds"),
    "repo_root": str(REPO_ROOT),
    "raw_dir": str(RAW_DIR),
    "processed_dir": str(PROCESSED_DIR),
    "target_processed_dir": str(TARGET_PROCESSED_DIR),
    "python": sys.version,
    "pandas": pd.__version__,
    "duckdb": duckdb.__version__,
    "platform": platform.platform(),
}
save_json(RUN_REPORT_DIR / "run_meta.json", run_meta)


[02-02] RUN_TAG: 20251229_115541
[02-02] RUN_REPORT_DIR: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\02_eda_target_mooc\20251229_115541
[02-02] TARGET_PROCESSED_DIR: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\mars
[02-02] Wrote JSON: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\02_eda_target_mooc\20251229_115541\run_meta.json


Locate MARS raw files (auto-discover; fail loudly if missing)

In [4]:
# [CELL 02-03] Discover MARS raw files (explicit + implicit)

MARS_RAW_DIR = RAW_DIR / "mars"
if not MARS_RAW_DIR.exists():
    raise FileNotFoundError(f"Missing MARS folder: {MARS_RAW_DIR}")

explicit_csv = MARS_RAW_DIR / "explicit_ratings_en.csv"
implicit_csv = MARS_RAW_DIR / "implicit_ratings_en.csv"

print("[02-03] MARS dir:", MARS_RAW_DIR)
print("[02-03] explicit exists:", explicit_csv.exists(), "|", explicit_csv)
print("[02-03] implicit exists:", implicit_csv.exists(), "|", implicit_csv)

if not explicit_csv.exists():
    raise FileNotFoundError("Missing explicit_ratings_en.csv under data/raw/mars/")
if not implicit_csv.exists():
    raise FileNotFoundError("Missing implicit_ratings_en.csv under data/raw/mars/")

# Optional context tables
items_csv = MARS_RAW_DIR / "items_en.csv"
users_csv = MARS_RAW_DIR / "users_en.csv"
print("[02-03] items exists:", items_csv.exists(), "|", items_csv)
print("[02-03] users exists:", users_csv.exists(), "|", users_csv)


[02-03] MARS dir: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\raw\mars
[02-03] explicit exists: True | D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\raw\mars\explicit_ratings_en.csv
[02-03] implicit exists: True | D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\raw\mars\implicit_ratings_en.csv
[02-03] items exists: True | D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\raw\mars\items_en.csv
[02-03] users exists: True | D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\raw\mars\users_en.csv


Validate file types (must be consistent)

In [5]:
# [CELL 02-04] Confirm file formats (expected CSV)

assert explicit_csv.suffix.lower() == ".csv"
assert implicit_csv.suffix.lower() == ".csv"
print("[02-04] OK: MARS interaction files are CSV.")


[02-04] OK: MARS interaction files are CSV.


DuckDB: build a raw view (no memory blowups)

In [6]:
# [CELL 02-05] Build DuckDB raw view for MARS ratings (explicit + implicit) safely

con = duckdb.connect(database=":memory:")
con.execute("PRAGMA threads=8;")

RAW_VIEW = "raw_target"

explicit_path = str(explicit_csv.resolve())
implicit_path = str(implicit_csv.resolve())

print("[02-05] explicit:", explicit_path)
print("[02-05] implicit:", implicit_path)

con.execute(f"CREATE OR REPLACE VIEW raw_explicit AS SELECT * FROM read_csv_auto('{explicit_path}', header=True);")
con.execute(f"CREATE OR REPLACE VIEW raw_implicit AS SELECT * FROM read_csv_auto('{implicit_path}', header=True);")

# UNION BY NAME aligns columns; missing columns become NULL
con.execute(
    f"""
    CREATE OR REPLACE VIEW {RAW_VIEW} AS
    SELECT * FROM raw_explicit
    UNION BY NAME
    SELECT * FROM raw_implicit;
    """
)

schema_df = con.execute(f"DESCRIBE SELECT * FROM {RAW_VIEW}").df()
print("[02-05] Unified raw schema:")
display(schema_df)

sample_df = con.execute(f"SELECT * FROM {RAW_VIEW} LIMIT 5").df()
print("[02-05] Sample rows:")
display(sample_df)


[02-05] explicit: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\raw\mars\explicit_ratings_en.csv
[02-05] implicit: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\raw\mars\implicit_ratings_en.csv
[02-05] Unified raw schema:


Unnamed: 0,column_name,column_type,null,key,default,extra
0,user_id,BIGINT,YES,,,
1,item_id,BIGINT,YES,,,
2,watch_percentage,BIGINT,YES,,,
3,created_at,TIMESTAMP,YES,,,
4,rating,BIGINT,YES,,,


[02-05] Sample rows:


Unnamed: 0,user_id,item_id,watch_percentage,created_at,rating
0,224557,7680,100,2018-09-28 16:23:34,10
1,267435,510,100,2018-10-17 10:52:40,10
2,224453,615,100,2018-10-31 19:13:20,10
3,277123,70385,25,2020-02-05 18:50:53,3
4,277945,512,100,2018-11-20 16:23:15,10


Column mapping to canonical event schema (auto-detect + override)

In [7]:
# [CELL 02-06] Map raw columns -> canonical schema
# Canonical events schema (agreed): user_id, item_id, timestamp, signal_type, value
# For MARS: item_id is a COURSE id.

raw_cols = con.execute(f"SELECT * FROM {RAW_VIEW} LIMIT 1").df().columns.tolist()
raw_cols_lower = [c.lower() for c in raw_cols]
print("[02-06] Raw columns:", raw_cols)

def first_match(candidates):
    for cand in candidates:
        if cand in raw_cols_lower:
            return raw_cols[raw_cols_lower.index(cand)]
    return None

COL_USER = first_match(["user_id", "userid", "user", "uid"])
COL_ITEM = first_match(["item_id", "item", "course_id", "cid"])
COL_TIME = first_match(["created_at", "timestamp", "time", "event_time", "ts"])

# these may exist only for some rows
COL_RATING = first_match(["rating"])
COL_WATCH  = first_match(["watch_percentage", "watch_percent", "watch_pct"])

print("[02-06] Detected mapping:")
print("  COL_USER  =", COL_USER)
print("  COL_ITEM  =", COL_ITEM)
print("  COL_TIME  =", COL_TIME)
print("  COL_RATING=", COL_RATING)
print("  COL_WATCH =", COL_WATCH)

missing = [k for k,v in {"COL_USER": COL_USER, "COL_ITEM": COL_ITEM, "COL_TIME": COL_TIME}.items() if v is None]
if missing:
    raise ValueError("Missing required columns: " + ", ".join(missing))


[02-06] Raw columns: ['user_id', 'item_id', 'watch_percentage', 'created_at', 'rating']
[02-06] Detected mapping:
  COL_USER  = user_id
  COL_ITEM  = item_id
  COL_TIME  = created_at
  COL_RATING= rating
  COL_WATCH = watch_percentage


Build a canonical events_target view (types + null filtering; no sessions)

In [8]:
# [CELL 02-07] Create canonical view with agreed schema:
# user_id, item_id, timestamp, signal_type, value
# signal_type:
#   - explicit_rating: value = rating/10 (assumes rating is 0..10)
#   - implicit_watch:  value = watch_percentage/100 (assumes 0..100)

def qident(name: str) -> str:
    return '"' + name.replace('"', '""') + '"'

CANON_VIEW = "events_target"

u = qident(COL_USER)
i = qident(COL_ITEM)
t = qident(COL_TIME)

rating_expr = "NULL" if COL_RATING is None else f"try_cast({qident(COL_RATING)} AS DOUBLE)"
watch_expr  = "NULL" if COL_WATCH  is None else f"try_cast({qident(COL_WATCH)} AS DOUBLE)"

con.execute(
    f"""
    CREATE OR REPLACE VIEW {CANON_VIEW} AS
    SELECT
      CAST({u} AS VARCHAR) AS user_id,
      CAST({i} AS VARCHAR) AS item_id,
      CAST({t} AS TIMESTAMP) AS timestamp,

      CASE
        WHEN {rating_expr} IS NOT NULL THEN 'explicit_rating'
        WHEN {watch_expr}  IS NOT NULL THEN 'implicit_watch'
        ELSE 'interaction'
      END AS signal_type,

      CASE
        WHEN {rating_expr} IS NOT NULL THEN ({rating_expr} / 10.0)
        WHEN {watch_expr}  IS NOT NULL THEN ({watch_expr}  / 100.0)
        ELSE 1.0
      END AS value

    FROM {RAW_VIEW}
    WHERE {u} IS NOT NULL
      AND {i} IS NOT NULL
      AND {t} IS NOT NULL
    """
)

canon_sample = con.execute(f"SELECT * FROM {CANON_VIEW} LIMIT 10").df()
print("[02-07] Canonical sample:")
display(canon_sample)

bad_ts = con.execute(f"SELECT COUNT(*) AS n_bad_ts FROM {CANON_VIEW} WHERE timestamp IS NULL").df().iloc[0]["n_bad_ts"]
print("[02-07] Bad timestamp rows (timestamp is NULL after cast):", int(bad_ts))
if bad_ts > 0:
    raise ValueError("Timestamp casting produced NULLs. Check created_at format.")


[02-07] Canonical sample:


Unnamed: 0,user_id,item_id,timestamp,signal_type,value
0,224557,7680,2018-09-28 16:23:34,explicit_rating,1.0
1,267435,510,2018-10-17 10:52:40,explicit_rating,1.0
2,224453,615,2018-10-31 19:13:20,explicit_rating,1.0
3,277123,70385,2020-02-05 18:50:53,explicit_rating,0.3
4,277945,512,2018-11-20 16:23:15,explicit_rating,1.0
5,277945,804,2019-10-10 21:20:24,explicit_rating,0.5
6,277945,4418,2019-09-17 17:43:19,explicit_rating,1.0
7,277945,4422,2019-08-21 21:59:00,explicit_rating,0.4
8,277945,31972,2020-01-27 21:09:13,explicit_rating,1.0
9,277945,32024,2021-04-29 00:24:05,explicit_rating,0.1


[02-07] Bad timestamp rows (timestamp is NULL after cast): 0


Core EDA (counts, time range, top actions, quantiles)

In [9]:
# [CELL 02-08] EDA metrics for target dataset (MARS)

counts = con.execute(
    f"""
    SELECT
      COUNT(*) AS n_events,
      COUNT(DISTINCT user_id) AS n_users,
      COUNT(DISTINCT item_id) AS n_items,
      MIN(timestamp) AS min_ts,
      MAX(timestamp) AS max_ts
    FROM {CANON_VIEW};
    """
).df()
print("[02-08] Counts overview:")
display(counts)

signal_dist = con.execute(
    f"""
    SELECT signal_type, COUNT(*) AS n
    FROM {CANON_VIEW}
    GROUP BY 1
    ORDER BY n DESC;
    """
).df()
print("[02-08] signal_type distribution:")
display(signal_dist)

events_per_user = con.execute(
    f"""
    WITH per_user AS (
      SELECT user_id, COUNT(*) AS cnt
      FROM {CANON_VIEW}
      GROUP BY 1
    )
    SELECT
      approx_quantile(cnt, [0.5, 0.9, 0.99]) AS q,
      MIN(cnt) AS min_cnt,
      MAX(cnt) AS max_cnt,
      AVG(cnt) AS avg_cnt
    FROM per_user;
    """
).df()
print("[02-08] Events-per-user quantiles + min/max/avg:")
display(events_per_user)

value_stats = con.execute(
    f"""
    SELECT
      signal_type,
      COUNT(*) AS n,
      MIN(value) AS min_value,
      MAX(value) AS max_value,
      AVG(value) AS avg_value
    FROM {CANON_VIEW}
    GROUP BY 1
    ORDER BY 1;
    """
).df()
print("[02-08] Value stats by signal_type:")
display(value_stats)


[02-08] Counts overview:


Unnamed: 0,n_events,n_users,n_items,min_ts,max_ts
0,25212,3007,958,2016-10-28 19:13:15,2021-09-22 17:24:54


[02-08] signal_type distribution:


Unnamed: 0,signal_type,n
0,interaction,21553
1,explicit_rating,3659


[02-08] Events-per-user quantiles + min/max/avg:


Unnamed: 0,q,min_cnt,max_cnt,avg_cnt
0,"[3, 18, 89]",1,420,8.384436


[02-08] Value stats by signal_type:


Unnamed: 0,signal_type,n,min_value,max_value,avg_value
0,explicit_rating,3659,0.1,1.0,0.857584
1,interaction,21553,1.0,1.0,1.0


USER SEQUENCE CHARACTERISTICS

In [14]:
print("\n" + "="*70)
print("USER SEQUENCE CHARACTERISTICS")
print("="*70)

items_per_user = con.execute(f"""
    WITH per_user AS (
      SELECT 
        user_id, 
        COUNT(DISTINCT item_id) AS n_items,
        COUNT(*) AS n_events,
        MIN(timestamp) AS first_event,
        MAX(timestamp) AS last_event
      FROM {CANON_VIEW}
      GROUP BY user_id
    )
    SELECT
      approx_quantile(n_items,  [0.5, 0.9, 0.99]) AS items_quantiles,
      approx_quantile(n_events, [0.5, 0.9, 0.99]) AS events_quantiles,
      MIN(n_items) AS min_items,
      MAX(n_items) AS max_items,
      AVG(n_items) AS avg_items,
      AVG(EXTRACT(EPOCH FROM (last_event - first_event)) / 86400.0) AS avg_span_days
    FROM per_user
""").df()

print("\nItems per user:")
display(items_per_user)

sufficient_history = con.execute(f"""
    WITH per_user AS (
      SELECT user_id, COUNT(*) AS n_events
      FROM {CANON_VIEW}
      GROUP BY user_id
    )
    SELECT
      COUNT(*) AS total_users,
      SUM(CASE WHEN n_events >= 2  THEN 1 ELSE 0 END) AS users_2plus,
      SUM(CASE WHEN n_events >= 5  THEN 1 ELSE 0 END) AS users_5plus,
      SUM(CASE WHEN n_events >= 10 THEN 1 ELSE 0 END) AS users_10plus,
      SUM(CASE WHEN n_events >= 20 THEN 1 ELSE 0 END) AS users_20plus
    FROM per_user
""").df()

print("\nUsers by sequence length:")
display(sufficient_history)

row = sufficient_history.iloc[0]
total = int(row["total_users"])
def pct(x): 
    return (float(x) / total * 100.0) if total > 0 else 0.0

print(f"\n  ✅ {int(row['users_2plus']):,} users ({pct(row['users_2plus']):.1f}%) have >=2 events")
print(f"  ✅ {int(row['users_5plus']):,} users ({pct(row['users_5plus']):.1f}%) have >=5 events")
print(f"  ✅ {int(row['users_10plus']):,} users ({pct(row['users_10plus']):.1f}%) have >=10 events")



USER SEQUENCE CHARACTERISTICS

Items per user:


Unnamed: 0,items_quantiles,events_quantiles,min_items,max_items,avg_items,avg_span_days
0,"[2, 11, 44]","[3, 18, 88]",1,186,4.98304,57.154309



Users by sequence length:


Unnamed: 0,total_users,users_2plus,users_5plus,users_10plus,users_20plus
0,3007,2204.0,1150.0,629.0,274.0



  ✅ 2,204 users (73.3%) have >=2 events
  ✅ 1,150 users (38.2%) have >=5 events
  ✅ 629 users (20.9%) have >=10 events


ITEM (COURSE) CHARACTERISTICS

In [15]:
print("\n" + "="*70)
print("ITEM (COURSE) CHARACTERISTICS")
print("="*70)

events_per_item = con.execute(f"""
    WITH per_item AS (
      SELECT 
        item_id,
        COUNT(*) AS n_events,
        COUNT(DISTINCT user_id) AS n_users,
        MIN(timestamp) AS first_event,
        MAX(timestamp) AS last_event
      FROM {CANON_VIEW}
      GROUP BY item_id
    )
    SELECT
      COUNT(*) AS total_items,
      approx_quantile(n_events, [0.5, 0.9, 0.99]) AS events_quantiles,
      approx_quantile(n_users,  [0.5, 0.9, 0.99]) AS users_quantiles,
      MIN(n_events) AS min_events,
      MAX(n_events) AS max_events,
      AVG(n_events) AS avg_events,
      SUM(CASE WHEN n_events = 1 THEN 1 ELSE 0 END) AS items_single_event,
      SUM(CASE WHEN n_events <= 5 THEN 1 ELSE 0 END) AS items_cold_5
    FROM per_item
""").df()

print("\nEvents per item (course):")
display(events_per_item)

top_items = con.execute(f"""
    SELECT 
      item_id,
      COUNT(*) AS n_events,
      COUNT(DISTINCT user_id) AS n_users,
      AVG(value) AS avg_value
    FROM {CANON_VIEW}
    GROUP BY item_id
    ORDER BY n_events DESC
    LIMIT 10
""").df()

print("\nTop 10 most popular courses:")
display(top_items)



ITEM (COURSE) CHARACTERISTICS

Events per item (course):


Unnamed: 0,total_items,events_quantiles,users_quantiles,min_events,max_events,avg_events,items_single_event,items_cold_5
0,958,"[11, 59, 305]","[6, 35, 172]",1,910,26.317328,62.0,310.0



Top 10 most popular courses:


Unnamed: 0,item_id,n_events,n_users,avg_value
0,510,910,414,0.977692
1,911,523,399,0.992925
2,43457,392,143,0.960969
3,512,389,204,0.950129
4,511,373,185,0.984182
5,884,371,222,0.997574
6,7626,365,149,0.932055
7,3374,339,244,1.0
8,913,322,252,0.984783
9,940,318,251,0.997484


TIMESTAMP QUALITY CHECKS

In [16]:
print("\n" + "="*70)
print("TIMESTAMP QUALITY CHECKS")
print("="*70)

future_check = con.execute(f"""
    SELECT COUNT(*) AS n_future
    FROM {CANON_VIEW}
    WHERE timestamp > CURRENT_TIMESTAMP
""").df()
print(f"\nFuture timestamps: {int(future_check.iloc[0]['n_future'])}")

dup_check = con.execute(f"""
    WITH dups AS (
      SELECT user_id, item_id, timestamp, COUNT(*) AS cnt
      FROM {CANON_VIEW}
      GROUP BY 1, 2, 3
      HAVING COUNT(*) > 1
    )
    SELECT COUNT(*) AS n_dup_groups, COALESCE(SUM(cnt), 0) AS n_dup_rows
    FROM dups
""").df()

print("\nDuplicate (user, item, timestamp) tuples:")
display(dup_check)

order_check = con.execute(f"""
    WITH ordered AS (
      SELECT 
        user_id,
        timestamp,
        LAG(timestamp) OVER (PARTITION BY user_id ORDER BY timestamp) AS prev_ts
      FROM {CANON_VIEW}
    )
    SELECT COUNT(*) AS n_out_of_order
    FROM ordered
    WHERE prev_ts IS NOT NULL AND timestamp < prev_ts
""").df()

n_out = int(order_check.iloc[0]["n_out_of_order"])
print(f"\nOut-of-order events: {n_out}")
if n_out > 0:
    print("  ⚠️  WARNING: Some timestamps are not monotonic within users!")



TIMESTAMP QUALITY CHECKS

Future timestamps: 0

Duplicate (user, item, timestamp) tuples:


Unnamed: 0,n_dup_groups,n_dup_rows
0,5,10.0



Out-of-order events: 0


Daily volume series + simple plots (saved under reports/)

SPARSITY ANALYSIS

In [17]:
print("\n" + "="*70)
print("SPARSITY ANALYSIS")
print("="*70)

sparsity = con.execute(f"""
    WITH stats AS (
      SELECT 
        COUNT(*) AS n_events,
        COUNT(DISTINCT user_id) AS n_users,
        COUNT(DISTINCT item_id) AS n_items
      FROM {CANON_VIEW}
    )
    SELECT
      n_events,
      n_users,
      n_items,
      (n_users * n_items) AS matrix_size,
      ROUND(100.0 * n_events / NULLIF((n_users * n_items), 0), 6) AS density_pct,
      ROUND(100.0 - (100.0 * n_events / NULLIF((n_users * n_items), 0)), 6) AS sparsity_pct
    FROM stats
""").df()

print("\nUser-Item matrix sparsity:")
display(sparsity)

row = sparsity.iloc[0]
print(f"\n  Matrix size: {int(row['n_users']):,} users × {int(row['n_items']):,} items = {int(row['matrix_size']):,} cells")
print(f"  Observed: {int(row['n_events']):,} events")
print(f"  Density: {float(row['density_pct']):.6f}%")
print(f"  Sparsity: {float(row['sparsity_pct']):.6f}%")



SPARSITY ANALYSIS

User-Item matrix sparsity:


Unnamed: 0,n_events,n_users,n_items,matrix_size,density_pct,sparsity_pct
0,25212,3007,958,2880706,0.875202,99.124798



  Matrix size: 3,007 users × 958 items = 2,880,706 cells
  Observed: 25,212 events
  Density: 0.875202%
  Sparsity: 99.124798%


DATA PROVENANCE & LIMITATIONS (text only)

In [None]:
print("\n" + "="*70)
print("DATA PROVENANCE & LIMITATIONS")
print("="*70)

provenance = f"""
Dataset: MARS (Target MOOC)
Source: [CITATION TBD]
License: [TBD]

Known characteristics from this run:
  - Time period: {counts.iloc[0]['min_ts']} to {counts.iloc[0]['max_ts']}
  - Signals: explicit_rating + implicit_watch
  - Schema: (user_id, item_id(course), timestamp, signal_type, value)

Limitations (as observed / structural):
  1) Event-level interactions (not session-labeled at source)
  2) No user demographics/context features in these CSVs
  3) No course metadata used here (items_en.csv exists but not joined in EDA)
  4) Sparsity: see SPARSITY ANALYSIS cell output
  5) Sequence lengths: see USER SEQUENCE CHARACTERISTICS output

TODO before publication:
  - [ ] Find and cite original MARS dataset source
  - [ ] Verify license for academic use
  - [ ] Confirm rating scale and watch_percentage semantics
"""
print(provenance)

# Save provenance text for the run
prov_path = RUN_REPORT_DIR / "provenance_and_limitations.txt"
prov_path.write_text(provenance.strip() + "\n", encoding="utf-8")
print("[02-08E] Saved:", prov_path)


In [18]:
# [CELL 02-09] Daily volume series + plots

daily = con.execute(
    f"""
    SELECT
      date_trunc('day', timestamp) AS day,
      COUNT(*) AS n
    FROM {CANON_VIEW}
    GROUP BY 1
    ORDER BY 1;
    """
).df()

print("[02-09] Daily rows:", daily.shape)
display(daily.head())

# Save tables
save_df_csv(RUN_REPORT_DIR / "counts_overview.csv", counts)
save_df_csv(RUN_REPORT_DIR / "signal_type_dist.csv", signal_dist)
save_df_csv(RUN_REPORT_DIR / "events_per_user.csv", events_per_user)
save_df_csv(RUN_REPORT_DIR / "value_stats.csv", value_stats)
save_df_csv(RUN_REPORT_DIR / "daily_event_volume.csv", daily)

# Plot (matplotlib only; no fixed colors)
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 4))
plt.plot(pd.to_datetime(daily["day"]), daily["n"])
plt.title("MARS target daily event volume")
plt.xlabel("Day")
plt.ylabel("Events")
plt.tight_layout()

plot_path = RUN_REPORT_DIR / "daily_event_volume.png"
plt.savefig(plot_path, dpi=150)
plt.close()
print("[02-09] Saved plot:", plot_path)


[02-09] Daily rows: (1392, 2)


Unnamed: 0,day,n
0,2016-10-28,20
1,2016-10-29,3
2,2016-10-31,29
3,2016-11-01,11
4,2016-11-02,48


[02-02] Wrote CSV: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\02_eda_target_mooc\20251229_115541\counts_overview.csv | shape: (1, 5)
[02-02] Wrote CSV: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\02_eda_target_mooc\20251229_115541\signal_type_dist.csv | shape: (2, 2)
[02-02] Wrote CSV: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\02_eda_target_mooc\20251229_115541\events_per_user.csv | shape: (1, 4)
[02-02] Wrote CSV: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\02_eda_target_mooc\20251229_115541\value_stats.csv | shape: (2, 5)
[02-02] Wrote CSV: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\02_eda_target_mooc\20251229_115541\daily_event_volume.csv | shape: (1392, 2)
[02-09] Saved plot: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\02_eda_target_mooc\20251229_115541\daily_event_volume.png


In [19]:
save_df_csv(RUN_REPORT_DIR / "user_items_per_user.csv", items_per_user)
save_df_csv(RUN_REPORT_DIR / "user_sufficient_history.csv", sufficient_history)
save_df_csv(RUN_REPORT_DIR / "item_events_per_item.csv", events_per_item)
save_df_csv(RUN_REPORT_DIR / "item_top_10.csv", top_items)
save_df_csv(RUN_REPORT_DIR / "timestamp_future_check.csv", future_check)
save_df_csv(RUN_REPORT_DIR / "timestamp_dup_check.csv", dup_check)
save_df_csv(RUN_REPORT_DIR / "timestamp_order_check.csv", order_check)
save_df_csv(RUN_REPORT_DIR / "sparsity.csv", sparsity)


[02-02] Wrote CSV: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\02_eda_target_mooc\20251229_115541\user_items_per_user.csv | shape: (1, 6)
[02-02] Wrote CSV: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\02_eda_target_mooc\20251229_115541\user_sufficient_history.csv | shape: (1, 5)
[02-02] Wrote CSV: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\02_eda_target_mooc\20251229_115541\item_events_per_item.csv | shape: (1, 8)
[02-02] Wrote CSV: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\02_eda_target_mooc\20251229_115541\item_top_10.csv | shape: (10, 4)
[02-02] Wrote CSV: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\02_eda_target_mooc\20251229_115541\timestamp_future_check.csv | shape: (1, 1)
[02-02] Wrote CSV: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\02_eda_target_mooc\20251229_115541\timestamp_dup_check.csv | shape: (1, 2)
[02-02] Wrote CSV: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\02_eda_tar

Export canonical parquet for downstream normalization (Notebook 03)

In [11]:
# [CELL 02-10] Export canonical target events parquet (hybrid: explicit + implicit)

OUT_EVENTS = TARGET_PROCESSED_DIR / "mars_events_canonical.parquet"

con.execute(
    f"""
    COPY (
      SELECT user_id, item_id, timestamp, signal_type, value
      FROM {CANON_VIEW}
    )
    TO '{str(OUT_EVENTS)}'
    (FORMAT PARQUET);
    """
)

print("[02-10] Wrote canonical target events parquet:", OUT_EVENTS)


[02-10] Wrote canonical target events parquet: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\mars\mars_events_canonical.parquet


Optional: define KEEP_ACTIONS for target + export filtered interactions

In [12]:
# [CELL 02-11] Optional ablation exports (explicit-only / implicit-only)

OUT_EXPLICIT = TARGET_PROCESSED_DIR / "mars_events_explicit_only.parquet"
OUT_IMPLICIT = TARGET_PROCESSED_DIR / "mars_events_implicit_only.parquet"

con.execute(
    f"""
    COPY (
      SELECT user_id, item_id, timestamp, signal_type, value
      FROM {CANON_VIEW}
      WHERE signal_type = 'explicit_rating'
    )
    TO '{str(OUT_EXPLICIT)}'
    (FORMAT PARQUET);
    """
)
print("[02-11] Wrote:", OUT_EXPLICIT)

con.execute(
    f"""
    COPY (
      SELECT user_id, item_id, timestamp, signal_type, value
      FROM {CANON_VIEW}
      WHERE signal_type = 'implicit_watch'
    )
    TO '{str(OUT_IMPLICIT)}'
    (FORMAT PARQUET);
    """
)
print("[02-11] Wrote:", OUT_IMPLICIT)


[02-11] Wrote: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\mars\mars_events_explicit_only.parquet
[02-11] Wrote: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\mars\mars_events_implicit_only.parquet


Save dataset metadata for reproducibility

In [22]:
# [CELL 02-12] Save dataset metadata for reproducibility (JSON-safe)

import numpy as np

def json_safe(x):
    """Convert numpy/pandas types into JSON-serializable Python types."""
    # numpy scalar -> python scalar
    if isinstance(x, (np.integer, np.int64, np.int32)):
        return int(x)
    if isinstance(x, (np.floating, np.float64, np.float32)):
        return float(x)
    if isinstance(x, (np.bool_,)):
        return bool(x)

    # numpy array -> list
    if isinstance(x, np.ndarray):
        return x.tolist()

    # pandas Timestamp / datetime -> str
    if isinstance(x, (pd.Timestamp, datetime)):
        return str(x)

    # dict -> recurse
    if isinstance(x, dict):
        return {k: json_safe(v) for k, v in x.items()}

    # list/tuple -> recurse
    if isinstance(x, (list, tuple)):
        return [json_safe(v) for v in x]

    # fallback: keep primitives / stringify unknowns
    if x is None or isinstance(x, (str, int, float, bool)):
        return x
    return str(x)


counts_row = counts.iloc[0].to_dict()

meta = {
    "dataset": "MARS (target)",
    "run_tag": RUN_TAG,
    "canonical_schema": ["user_id", "item_id", "timestamp", "signal_type", "value"],
    "notes": [
        "In MARS, item_id is treated as course_id (items are courses).",
        "Hybrid signals are normalized: rating/10, watch_percentage/100.",
        "No session analysis or sessionization in this notebook."
    ],
    "raw_files": {
        "explicit_ratings_en.csv": str(explicit_csv),
        "implicit_ratings_en.csv": str(implicit_csv),
        "items_en.csv": str(items_csv) if "items_csv" in globals() else None,
        "users_en.csv": str(users_csv) if "users_csv" in globals() else None,
    },
    "mapping": {
        "COL_USER": COL_USER,
        "COL_ITEM": COL_ITEM,
        "COL_TIME": COL_TIME,
        "COL_RATING": COL_RATING,
        "COL_WATCH": COL_WATCH,
    },
    "counts": {
        "n_events": counts_row["n_events"],
        "n_users": counts_row["n_users"],
        "n_items": counts_row["n_items"],
        "min_ts": counts_row["min_ts"],
        "max_ts": counts_row["max_ts"],
    },
    "exports": {
        "mars_events_canonical": str(OUT_EVENTS),
        "mars_events_explicit_only": str(OUT_EXPLICIT) if "OUT_EXPLICIT" in globals() else None,
        "mars_events_implicit_only": str(OUT_IMPLICIT) if "OUT_IMPLICIT" in globals() else None,
    },
    "reports_dir": str(RUN_REPORT_DIR),
}

# ---------- Extra EDA (only if cells exist) ----------
extra_eda = {}

if "items_per_user" in globals():
    extra_eda["items_per_user"] = items_per_user.to_dict(orient="records")[0]

if "sufficient_history" in globals():
    extra_eda["sufficient_history"] = sufficient_history.to_dict(orient="records")[0]

if "events_per_item" in globals():
    extra_eda["events_per_item"] = events_per_item.to_dict(orient="records")[0]

if "top_items" in globals():
    extra_eda["top_items"] = top_items.to_dict(orient="records")

if "future_check" in globals() or "dup_check" in globals() or "order_check" in globals():
    extra_eda["timestamp_quality"] = {
        "n_future": future_check.iloc[0]["n_future"] if "future_check" in globals() else None,
        "dup_check": dup_check.to_dict(orient="records")[0] if "dup_check" in globals() else None,
        "n_out_of_order": order_check.iloc[0]["n_out_of_order"] if "order_check" in globals() else None,
    }

if "sparsity" in globals():
    extra_eda["sparsity"] = sparsity.to_dict(orient="records")[0]

meta["extra_eda"] = extra_eda

# ---------- Provenance / limitations ----------
meta["provenance"] = {
    "dataset_name": "MARS",
    "citation": "[CITATION TBD]",
    "license": "[TBD]",
    "limitations": [
        "Event-level interactions (not session-labeled at source).",
        "No user demographics/context features in these CSVs.",
        "No course metadata joined in this notebook (items_en.csv exists but not joined).",
        "Sparse user-item matrix (see sparsity in extra_eda).",
        "Short sequences for many users (see sufficient_history in extra_eda).",
    ],
    "notes": "See reports/.../provenance_and_limitations.txt for the run text if generated.",
}

# Make everything JSON serializable
meta = json_safe(meta)

# Save into processed (so Notebook 03 can rely on it) and also into the run report folder
save_json(TARGET_PROCESSED_DIR / "dataset_metadata.json", meta)
save_json(RUN_REPORT_DIR / "dataset_metadata.json", meta)

print("[02-12] Done. Next notebook is 03_schema_normalization.ipynb")


[02-02] Wrote JSON: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\mars\dataset_metadata.json
[02-02] Wrote JSON: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\02_eda_target_mooc\20251229_115541\dataset_metadata.json
[02-12] Done. Next notebook is 03_schema_normalization.ipynb
