We normalize Source (XuetangX) + Target (MARS) into the minimal unified schema:

user_id, item_id, timestamp, signal_type, value

Target (MARS): already in this schema from Notebook 02.

Source (XuetangX): we will map:

user_id → user_id

item_id → course_id (items=courses to match target)

timestamp → ts

signal_type → from action (or constant if needed)

value → 1.0 (implicit signal)

Header + imports + version logging

In [1]:
# [CELL 03-00] Notebook 03 — Schema Normalization (Source + Target)
# Output minimal schema:
#   user_id, item_id, timestamp, signal_type, value
# Guardrail: NO session-gap analysis, NO sessionization, NO prefix->label samples.

import os
import sys
import json
import platform
from pathlib import Path
from datetime import datetime

import pandas as pd

try:
    import duckdb
except Exception as e:
    raise RuntimeError("duckdb is required for this notebook. Please install duckdb.") from e

print("[03-00] Python:", sys.version)
print("[03-00] Platform:", platform.platform())
print("[03-00] pandas:", pd.__version__)
print("[03-00] duckdb:", duckdb.__version__)
print("[03-00] CWD:", Path.cwd().resolve())


[03-00] Python: 3.11.14 | packaged by Anaconda, Inc. | (main, Oct 21 2025, 18:30:03) [MSC v.1929 64 bit (AMD64)]
[03-00] Platform: Windows-10-10.0.22621-SP0
[03-00] pandas: 2.3.3
[03-00] duckdb: 1.4.3
[03-00] CWD: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\notebooks


Locate repo root (Windows-safe) + dirs

In [2]:
# [CELL 03-01] Bootstrap: locate repo root reliably (Windows-safe)

CWD = Path.cwd().resolve()
print("[03-01] Initial CWD:", CWD)

def find_repo_root(start: Path) -> Path:
    for p in [start, *start.parents]:
        if (p / "PROJECT_STATE.md").exists():
            print(f"[03-01] Found PROJECT_STATE.md in: {p}")
            return p
    for p in [start, *start.parents]:
        if (p / ".git").exists():
            print(f"[03-01] Found .git in: {p}")
            return p
    raise FileNotFoundError("Could not locate repo root (PROJECT_STATE.md or .git not found).")

REPO_ROOT = find_repo_root(CWD)

DATA_DIR      = REPO_ROOT / "data"
RAW_DIR       = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"
REPORTS_DIR   = REPO_ROOT / "reports"

for d in [DATA_DIR, RAW_DIR, PROCESSED_DIR, REPORTS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("[03-01] REPO_ROOT:", REPO_ROOT)
print("[03-01] PROCESSED_DIR:", PROCESSED_DIR)
print("[03-01] REPORTS_DIR:", REPORTS_DIR)


[03-01] Initial CWD: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\notebooks
[03-01] Found PROJECT_STATE.md in: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta
[03-01] REPO_ROOT: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta
[03-01] PROCESSED_DIR: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed
[03-01] REPORTS_DIR: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports


RUN_TAG + output folders + helpers

In [3]:
# [CELL 03-02] Run tag + output folders + helpers

RUN_TAG = datetime.now().strftime("%Y%m%d_%H%M%S")
RUN_REPORT_DIR = REPORTS_DIR / "03_schema_normalization" / RUN_TAG
RUN_REPORT_DIR.mkdir(parents=True, exist_ok=True)

NORM_DIR = PROCESSED_DIR / "normalized_events"
NORM_DIR.mkdir(parents=True, exist_ok=True)

print("[03-02] RUN_TAG:", RUN_TAG)
print("[03-02] RUN_REPORT_DIR:", RUN_REPORT_DIR)
print("[03-02] NORM_DIR:", NORM_DIR)

def save_json(path: Path, obj: dict):
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(json.dumps(obj, indent=2, ensure_ascii=False), encoding="utf-8")
    print("[03-02] Wrote JSON:", path)

def save_df_csv(path: Path, df: pd.DataFrame, index=False):
    path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path, index=index)
    print("[03-02] Wrote CSV:", path, "| shape:", df.shape)

run_meta = {
    "run_tag": RUN_TAG,
    "notebook": "03_schema_normalization.ipynb",
    "created_at": datetime.now().isoformat(timespec="seconds"),
    "repo_root": str(REPO_ROOT),
    "processed_dir": str(PROCESSED_DIR),
    "norm_dir": str(NORM_DIR),
    "python": sys.version,
    "pandas": pd.__version__,
    "duckdb": duckdb.__version__,
    "platform": platform.platform(),
}
save_json(RUN_REPORT_DIR / "run_meta.json", run_meta)


[03-02] RUN_TAG: 20251229_131421
[03-02] RUN_REPORT_DIR: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\03_schema_normalization\20251229_131421
[03-02] NORM_DIR: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\normalized_events
[03-02] Wrote JSON: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\03_schema_normalization\20251229_131421\run_meta.json


Input paths + existence checks

In [4]:
# [CELL 03-03] Inputs

# Source: from Notebook 01 output
SOURCE_INTERACTIONS = PROCESSED_DIR / "xuetangx_interactions_course_filtered.parquet"

# Target: from Notebook 02 output (agreed minimal schema)
TARGET_EVENTS_CANON = PROCESSED_DIR / "mars" / "mars_events_canonical.parquet"

print("[03-03] SOURCE_INTERACTIONS:", SOURCE_INTERACTIONS)
print("[03-03] TARGET_EVENTS_CANON:", TARGET_EVENTS_CANON)

assert SOURCE_INTERACTIONS.exists(), f"Missing source interactions parquet: {SOURCE_INTERACTIONS}"
assert TARGET_EVENTS_CANON.exists(), f"Missing target events parquet: {TARGET_EVENTS_CANON}"


[03-03] SOURCE_INTERACTIONS: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\xuetangx_interactions_course_filtered.parquet
[03-03] TARGET_EVENTS_CANON: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\mars\mars_events_canonical.parquet


DuckDB connect + load views + print schemas (CHECKPOINT)

In [5]:
# [CELL 03-04] DuckDB connect + create raw views + schema inspection (CHECKPOINT)

con = duckdb.connect(database=":memory:")
con.execute("PRAGMA threads=8;")

con.execute(f"CREATE OR REPLACE VIEW src_raw AS SELECT * FROM read_parquet('{str(SOURCE_INTERACTIONS)}');")
con.execute(f"CREATE OR REPLACE VIEW tgt_raw AS SELECT * FROM read_parquet('{str(TARGET_EVENTS_CANON)}');")

src_schema = con.execute("DESCRIBE SELECT * FROM src_raw;").df()
tgt_schema = con.execute("DESCRIBE SELECT * FROM tgt_raw;").df()

print("[03-04] src_raw schema:")
display(src_schema)

print("[03-04] tgt_raw schema:")
display(tgt_schema)

# small samples
print("[03-04] src_raw sample:")
display(con.execute("SELECT * FROM src_raw LIMIT 5;").df())

print("[03-04] tgt_raw sample:")
display(con.execute("SELECT * FROM tgt_raw LIMIT 5;").df())

# Save schemas
save_df_csv(RUN_REPORT_DIR / "src_raw_schema.csv", src_schema)
save_df_csv(RUN_REPORT_DIR / "tgt_raw_schema.csv", tgt_schema)


[03-04] src_raw schema:


Unnamed: 0,column_name,column_type,null,key,default,extra
0,user_id,VARCHAR,YES,,,
1,item_id,VARCHAR,YES,,,
2,ts,TIMESTAMP_NS,YES,,,
3,action,VARCHAR,YES,,,


[03-04] tgt_raw schema:


Unnamed: 0,column_name,column_type,null,key,default,extra
0,user_id,VARCHAR,YES,,,
1,item_id,VARCHAR,YES,,,
2,timestamp,TIMESTAMP,YES,,,
3,signal_type,VARCHAR,YES,,,
4,value,DOUBLE,YES,,,


[03-04] src_raw sample:


Unnamed: 0,user_id,item_id,ts,action
0,1482755,course-v1:BIT+PHY1701601+2015_T2,2015-10-26 11:49:16,click_about
1,1482755,course-v1:BIT+PHY1701601+2015_T2,2015-10-26 11:49:17,click_about
2,1482755,course-v1:BIT+PHY1701601+2015_T2,2015-10-26 11:49:20,click_about
3,1482755,course-v1:BIT+PHY1701601+2015_T2,2015-10-26 11:49:24,click_info
4,1482755,course-v1:BIT+PHY1701601+2015_T2,2015-10-26 11:49:26,click_info


[03-04] tgt_raw sample:


Unnamed: 0,user_id,item_id,timestamp,signal_type,value
0,224557,510,2018-09-28 16:18:29,explicit_rating,1.0
1,224557,615,2018-09-28 16:22:22,explicit_rating,1.0
2,224557,7680,2018-09-28 16:23:34,explicit_rating,1.0
3,224293,510,2018-09-28 17:20:30,explicit_rating,1.0
4,224293,515,2018-09-28 17:40:02,explicit_rating,1.0


[03-02] Wrote CSV: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\03_schema_normalization\20251229_131421\src_raw_schema.csv | shape: (4, 6)
[03-02] Wrote CSV: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\03_schema_normalization\20251229_131421\tgt_raw_schema.csv | shape: (5, 6)


Explicit mapping + normalized views (src_norm, tgt_norm)

In [6]:
# [CELL 03-05] Create normalized views (src_norm, tgt_norm) to the agreed minimal schema
# Final minimal columns:
#   user_id, item_id, timestamp, signal_type, value
# Optional debug column:
#   domain ∈ {source, target}

print("\n" + "="*70)
print("03-05 NORMALIZATION MAPPING")
print("="*70)

# Source schema confirmed:
# user_id, item_id, ts, action
SRC_USER = "user_id"
SRC_ITEM = "item_id"      # In XuetangX this is course_id already (course-v1:...)
SRC_TIME = "ts"
SRC_ACTION = "action"

# Target schema confirmed:
# user_id, item_id, timestamp, signal_type, value
TGT_USER = "user_id"
TGT_ITEM = "item_id"      # In MARS this is course_id (your agreed interpretation)
TGT_TIME = "timestamp"
TGT_SIGNAL = "signal_type"
TGT_VALUE = "value"

print("[03-05] SRC columns:", {"user_id": SRC_USER, "item_id": SRC_ITEM, "timestamp": SRC_TIME, "action": SRC_ACTION})
print("[03-05] TGT columns:", {"user_id": TGT_USER, "item_id": TGT_ITEM, "timestamp": TGT_TIME, "signal_type": TGT_SIGNAL, "value": TGT_VALUE})

# Create normalized source view:
# - signal_type: use action (more informative than a constant)
# - value: 1.0 for all implicit interactions (simple and consistent)
con.execute(f"""
CREATE OR REPLACE VIEW src_norm AS
SELECT
  'source'::VARCHAR AS domain,
  CAST({SRC_USER} AS VARCHAR) AS user_id,
  CAST({SRC_ITEM} AS VARCHAR) AS item_id,
  CAST({SRC_TIME} AS TIMESTAMP) AS timestamp,
  CAST({SRC_ACTION} AS VARCHAR) AS signal_type,
  1.0::DOUBLE AS value
FROM src_raw
WHERE {SRC_USER} IS NOT NULL
  AND {SRC_ITEM} IS NOT NULL
  AND {SRC_TIME} IS NOT NULL;
""")

# Create normalized target view:
# - keep signal_type/value as produced in Notebook 02
con.execute(f"""
CREATE OR REPLACE VIEW tgt_norm AS
SELECT
  'target'::VARCHAR AS domain,
  CAST({TGT_USER} AS VARCHAR) AS user_id,
  CAST({TGT_ITEM} AS VARCHAR) AS item_id,
  CAST({TGT_TIME} AS TIMESTAMP) AS timestamp,
  CAST({TGT_SIGNAL} AS VARCHAR) AS signal_type,
  CAST({TGT_VALUE} AS DOUBLE) AS value
FROM tgt_raw
WHERE {TGT_USER} IS NOT NULL
  AND {TGT_ITEM} IS NOT NULL
  AND {TGT_TIME} IS NOT NULL;
""")

print("[03-05] src_norm sample:")
display(con.execute("SELECT * FROM src_norm LIMIT 5;").df())

print("[03-05] tgt_norm sample:")
display(con.execute("SELECT * FROM tgt_norm LIMIT 5;").df())

# San



03-05 NORMALIZATION MAPPING
[03-05] SRC columns: {'user_id': 'user_id', 'item_id': 'item_id', 'timestamp': 'ts', 'action': 'action'}
[03-05] TGT columns: {'user_id': 'user_id', 'item_id': 'item_id', 'timestamp': 'timestamp', 'signal_type': 'signal_type', 'value': 'value'}
[03-05] src_norm sample:


Unnamed: 0,domain,user_id,item_id,timestamp,signal_type,value
0,source,1482755,course-v1:BIT+PHY1701601+2015_T2,2015-10-26 11:49:16,click_about,1.0
1,source,1482755,course-v1:BIT+PHY1701601+2015_T2,2015-10-26 11:49:17,click_about,1.0
2,source,1482755,course-v1:BIT+PHY1701601+2015_T2,2015-10-26 11:49:20,click_about,1.0
3,source,1482755,course-v1:BIT+PHY1701601+2015_T2,2015-10-26 11:49:24,click_info,1.0
4,source,1482755,course-v1:BIT+PHY1701601+2015_T2,2015-10-26 11:49:26,click_info,1.0


[03-05] tgt_norm sample:


Unnamed: 0,domain,user_id,item_id,timestamp,signal_type,value
0,target,224557,510,2018-09-28 16:18:29,explicit_rating,1.0
1,target,224557,615,2018-09-28 16:22:22,explicit_rating,1.0
2,target,224557,7680,2018-09-28 16:23:34,explicit_rating,1.0
3,target,224293,510,2018-09-28 17:20:30,explicit_rating,1.0
4,target,224293,515,2018-09-28 17:40:02,explicit_rating,1.0


Validation (counts, time range, duplicates, ordering, future dates)

In [7]:
# [CELL 03-06] Validation checks (no sessionization)

print("\n" + "="*70)
print("03-06 VALIDATION CHECKS")
print("="*70)

def summary(view: str):
    return con.execute(f"""
        SELECT
          COUNT(*) AS n_events,
          COUNT(DISTINCT user_id) AS n_users,
          COUNT(DISTINCT item_id) AS n_items,
          MIN(timestamp) AS min_ts,
          MAX(timestamp) AS max_ts
        FROM {view};
    """).df()

src_sum = summary("src_norm")
tgt_sum = summary("tgt_norm")

print("[03-06] src_norm summary:")
display(src_sum)
print("[03-06] tgt_norm summary:")
display(tgt_sum)

# Duplicates: same (user_id, item_id, timestamp)
src_dups = con.execute("""
WITH d AS (
  SELECT user_id, item_id, timestamp, COUNT(*) AS c
  FROM src_norm
  GROUP BY 1,2,3
  HAVING COUNT(*) > 1
)
SELECT COUNT(*) AS n_dup_groups, COALESCE(SUM(c),0) AS n_dup_rows
FROM d;
""").df()

tgt_dups = con.execute("""
WITH d AS (
  SELECT user_id, item_id, timestamp, COUNT(*) AS c
  FROM tgt_norm
  GROUP BY 1,2,3
  HAVING COUNT(*) > 1
)
SELECT COUNT(*) AS n_dup_groups, COALESCE(SUM(c),0) AS n_dup_rows
FROM d;
""").df()

print("[03-06] src duplicate tuples:")
display(src_dups)
print("[03-06] tgt duplicate tuples:")
display(tgt_dups)

# Timestamp ordering within users
src_order_bad = con.execute("""
WITH w AS (
  SELECT user_id, timestamp,
         LAG(timestamp) OVER (PARTITION BY user_id ORDER BY timestamp) AS prev_ts
  FROM src_norm
)
SELECT COUNT(*) AS n_out_of_order
FROM w
WHERE prev_ts IS NOT NULL AND timestamp < prev_ts;
""").df()

tgt_order_bad = con.execute("""
WITH w AS (
  SELECT user_id, timestamp,
         LAG(timestamp) OVER (PARTITION BY user_id ORDER BY timestamp) AS prev_ts
  FROM tgt_norm
)
SELECT COUNT(*) AS n_out_of_order
FROM w
WHERE prev_ts IS NOT NULL AND timestamp < prev_ts;
""").df()

print("[03-06] src out-of-order:", int(src_order_bad.iloc[0]["n_out_of_order"]))
print("[03-06] tgt out-of-order:", int(tgt_order_bad.iloc[0]["n_out_of_order"]))

# Future timestamps (should be 0)
src_future = con.execute("SELECT COUNT(*) AS n_future FROM src_norm WHERE timestamp > CURRENT_TIMESTAMP;").df()
tgt_future = con.execute("SELECT COUNT(*) AS n_future FROM tgt_norm WHERE timestamp > CURRENT_TIMESTAMP;").df()
print("[03-06] src future:", int(src_future.iloc[0]["n_future"]), "| tgt future:", int(tgt_future.iloc[0]["n_future"]))

# Save validation outputs
save_df_csv(RUN_REPORT_DIR / "src_norm_summary.csv", src_sum)
save_df_csv(RUN_REPORT_DIR / "tgt_norm_summary.csv", tgt_sum)
save_df_csv(RUN_REPORT_DIR / "src_dups.csv", src_dups)
save_df_csv(RUN_REPORT_DIR / "tgt_dups.csv", tgt_dups)
save_df_csv(RUN_REPORT_DIR / "src_order_bad.csv", src_order_bad)
save_df_csv(RUN_REPORT_DIR / "tgt_order_bad.csv", tgt_order_bad)
save_df_csv(RUN_REPORT_DIR / "src_future.csv", src_future)
save_df_csv(RUN_REPORT_DIR / "tgt_future.csv", tgt_future)



03-06 VALIDATION CHECKS


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[03-06] src_norm summary:


Unnamed: 0,n_events,n_users,n_items,min_ts,max_ts
0,167818548,770283,1628,2015-07-31 23:59:15,2017-07-31 23:59:09


[03-06] tgt_norm summary:


Unnamed: 0,n_events,n_users,n_items,min_ts,max_ts
0,3659,822,776,2018-09-28 14:38:15,2021-09-20 16:26:06


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[03-06] src duplicate tuples:


Unnamed: 0,n_dup_groups,n_dup_rows
0,12583173,33248396.0


[03-06] tgt duplicate tuples:


Unnamed: 0,n_dup_groups,n_dup_rows
0,4,8.0


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[03-06] src out-of-order: 0
[03-06] tgt out-of-order: 0


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[03-06] src future: 0 | tgt future: 0
[03-02] Wrote CSV: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\03_schema_normalization\20251229_131421\src_norm_summary.csv | shape: (1, 5)
[03-02] Wrote CSV: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\03_schema_normalization\20251229_131421\tgt_norm_summary.csv | shape: (1, 5)
[03-02] Wrote CSV: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\03_schema_normalization\20251229_131421\src_dups.csv | shape: (1, 2)
[03-02] Wrote CSV: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\03_schema_normalization\20251229_131421\tgt_dups.csv | shape: (1, 2)
[03-02] Wrote CSV: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\03_schema_normalization\20251229_131421\src_order_bad.csv | shape: (1, 1)
[03-02] Wrote CSV: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\03_schema_normalization\20251229_131421\tgt_order_bad.csv | shape: (1, 1)
[03-02] Wrote CSV: D:\00_DS-ML-Workspace\mooc-coldstart-sess

Create deduped views + export Parquet files

In [8]:
# [CELL 03-07] Deduplicate then export normalized Parquets

print("\n" + "="*70)
print("03-07 DEDUP + EXPORT")
print("="*70)

# Dedup source:
# Key: user_id, item_id, timestamp, signal_type
# value is constant 1.0 -> safe to keep 1 row
con.execute("""
CREATE OR REPLACE VIEW src_norm_dedup AS
SELECT
  domain, user_id, item_id, timestamp, signal_type,
  1.0::DOUBLE AS value
FROM (
  SELECT
    domain, user_id, item_id, timestamp, signal_type,
    ROW_NUMBER() OVER (
      PARTITION BY user_id, item_id, timestamp, signal_type
      ORDER BY timestamp
    ) AS rn
  FROM src_norm
)
WHERE rn = 1;
""")

# Dedup target:
# Key: user_id, item_id, timestamp, signal_type
# value may vary (rating/watch); keep MAX(value) to be deterministic
con.execute("""
CREATE OR REPLACE VIEW tgt_norm_dedup AS
SELECT
  domain, user_id, item_id, timestamp, signal_type,
  MAX(value) AS value
FROM tgt_norm
GROUP BY 1,2,3,4,5;
""")

# Quick counts: before/after
src_before = con.execute("SELECT COUNT(*) AS n FROM src_norm;").df().iloc[0]["n"]
src_after  = con.execute("SELECT COUNT(*) AS n FROM src_norm_dedup;").df().iloc[0]["n"]
tgt_before = con.execute("SELECT COUNT(*) AS n FROM tgt_norm;").df().iloc[0]["n"]
tgt_after  = con.execute("SELECT COUNT(*) AS n FROM tgt_norm_dedup;").df().iloc[0]["n"]

print(f"[03-07] src before: {int(src_before):,} | after dedup: {int(src_after):,} | removed: {int(src_before - src_after):,}")
print(f"[03-07] tgt before: {int(tgt_before):,} | after dedup: {int(tgt_after):,} | removed: {int(tgt_before - tgt_after):,}")

OUT_SRC = NORM_DIR / "events_source_norm.parquet"
OUT_TGT = NORM_DIR / "events_target_norm.parquet"
OUT_ALL = NORM_DIR / "events_all_norm.parquet"

# Export source
con.execute(f"""
COPY (
  SELECT user_id, item_id, timestamp, signal_type, value
  FROM src_norm_dedup
)
TO '{str(OUT_SRC)}' (FORMAT PARQUET);
""")
print("[03-07] Wrote:", OUT_SRC)

# Export target
con.execute(f"""
COPY (
  SELECT user_id, item_id, timestamp, signal_type, value
  FROM tgt_norm_dedup
)
TO '{str(OUT_TGT)}' (FORMAT PARQUET);
""")
print("[03-07] Wrote:", OUT_TGT)

# Export combined (still no sessions; just union)
con.execute(f"""
COPY (
  SELECT 'source' AS domain, user_id, item_id, timestamp, signal_type, value
  FROM src_norm_dedup
  UNION ALL
  SELECT 'target' AS domain, user_id, item_id, timestamp, signal_type, value
  FROM tgt_norm_dedup
)
TO '{str(OUT_ALL)}' (FORMAT PARQUET);
""")
print("[03-07] Wrote:", OUT_ALL)



03-07 DEDUP + EXPORT


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[03-07] src before: 167,818,548 | after dedup: 154,817,413 | removed: 13,001,135
[03-07] tgt before: 3,659 | after dedup: 3,655 | removed: 4


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[03-07] Wrote: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\normalized_events\events_source_norm.parquet
[03-07] Wrote: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\normalized_events\events_target_norm.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[03-07] Wrote: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\normalized_events\events_all_norm.parquet


Save normalization metadata JSON (JSON-safe)

In [9]:
# [CELL 03-08] Save normalization metadata (JSON-safe)

import numpy as np

def json_safe(x):
    if isinstance(x, (np.integer,)):
        return int(x)
    if isinstance(x, (np.floating,)):
        return float(x)
    if isinstance(x, (np.bool_,)):
        return bool(x)
    if isinstance(x, np.ndarray):
        return x.tolist()
    if isinstance(x, (pd.Timestamp, datetime)):
        return str(x)
    if isinstance(x, dict):
        return {k: json_safe(v) for k, v in x.items()}
    if isinstance(x, (list, tuple)):
        return [json_safe(v) for v in x]
    if x is None or isinstance(x, (str, int, float, bool)):
        return x
    return str(x)

meta = {
    "run_tag": RUN_TAG,
    "notebook": "03_schema_normalization.ipynb",
    "canonical_schema": ["user_id", "item_id", "timestamp", "signal_type", "value"],
    "inputs": {
        "source_interactions": str(SOURCE_INTERACTIONS),
        "target_events_canonical": str(TARGET_EVENTS_CANON),
    },
    "source": {
        "raw_schema": src_schema.to_dict(orient="records"),
        "mapping": {"user_id": "user_id", "item_id": "item_id(course_id)", "timestamp": "ts", "signal_type": "action", "value": "1.0"},
        "summary_before_dedup": src_sum.to_dict(orient="records")[0],
        "duplicates": src_dups.to_dict(orient="records")[0],
        "out_of_order": int(src_order_bad.iloc[0]["n_out_of_order"]),
        "future": int(src_future.iloc[0]["n_future"]),
        "n_before": int(src_before),
        "n_after_dedup": int(src_after),
    },
    "target": {
        "raw_schema": tgt_schema.to_dict(orient="records"),
        "mapping": {"user_id": "user_id", "item_id": "item_id(course_id)", "timestamp": "timestamp", "signal_type": "signal_type", "value": "value"},
        "summary_before_dedup": tgt_sum.to_dict(orient="records")[0],
        "duplicates": tgt_dups.to_dict(orient="records")[0],
        "out_of_order": int(tgt_order_bad.iloc[0]["n_out_of_order"]),
        "future": int(tgt_future.iloc[0]["n_future"]),
        "n_before": int(tgt_before),
        "n_after_dedup": int(tgt_after),
    },
    "exports": {
        "events_source_norm": str(OUT_SRC),
        "events_target_norm": str(OUT_TGT),
        "events_all_norm": str(OUT_ALL),
    },
    "reports_dir": str(RUN_REPORT_DIR),
}

meta = json_safe(meta)

save_json(NORM_DIR / "dataset_metadata.json", meta)
save_json(RUN_REPORT_DIR / "dataset_metadata.json", meta)

print("[03-08] Done. Next notebook: 04_session_gap_and_timeline_analysis.ipynb")


[03-02] Wrote JSON: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\normalized_events\dataset_metadata.json
[03-02] Wrote JSON: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\reports\03_schema_normalization\20251229_131421\dataset_metadata.json
[03-08] Done. Next notebook: 04_session_gap_and_timeline_analysis.ipynb
