ETL

In [None]:
import json, textwrap, traceback
from io import StringIO
from pathlib import Path
import pathlib
from unidecode import unidecode

import os
from dotenv import load_dotenv

import numpy as np
import pandas as pd
from openai import OpenAI
import math, re

# === ADDED ===
from datetime import datetime
import hashlib
# === /ADDED ===

# ───────── Config ─────────
SAVE_DIR = Path("/Users/jakob/ba_etl/adaptive/cleaned")     
DATA_PATH   = "/Users/jakob/ba_etl/adaptive/data/raw/imdb_data.csv"
MODEL_NAME  = "o4-mini"
SAMPLE_SIZE = 5
RNG_STATE   = 42

load_dotenv()                     
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# ───────── Artefakt-Ordner und Helper ─────────
ARTIFACT_DIR = Path("/Users/jakob/ba_etl/adaptive/run_artifacts")

# === ADDED ===
RUN_ID = datetime.now().strftime("%Y%m%d_%H%M%S")

def run_dir_for(dataset_path: str) -> Path:
    """Pro Run eigener Artefaktordner: <stem>_<RUN_ID>"""
    stem = Path(dataset_path).stem
    d = ARTIFACT_DIR / f"{stem}_{RUN_ID}"
    d.mkdir(parents=True, exist_ok=True)
    return d

def append_log(msg: str, dataset_path: str, fname: str = "run.log"):
    rd = run_dir_for(dataset_path)
    with open(rd / fname, "a", encoding="utf-8") as f:
        f.write(msg.rstrip() + "\n")

def save_artifact_copy(text: str, stage: str, kind: str,
                       dataset_path: str, attempt: int = 0, add_hash: bool = False) -> str:
    """
    Legt eine *zusätzliche* Kopie im Run-Ordner ab (ohne das bestehende save_artifact zu verändern).
    stage : 'transform' | 'validator'
    kind  : 'prompt'    | 'code'
    """
    run_dir = run_dir_for(dataset_path)
    suffix = "_retry" if attempt else ""
    ext = "txt" if kind == "prompt" else "py"
    fname = f"{stage}_{kind}{suffix}"
    if add_hash:
        digest = hashlib.sha256(text.encode("utf-8")).hexdigest()[:12]
        fname += f"_{digest}"
    target = run_dir / f"{fname}.{ext}"
    target.write_text(text, encoding="utf-8")
    return str(target)

def write_manifest(dataset_path: str, meta: dict) -> str:
    rd = run_dir_for(dataset_path)
    p = rd / "manifest.json"
    p.write_text(json.dumps(meta, indent=2), encoding="utf-8")
    return str(p)
# === /ADDED ===

# ───────── Schema laden ─────────
SCHEMA_PATH = Path("schema.json")
SCHEMA_SPEC = json.loads(SCHEMA_PATH.read_text(encoding="utf-8"))

# ───────── Helper ─────────
def schema_as_text(spec):
    return "\n".join(f"- {d['field']}: (type: {d['type']}) {d['rule']}"
                     for d in spec)

def infer_dataset_tag(path: str) -> str:
    name = Path(path).name.lower()
    if "imdb" in name: return "IMDB"
    if "movielens" in name: return "MOVIELENS"
    if "metacritic" in name: return "METACRITIC"
    if "rotten" in name or "rt" in name: return "RT"
    return "DATA"

DATASET_TAG = infer_dataset_tag(DATA_PATH)
ID_COL = f"ID_{DATASET_TAG}"

SCHEMA_TEXT = schema_as_text(SCHEMA_SPEC)

def load_df(path=DATA_PATH):
    return pd.read_csv(path, on_bad_lines="skip")

def make_context_and_meta(df):
    ctx = df.sample(SAMPLE_SIZE, random_state=RNG_STATE).to_string(index=False)
    buf = StringIO(); df.info(buf=buf)
    return ctx, buf.getvalue()

def get_completion(prompt: str):
    client = OpenAI(api_key=OPENAI_API_KEY)
    return client.chat.completions.create(
        model=MODEL_NAME,
        messages=[{"role": "user", "content": prompt}]
    ).choices[0].message.content

def exec_generated_code(code: str, g: dict):
    """Letzten ```-Block ausführen."""
    if "```" in code:
        code = code.split("```")[-2]
    clean = code.replace("python", "", 1).replace("```", "").strip()
    exec(clean, g)

def error_snippet(exc, max_lines=20, max_chars=1500):
    tb = traceback.TracebackException.from_exception(exc, capture_locals=False)
    frames = list(tb.format())
    txt = "".join(frames[:5] + frames[-max_lines:]).strip()

    # AssertionError: füge Beispiele an
    if isinstance(exc, AssertionError) and "invalid_entries" in str(exc):
        msg = str(exc)
        truncated = msg[:500] + (" …" if len(msg) > 500 else "")
        txt = truncated + "\n\n" + txt

    return (txt[:max_chars] + " …") if len(txt) > max_chars else txt

def extract_code_block(md: str) -> str:
    """Extrahiert den letzten ```-Block und entfernt Markdown-Header."""
    if "```" in md:
        md = md.split("```")[-2]
    return md.replace("python", "", 1).replace("```", "").strip()

def save_artifact(text: str, stage: str, kind: str,
                  dataset_path: str, attempt: int = 0):
    """
    stage  : 'transform' | 'validator'
    kind   : 'prompt'    | 'code'
    attempt: 0 = erster Versuch, 1 = Retry
    """
    stem = Path(dataset_path).stem
    suffix = "_retry" if attempt else ""
    ext = "txt" if kind == "prompt" else "py"
    target_dir = ARTIFACT_DIR / stem
    target_dir.mkdir(parents=True, exist_ok=True)
    (target_dir / f"{stage}_{kind}{suffix}.{ext}").write_text(text, encoding="utf-8")

def completeness_report(df0, out, v_llm, v_det, inv_det, savedir):
    print("\n— Completeness Report —")
    print(f"Original rows:            {len(df0):>6}")
    print(f"After transformation:     {len(out):>6}   (-{len(df0)-len(out)})")
    print(f"LLM validator valid:      {len(v_llm):>6}   (-{len(out)-len(v_llm)})")
    if inv_det:
        bad = pd.DataFrame(inv_det)
        print("\nBeispiele ungültiger Datensätze (dynamic):")
        print(bad.head().to_string(index=False))
        rep = Path(savedir) / f"{Path(DATA_PATH).stem}_invalid_records.csv"
        bad.to_csv(rep, index=False)
        print(f"\n❌  Fehlerliste gespeichert unter: {rep}")
    # === ADDED: Report zusätzlich im Run-Ordner ablegen ===
    lines = [
        "— Completeness Report —",
        f"Original rows:            {len(df0):>6}",
        f"After transformation:     {len(out):>6}   (-{len(df0)-len(out)})",
        f"LLM validator valid:      {len(v_llm):>6}   (-{len(out)-len(v_llm)})",
    ]
    (run_dir_for(DATA_PATH) / "completeness.txt").write_text("\n".join(lines), encoding="utf-8")
    # === /ADDED ===

# ───────── Prompt-Builder ─────────
def build_transformation_prompt(ctx, meta):
    return textwrap.dedent(f"""
    [ROLE]
    You are an expert data scientist specialised in cleansing and standardising film datasets.
    Try to interpret the schema and the context and the metadata to understand the data.

    [PROCESS]
      (a) [PLAN] Outline your high-level approach.
      (b) [THINK] Write Pandas code step by step to fullfill the {SCHEMA_TEXT}.
      (c) [CHECK] Self-verify logic.
      (d) [ANSWER] return one ```python``` block that:
          • starts with  output = []
          • Select the most complete numeric rating, prefer original ratings, fallback if empty.
          • preserve the column {ID_COL} unchanged in every output row as the ID column
          • interprets two-digit years so that the final four-digit year
            falls in the realistic range 1900-2025
          • genres: list of strings 
          • title: normalize exactly: ascii+lower; remove one trailing "(YYYY)" then one trailing "(...)"; punctuation→space; collapse spaces+trim; drop trailing "the"; dedup tokens (keep order).
          • release_year: could also be "streaming_release_year", if necessary extract it 
          • interprets two-digit years so that the final four-digit year
            falls in the realistic range 1900-2025
          • builds an additional list  invalid_entries
            – append a dict whenever a row is skipped
              {{ "row": <original_row_as_dict>, "reason": "<short text>" }}
          
                           
    [SCHEMA]
    {SCHEMA_TEXT}

    [CONTEXT]
    {ctx}

    [METADATA]
    {meta}
    """).strip()

def build_validator_prompt():
    schema_json = json.dumps(SCHEMA_SPEC, indent=2, ensure_ascii=False)
    return textwrap.dedent(f"""
    [ROLE]
    Validation stage of the ETL pipeline.

    [SCHEMA_JSON]
    {schema_json}

    [PROCESS]
      (a) [PLAN] outline checks
      (b) [THINK] derive assertions
      (c) [CHECK] ensure logic
      (d) [ANSWER] one ```python``` block that
          • iterates **over `output` (list of dicts)**
          • builds valid_output / invalid_entries / duplicate_count
          • raises AssertionError if invalid_entries is non-empty.

    [FORMAT] Use [PLAN] [THINK] [CHECK] [ANSWER].
    [OUTPUT] show only code inside the block.
    """).strip()

# ───────── Main ─────────
def main():
    try:
        print("Running with pandas", pd.__version__)
        append_log(f"RUN {RUN_ID} started with pandas {pd.__version__}", DATA_PATH)

        df = load_df(); globals()["df"] = df
        df[ID_COL] = range(1, len(df)+1)   # 1..N pro geladener Datei
        globals()["ID_COL"] = ID_COL  
        ctx, meta = make_context_and_meta(df)

        # 1) Transformation --------------------------------------------------
        snippet = ""
        # ===  Artefaktpfade für Manifest sammeln ===
        manifest_paths = {
            "transform": {"prompt": [], "code": []},
            "validator": {"prompt": [], "code": []},
        }
        # === /ADDED ===

        for attempt in range(2):          # 0 = erster Versuch, 1 = Retry
            try:
                prompt = build_transformation_prompt(ctx, meta)
                if attempt == 1:          # beim Retry Fehlersnippet anhängen
                    prompt += f"\n[TRANSFORMATION_ERROR]\n{snippet}"
                    
                save_artifact(prompt, "transform", "prompt", DATA_PATH, attempt) 
                # === zusätzliche Kopie im Run-Ordner ===
                pth = save_artifact_copy(prompt, "transform", "prompt", DATA_PATH, attempt, add_hash=False)
                manifest_paths["transform"]["prompt"].append(pth)
                t_code = get_completion(prompt)
                # Bestehende Speicherung beibehalten:
                save_artifact(
                    extract_code_block(t_code),
                    "transform", "code", DATA_PATH, attempt)
                print("\n— Transformation Code —\n", t_code)
                # ===  Hash-Kopie ablegen ===
                t_code_block = extract_code_block(t_code)
                pth = save_artifact_copy(t_code_block, "transform", "code", DATA_PATH, attempt, add_hash=True)
                manifest_paths["transform"]["code"].append(pth)

                exec_generated_code(t_code, globals())
                if "output" not in globals():
                    raise RuntimeError("LLM produced no `output`")

                append_log(f"[TRANSFORM][attempt={attempt}] OK, rows={len(globals()['output'])}", DATA_PATH)
                break                     # erfolgreich → Schleife beenden

            except Exception as exc:
                snippet = error_snippet(exc)
                print("⚠️  transformation crashed – retrying …\n", snippet)
                append_log(f"[TRANSFORM][attempt={attempt}] ERROR\n{traceback.format_exc()}", DATA_PATH)
                globals().pop("output", None)     # aufräumen
                if attempt == 1:                  # zweiter Fehlschlag → endgültig
                    raise
        
        # -------- persist rows dropped by the transformation ----------------
        trans_invalid = globals().get("invalid_entries", [])
        trans_invalid_count = 0
        if trans_invalid:
            inv_df = pd.DataFrame(trans_invalid)
            SAVE_DIR.mkdir(parents=True, exist_ok=True)
            tr_inv_path = SAVE_DIR / f"{Path(DATA_PATH).stem}_invalid.csv"
            inv_df.to_csv(tr_inv_path, index=False)
            trans_invalid_count = len(inv_df)
            print(f"\n❌  Transform dropped {trans_invalid_count} rows  →  {tr_inv_path}")
            append_log(f"[TRANSFORM] dropped {trans_invalid_count} rows → {tr_inv_path}", DATA_PATH)
        # Liste leeren, damit der Validator seine eigene invalid_entries anlegt
        globals().pop("invalid_entries", None)
    
        # 2) LLM-Validator ---------------------------------------------------
        validator_prompt = build_validator_prompt()
        save_artifact(validator_prompt, "validator", "prompt", DATA_PATH)          # Prompt sichern
        # === ADDED ===
        pth = save_artifact_copy(validator_prompt, "validator", "prompt", DATA_PATH, attempt=0, add_hash=False)
        manifest_paths["validator"]["prompt"].append(pth)
        # === /ADDED ===

        v_code = get_completion(validator_prompt)
        save_artifact(extract_code_block(v_code), "validator", "code", DATA_PATH)  # Code sichern
        print("\n— Validator Code —\n", v_code)
        # === ADDED: Hash-Kopie ablegen ===
        v_code_block = extract_code_block(v_code)
        pth = save_artifact_copy(v_code_block, "validator", "code", DATA_PATH, attempt=0, add_hash=True)
        manifest_paths["validator"]["code"].append(pth)
        # === /ADDED ===

        try:
            exec_generated_code(v_code, globals())      # setzt valid_output, invalid_entries …
            append_log(f"[VALIDATOR] OK, valid_output={len(globals().get('valid_output', []))}", DATA_PATH)
        except Exception as exc:
            # ─ Retry --------------------------------------------------------
            snippet = error_snippet(exc)
            print("⚠️  validator crashed – retrying …\n", snippet)
            append_log(f"[VALIDATOR] ERROR\n{traceback.format_exc()}", DATA_PATH)

            for var in ("output", "valid_output", "invalid_entries", "duplicate_count"):
                globals().pop(var, None)

            retry_prompt = (build_transformation_prompt(ctx, meta)
                            + f"\n[VALIDATION_ERROR]\n{snippet}")
            
            save_artifact(retry_prompt, "transform", "prompt", DATA_PATH, attempt=1)
            pth = save_artifact_copy(retry_prompt, "transform", "prompt", DATA_PATH, attempt=1, add_hash=False)
            manifest_paths["transform"]["prompt"].append(pth)

            t_code = get_completion(retry_prompt)
            save_artifact(extract_code_block(t_code), "transform", "code", DATA_PATH, attempt=1)
            print("\n— Retry Transformation Code —\n", t_code)
            t_code_block = extract_code_block(t_code)
            pth = save_artifact_copy(t_code_block, "transform", "code", DATA_PATH, attempt=1, add_hash=True)
            manifest_paths["transform"]["code"].append(pth)

            exec_generated_code(t_code, globals())
            if "output" not in globals():
                raise RuntimeError("Retry produced no `output`")

            validator_prompt_retry = build_validator_prompt()
            save_artifact(validator_prompt_retry, "validator", "prompt", DATA_PATH, attempt=1)
            pth = save_artifact_copy(validator_prompt_retry, "validator", "prompt", DATA_PATH, attempt=1, add_hash=False)
            manifest_paths["validator"]["prompt"].append(pth)

            v_code = get_completion(validator_prompt_retry)
            save_artifact(extract_code_block(v_code), "validator", "code", DATA_PATH, attempt=1)
            print("\n— Retry Validator Code —\n", v_code)
            v_code_block = extract_code_block(v_code)
            pth = save_artifact_copy(v_code_block, "validator", "code", DATA_PATH, attempt=1, add_hash=True)
            manifest_paths["validator"]["code"].append(pth)

            exec_generated_code(v_code, globals())
            append_log(f"[VALIDATOR][retry] OK, valid_output={len(globals().get('valid_output', []))}", DATA_PATH)

        # === ADDED: Validator-Fehlerliste persistieren ===
        val_invalid = globals().get("invalid_entries", [])
        val_invalid_count = 0
        if val_invalid:
            SAVE_DIR.mkdir(parents=True, exist_ok=True)
            val_inv_path = SAVE_DIR / f"{Path(DATA_PATH).stem}_invalid_validator.csv"
            pd.DataFrame(val_invalid).to_csv(val_inv_path, index=False)
            val_invalid_count = len(val_invalid)
            print(f"❌  Validator flagged {val_invalid_count} rows  →  {val_inv_path}")
            append_log(f"[VALIDATOR] invalid rows: {val_invalid_count} → {val_inv_path}", DATA_PATH)
        # 3) Report + Persist  -------------------------------------------------
        completeness_report(df, output, valid_output,
                            valid_output, val_invalid,     # <= hier nicht mehr []
                            "cleaned")
        
        SAVE_DIR.mkdir(parents=True, exist_ok=True)
        out_path = SAVE_DIR / Path(DATA_PATH).name
        pd.DataFrame(valid_output).to_csv(out_path, index=False)
        print(f"\n✅ Bereinigter Datensatz gespeichert unter: {out_path}")
        append_log(f"[OUTPUT] written → {out_path}", DATA_PATH)

        # === ADDED: Manifest schreiben ===
        manifest = {
            "run_id": RUN_ID,
            "dataset": str(DATA_PATH),
            "model": MODEL_NAME,
            "sample_size": SAMPLE_SIZE,
            "rng_state": RNG_STATE,
            "outputs": {
                "cleaned_csv": str(out_path),
                "transform_invalid_count": trans_invalid_count,
                "validator_invalid_count": val_invalid_count
            },
            "artifacts": manifest_paths
        }
        man_path = write_manifest(DATA_PATH, manifest)
        append_log(f"[MANIFEST] {man_path}", DATA_PATH)
        
    except Exception:
        traceback.print_exc()
        append_log(f"[FATAL] {traceback.format_exc()}", DATA_PATH)

if __name__ == "__main__":
    main()

Merge

In [None]:
import pandas as pd
from pathlib import Path
import numpy as np
import re, textwrap, datetime as dt

try:
    from unidecode import unidecode   # Akzente → ASCII
except ImportError:
    unidecode = lambda s: s

# Eingabe-/Ausgabe
CLEAN_DIR = Path("cleaned")
MERGE_OUT = Path("merged/all_movies_wide_fuzzy.csv")
DUPL_OUT  = Path("merged/all_movies_fuzzy_duplicates.csv")

# Kanonische Rating-Spalten je Quelle
CANON = {
    "imdb_data":              "rating_imdb",
    "movielens_aggregated":   "rating_movielens",
    "metacritic_movies":      "rating_metacritic",
    "rotten_tomatoes_movies": "rating_rt_audience",
}

# Kanonische ID-Spalten je Quelle
ID_MAP = {
    "imdb_data":              "ID_IMDB",
    "movielens_aggregated":   "ID_MOVIELENS",
    "metacritic_movies":      "ID_METACRITIC",
    "rotten_tomatoes_movies": "ID_RT",
}

def standardize_id_column(df: pd.DataFrame, src: str) -> pd.DataFrame:
    target = ID_MAP.get(src, f"ID_{src.upper()}")
    if target in df.columns:
        pass
    elif "ID" in df.columns:
        df = df.rename(columns={"ID": target})
    else:
        cands = [c for c in df.columns if str(c).upper().startswith("ID_")]
        if cands:
            df = df.rename(columns={cands[0]: target})
    if target in df.columns:
        df[target] = pd.to_numeric(df[target], errors="coerce").astype("Int64")
    return df

# 1) Clean-CSVs einsammeln
frames = []
for csv in CLEAN_DIR.glob("*.csv"):
    name = csv.name
    if (
        name.endswith(("merged.csv", "all_movies_wide.csv", "all_movies_wide_fuzzy.csv"))
        or "_invalid_records" in name
        or "_duplicates"      in name
    ):
        continue

    src = csv.stem  # imdb_data, movielens_aggregated, …
    df  = pd.read_csv(csv)

    # Jahr vereinheitlichen
    if "year" in df.columns and "release_year" not in df.columns:
        df = df.rename(columns={"year": "release_year"})

    # Rating in kanonische Spalte umbenennen (falls nötig)
    rating_col = CANON.get(src, f"rating_{src}")
    if "rating" in df.columns and rating_col not in df.columns:
        df = df.rename(columns={"rating": rating_col})

    # ID-Spalte standardisieren
    df = standardize_id_column(df, src)

    df["source"] = src
    frames.append(df)

if not frames:
    raise RuntimeError(f"Keine geeigneten Clean-CSVs in {CLEAN_DIR}")

long_df = pd.concat(frames, ignore_index=True)
long_df["norm_title"]   = long_df["title"].map(norm_title)
long_df["release_year"] = pd.to_numeric(long_df["release_year"], errors="coerce").astype("Int64")

# 3) Jahr-Cluster (±1) – wie statisch
def year_cluster(sub: pd.DataFrame) -> pd.Series:
    years = sorted(set([y for y in sub["release_year"].dropna()]))
    clusters, cid = {}, 0
    for y in years:
        if any(abs(y - c) <= 1 for c in clusters.get(cid, [])):
            clusters[cid].append(y)
        else:
            cid += 1; clusters[cid] = [y]
    mapping = {y: c for c, ys in clusters.items() for y in ys}
    return sub["release_year"].map(mapping).fillna(cid + 1).astype(int)

long_df["year_cluster"] = (
    long_df.groupby("norm_title", group_keys=False)
           .apply(year_cluster)
)

# 4) Helper für gruppierte Aggregation
def first_valid(s: pd.Series):
    s = s.dropna()
    return s.iloc[0] if not s.empty else np.nan

def first_non_empty(series: pd.Series):
    for val in series:
        if isinstance(val, list) and val:
            return val
        if isinstance(val, str) and val.strip():
            return val
    return np.nan

group_cols = ["norm_title", "year_cluster"]
rating_cols = [c for c in long_df.columns if c.startswith("rating_")]
id_cols     = [c for c in long_df.columns if str(c).startswith("ID_")]

# 5) Ratings je Film (erste gültige) – entspricht statischem Pivot-Resultat
ratings_wide = (
    long_df.groupby(group_cols, as_index=False)[rating_cols]
           .agg(first_valid) if rating_cols else long_df[group_cols].drop_duplicates()
)

# IDs je Film (erste gültige)
ids_map = (
    long_df.groupby(group_cols, as_index=False)[id_cols]
           .agg(first_valid) if id_cols else long_df[group_cols].drop_duplicates()
)

# 6) Repräsentativer Titel & kleinstes Jahr
meta = (
    long_df.sort_values(["source", "title"])
           .groupby(group_cols, as_index=False)
           .agg(title=("title", "first"), release_year=("release_year", "min"))
)

# 7) Genres: erste nicht-leere Liste (wie statisch)
genres_map = (
    long_df.sort_values("source")
           .groupby(group_cols)["genres"]
           .apply(first_non_empty)
           .reset_index(name="genres")
)

# 8) Wide zusammenführen
wide = ratings_wide.merge(meta,       on=group_cols, how="left")
wide = wide.merge(genres_map,         on=group_cols, how="left")
wide = wide.merge(ids_map,            on=group_cols, how="left")

# 9) Ungefiltertes Ergebnis speichern
MERGE_OUT.parent.mkdir(parents=True, exist_ok=True)
UNFILTERED_OUT = Path("merged/all_movies.csv")
wide.to_csv(UNFILTERED_OUT, index=False)
print(f"💾 Ungefilterter Wide-Frame gespeichert: {UNFILTERED_OUT}")

# 10) ≥2 Ratings-Filter (wie statisch)
present_rating_mask = wide[rating_cols].notna().sum(axis=1) if rating_cols else pd.Series(0, index=wide.index)
wide_filtered = wide.loc[present_rating_mask >= 2].copy()

# 11) Finale Spalten (IDs nach vorne)
ordered_cols = (id_cols + ["title", "release_year", "genres"] + rating_cols)
ordered_cols = [c for c in ordered_cols if c in wide_filtered.columns]
wide_filtered = wide_filtered[ordered_cols]

# 12) Fuzzy-Duplikate (title, release_year)
dup_mask   = wide_filtered.duplicated(subset=["title", "release_year"], keep=False)
duplicates = wide_filtered[dup_mask].copy()
uniques    = wide_filtered[~dup_mask].copy()

# 13) Endergebnisse speichern
uniques.to_csv(MERGE_OUT, index=False)
duplicates.to_csv(DUPL_OUT, index=False)

print(textwrap.dedent(f"""
  — Fuzzy-Merge abgeschlossen ({dt.date.today()}) —
  Eingelesene Quellen : {len(frames)}
  Long-Records        : {len(long_df)}
  Filme mit ≥2 Votes  : {len(uniques)}
  Fuzzy-Duplikate     : {len(duplicates)}
  👉 Gemergt  : {MERGE_OUT}
  👉 Duplikate: {DUPL_OUT}
""").strip())


Superscore

In [None]:
# ─── Superscore (0-10 Skala, ungewichtet – wie static_pipeline) ──────────
import pandas as pd
import numpy as np
import textwrap, datetime as dt
from pathlib import Path

MERGED      = Path("merged/all_movies_wide_fuzzy.csv")
OUT_CSV     = Path("merged/all_movies_superscore_adaptive.csv")
MIN_RATINGS = 2

df = pd.read_csv(MERGED)

# ID-Spalten erkennen und sauber typisieren (bleiben bis zum Ende erhalten)
id_cols = [c for c in df.columns if c == "ID" or str(c).startswith("ID_")]
for c in id_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64")

# Rating-Spalten finden
rating_cols = [c for c in df.columns if c.startswith("rating_")]
if not rating_cols:
    raise RuntimeError("Keine rating_*-Spalten gefunden.")

# 1. Quelle → 0-10-Normierung (heuristisch wie in static_pipeline)
def to_0_10(series):
    if series.dropna().empty:
        return series
    mx = series.max()
    if mx <= 5.5:      # 0–5 Skala → *2 (MovieLens)
        return series * 2
    if mx > 10:        # 0–100 Skala → /10 (Metacritic, RT%)
        return series / 10
    return series      # 0–10 unverändert (IMDb)

norm_cols = []
for col in rating_cols:
    ncol = col.replace("rating_", "") + "_norm"
    df[ncol] = to_0_10(df[col])
    norm_cols.append(ncol)

# 2. Anzahl verfügbarer normalisierter Ratings
df["num_available_ratings"] = df[norm_cols].notna().sum(axis=1)

# 3. Superscore (nur wenn ≥ MIN_RATINGS normalisierte Ratings vorhanden)
mask = df["num_available_ratings"] >= MIN_RATINGS
df.loc[mask, "superscore_mean_0_10"]   = df.loc[mask, norm_cols].mean(axis=1).round(1)
df.loc[mask, "superscore_median_0_10"] = df.loc[mask, norm_cols].median(axis=1).round(1)

# 4. Ausgabe – ID-Spalten vorne
base_cols = [c for c in ["title", "release_year", "genres"] if c in df.columns]
tail_cols = ["num_available_ratings", "superscore_mean_0_10", "superscore_median_0_10"]
cols_out  = id_cols + base_cols + rating_cols + norm_cols + tail_cols
cols_out  = [c for c in cols_out if c in df.columns]

df[cols_out].to_csv(OUT_CSV, index=False)

print(textwrap.dedent(f"""
  — Superscore-Static abgeschlossen ({dt.date.today()}) —
  Rating-Spalten erkannt : {rating_cols}
  Normierte Spalten      : {norm_cols}
  Filme mit ≥{MIN_RATINGS} Ratings : {int(mask.sum())} / {len(df)}
  ID-Spalten im Output   : {id_cols}
  👉 Ergebnis : {OUT_CSV}
""").strip())