In [1]:
# === Cell M1: Train metadata-only models using labels from all_signals_combined.csv ===
import pandas as pd, numpy as np, re, ast, joblib, warnings
from pathlib import Path

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, f1_score, mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import SGDRegressor

warnings.filterwarnings("ignore")

# ---- Paths (adjust if your videos.csv lives elsewhere) ----
VIDEOS_PATH = Path("/kaggle/input/datathon-loreal/videos.csv")
ALL_SIG_PATH = Path("/kaggle/input/fusion-engine/all_signals_combined.csv")
assert VIDEOS_PATH.exists(), f"Missing {VIDEOS_PATH}"
assert ALL_SIG_PATH.exists(), f"Missing {ALL_SIG_PATH}"

# ---- Load ----
vid = pd.read_csv(VIDEOS_PATH)
sig = pd.read_csv(ALL_SIG_PATH)
print("Loaded:", vid.shape, sig.shape)

# ---- Minimal columns & parsing helpers ----
def parse_iso_duration_to_seconds(s):
    if not isinstance(s, str): return np.nan
    m = re.match(r"^PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?$", s)
    if not m: 
        # allow plain seconds (e.g., "30")
        try: return float(s)
        except: return np.nan
    h = int(m.group(1)) if m.group(1) else 0
    m_ = int(m.group(2)) if m.group(2) else 0
    s_ = int(m.group(3)) if m.group(3) else 0
    return h*3600 + m_*60 + s_

def tags_to_text(x):
    # videos.csv may store tags as JSON-like list strings
    try:
        if isinstance(x, str) and x.startswith("[") and x.endswith("]"):
            arr = ast.literal_eval(x)
            if isinstance(arr, list): return " ".join(map(str, arr))
    except Exception:
        pass
    return str(x) if isinstance(x, str) else ""

# Ensure required columns exist
for c in ["videoId","title","description","tags","contentDuration","viewCount","likeCount","commentCount"]:
    if c not in vid.columns: vid[c] = np.nan

# Build text + numeric metadata
vid["duration_seconds"] = vid["contentDuration"].apply(parse_iso_duration_to_seconds)
vid["tags_text"] = vid["tags"].apply(tags_to_text)
for c in ["title","description","tags_text"]:
    vid[c] = vid[c].fillna("")
vid["text_all"]  = (vid["title"] + " " + vid["description"] + " " + vid["tags_text"]).str.strip()
vid["title_len"] = vid["title"].str.len()
vid["desc_len"]  = vid["description"].str.len()
vid["tags_len"]  = vid["tags_text"].str.len()

meta_cols = ["videoId","text_all","duration_seconds","viewCount","likeCount","commentCount","title_len","desc_len","tags_len"]
meta = vid[meta_cols].copy()

# ---- Derive labels from all_signals (NO leakage back into features) ----
# Prefer composite_score for Emerging; fallback to predicted_future_score if needed
source_for_emerging = "composite_score" if "composite_score" in sig.columns else "predicted_future_score"
assert source_for_emerging in sig.columns, "Need composite_score or predicted_future_score in all_signals."

# Stage heuristic uses momentum vs decay signals when available
mom_cols = [c for c in sig.columns if re.search(r'hawkes|tbi_', c, re.IGNORECASE)]
dec_cols = [c for c in sig.columns if re.search(r'decay', c, re.IGNORECASE)]

def stage_from_signals_row(row):
    mom_vals = [row[c] for c in mom_cols if c in sig.columns]
    dec_vals = [row[c] for c in dec_cols if c in sig.columns]
    mom = np.nanmean(mom_vals) if len(mom_vals) else np.nan
    dec = np.nanmean(dec_vals) if len(dec_vals) else np.nan
    if np.isnan(mom) or np.isnan(dec): return "Peaking"
    if mom > dec * 1.10: return "Rising"
    if dec > mom * 1.10: return "Decaying"
    return "Peaking"

lab = sig[["videoId", source_for_emerging]].copy()
lab["label_emerging"] = (lab[source_for_emerging] >= np.nanpercentile(lab[source_for_emerging], 80)).astype(int)

if len(mom_cols) or len(dec_cols):
    tmp = sig[["videoId"] + list(set(mom_cols + dec_cols))].copy()
    tmp["label_stage"] = tmp.apply(stage_from_signals_row, axis=1)
    lab = lab.merge(tmp[["videoId","label_stage"]], on="videoId", how="left")
else:
    lab["label_stage"] = "Peaking"

# Volume target = predicted_future_score if present, else composite_score
vol_target = "predicted_future_score" if "predicted_future_score" in sig.columns else "composite_score"
lab["target_volume_reg"] = sig.set_index("videoId")[vol_target].reindex(lab["videoId"]).values

# ---- Join metadata with labels ----
df = meta.merge(lab, on="videoId", how="inner").dropna(subset=["text_all","duration_seconds"])
print("Trainable rows:", df.shape)

# ---- Train/test (time-agnostic split for simplicity) ----
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.20, random_state=42, stratify=df["label_emerging"])

TEXT_COL = "text_all"
NUM_COLS = ["duration_seconds","viewCount","likeCount","commentCount","title_len","desc_len","tags_len"]

# Preprocessor: TF-IDF on text + scaled numeric
ct = ColumnTransformer([
    ("text", TfidfVectorizer(max_features=12000, ngram_range=(1,2)), TEXT_COL),
    ("num",  Pipeline([("imp", SimpleImputer(strategy="median")),
                       ("scl", StandardScaler(with_mean=False))]),
     NUM_COLS)
])

# Models: LogisticRegression(saga) for Emerging/Stage, SGDRegressor on log1p(target) for Volume
clf_em = Pipeline([
    ("prep", ct),
    ("clf", LogisticRegression(
        solver="saga", penalty="l2",
        max_iter=5000, class_weight="balanced", n_jobs=-1, random_state=42
    ))
])

clf_st = Pipeline([
    ("prep", ct),
    ("clf", LogisticRegression(
        solver="saga", penalty="l2", multi_class="multinomial",
        max_iter=5000, class_weight="balanced", n_jobs=-1, random_state=42
    ))
])

reg_vo = Pipeline([
    ("prep", ct),
    ("reg", TransformedTargetRegressor(
        regressor=SGDRegressor(loss="squared_error", penalty="l2",
                               alpha=1e-4, max_iter=3000, tol=1e-4, random_state=42),
        func=np.log1p, inverse_func=np.expm1
    ))
])

# Fit
clf_em.fit(train_df[[TEXT_COL]+NUM_COLS], train_df["label_emerging"].astype(int))
clf_st.fit(train_df[[TEXT_COL]+NUM_COLS], train_df["label_stage"].astype(str))
reg_vo.fit(train_df[[TEXT_COL]+NUM_COLS], train_df["target_volume_reg"].astype(float))

# Quick eval
from sklearn.metrics import accuracy_score
pred_em = clf_em.predict(test_df[[TEXT_COL]+NUM_COLS])
print("\n=== Emerging (metadata-only) ===")
print("F1:", round(f1_score(test_df["label_emerging"], pred_em), 3))

pred_st = clf_st.predict(test_df[[TEXT_COL]+NUM_COLS])
print("\n=== Stage (metadata-only) ===")
print(classification_report(test_df["label_stage"], pred_st, zero_division=0))

pred_vo = reg_vo.predict(test_df[[TEXT_COL]+NUM_COLS])
rmse = float(np.sqrt(mean_squared_error(test_df["target_volume_reg"], pred_vo)))
print("\n=== Volume (metadata-only) ===")
print("RMSE:", round(rmse, 3))

# Save PKLs (metadata-only)
MODEL_DIR = Path("/kaggle/working")
MODEL_DIR.mkdir(parents=True, exist_ok=True)
joblib.dump(clf_em, str(MODEL_DIR/"meta_emerging.pkl"))
joblib.dump(clf_st, str(MODEL_DIR/"meta_stage.pkl"))
joblib.dump(reg_vo, str(MODEL_DIR/"meta_volume.pkl"))
print("\nSaved metadata-only models to:", MODEL_DIR)

Loaded: (92759, 15) (39938, 13)
Trainable rows: (39421, 13)

=== Emerging (metadata-only) ===
F1: 0.81

=== Stage (metadata-only) ===
              precision    recall  f1-score   support

    Decaying       0.99      0.95      0.97      7730
     Peaking       0.12      0.37      0.18        68
      Rising       0.09      0.32      0.14        87

    accuracy                           0.93      7885
   macro avg       0.40      0.54      0.43      7885
weighted avg       0.97      0.93      0.95      7885


=== Volume (metadata-only) ===
RMSE: 0.114

Saved metadata-only models to: /kaggle/working
