In [2]:
import numpy as np
import pandas as pd

from scipy.stats import (
    pointbiserialr, spearmanr, mannwhitneyu,
    chi2_contingency, fisher_exact
)
from sklearn.metrics import roc_auc_score
import statsmodels.api as sm

# -------------------------------------------------------------------
# 1. Small utilities
# -------------------------------------------------------------------

def _safe_series(df: pd.DataFrame, name: str) -> bool:
    """Check if column exists in df."""
    return name in df.columns


def _group_topk(s: pd.Series, k: int) -> pd.Series:
    """
    Keep top-k most frequent categories, map the rest to "Other".
    Works on any hashable dtype (string, int, etc.).
    """
    vc = s.value_counts(dropna=False)
    keep = set(vc.head(k).index)
    return s.where(s.isin(keep), other="Other")


# -------------------------------------------------------------------
# 2. Dataset summary helpers
# -------------------------------------------------------------------

def summarize_dataset(df: pd.DataFrame, label_col: str = "label") -> None:
    """
    Print basic info about the dataset: shape, label distribution,
    missingness and some type/cardinality info.
    """
    print("=== DATASET SUMMARY ===")
    print("Shape:", df.shape)

    if label_col in df.columns:
        y = df[label_col]
        print("\nLabel distribution:")
        print(y.value_counts(dropna=False))
        print("Pos rate:", y.mean())
    else:
        print(f"\n[Warning] Label column '{label_col}' not found in df.")

    print("\nTop 30 columns by missingness:")
    miss = df.isna().mean().sort_values(ascending=False)
    print(miss.head(30))

    print("\nColumn types and cardinality (first 40):")
    info_rows = []
    for col in df.columns[:40]:
        ser = df[col]
        info_rows.append({
            "col": col,
            "dtype": str(ser.dtype),
            "n_unique": ser.nunique(dropna=True),
            "missing_frac": ser.isna().mean(),
        })
    info_df = pd.DataFrame(info_rows)
    display(info_df)


# -------------------------------------------------------------------
# 3. Automatic feature detection (numeric / categorical)
# -------------------------------------------------------------------

def auto_detect_features(
    df: pd.DataFrame,
    label_col: str = "label",
    min_non_na_frac: float = 0.3,
    max_cat_cardinality: int = 50,
    verbose: bool = True,
):
    """
    Automatically propose numeric and categorical features based on:
      - dtype
      - fraction of non-missing values
      - cardinality for categoricals

    Returns:
      numeric_feats, categorical_feats
    """
    if label_col not in df.columns:
        raise ValueError(f"Label column '{label_col}' not in df.")

    numeric_feats = []
    categorical_feats = []

    n = len(df)
    for col in df.columns:
        if col == label_col:
            continue

        ser = df[col]
        non_na_frac = 1.0 - ser.isna().mean()
        if non_na_frac < min_non_na_frac:
            # too sparse
            continue

        # Identify numeric/light bool columns
        if pd.api.types.is_numeric_dtype(ser) or pd.api.types.is_bool_dtype(ser):
            numeric_feats.append(col)
        else:
            # treat as categorical if not too many unique values
            nunique = ser.nunique(dropna=True)
            if 1 < nunique <= max_cat_cardinality:
                categorical_feats.append(col)

    if verbose:
        print("=== AUTO-DETECTED FEATURES ===")
        print(f"Numeric ({len(numeric_feats)}):", numeric_feats[:20],
              "..." if len(numeric_feats) > 20 else "")
        print(f"Categorical ({len(categorical_feats)}):", categorical_feats[:20],
              "..." if len(categorical_feats) > 20 else "")

    return numeric_feats, categorical_feats


# -------------------------------------------------------------------
# 4. Main analysis function (extended version of yours)
# -------------------------------------------------------------------

def analyze_influencer_correlations(
    df: pd.DataFrame,
    label_col: str = "label",
    numeric_feats=None,
    categorical_feats=None,
    topk_map=None,
    auto_topk_default: int = 20,
):
    """
    Run univariate association analysis between each feature and the binary label:
      - For numeric features:
          * Means/std by class
          * Point-biserial r + p
          * Spearman rho + p
          * Mann-Whitney U + p
          * 1-feature logistic coef p-value
          * AUC (using raw feature as score)
      - For categorical features:
          * Chi-square (or Fisher for 2x2)
          * Cramér's V
          * Top levels by positive rate

    If numeric_feats / categorical_feats are None, they are auto-detected.
    topk_map: dict col_name -> K for _group_topk, else auto_topk_default.
    """
    assert label_col in df.columns, f"'{label_col}' not in df"

    # Ensure binary ints
    y = df[label_col].astype(int)

    # Feature lists: auto-detect if not provided
    if numeric_feats is None or categorical_feats is None:
        auto_num, auto_cat = auto_detect_features(
            df, label_col=label_col, verbose=False
        )
        if numeric_feats is None:
            numeric_feats = auto_num
        if categorical_feats is None:
            categorical_feats = auto_cat

    if topk_map is None:
        topk_map = {}

    # ---------- NUMERIC FEATURES ----------
    num_rows = []
    for feat in numeric_feats:
        if not _safe_series(df, feat):
            continue

        x = df[feat]
        # drop missing and non-finite
        mask = x.notna() & np.isfinite(x) & y.notna()
        x = x[mask].astype(float)
        y_ = y[mask].astype(int)

        if len(x) < 30 or y_.nunique() < 2:
            continue

        g0 = x[y_ == 0]
        g1 = x[y_ == 1]
        mean0, mean1 = g0.mean(), g1.mean()
        std0, std1   = g0.std(ddof=1), g1.std(ddof=1)

        # Correlations
        try:
            pb_r, pb_p = pointbiserialr(y_, x)
        except Exception:
            pb_r, pb_p = np.nan, np.nan

        try:
            sp_rho, sp_p = spearmanr(y_, x)
        except Exception:
            sp_rho, sp_p = np.nan, np.nan

        # Mann-Whitney
        try:
            mw_u, mw_p = mannwhitneyu(g0, g1, alternative="two-sided")
        except Exception:
            mw_u, mw_p = np.nan, np.nan

        # 1-feature logistic + AUC
        try:
            X = sm.add_constant(x.values, has_constant="add")
            model = sm.Logit(y_.values, X, missing="drop").fit(disp=False)
            logit_p = model.pvalues[1] if len(model.pvalues) > 1 else np.nan
            auc = roc_auc_score(y_, x)
        except Exception:
            logit_p, auc = np.nan, np.nan

        num_rows.append({
            "feature": feat,
            "n": int(len(x)),
            "mean_0": mean0, "mean_1": mean1,
            "std_0": std0,   "std_1": std1,
            "pointbiserial_r": pb_r, "pointbiserial_p": pb_p,
            "spearman_rho": sp_rho,  "spearman_p": sp_p,
            "mannwhitney_U": mw_u,   "mannwhitney_p": mw_p,
            "logit_p": logit_p,
            "auc_univariate": auc,
        })

    numeric_results = pd.DataFrame(num_rows)
    if len(numeric_results) > 0:
        numeric_results = numeric_results.sort_values(
            ["logit_p", "pointbiserial_p", "mannwhitney_p"],
            na_position="last"
        )

    # ---------- CATEGORICAL FEATURES ----------
    cat_rows = []
    for feat in categorical_feats:
        if not _safe_series(df, feat):
            continue

        s = df[feat]

        # Group high-cardinality
        k = topk_map.get(feat, auto_topk_default)
        s = s.astype(object).fillna("Unknown")
        if s.nunique(dropna=False) > k:
            s = _group_topk(s, k)

        # Drop missing label / feature
        mask = y.notna() & s.notna()
        s = s[mask]
        y_ = y[mask].astype(int)
        if len(s) < 30 or y_.nunique() < 2:
            continue

        ct = pd.crosstab(y_, s)
        if ct.shape[1] < 2:  # only one category after grouping
            continue

        n = ct.values.sum()
        r, c = ct.shape

        if r == 2 and c == 2:
            odds, p_fisher = fisher_exact(ct.values)
            chi2, p_chi2, dof, _ = chi2_contingency(ct, correction=False)
            cramers_v = np.sqrt((chi2 / n) / (min(r - 1, c - 1)))
            p_value = p_fisher
            test_name = "Fisher (2x2) + Chi2"
        else:
            chi2, p_value, dof, _ = chi2_contingency(ct, correction=False)
            cramers_v = np.sqrt((chi2 / n) / (min(r - 1, c - 1)))
            test_name = "Chi-square"

        # Positive rate by category
        pos_rates = (ct.loc[1] / ct.sum(axis=0)).sort_values(ascending=False)

        cat_rows.append({
            "feature": feat,
            "k_levels": c,
            "n": int(n),
            "test": test_name,
            "chi2": chi2,
            "df": dof,
            "p_value": p_value,
            "cramers_v": cramers_v,
            "top_levels_by_rate": pos_rates.head(5).round(3).to_dict(),
        })

    categorical_results = pd.DataFrame(cat_rows)
    if len(categorical_results) > 0:
        categorical_results = categorical_results.sort_values(
            ["p_value"], na_position="last"
        )

    return numeric_results, categorical_results


# -------------------------------------------------------------------
# 5. Convenience wrapper to run everything on a df
# -------------------------------------------------------------------

def full_influencer_feature_analysis(
    df: pd.DataFrame,
    label_col: str = "label",
    numeric_override=None,
    categorical_override=None,
    topk_map=None,
):
    """
    High-level function:
      1. Summarizes dataset.
      2. Auto-detects feature types (unless overridden).
      3. Runs univariate analysis.
      4. Displays top features.

    Returns:
      numeric_results, categorical_results
    """
    summarize_dataset(df, label_col=label_col)

    if numeric_override is not None or categorical_override is not None:
        numeric_feats = numeric_override
        categorical_feats = categorical_override
    else:
        numeric_feats, categorical_feats = auto_detect_features(df, label_col=label_col)

    print("\n=== RUNNING UNIVARIATE ANALYSIS ===")
    num_res, cat_res = analyze_influencer_correlations(
        df,
        label_col=label_col,
        numeric_feats=numeric_feats,
        categorical_feats=categorical_feats,
        topk_map=topk_map,
    )

    print("\n=== NUMERIC FEATURES (sorted by significance) ===")
    display(num_res.head(50))

    print("\n=== CATEGORICAL / BOOLEAN FEATURES (sorted by p-value) ===")
    display(cat_res.head(50))

    return num_res, cat_res

In [7]:
import pandas as pd
import numpy as np
import re, ast

def parse_tweets(path, expect_label=True):
    # Load & flatten
    df = pd.read_json(path, lines=True)
    df = pd.json_normalize(df.to_dict(orient="records"), sep=".")

    # Ensure expected nested columns exist
    for col in [
        "text", "extended_tweet.full_text", "source",
        "entities.hashtags", "entities.user_mentions", "entities.urls",
        "extended_entities.media",
    ]:
        if col not in df.columns:
            df[col] = np.nan

    # Full text (vectorized, avoids apply/axis=1)
    df["full_text"] = df["extended_tweet.full_text"].fillna(df["text"]).fillna("")

    # Engagement (create if missing)
    for col in ["retweet_count", "favorite_count", "reply_count", "quote_count"]:
        if col not in df.columns:
            df[col] = 0

    # Safe length for list-like fields (sometimes lists, sometimes stringified)
    def safe_len(x):
        if isinstance(x, list):
            return len(x)
        if isinstance(x, str):
            try:
                v = ast.literal_eval(x)
                return len(v) if isinstance(v, (list, tuple)) else 1
            except Exception:
                return 0
        return 0

    df["n_hashtags"] = df["entities.hashtags"].apply(safe_len)
    df["n_mentions"] = df["entities.user_mentions"].apply(safe_len)
    df["n_urls"]     = df["entities.urls"].apply(safe_len)

    # Media presence
    df["has_media"] = df["extended_entities.media"].apply(lambda x: safe_len(x) > 0)

    # Source app (extract readable name from HTML anchor)
    def extract_source(x):
        if not isinstance(x, str):
            return "Unknown"
        m = re.search(r'>([^<]+)<', x)
        return m.group(1) if m else x

    df["source_app"] = df["source"].apply(extract_source)

    # User fields (create if missing)
    for col in [
        "user.description", "user.location",
        "user.favourites_count", "user.statuses_count", "user.listed_count",
        "user.default_profile", "user.default_profile_image",
        "user.geo_enabled", "user.protected",
        "user.lang", "user.time_zone",
        "user.created_at",
    ]:
        if col not in df.columns:
            df[col] = np.nan
    df["user.description"] = df["user.description"].fillna("")

    # --- simple structural flags ---
    df["is_reply"] = (
        df["in_reply_to_status_id"].notna() |
        df["in_reply_to_user_id"].notna()
    )

    # approximate retweet flag (no retweeted_status in the schema you pasted)
    df["is_retweet"] = df["text"].fillna("").str.startswith("RT @")

    if "is_quote_status" not in df.columns:
        df["is_quote_status"] = False
    df["is_quote_status"] = df["is_quote_status"].fillna(False)

    if "possibly_sensitive" not in df.columns:
        df["possibly_sensitive"] = False
    df["possibly_sensitive"] = df["possibly_sensitive"].fillna(False)

    # Place / geo
    for col in [
        "place.country_code", "place.place_type", "place.full_name", "place.id"
    ]:
        if col not in df.columns:
            df[col] = np.nan

    df["has_place"] = df["place.id"].notna()

    # quoted-status flags (minimal)
    if "quoted_status_id" not in df.columns:
        df["quoted_status_id"] = np.nan
    df["has_quoted_status"] = df["quoted_status_id"].notna()

    if "quoted_status.extended_entities.media" not in df.columns:
        df["quoted_status.extended_entities.media"] = np.nan
    df["quoted_has_media"] = df["quoted_status.extended_entities.media"].apply(
        lambda x: safe_len(x) > 0
    )

    if "quoted_status.possibly_sensitive" not in df.columns:
        df["quoted_status.possibly_sensitive"] = np.nan
    df["quoted_is_sensitive"] = df["quoted_status.possibly_sensitive"].fillna(False)

    if "quoted_status.user.verified" not in df.columns:
        df["quoted_status.user.verified"] = np.nan
    df["quoted_user_verified"] = df["quoted_status.user.verified"].fillna(False)

    # --- simple text length features ---
    df["tweet_len_chars"] = df["full_text"].fillna("").str.len()
    df["tweet_len_words"] = df["full_text"].fillna("").str.split().str.len()

    # Keep relevant columns (only those that exist)
    # (you can still engineer logs/clips later in make_transformations)
    keep_cols = [
        # IDs / bookkeeping
        "id_str",
        "challenge_id",          # if present
        # text
        "full_text",
        "user.description",
        # counts + structure
        "n_hashtags", "n_mentions", "n_urls", "has_media",
        "tweet_len_chars", "tweet_len_words",
        "retweet_count", "favorite_count", "reply_count", "quote_count",
        # interaction type
        "is_reply", "is_retweet", "is_quote_status",
        "possibly_sensitive",
        # source
        "source_app",
        # user info
        "user.location",
        "user.favourites_count",
        "user.statuses_count",
        "user.listed_count",
        "user.default_profile",
        "user.default_profile_image",
        "user.geo_enabled",
        "user.protected",
        "user.lang",
        "user.time_zone",
        "user.created_at",
        # time / place
        "created_at",
        "timestamp_ms",
        "place.country_code",
        "place.place_type",
        "place.full_name",
        "has_place",
        # quoted status summary
        "has_quoted_status",
        "quoted_has_media",
        "quoted_is_sensitive",
        "quoted_user_verified",
    ]
    existing = [c for c in keep_cols if c in df.columns]
    out = df[existing].copy()

    # Attach label if expected and available
    if expect_label and "label" in df.columns:
        out["label"] = df["label"]
    elif expect_label and "label" not in df.columns:
        print("Warning: 'label' not found in this file; returning features only.")

    missing = sorted(set(keep_cols) - set(existing))
    if missing:
        print("Note: some expected columns not present in raw json:", missing)

    return out

# Usage
train_clean = parse_tweets("../../train.jsonl", expect_label=True)

  df["quoted_is_sensitive"] = df["quoted_status.possibly_sensitive"].fillna(False)
  df["quoted_user_verified"] = df["quoted_status.user.verified"].fillna(False)


In [8]:
# 1) Quick full analysis using auto-detected features
num_res, cat_res = full_influencer_feature_analysis(train_clean, label_col="label")

=== DATASET SUMMARY ===
Shape: (154914, 41)

Label distribution:
label
0    82674
1    72240
Name: count, dtype: int64
Pos rate: 0.46632325031953215

Top 30 columns by missingness:
user.lang                     1.000000
user.time_zone                1.000000
place.full_name               0.981396
place.place_type              0.981396
place.country_code            0.981396
user.location                 0.341635
id_str                        0.000000
user.created_at               0.000000
user.default_profile          0.000000
user.default_profile_image    0.000000
user.geo_enabled              0.000000
user.protected                0.000000
timestamp_ms                  0.000000
created_at                    0.000000
user.statuses_count           0.000000
has_place                     0.000000
has_quoted_status             0.000000
quoted_has_media              0.000000
quoted_is_sensitive           0.000000
quoted_user_verified          0.000000
user.listed_count             0.000000


Unnamed: 0,col,dtype,n_unique,missing_frac
0,id_str,int64,154812,0.0
1,challenge_id,int64,154914,0.0
2,full_text,object,154091,0.0
3,user.description,object,41235,0.0
4,n_hashtags,int64,15,0.0
5,n_mentions,int64,12,0.0
6,n_urls,int64,5,0.0
7,has_media,bool,2,0.0
8,tweet_len_chars,int64,496,0.0
9,tweet_len_words,int64,79,0.0


=== AUTO-DETECTED FEATURES ===
Numeric (27): ['id_str', 'challenge_id', 'n_hashtags', 'n_mentions', 'n_urls', 'has_media', 'tweet_len_chars', 'tweet_len_words', 'retweet_count', 'favorite_count', 'reply_count', 'quote_count', 'is_reply', 'is_retweet', 'is_quote_status', 'user.favourites_count', 'user.statuses_count', 'user.listed_count', 'user.default_profile', 'user.default_profile_image'] ...
Categorical (1): ['possibly_sensitive'] 

=== RUNNING UNIVARIATE ANALYSIS ===


  rpb, prob = pearsonr(x, y)
  sp_rho, sp_p = spearmanr(y_, x)
  rpb, prob = pearsonr(x, y)
  sp_rho, sp_p = spearmanr(y_, x)
  rpb, prob = pearsonr(x, y)
  sp_rho, sp_p = spearmanr(y_, x)
  rpb, prob = pearsonr(x, y)
  sp_rho, sp_p = spearmanr(y_, x)
  rpb, prob = pearsonr(x, y)
  sp_rho, sp_p = spearmanr(y_, x)
  rpb, prob = pearsonr(x, y)
  sp_rho, sp_p = spearmanr(y_, x)



=== NUMERIC FEATURES (sorted by significance) ===


Unnamed: 0,feature,n,mean_0,mean_1,std_0,std_1,pointbiserial_r,pointbiserial_p,spearman_rho,spearman_p,mannwhitney_U,mannwhitney_p,logit_p,auc_univariate
3,n_mentions,154914,0.8612139,0.3971346,1.298493,0.8895418,-0.201323,0.0,-0.227116,0.0,3657434000.0,0.0,0.0,0.387607
12,is_reply,154914,0.4105644,0.2010382,0.4919392,0.4007793,-0.225437,0.0,-0.225437,0.0,3611869000.0,0.0,0.0,0.395237
15,user.favourites_count,154914,10700.28,19938.5,23183.24,38248.25,0.146453,0.0,0.115545,0.0,2586861000.0,0.0,0.0,0.566862
16,user.statuses_count,154914,10693.37,43771.45,29118.89,76408.9,0.28105,0.0,0.45556,0.0,1411770000.0,0.0,0.0,0.763616
18,user.default_profile,154914,0.744309,0.4253599,0.4362516,0.4944009,-0.324203,0.0,-0.324203,0.0,3938626000.0,0.0,0.0,0.340525
20,user.geo_enabled,154914,0.2499214,0.5396041,0.4329699,0.4984325,0.296986,0.0,0.296986,0.0,2121139000.0,0.0,0.0,0.644841
17,user.listed_count,154914,6.79101,132.7866,29.50026,1167.234,0.078584,1.075484e-210,0.615137,0.0,896907400.0,0.0,0.0,0.849824
2,n_hashtags,154914,0.2480224,0.3730482,0.8155625,0.8820414,0.073421,4.1373389999999996e-184,0.111075,0.0,2737765000.0,0.0,6.3959860000000005e-177,0.541595
5,has_media,154914,0.02255848,0.04595792,0.148492,0.2093953,0.064901,3.189894e-144,0.064901,3.189894e-144,2916310000.0,6.34033e-144,2.891025e-138,0.5117
7,tweet_len_words,154914,24.29524,23.34453,14.83649,13.87369,-0.032928,1.963951e-38,-0.025501,1.032606e-23,3074294000.0,1.0493160000000001e-23,2.1897059999999997e-38,0.485247



=== CATEGORICAL / BOOLEAN FEATURES (sorted by p-value) ===


Unnamed: 0,feature,k_levels,n,test,chi2,df,p_value,cramers_v,top_levels_by_rate
0,possibly_sensitive,2,154914,Fisher (2x2) + Chi2,53.178889,1,1.973608e-13,0.018528,"{False: 0.467, 1.0: 0.348}"


In [13]:
import numpy as np
import pandas as pd
import re, ast, json, hashlib
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score


# ============================================================
# 1. Parsing + author_pseudo_id + basic metadata columns
# ============================================================

def parse_tweets_meta(path, expect_label=True):
    """
    Parse the raw train.jsonl and build a flat DataFrame with:
    - author_pseudo_id
    - basic tweet/user fields
    - simple structural flags (is_reply, has_media, etc.)
    """
    path = Path(path)
    df = pd.read_json(path, lines=True)
    df = pd.json_normalize(df.to_dict(orient="records"), sep=".")

    # Ensure some nested columns exist
    for col in [
        "text", "extended_tweet.full_text", "source",
        "entities.hashtags", "entities.user_mentions", "entities.urls",
        "extended_entities.media",
        "user.created_at", "user.description", "user.url", "user.location",
        "user.favourites_count", "user.statuses_count", "user.listed_count",
        "user.default_profile", "user.geo_enabled",
        "in_reply_to_status_id", "in_reply_to_user_id",
    ]:
        if col not in df.columns:
            df[col] = np.nan

    # -------- full text ----------
    df["full_text"] = df["extended_tweet.full_text"].fillna(df["text"]).fillna("")

    df["text_len"] = df["full_text"].str.len()

    # -------- counts / list lengths ----------
    def safe_len(x):
        if isinstance(x, list):
            return len(x)
        if isinstance(x, str):
            try:
                v = ast.literal_eval(x)
                return len(v) if isinstance(v, (list, tuple)) else 1
            except Exception:
                return 0
        return 0

    df["n_hashtags"] = df["entities.hashtags"].apply(safe_len)
    df["n_mentions"] = df["entities.user_mentions"].apply(safe_len)
    df["n_urls"]     = df["entities.urls"].apply(safe_len)

    # media flag
    df["has_media"] = df["extended_entities.media"].apply(lambda x: safe_len(x) > 0)

    # -------- author_pseudo_id (same logic you used) ----------
    def make_user_key(row):
        key = (
            str(row.get("user.created_at", "")) + "|" +
            str(row.get("user.description", "")) + "|" +
            str(row.get("user.url", "")) + "|" +
            str(row.get("user.location", ""))
        )
        return hashlib.md5(key.encode("utf-8")).hexdigest()

    df["author_pseudo_id"] = df.apply(make_user_key, axis=1)

    # -------- structural flags ----------
    df["is_reply"] = (
        df["in_reply_to_status_id"].notna() |
        df["in_reply_to_user_id"].notna()
    )

    # note: in your real raw json you might have a better retweet flag;
    # here we approximate using RT prefix
    df["is_retweet"] = df["text"].fillna("").str.startswith("RT @")

    # ensure boolean columns exist (even if missing)
    for col in ["user.default_profile", "user.geo_enabled"]:
        if col not in df.columns:
            df[col] = np.nan

    # -------- source_app (HTML → readable name) ----------
    def extract_source(x):
        if not isinstance(x, str):
            return "Unknown"
        m = re.search(r'>([^<]+)<', x)
        return m.group(1) if m else x

    df["source_app"] = df["source"].apply(extract_source)

    # keep only metadata we care about:
    keep_cols = [
        "author_pseudo_id", "full_text",
        "challenge_id" if "challenge_id" in df.columns else None,
        "n_hashtags", "n_mentions", "n_urls", "has_media",
        "user.favourites_count", "user.statuses_count", "user.listed_count",
        "user.default_profile", "user.geo_enabled",
        "is_reply", "text_len", "source_app"
    ]
    keep_cols = [c for c in keep_cols if c is not None]

    out = df[keep_cols].copy()

    # attach label if present
    if expect_label and "label" in df.columns:
        out["label"] = df["label"]
    elif expect_label and "label" not in df.columns:
        print("Warning: 'label' not found in this file; returning features only.")

    return out


# ============================================================
# 2. Feature engineering for metadata-only baselines
# ============================================================

def build_meta_features(df, fit_stats=None, src2idx=None, K=15):
    """
    Build metadata features:
      - log_status (from user.statuses_count)
      - log_listed (from user.listed_count)
      - log_fav   (from user.favourites_count)
      - n_mentions, n_hashtags (not capped here, but can be)
      - booleans as 0/1
      - source_idx: bucketized from source_app with top-K on TRAIN
    fit_stats: dict with p99s, learned from TRAIN
    src2idx: dict source_app -> index, learned from TRAIN
    """
    df = df.copy()

    # ensure numeric columns exist
    for col in ["user.statuses_count", "user.listed_count", "user.favourites_count"]:
        if col not in df.columns:
            df[col] = 0

    # fit_stats = dict with p99s, learned from train set only
    if fit_stats is None:
        fit_stats = {}
        for col in ["user.statuses_count", "user.listed_count", "user.favourites_count"]:
            fit_stats[f"{col}_p99"] = float(df[col].quantile(0.995))

    # log transforms with clipping
    df["log_status"] = np.log1p(
        np.clip(df["user.statuses_count"].fillna(0), 0, fit_stats["user.statuses_count_p99"])
    )
    df["log_listed"] = np.log1p(
        np.clip(df["user.listed_count"].fillna(0), 0, fit_stats["user.listed_count_p99"])
    )
    df["log_fav"] = np.log1p(
        np.clip(df["user.favourites_count"].fillna(0), 0, fit_stats["user.favourites_count_p99"])
    )

    # counts: small cap to reduce outliers
    df["n_mentions"] = df["n_mentions"].fillna(0).astype(int)#.clip(0, 10)
    df["n_hashtags"] = df["n_hashtags"].fillna(0).astype(int)#.clip(0, 10)

    # booleans → 0/1
    for bcol in ["has_media", "is_reply", "user.default_profile", "user.geo_enabled"]:
        if bcol not in df.columns:
            df[bcol] = False
        df[bcol] = df[bcol].fillna(False).astype(int)

    # ---- source_idx from source_app ----
    if "source_app" not in df.columns:
        df["source_app"] = "Unknown"

    if src2idx is None:
        top_src = df["source_app"].fillna("Unknown").value_counts().head(K).index.tolist()
        src2idx = {s: i+1 for i, s in enumerate(top_src)}  # 0 reserved for "Other"

    df["source_idx"] = (
        df["source_app"]
        .fillna("Unknown")
        .map(src2idx)
        .fillna(0)
        .astype(int)
    )

    return df, fit_stats, src2idx


# ============================================================
# 3. Author-based split (same spirit as your TweetsDataModule)
# ============================================================

def author_based_split(df, val_size=0.1, random_state=42):
    """
    Split df into train/val such that authors (author_pseudo_id)
    do not overlap between train and val.
    """
    assert "author_pseudo_id" in df.columns, "author_pseudo_id column missing"

    user_ids = df["author_pseudo_id"].astype(str)
    unique_users = user_ids.unique()

    train_users, val_users = train_test_split(
        unique_users, test_size=val_size, random_state=random_state
    )

    train_mask = user_ids.isin(train_users)
    val_mask   = user_ids.isin(val_users)

    train_df = df[train_mask].reset_index(drop=True)
    val_df   = df[val_mask].reset_index(drop=True)

    print("User-level split:")
    print("  #train tweets:", len(train_df))
    print("  #val tweets:  ", len(val_df))
    print("  #unique users train:", len(np.unique(user_ids[train_mask])))
    print("  #unique users val:  ", len(np.unique(user_ids[val_mask])))

    return train_df, val_df


# ============================================================
# 4. Run LogisticRegression baselines for multiple subsets
# ============================================================

def run_metadata_ablations(train_df, val_df):
    """
    Train a LogisticRegression on various subsets of metadata
    and evaluate on author-based validation set.
    """

    # All meta features we care about
    ALL_META_FEATURES_LOG = [
        "log_status",
        "log_listed",
        "log_fav",
        "user.default_profile",
        "user.geo_enabled",
        "is_reply",
        "n_mentions",
        "n_hashtags",
        "has_media",

    ]

    ALL_META_FEATURES = [
        "user.statuses_count",
        "user.listed_count",
        "user.favourites_count",
        "user.default_profile",
        "user.geo_enabled",
        "is_reply",
        "n_mentions",
        "n_hashtags",
        "has_media",

    ]

    # Define ablation sets: you can add/remove sets here easily
    ablation_sets = {
        "all_9_meta": ALL_META_FEATURES_LOG,
        "all_plus_source_idx": ALL_META_FEATURES_LOG + ["source_idx"],
        "all_plus_source_app": ALL_META_FEATURES_LOG + ["source_app"],
        "all_9_meta_non_log": ALL_META_FEATURES,
        "all_with_len": ALL_META_FEATURES_LOG + ["text_len"],
        "all_minus_fav": [
            "log_status",
            "log_listed",
            "user.default_profile",
            "user.geo_enabled",
            "is_reply",
            "n_mentions",
            "n_hashtags",
            "has_media"],
        "all_minus_hashs": [
            "log_status",
            "log_listed",
            "user.default_profile",
            "user.geo_enabled",
            "is_reply",
            "n_mentions",
            "log_fav",
            "has_media"],
        "all_minus_hashs_media": [
            "log_status",
            "log_listed",
            "user.default_profile",
            "user.geo_enabled",
            "is_reply",
            "n_mentions",
            "log_fav"],
        "all_minus_hashs_media_with_src": [
            "log_status",
            "log_listed",
            "user.default_profile",
            "user.geo_enabled",
            "is_reply",
            "n_mentions",
            "log_fav", "source_idx"],

        "best_with_nine":[
            "user.statuses_count",
            "user.listed_count",
            "user.favourites_count",
            "user.default_profile",
            "is_reply",
            "n_mentions",
            "n_hashtags",
            "has_media",
            "source_idx"
        ],
        "best_with_eight":[
            "user.statuses_count",
            "user.listed_count",
            "user.favourites_count",
            "user.default_profile",
            "is_reply",
            "n_mentions",
            "n_hashtags",
            "source_idx"
        ],
        "best_with_seven": [
            "log_status",
            "log_listed",
            "user.default_profile",
            "is_reply",
            "n_mentions",
            "log_fav", "source_idx"],
        
        "best_with_six": [
            "log_status",
            "log_listed",
            "user.default_profile",
            "is_reply",
            #"n_mentions",
            "log_fav", "source_idx"],

        "second_best_with_six": [
            "log_status",
            "log_listed",
            "user.default_profile",
            #"is_reply",
            "n_mentions",
            "log_fav", "source_idx"],
        "best_with_five": [
            "log_status",
            "log_listed",
            "user.default_profile",
            "is_reply",
            #"n_mentions",
            #"log_fav", 
            "source_idx"],
        "current": ["log_status", "log_listed", "n_mentions"],
        "current_with_src": ["log_status", "log_listed", "n_mentions", "source_idx"],
        "cur_plus1": ["log_status", "log_listed", "n_mentions", "user.default_profile"],
        "user_stats_3": ["log_status", "log_listed", "log_fav"],
        "user_stats_2": ["log_status", "log_listed"],
        "log_status_only": ["log_status"],
        "log_listed_only": ["log_listed"],
        "interaction_only": ["is_reply", "n_mentions", "n_hashtags", "has_media"],
        "reply_mentions_only": ["is_reply", "n_mentions"],
        "hashtags_media_only": ["n_hashtags", "has_media"],
    }

    prev_best_res = ALL_META_FEATURES_LOG.copy()
    prev_best_res += ["source_idx"]
    #prev_best_res.remove("user.geo_enabled")
    #prev_best_res.remove("has_media")
    
    # to_rmv = ["log_status",
    #         "log_listed",
    #         "user.default_profile",
    #         "user.geo_enabled",
    #         "is_reply",
    #         "n_mentions",
    #         "log_fav"]
    to_rmv = prev_best_res.copy()
    for col in to_rmv:
        cur = prev_best_res.copy()
        if col in cur:
            cur.remove(col)
        else:
            continue
        ablation_sets[f"prev_best_minus_{col}"] = cur
    # also: each single feature as its own baseline
    for feat in ALL_META_FEATURES:
        ablation_sets.setdefault(f"single_{feat}", [feat])

    y_train = train_df["label"].astype(int).values
    y_val   = val_df["label"].astype(int).values

    results = []

    for name, cols in ablation_sets.items():
        # some columns might be missing if anything went wrong, so check:
        missing = [c for c in cols if c not in train_df.columns]
        if missing:
            print(f"[{name}] Skipping: missing columns {missing}")
            continue

        # split into numeric vs categorical (only source_idx is categorical here)
        cat_cols = [c for c in cols if c in ["source_idx", "source_app"]]
        num_cols = [c for c in cols if c not in ["source_idx", "source_app"]]

        # prepare ColumnTransformer
        transformers = []
        if num_cols:
            transformers.append(("num", StandardScaler(), num_cols))
        if cat_cols:
            transformers.append(("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols))

        if not transformers:
            print(f"[{name}] Skipping: no valid columns after split.")
            continue

        pre = ColumnTransformer(transformers=transformers, remainder="drop")

        clf = Pipeline(
            steps=[
                ("pre", pre),
                ("logreg", LogisticRegression(max_iter=2000, class_weight="balanced")),
            ]
        )

        X_train = train_df[cols]
        X_val   = val_df[cols]

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_val)
        y_prob = clf.predict_proba(X_val)[:, 1]

        acc = accuracy_score(y_val, y_pred)
        auc = roc_auc_score(y_val, y_prob)

        print(f"[{name}] acc={acc:.4f}, AUC={auc:.4f}, n_feats={len(cols)}")

        results.append({
            "subset": name,
            "n_features": len(cols),
            "accuracy": acc,
            "auc": auc,
        })

    results_df = pd.DataFrame(results).sort_values(
        ["accuracy", "auc"], ascending=False
    ).reset_index(drop=True)

    print("\n=== Summary (sorted by accuracy, then AUC) ===")
    display(results_df)

    return results_df


# ============================================================
# 5. Main: load data, build features, split, run ablations
# ============================================================

# ---- adjust path to your train.jsonl ----
train_path = "../../train.jsonl"   # <-- change if needed

# 1) parse with author_pseudo_id + raw meta
raw_df = parse_tweets_meta(train_path, expect_label=True)

# 2) author-based split BEFORE computing stats (important!)
train_raw, val_raw = author_based_split(raw_df, val_size=0.1, random_state=42)

# 3) build metadata features, with p99 stats fitted on TRAIN only
train_meta, stats, src2idx = build_meta_features(train_raw, fit_stats=None, src2idx=None, K=15)
val_meta, _, _             = build_meta_features(val_raw, fit_stats=stats, src2idx=src2idx, K=15)

# 4) sanity check: label distribution
print("\nTrain label distribution:")
print(train_meta["label"].value_counts(normalize=True))
print("\nVal label distribution:")
print(val_meta["label"].value_counts(normalize=True))

# 5) run ablations
results_df = run_metadata_ablations(train_meta, val_meta)

User-level split:
  #train tweets: 139426
  #val tweets:   15488
  #unique users train: 44158
  #unique users val:   4907

Train label distribution:
label
0    0.534556
1    0.465444
Name: proportion, dtype: float64

Val label distribution:
label
0    0.525762
1    0.474238
Name: proportion, dtype: float64
[all_9_meta] acc=0.8032, AUC=0.8740, n_feats=9
[all_plus_source_idx] acc=0.8085, AUC=0.8787, n_feats=10
[all_plus_source_app] acc=0.8077, AUC=0.8789, n_feats=10
[all_9_meta_non_log] acc=0.7788, AUC=0.8542, n_feats=9
[all_with_len] acc=0.8017, AUC=0.8741, n_feats=10
[all_minus_fav] acc=0.7994, AUC=0.8690, n_feats=8
[all_minus_hashs] acc=0.8029, AUC=0.8741, n_feats=8
[all_minus_hashs_media] acc=0.8028, AUC=0.8738, n_feats=7
[all_minus_hashs_media_with_src] acc=0.8096, AUC=0.8786, n_feats=8
[best_with_nine] acc=0.7898, AUC=0.8624, n_feats=9
[best_with_eight] acc=0.7890, AUC=0.8621, n_feats=8
[best_with_seven] acc=0.8119, AUC=0.8785, n_feats=7
[best_with_six] acc=0.8111, AUC=0.8784, n_fe

Unnamed: 0,subset,n_features,accuracy,auc
0,best_with_seven,7,0.811919,0.878462
1,prev_best_minus_user.geo_enabled,9,0.81108,0.878673
2,best_with_six,6,0.81108,0.87841
3,second_best_with_six,6,0.81095,0.878136
4,all_minus_hashs_media_with_src,8,0.809595,0.878615
5,prev_best_minus_has_media,9,0.809465,0.878563
6,prev_best_minus_user.default_profile,9,0.809401,0.878559
7,prev_best_minus_is_reply,9,0.808949,0.878471
8,best_with_five,5,0.808884,0.875568
9,prev_best_minus_n_hashtags,9,0.80882,0.878783


In [7]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, CamembertTokenizer
from sklearn.metrics import accuracy_score, roc_auc_score


class InfluencerTextOnly(nn.Module):
    def __init__(
        self,
        base_model: str = "cmarkea/distilcamembert-base",
        head_hidden_dim: int = 256,
        head_dropout: float = 0.15,
        max_len: int = 128,
    ):
        super().__init__()
        self.tok = CamembertTokenizer.from_pretrained(base_model)
        self.enc = AutoModel.from_pretrained(base_model)
        dim = self.enc.config.hidden_size
        self.head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Dropout(head_dropout),
            nn.Linear(dim, head_hidden_dim),
            nn.ReLU(),
            nn.Dropout(head_dropout),
            nn.Linear(head_hidden_dim, 2),
            nn.LogSoftmax(dim=1),
        )
        self.max_len = max_len

    def _dev(self):
        return next(self.parameters()).device

    def forward(self, batch):
        tok = self.tok(
            batch["full_text"],          # list of strings
            padding=True,
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        ).to(self._dev())
        out = self.enc(**tok).last_hidden_state[:, 0]  # CLS / first token
        return self.head(out)                          # log-probs (N, 2)


# ============================================================
# 6. Dataset + DataLoaders (reusing parse_tweets_meta + split)
# ============================================================

class TweetTextDataset(Dataset):
    """
    Simple dataset wrapping a DataFrame that has:
      - 'full_text' column (already built by parse_tweets_meta)
      - 'label' column
    """
    def __init__(self, df):
        self.texts = df["full_text"].astype(str).tolist()
        self.labels = df["label"].astype(int).tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return {
            "full_text": self.texts[idx],
            "label": self.labels[idx],
        }


def text_collate_fn(batch):
    """
    Custom collate so that:
      - model gets a list of strings under batch["full_text"]
      - labels become a LongTensor
    """
    texts = [b["full_text"] for b in batch]
    labels = torch.tensor([b["label"] for b in batch], dtype=torch.long)
    return {"full_text": texts, "label": labels}


def make_text_dataloaders(train_df, val_df, batch_size=16):
    train_ds = TweetTextDataset(train_df)
    val_ds   = TweetTextDataset(val_df)

    train_loader = DataLoader(
        train_ds, batch_size=batch_size, shuffle=True,
        num_workers=0, collate_fn=text_collate_fn
    )
    val_loader = DataLoader(
        val_ds, batch_size=batch_size, shuffle=False,
        num_workers=0, collate_fn=text_collate_fn
    )
    return train_loader, val_loader


# ============================================================
# 7. Train / eval loops for InfluencerTextOnly
# ============================================================

def evaluate_text_model(model, data_loader, device):
    model.eval()
    all_labels = []
    all_logits = []

    with torch.no_grad():
        for batch in data_loader:
            labels = batch["label"].to(device)
            logits_logp = model(batch)          # (N, 2) log-probs
            logits = logits_logp                # already log-softmax output

            all_labels.append(labels.cpu())
            all_logits.append(logits.cpu())

    all_labels = torch.cat(all_labels).numpy()
    all_logits = torch.cat(all_logits, dim=0).numpy()    # log-probs

    # predicted class
    y_pred = all_logits.argmax(axis=1)
    # proba for class 1 (exp of log-prob)
    y_prob = np.exp(all_logits)[:, 1]

    acc = accuracy_score(all_labels, y_pred)
    auc = roc_auc_score(all_labels, y_prob)

    return acc, auc


def train_text_only_model(
    train_df,
    val_df,
    base_model: str = "cmarkea/distilcamembert-base",
    max_len: int = 128,
    head_hidden_dim: int = 256,
    head_dropout: float = 0.15,
    batch_size: int = 16,
    epochs: int = 3,
    # LRs
    head_lr: float = 2e-5,
    backbone_lr: float = 1e-5,
    weight_decay: float = 0.01,
    freeze_backbone: bool = True,
    device: str = None,
):
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    device = torch.device(device)

    # DataLoaders (reusing author-based split output)
    train_loader, val_loader = make_text_dataloaders(train_df, val_df, batch_size=batch_size)

    # Model
    model = InfluencerTextOnly(
        base_model=base_model,
        head_hidden_dim=head_hidden_dim,
        head_dropout=head_dropout,
        max_len=max_len,
    ).to(device)

    # -----------------------------
    #  Optimizer + freezing logic
    # -----------------------------
    criterion = nn.NLLLoss()

    if freeze_backbone:
        # Freeze encoder, only train head
        for p in model.enc.parameters():
            p.requires_grad = False

        optimizer = torch.optim.AdamW(
            model.head.parameters(),
            lr=head_lr,
            weight_decay=weight_decay,
        )
        print(">> Training with BACKBONE FROZEN (head only).")
    else:
        # Train both encoder and head with different LRs
        for p in model.enc.parameters():
            p.requires_grad = True

        param_groups = [
            {"params": model.enc.parameters(),  "lr": backbone_lr, "weight_decay": weight_decay},
            {"params": model.head.parameters(), "lr": head_lr,     "weight_decay": weight_decay},
        ]
        optimizer = torch.optim.AdamW(param_groups)
        print(">> Training with BACKBONE UNFROZEN (separate LRs for backbone/head).")

    best_val_auc = 0.0
    best_val_acc = 0.0

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        n_samples = 0

        for batch in train_loader:
            labels = batch["label"].to(device)
            optimizer.zero_grad()

            logp = model(batch)                 # (N, 2) log-probs
            loss = criterion(logp, labels)

            loss.backward()
            optimizer.step()

            bs = labels.size(0)
            running_loss += loss.item() * bs
            n_samples += bs

        train_loss = running_loss / max(1, n_samples)
        val_acc, val_auc = evaluate_text_model(model, val_loader, device)

        print(
            f"[TextOnly][Epoch {epoch}] "
            f"train_loss={train_loss:.4f}, "
            f"val_acc={val_acc:.4f}, val_auc={val_auc:.4f}"
        )

        if val_auc > best_val_auc:
            best_val_auc = val_auc
            best_val_acc = val_acc

    print(f"\nBest TextOnly val_acc={best_val_acc:.4f}, val_auc={best_val_auc:.4f}")
    return model, {"best_val_acc": best_val_acc, "best_val_auc": best_val_auc}


# ============================================================
# 8. End-to-end: reuse parse_tweets_meta + author_based_split
# ============================================================

def run_text_only_experiment(
    train_path="../../train.jsonl",
    val_size=0.1,
    random_state=42,
    **train_kwargs,
):
    """
    End-to-end run:
      - reuse parse_tweets_meta to build full_text + label + author_pseudo_id
      - reuse author_based_split for user-level split
      - train + evaluate InfluencerTextOnly
    """
    raw_df = parse_tweets_meta(train_path, expect_label=True)
    train_raw, val_raw = author_based_split(raw_df, val_size=val_size, random_state=random_state)

    print("\nTrain label distribution:")
    print(train_raw["label"].value_counts(normalize=True))
    print("\nVal label distribution:")
    print(val_raw["label"].value_counts(normalize=True))

    model, scores = train_text_only_model(train_raw, val_raw, **train_kwargs)
    return model, scores


# Example usage (if you want a direct script entry):
if __name__ == "__main__":
    train_path = "../../train.jsonl"
    model, scores = run_text_only_experiment(
        train_path="../../train.jsonl",
        val_size=0.1,
        random_state=42,
        epochs=3,
        batch_size=16,
        head_lr=2e-5,
        freeze_backbone=True,   # <- default, but explicit
    )

    # model, scores = run_text_only_experiment(
    #     train_path="../../train.jsonl",
    #     val_size=0.1,
    #     random_state=42,
    #     epochs=3,
    #     batch_size=16,
    #     freeze_backbone=False,
    #     head_lr=2e-5,
    #     backbone_lr=1e-5,
    # )

User-level split:
  #train tweets: 139426
  #val tweets:   15488
  #unique users train: 44158
  #unique users val:   4907

Train label distribution:
label
0    0.534556
1    0.465444
Name: proportion, dtype: float64

Val label distribution:
label
0    0.525762
1    0.474238
Name: proportion, dtype: float64


Some weights of CamembertModel were not initialized from the model checkpoint at cmarkea/distilcamembert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


>> Training with BACKBONE FROZEN (head only).
[TextOnly][Epoch 0] train_loss=0.6457, val_acc=0.6581, val_auc=0.7135
[TextOnly][Epoch 1] train_loss=0.6270, val_acc=0.6607, val_auc=0.7233
[TextOnly][Epoch 2] train_loss=0.6222, val_acc=0.6650, val_auc=0.7269

Best TextOnly val_acc=0.6650, val_auc=0.7269
