In [4]:
"""
Unify Reddit, Twitter, and YouTube datasets to a common schema (Twitter/YouTube style)
and save both per-platform standardized files and one combined file.

Target schema/column order:
['Platform','URL','Engagement','Text','Matched Keywords',
 'toxicity_score','Label','clean_text','word_count','negative_word_count']
"""

import pandas as pd
import numpy as np

# ---------- INPUT PATHS (adjust if needed) ----------
P_REDDIT  = "reddit_labeled.csv"
P_TWITTER = "twitter_labeled (1).csv"
P_YT      = "youtube_cleaned_balanced.csv"

# ---------- OUTPUT PATHS ----------
OUT_REDDIT  = "reddit_standardized.csv"
OUT_TWITTER = "twitter_standardized.csv"
OUT_YT      = "youtube_standardized.csv"
OUT_ALL     = "all_platforms_standardized.csv"

TARGET_COLS = [
    "Platform","URL","Engagement","Text","Matched Keywords",
    "toxicity_score","Label","clean_text","word_count","negative_word_count"
]

# ---------- helpers ----------
def ensure_list(x):
    """Ensure Matched Keywords is a list-like (handles '[]', comma strings, single token)."""
    if isinstance(x, list): return x
    if pd.isna(x): return []
    s = str(x).strip()
    if s.startswith("[") and s.endswith("]"):
        inner = s[1:-1].strip()
        if not inner: return []
        parts = [p.strip().strip("'\"") for p in inner.split(",")]
        return [p for p in parts if p]
    if "," in s:
        return [t.strip() for t in s.split(",") if t.strip()]
    return [] if s in {"", "none", "nan"} else [s]

def finalize(df):
    """Coerce dtypes, compute missing word_count, fill missing negative_word_count, and de-dupe."""
    df["Platform"] = df["Platform"].astype(str)
    df["URL"] = df["URL"].astype(str)

    # Engagement numeric
    df["Engagement"] = pd.to_numeric(df["Engagement"], errors="coerce").fillna(0)

    # Matched Keywords as list (store as stringified list for CSV or keep lists if you prefer)
    df["Matched Keywords"] = df["Matched Keywords"].apply(ensure_list)

    # toxicity_score float
    df["toxicity_score"] = pd.to_numeric(df["toxicity_score"], errors="coerce")

    # Label: keep as-is if present; if missing and toxicity available, derive by threshold=0.5
    if "Label" not in df.columns:
        df["Label"] = (df["toxicity_score"] >= 0.5).astype(int)
    else:
        # If Label exists but has NaN, fill from threshold
        miss = df["Label"].isna()
        if miss.any():
            df.loc[miss, "Label"] = (df.loc[miss, "toxicity_score"] >= 0.5).astype(int)

    # word_count: compute from clean_text if missing/NaN
    if "word_count" not in df.columns:
        df["word_count"] = df["clean_text"].astype(str).str.split().apply(len)
    else:
        wc = pd.to_numeric(df["word_count"], errors="coerce")
        missing_wc = wc.isna()
        wc = wc.fillna(0)
        if missing_wc.any():
            wc.loc[missing_wc] = df.loc[missing_wc, "clean_text"].astype(str).str.split().apply(len)
        df["word_count"] = wc.astype(int)

    # negative_word_count: if not present, set 0 for now
    if "negative_word_count" not in df.columns:
        df["negative_word_count"] = 0
    else:
        df["negative_word_count"] = pd.to_numeric(df["negative_word_count"], errors="coerce").fillna(0).astype(int)

    # Minimal de-duplication on (URL, clean_text)
    df = df.drop_duplicates(subset=["URL", "clean_text"], keep="first")

    # Reorder columns
    return df[TARGET_COLS]

# ---------- standardizers ----------
def standardize_reddit(df):
    out = pd.DataFrame()
    out["Platform"] = df.get("Platform", "Reddit")
    out["URL"] = df.get("URL", "")
    out["Engagement"] = df.get("Engagement", 0)

    # Reddit file may not have raw 'Text'; fallback to 'Clean Text'
    out["Text"] = df["Text"] if "Text" in df.columns else df.get("Clean Text", "")

    out["Matched Keywords"] = df.get("Matched Keywords", "[]")
    out["toxicity_score"] = df.get("Toxicity Score", np.nan)  # map to target name
    out["Label"] = df.get("Label", np.nan)
    out["clean_text"] = df.get("Clean Text", "")
    out["word_count"] = df.get("Word Count", np.nan)
    # Reddit typically lacks negative_word_count; create it
    out["negative_word_count"] = 0
    return finalize(out)

def standardize_twitter(df):
    out = pd.DataFrame()
    out["Platform"] = df.get("Platform", "Twitter")
    out["URL"] = df.get("URL", "")
    out["Engagement"] = df.get("Engagement", 0)
    out["Text"] = df.get("Text", "")
    out["Matched Keywords"] = df.get("Matched Keywords", "[]")
    out["toxicity_score"] = df.get("toxicity_score", np.nan)
    out["Label"] = df.get("Label", np.nan)
    out["clean_text"] = df.get("clean_text", "")
    out["word_count"] = df.get("word_count", np.nan)
    out["negative_word_count"] = df.get("negative_word_count", 0)
    return finalize(out)

def standardize_youtube(df):
    out = pd.DataFrame()
    out["Platform"] = df.get("Platform", "YouTube")
    # Reddit/Twitter use "URL"; YouTube has "Video URL"
    out["URL"] = df.get("URL", df.get("Video URL", ""))
    out["Engagement"] = df.get("Engagement", 0)
    out["Text"] = df.get("Comment", "")
    out["Matched Keywords"] = df.get("Matched Keywords", "[]")
    out["toxicity_score"] = df.get("toxicity_score", np.nan)
    out["Label"] = df.get("Label", np.nan)
    out["clean_text"] = df.get("clean_text", "")
    out["word_count"] = df.get("word_count", np.nan)
    out["negative_word_count"] = df.get("negative_word_count", 0)
    return finalize(out)

# ---------- load, standardize, save ----------
reddit  = pd.read_csv(P_REDDIT)
twitter = pd.read_csv(P_TWITTER)
yt      = pd.read_csv(P_YT)

reddit_std  = standardize_reddit(reddit)
twitter_std = standardize_twitter(twitter)
yt_std      = standardize_youtube(yt)

reddit_std.to_csv(OUT_REDDIT,  index=False)
twitter_std.to_csv(OUT_TWITTER, index=False)
yt_std.to_csv(OUT_YT,          index=False)

all_std = pd.concat([reddit_std, twitter_std, yt_std], ignore_index=True)
all_std.to_csv(OUT_ALL, index=False)

print("Saved:")
print(f"  - {OUT_REDDIT}  ({reddit_std.shape})")
print(f"  - {OUT_TWITTER} ({twitter_std.shape})")
print(f"  - {OUT_YT}      ({yt_std.shape})")
print(f"  - {OUT_ALL}     ({all_std.shape})")


Saved:
  - reddit_standardized.csv  ((135, 10))
  - twitter_standardized.csv ((90, 10))
  - youtube_standardized.csv      ((100, 10))
  - all_platforms_standardized.csv     ((325, 10))
