In [6]:
import pandas as pd
import numpy as np
import ast

# -------------------------------------------------------------------
# 0. Config
# -------------------------------------------------------------------
INPUT_PATH = "../data/scores_three_sources.csv"  # or "/mnt/data/scores_three_sources.csv"
OUTPUT_EQ   = "combined_equal_softmax.csv"
OUTPUT_W    = "combined_news_weighted_softmax.csv"
OUTPUT_CONF = "combined_conf_weighted_softmax.csv"

sources = ["Guard", "Ellen", "GNews"]
classes = ["positive", "neutral", "negative"]

# -------------------------------------------------------------------
# 1. Load data
# -------------------------------------------------------------------
df = pd.read_csv(INPUT_PATH)
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]  # drop any unnamed index cols

# -------------------------------------------------------------------
# 2. Parse softmax strings -> separate columns per class
# -------------------------------------------------------------------
def parse_softmax_to_dict(s):
    if pd.isna(s):
        return None
    try:
        d = ast.literal_eval(s)
        # normalize keys & default to 0 if missing
        return {
            "positive": float(d.get("positive", 0.0)),
            "neutral":  float(d.get("neutral", 0.0)),
            "negative": float(d.get("negative", 0.0)),
        }
    except Exception:
        return None

for src in sources:
    # Parse to dict
    parsed = df[f"{src}_softmax"].apply(parse_softmax_to_dict)

    # Create one column per class
    for c in classes:
        df[f"{src}_{c}"] = parsed.apply(
            lambda d: d[c] if d is not None else np.nan
        )

    # Article counts as float (missing -> 0 for weighting)
    df[f"{src}_cnt"] = df[f"{src}_article_count"].fillna(0.0)

# Total article count across all sources (for output)
df["total_article_count"] = df[[f"{src}_cnt" for src in sources]].sum(axis=1)

# -------------------------------------------------------------------
# 3. Helper: combine probabilities row-wise
# -------------------------------------------------------------------
def combine_softmax_equal(row):
    """
    Equal-weight average of available sources for each class.
    """
    out = {}
    for c in classes:
        vals = [row[f"{src}_{c}"] for src in sources]
        vals = [v for v in vals if pd.notna(v)]
        if len(vals) == 0:
            out[c] = np.nan
        else:
            out[c] = float(np.mean(vals))
    # renormalize to sum to 1 if possible
    total = sum(v for v in out.values() if pd.notna(v))
    if not np.isnan(total) and total > 0:
        out = {k: v / total for k, v in out.items()}
    return out

def combine_softmax_news_weighted(row):
    """
    Article-count-weighted average with weights = log(1 + count).
    """
    out = {}
    weights = {src: np.log1p(row[f"{src}_cnt"]) for src in sources}

    for c in classes:
        num = 0.0
        den = 0.0
        for src in sources:
            p = row[f"{src}_{c}"]
            w = weights[src]
            if pd.notna(p) and w > 0:
                num += w * p
                den += w
        if den == 0:
            out[c] = np.nan
        else:
            out[c] = float(num / den)

    # renormalize
    total = sum(v for v in out.values() if pd.notna(v))
    if not np.isnan(total) and total > 0:
        out = {k: v / total for k, v in out.items()}
    return out

def combine_softmax_conf_weighted(row):
    """
    Confidence-weighted:
      weight = max_prob * log(1 + count) for each source.
    """
    out = {}
    weights = {}
    for src in sources:
        probs = [row[f"{src}_{c}"] for c in classes]
        if any(pd.notna(p) for p in probs):
            max_prob = np.nanmax(probs)
        else:
            max_prob = np.nan
        cnt = row[f"{src}_cnt"]
        if pd.notna(max_prob) and cnt > 0:
            weights[src] = float(max_prob * np.log1p(cnt))
        else:
            weights[src] = 0.0

    for c in classes:
        num = 0.0
        den = 0.0
        for src in sources:
            p = row[f"{src}_{c}"]
            w = weights[src]
            if pd.notna(p) and w > 0:
                num += w * p
                den += w
        if den == 0:
            out[c] = np.nan
        else:
            out[c] = float(num / den)

    # renormalize
    total = sum(v for v in out.values() if pd.notna(v))
    if not np.isnan(total) and total > 0:
        out = {k: v / total for k, v in out.items()}
    return out

def probs_to_str(d):
    """
    Convert dict of probs to string for CSV.
    Returns NaN if any component is NaN.
    """
    if d is None:
        return np.nan
    if any(pd.isna(v) for v in d.values()):
        return np.nan
    # ensure float conversion and fixed order
    out = {
        "positive": float(d["positive"]),
        "neutral":  float(d["neutral"]),
        "negative": float(d["negative"]),
    }
    return str(out)

def dict_to_class(d):
    if d is None:
        return np.nan
    if any(pd.isna(v) for v in d.values()):
        return np.nan
    return max(d, key=d.get)  # argmax over positive/neutral/negative

# -------------------------------------------------------------------
# 4. Build combined softmax for each method
# -------------------------------------------------------------------
eq_soft = df.apply(combine_softmax_equal, axis=1)
w_soft  = df.apply(combine_softmax_news_weighted, axis=1)
conf_soft = df.apply(combine_softmax_conf_weighted, axis=1)

# -------------------------------------------------------------------
# 5. Build output DataFrames (one row per dateâ€“entity)
# -------------------------------------------------------------------
out_eq = pd.DataFrame({
    "date": df["date"],
    "entity": df["entity"],
    "classification": eq_soft.apply(dict_to_class),
    "softmax": eq_soft.apply(probs_to_str),
    "article_count": df["total_article_count"],
})

out_w = pd.DataFrame({
    "date": df["date"],
    "entity": df["entity"],
    "classification": w_soft.apply(dict_to_class),
    "softmax": w_soft.apply(probs_to_str),
    "article_count": df["total_article_count"],
})

out_conf = pd.DataFrame({
    "date": df["date"],
    "entity": df["entity"],
    "classification": conf_soft.apply(dict_to_class),
    "softmax": conf_soft.apply(probs_to_str),
    "article_count": df["total_article_count"],
})

# -------------------------------------------------------------------
# 6. Save to CSV
# -------------------------------------------------------------------
out_eq.to_csv(OUTPUT_EQ, index=False)
out_w.to_csv(OUTPUT_W, index=False)
out_conf.to_csv(OUTPUT_CONF, index=False)

print("Saved:")
print("  ", OUTPUT_EQ)
print("  ", OUTPUT_W)
print("  ", OUTPUT_CONF)


Saved:
   combined_equal_softmax.csv
   combined_news_weighted_softmax.csv
   combined_conf_weighted_softmax.csv
