In [8]:
# --- CONFIG ---
MIN_FREQ = 10            # keep keywords that appear in at least this many articles
OUT_RANKS_CSV = "keyword_ranks.csv"


import io, json, math, ast, re
import numpy as np
import pandas as pd
from google.colab import files
from sklearn.feature_selection import mutual_info_regression

# ---------- Upload ----------

uploaded = files.upload()
assert uploaded, "No file uploaded."
csv_name = list(uploaded.keys())[0]
df = pd.read_csv(io.BytesIO(uploaded[csv_name]))
print("Loaded:", csv_name, "rows:", len(df))

# ---------- Helpers ----------
def _safe_list_from_keywords(x):
    """Robustly parse the keywords column which may be:
    - list already
    - stringified list like "['A', 'B']"
    - semicolon/comma-separated string
    - NaN/None
    Returns a list[str] (lowercased & stripped), de-duplicated per article.
    """
    if x is None or (isinstance(x, float) and math.isnan(x)):
        return []
    if isinstance(x, list):
        vals = [str(t).strip() for t in x if str(t).strip()]
    else:
        s = str(x).strip()
        if not s:
            return []
        # try to parse as literal list first
        if (s.startswith("[") and s.endswith("]")) or (s.startswith("(") and s.endswith(")")):
            try:
                maybe = ast.literal_eval(s)
                if isinstance(maybe, (list, tuple)):
                    vals = [str(t).strip() for t in maybe if str(t).strip()]
                else:
                    vals = [s]
            except Exception:
                vals = re.split(r"[;,\|]\s*", s)
        else:
            vals = re.split(r"[;,\|]\s*", s)
    # normalize: lowercase, basic dedup within article
    vals = [v.lower() for v in vals if v]
    return sorted(set(vals))

def _zscore(x):
    x = pd.Series(x).astype(float)
    return (x - x.mean()) / (x.std(ddof=0) + 1e-9)

# ---------- Basic checks ----------
for col in ["keywords", "comments_total", "sentiment_mean_textblob"]:
    assert col in df.columns, f"Missing required column: {col}"

# Parse keywords
kw_lists = df["keywords"].map(_safe_list_from_keywords)
df["_kw_list"] = kw_lists

# Explode to keyword rows
exploded = df[["comments_total","sentiment_mean_textblob","_kw_list"]].explode("_kw_list")
exploded = exploded.rename(columns={"_kw_list":"keyword"})
exploded = exploded.dropna(subset=["keyword"])
exploded["keyword"] = exploded["keyword"].astype(str)

# Frequency per keyword
freq = exploded.groupby("keyword", as_index=False).size().rename(columns={"size":"n_articles"})
# Filter by frequency
freq = freq[freq["n_articles"] >= MIN_FREQ].sort_values("n_articles", ascending=False)

# Merge back the filtered keyword list
exploded_f = exploded.merge(freq[["keyword"]], on="keyword", how="inner")

# Aggregate per keyword: mean targets & basic stats
agg = exploded_f.groupby("keyword").agg(
    n_articles    = ("keyword","size"),
    mean_comments = ("comments_total","mean"),
    std_comments  = ("comments_total","std"),
    mean_sent     = ("sentiment_mean_textblob","mean"),
    std_sent      = ("sentiment_mean_textblob","std"),
).reset_index()


keep_keywords = set(freq["keyword"].tolist())
def to_indicator_row(lst):
    s = set(lst)
    return [1 if k in s else 0 for k in keep_keywords]


kw_order = sorted(keep_keywords)

BATCH = 5000
rows = []
for i in range(0, len(df), BATCH):
    batch = df.iloc[i:i+BATCH]
    for ks in batch["_kw_list"]:
        s = set(ks)
        rows.append([1 if k in s else 0 for k in kw_order])
X = np.array(rows, dtype=np.uint8)
y_vol = df["comments_total"].astype(float).values
y_sent = df["sentiment_mean_textblob"].astype(float).values


Xf = X.astype(float)
y_vol_z = _zscore(y_vol)
y_sent_z = _zscore(y_sent)

def pearson_with_y(Xz, yz):
    Xmean = Xz.mean(axis=0)
    Xstd  = Xz.std(axis=0) + 1e-9
    Xzs   = (Xz - Xmean) / Xstd
    yarr = np.asarray(yz).reshape(-1, 1)
    r = (Xzs * yarr).mean(axis=0)
    return r

r_vol = pearson_with_y(Xf, y_vol_z)
r_sent = pearson_with_y(Xf, y_sent_z)


def rankdata(a):
    return pd.Series(a).rank(method="average").values
r_vol_spearman  = pearson_with_y(Xf, _zscore(rankdata(y_vol)))
r_sent_spearman = pearson_with_y(Xf, _zscore(rankdata(y_sent)))


mi_vol  = mutual_info_regression(Xf, y_vol, discrete_features=True, random_state=0)
mi_sent = mutual_info_regression(Xf, y_sent, discrete_features=True, random_state=0)

# Build ranking table
rank_df = pd.DataFrame({
    "keyword": kw_order,
    "n_articles": freq.set_index("keyword").reindex(kw_order)["n_articles"].values,
    "pearson_comments": r_vol,
    "spearman_comments": r_vol_spearman,
    "mi_comments": mi_vol,
    "pearson_sentiment": r_sent,
    "spearman_sentiment": r_sent_spearman,
    "mi_sentiment": mi_sent,
})


def rank_composite(df, cols, prefix):
    # rank by absolute for correlations
    tmp = pd.DataFrame({c: df[c].abs().rank(ascending=False) for c in cols})
    df[f"{prefix}_rank"] = tmp.mean(axis=1)
    return df

rank_df = rank_composite(rank_df, ["pearson_comments","spearman_comments","mi_comments"], "volume")
rank_df = rank_composite(rank_df, ["pearson_sentiment","spearman_sentiment","mi_sentiment"], "sentiment")

# Overall score: mean of volume_rank and sentiment_rank (lower = stronger)
rank_df["overall_rank"] = (rank_df["volume_rank"] + rank_df["sentiment_rank"]) / 2.0
rank_df = rank_df.sort_values(["overall_rank","n_articles"], ascending=[True, False]).reset_index(drop=True)

# Save and preview
rank_df.to_csv(OUT_RANKS_CSV, index=False)
display(rank_df.head(20))



Upload your 'article_level_targets_textblob.csv' â€¦


Saving article_level_targets_textblob (1).csv to article_level_targets_textblob (1) (2).csv
Loaded: article_level_targets_textblob (1) (2).csv rows: 950
Saved keyword ranks -> keyword_ranks.csv


Unnamed: 0,keyword,n_articles,pearson_comments,spearman_comments,mi_comments,pearson_sentiment,spearman_sentiment,mi_sentiment,volume_rank,sentiment_rank,overall_rank
0,"trump, donald j",528,0.312967,0.400871,0.079427,-0.293847,-0.351515,0.09051,2.0,1.0,1.5
1,presidential election of 2016,243,0.261424,0.274053,0.062703,-0.180097,-0.222385,0.030774,3.0,7.0,5.0
2,united states politics and government,531,0.341392,0.408227,0.083744,-0.171239,-0.162037,0.061817,1.0,10.333333,5.666667
3,russia,115,0.211664,0.231745,0.035684,-0.152098,-0.216747,0.043379,4.666667,8.666667,6.666667
4,federal bureau of investigation,90,0.168298,0.181104,0.022225,-0.207345,-0.273067,0.037207,9.0,4.666667,6.833333
5,cyberwarfare and defense,63,0.220522,0.205798,0.024955,-0.129461,-0.193596,0.034325,6.0,12.333333,9.166667
6,justice department,58,0.159344,0.169324,0.016331,-0.1538,-0.209546,0.029039,11.666667,11.333333,11.5
7,espionage and intelligence services,36,0.206878,0.172387,0.031183,-0.118534,-0.178065,0.030877,7.333333,16.333333,11.833333
8,"comey, james b",51,0.225878,0.202669,0.033802,-0.120545,-0.1709,0.011304,5.0,25.666667,15.333333
9,russian interference in 2016 us elections and ...,202,0.050841,0.126388,0.017499,-0.240376,-0.313751,0.030616,27.0,4.666667,15.833333


In [9]:
from google.colab import files
files.download(OUT_RANKS_CSV)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
kw = pd.read_csv("keyword_ranks.csv")

MIN_ARTICLES = 25
MAX_RANK = 200

selected = kw[
    (kw["n_articles"] >= MIN_ARTICLES) &
    ((kw["volume_rank"] <= MAX_RANK) |
     (kw["sentiment_rank"] <= MAX_RANK))
]

print(selected)


                                               keyword  n_articles  \
0                                      trump, donald j         528   
1                        presidential election of 2016         243   
2                united states politics and government         531   
3                                               russia         115   
4                      federal bureau of investigation          90   
5                             cyberwarfare and defense          63   
6                                   justice department          58   
7                  espionage and intelligence services          36   
8                                       comey, james b          51   
9    russian interference in 2016 us elections and ...         202   
10                                    sanders, bernard          58   
11           special prosecutors (independent counsel)          69   
12                            house of representatives          67   
13                  

In [13]:
print(len(selected))

44


In [14]:
selected.to_csv("selected.csv", index=False)
from google.colab import files
files.download("selected.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>