In [62]:
import pandas as pd
from pathlib import Path
import glob

DATA_FOLDER = Path("../data/cleaned")   # folder where your CSVs are
output_csv = Path("../results")

dfs = []

for csv_path in DATA_FOLDER.glob("*.csv"):
    outlet_name = csv_path.stem
    df = pd.read_csv(csv_path)
    
    df["outlet"] = outlet_name                    # add outlet name

    dfs.append(df)

data = pd.concat(dfs, ignore_index=True)

# Rename the outlets from filename to media outlet name
rename_map = {
    "zelensky_ap_filtered": "Associated Press",
    "zelensky_cbc_filtered": "CBC News",
    "zelensky_ctv_filtered": "CTV News",
    "zelensky_foxnews_filtered": "Fox News",
    "zelensky_globalnews_filtered": "Global News",
    "zelensky_nbc_filtered": "NBC News",
    "zelensky_npr_filtered": "NPR",
    "zelensky_nytimes_filtered": "NY Times",
    "zelensky_wsj_filtered": "The Wall Street Journal News",
    "zelensky_wsjopinions_filtered": "The Wall Street Journal Opinions"
}

data["outlet"] = data["outlet"].replace(rename_map)

In [64]:
data = data.dropna(subset=["label", "short_opening"])

In [66]:
topic_docs = data.groupby("label")["short_opening"].apply(lambda x: " ".join(x)).reset_index()
# Normalize U.S. to US
topic_docs["short_opening"] = (
    topic_docs["short_opening"]
        .str.replace(r"\bU\.S\.\b", "US", regex=True)
        .str.replace(r"\bU\.S\b", "US", regex=True)
        .str.replace(r"\bU\. S\.\b", "US", regex=True)
)

In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS

custom_stopwords = {
    "zelensky", "zelenskyy", "volodymyr",
    "ukraine", "ukrainian", "president"
}

stop_words = ENGLISH_STOP_WORDS.union(custom_stopwords)
stop_words = list(stop_words)   # convert frozenset â†’ list

vectorizer = TfidfVectorizer(
    stop_words=stop_words,
    lowercase=True,
    token_pattern=r"[a-zA-Z']+",
    max_features=5000,
)

tfidf_matrix = vectorizer.fit_transform(topic_docs["short_opening"])
feature_names = vectorizer.get_feature_names_out()


In [70]:
top_words_per_topic = {}

for i, topic in enumerate(topic_docs["label"]):
    row = tfidf_matrix[i].toarray().flatten()
    top_indices = row.argsort()[::-1][:10]  # top 10
    top_words = [(feature_names[i], row[i]) for i in top_indices]
    top_words_per_topic[topic] = top_words

In [72]:
for topic, words in top_words_per_topic.items():
    print(f"\n=== {topic} ===")
    for w, score in words:
        print(f"{w:20s}  {score:.4f}")


=== Aid Policies ===
aid                   0.2697
congress              0.2172
russia                0.1801
house                 0.1737
military              0.1501
ukraine's             0.1351
zone                  0.1201
war                   0.1201
leader                0.1201
weapons               0.1180

=== Diplomatic Engagements ===
trump                 0.3833
talks                 0.2088
peace                 0.1711
met                   0.1676
russia                0.1645
said                  0.1633
meeting               0.1624
leaders               0.1506
war                   0.1402
putin                 0.1341

=== Domestic Governance ===
said                  0.1975
dismissal             0.1966
corruption            0.1648
zaluzhny              0.1475
commander             0.1475
gen                   0.1475
valery                0.1475
anti                  0.1475
independence          0.1236
country's             0.1103

=== Economic Structure ===
advised            

In [79]:
import numpy as np
import pandas as pd

rows = []

for i, topic in enumerate(topic_docs["label"]):
    row = tfidf_matrix[i].toarray().ravel()
    sorted_idx = np.argsort(row)[::-1]  # indices from highest to lowest tf-idf

    count = 0
    for idx in sorted_idx:
        word = feature_names[idx]
        score = row[idx]

        rows.append({
            "label": topic,
            "rank": count + 1,
            "word": word,
            "tfidf": score
        })
        count += 1
        if count == 10:   # top 10 per topic
            break

# Make a DataFrame and save
df_top = pd.DataFrame(rows)
df_top.to_csv(output_csv / f"top10_tfidf_per_topic.csv", index=False)

df_top.head()

Unnamed: 0,label,rank,word,tfidf
0,Aid Policies,1,aid,0.269679
1,Aid Policies,2,congress,0.217174
2,Aid Policies,3,russia,0.180076
3,Aid Policies,4,house,0.17374
4,Aid Policies,5,military,0.150063
