# Data Analysis
### 2025

<br>

```
1. Zavtra
data_eastview/zavtra_soros_final_01102025.csv
Link: https://drive.google.com/file/d/1wPyHPmYYfPkKeXa7eS6D5fnp0snbjgwd/view?usp=drive_link

2. Tsargrad
data_eastview/
tsargrad_soros_translated_01102025_final.csv
Link: https://drive.google.com/file/d/1KWhHCw3KUyTVJ5MivGWSDOcyzcWDXHuo/view?usp=drive_link

3. Eastlink
data_eastview/
eastlink_soros_final_01102025.csv
Link: https://drive.google.com/file/d/1coS3_R5uHDVHlJbs-LdD3DnG9h7O5bjW/view?usp=drive_link
Note: Eastlink contains 11 publications.

4. RT data
`data/final_dataset_updated.csv `
Link: https://drive.google.com/file/d/1-DJTaPm1QJ-orCQ5RPRfm9sJ1wFfMVFq/view?usp=drive_link
Note: This is from previous work
Renamed here as `final_dataset_updated_er.csv`
```


### Publications (3. Eastlink)
data_eastview/
eastlink_soros_final_01102025.csv
```
Publications = [
    "Kommersant",
    "Literaturnaia gazeta",
    "Nezavisimaia gazeta",
    "Novaia gazeta",
    "Pravda",
    "Slovo",
    "Sovetskaia Rossiia",
    "Trud",
    "Vedomosti",
    "Время MH",
    "Общая газета"
]
```

<br>

### Sentiment Analysis
Ner Models: en_core_web_sm (SpaCy) and dslim/bert-large-NER (Transformers)
https://github.com/fhamborg/NewsMTSC/blob/main/READMEpypi.md
https://aclanthology.org/2021.eacl-main.142.pdf
<br>




# Set Up (1) Jupyter Notebook or (2) Google Notebook

In [None]:
# Import libraries for all (1) and (2)
import csv
import requests
import xml.etree.ElementTree as ET
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import pandas.api.types as ptypes
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas.api.types as ptypes
from matplotlib.ticker import MaxNLocator
import re

import warnings
warnings.filterwarnings("ignore", message="Could not infer format, .*")

---

-- Set Up (1) Jupyter Notebook (uncomment to use)

In [None]:
%pip install -q scikit-learn

import sklearn
import sys
print("sklearn:", sklearn.__version__)
print("python exe:", sys.executable)

## Load Data

In [None]:
# Load data
# Import data (locally)
tsargrad_data = pd.read_csv("tsargrad_soros_final_01102025.csv", encoding="utf-8-sig")
zavtra_data   = pd.read_csv("zavtra_soros_final_01102025.csv",   encoding="utf-8-sig")
eastview_data = pd.read_csv("eastlink_soros_final_01102025.csv", encoding="utf-8-sig")
eir_data = pd.read_csv("combined_eir_output_data.csv", encoding="utf-8-sig")

print(tsargrad_data.shape, zavtra_data.shape, eastview_data.shape, eir_data.shape)

### Remove 'WARNING' Statment at start of `eastview_data`
1. Check lengths
2. View ArticleTextEnglish
3. Check rows before and after celan

In [None]:
s = eastview_data["ArticleTextEnglish"].astype("string")

print("rows:", len(s))
print("non-empty:", s.fillna("").str.strip().ne("").sum())
print("median length:", int(s.fillna("").str.len().median()))
print("max length:", int(s.fillna("").str.len().max()))

# how many look like boilerplate-only (very short)?
print("very short (<120 chars):", int(s.fillna("").str.len().lt(120).sum()))


In [None]:
cols_meta = [c for c in ["ArticleID","ArticleTitle","Publication","Year","Month","ArticleLink"]
             if c in eastview_data.columns]

def sample_preview(df, n=5, seed=42, text_col="ArticleTextEnglish", head_chars=1000):
    sub = df.sample(n=n, random_state=seed)
    for i, (_, r) in enumerate(sub.iterrows(), 1):
        print(f"\n#{i} — index={r.name}")
        print(" | ".join([f"{c}={r.get(c, '')}" for c in cols_meta]))
        txt = str(r[text_col] or "")
        print("-" * 60)
        print(txt[:head_chars].strip())
        if len(txt) > head_chars:
            print("… [truncated]")
        print("-" * 60)

sample_preview(eastview_data, n=5, seed=7)


In [None]:
import re

# 1) Preferred: from ATTENTION…COPYRIGHT up to *and including* "Welcome To East View"
EV_PREFIX_TO_WELCOME_RE = re.compile(
    r"""^\s*ATTENTION:.*?COPYRIGHT      # ATTENTION … COPYRIGHT …
        [\s\S]*?                        # anything (non-greedy)
        Welcome\W+To\W+East\W*View\s+   # … up to & including 'Welcome To East View'
    """,
    flags=re.IGNORECASE | re.VERBOSE
)

# 2) Smart alternative endings when 'Welcome...' is absent
EV_PREFIX_SMART_RE = re.compile(
    r"""^\s*ATTENTION:.*?COPYRIGHT.*?        # ATTENTION … COPYRIGHT …
        (?:East\W*View.*?\.\s+               # … up to a sentence ending that mentions East View
         |https?://[^\s)]+[^.]*\.\s+         # … or a URL sentence end
        )
    """,
    flags=re.IGNORECASE | re.DOTALL | re.VERBOSE,
)

# 3) Fallback: to the first period that follows COPYRIGHT
EV_PREFIX_FALLBACK_RE = re.compile(
    r"""^\s*ATTENTION:.*?COPYRIGHT.*?\.\s+""",
    flags=re.IGNORECASE | re.DOTALL,
)

def strip_ev_prefix_inline(text: str) -> str:
    if not isinstance(text, str):
        return text
    # normalize odd spaces/newlines
    t = (text.replace("\u00A0"," ").replace("\u2007"," ").replace("\u202F"," ")
              .replace("\r\n","\n").replace("\r","\n"))
    head = t[:1000].lower()
    if "attention:" in head and "copyright" in head:
        # Try Welcome… inclusive cut first
        t2 = EV_PREFIX_TO_WELCOME_RE.sub("", t, count=1)
        if t2 == t:
            # Then try other smart endings
            t2 = EV_PREFIX_SMART_RE.sub("", t, count=1)
            if t2 == t:
                # Last resort: first full stop after COPYRIGHT
                t2 = EV_PREFIX_FALLBACK_RE.sub("", t, count=1)
        return t2.lstrip()
    return t


### Check before and run function clean

In [None]:
s = eastview_data["ArticleTextEnglish"].astype(str)
starts_with_disclaimer = s.str.match(r"^\s*ATTENTION:.*COPYRIGHT", case=False)
print("Rows starting with EV preface:", int(starts_with_disclaimer.sum()))
eastview_data["ArticleTextEnglish"] = s.map(strip_ev_prefix_inline)

### Check after function clean

In [None]:
s2 = eastview_data["ArticleTextEnglish"].astype(str)
still_has_prefix = s2.str.match(r"^\s*ATTENTION:.*COPYRIGHT", case=False)
print("Rows still starting with preface after clean:", int(still_has_prefix.sum()))

### Check same indices before

In [None]:
def preview_idx(idx, col="ArticleTextEnglish", head=500):
    r = eastview_data.loc[idx]
    meta = " | ".join([f"{k}={r.get(k,'')}" for k in ["ArticleID","ArticleTitle","Publication","Year","Month","ArticleLink"] if k in eastview_data.columns])
    print(f"\nindex={idx} :: {meta}\n" + "-"*70)
    print(str(r[col])[:head])
    if len(str(r[col])) > head:
        print("… [truncated]")

for i in [1930, 2360, 1225, 640, 1347]:
    preview_idx(i)


In [None]:
problem_idx = s2[still_has_prefix].head(5).index.tolist()
for i in problem_idx:
    print("\n--- stubborn start @ index", i, "---")
    print(s2.loc[i][:600])

---

-- Set Up (2) Google Notebook (uncomment to use)

In [None]:
# import os
# from google.colab import drive
# from google.colab import files

# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

# from psutil import virtual_memory
# ram_gb = virtual_memory().total / 1e9
# print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

# if ram_gb < 20:
#   print('Not using a high-RAM runtime')
# else:
#   print('You are using a high-RAM runtime!')


# SVD imports for google Collab
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.decomposition import TruncatedSVD

In [None]:
# # Google drive load data
# # Load EIR Data (eir_data)
# eir_path = '/content/drive/MyDrive/george_soros/notebooks/data/combined_eir_output_data.csv'
# eir_data = pd.read_csv(eir_path)
# # eir_data.head()

# # ------------
# # Load Eastview
# eastview_path = '/content/drive/MyDrive/george_soros/notebooks/data_eastview/eastlink_soros_final_01102025.csv'
# eastview_data = pd.read_csv(eastview_path)
# # eastview_data.head()

# # ------------
# # Load Zavtra
# # data_eastview/zavtra_soros_final_01102025.csv
# zavtra_path = '/content/drive/MyDrive/george_soros/notebooks/data_eastview/zavtra_soros_final_01102025.csv'
# zavtra_data = pd.read_csv(zavtra_path)
# # zavtra_data.head()

# # ------------
# # Load Tsargrad
# tsargrad_path = '/content/drive/MyDrive/george_soros/notebooks/data_eastview/tsargrad_soros_translated_01102025_final.csv'
# tsargrad_data = pd.read_csv(tsargrad_path)
# # tsargrad_data.head()


-- END OF SET UP --

# Data Cleaning

In [None]:
# Load RT data
# Set nullable string dtype
STR_COLS = ["content", "soros_sentence", "uid", "source"]
dtype_map = {c: "string" for c in STR_COLS}

combined_df = pd.read_csv(
    "final_dataset_updated_rt.csv",
    encoding="utf-8-sig",
    dtype=dtype_map,   # avoid mixed types in these
    low_memory=False
)

# Coerce numeric columns (anything non-numeric becomes NaN)
for c in ["score_soros_sentence", "soros", "year"]:
    if c in combined_df.columns:
        combined_df[c] = pd.to_numeric(combined_df[c], errors="coerce")

# Parse date if present (looks like YYYYMMDD as int/string)
if "date" in combined_df.columns:
    # keep original in case you need it
    combined_df["_date_raw"] = combined_df["date"].astype("string")
    combined_df["date"] = pd.to_datetime(combined_df["_date_raw"], format="%Y%m%d", errors="coerce")

# Normalize source to lowercase for reliable filtering
if "source" in combined_df.columns:
    combined_df["source"] = combined_df["source"].str.strip().str.lower()

# keep only rows with a positive score
mask_scored = combined_df["score_soros_sentence"].notna() & (combined_df["score_soros_sentence"] > 0)

# source == 'rt' (case-insensitive because we normalized)
mask_rt = combined_df["source"].eq("rt")

rt_data = combined_df.loc[mask_scored & mask_rt].copy()

print(rt_data.shape)
display(rt_data.head())

In [None]:
CRITICAL_KEYS = ["ArticleID", "ArticleLink", "uid"]
TEXT_COL_HINTS = ["ArticleTextEnglish", "article_text", "translated_article_excerpt", "content", "soros_sentence"]
DATE_COL_HINTS = ["PublicationDate", "PublishedDate", "publication_date", "translated_date", "Date", "date"]

def first_present(df, cols):
    for c in cols:
        if c in df.columns:
            return c
    return None

def parse_best_date(df):
    """Try to parse a reasonable date column to datetime; return (colname, parsed_series)."""
    # prioritize our hints
    for col in [c for c in DATE_COL_HINTS if c in df.columns]:
        dt = pd.to_datetime(df[col], errors="coerce")
        if dt.notna().any(): 
            return col, dt
    # otherwise try any object-like column
    for col in df.select_dtypes(include=["object", "string"]).columns:
        dt = pd.to_datetime(df[col], errors="coerce")
        if dt.notna().any():
            return col, dt
    return None, pd.Series(pd.NaT, index=df.index)

def audit(df: pd.DataFrame, name: str):
    print(f"\n=== {name}: shape {df.shape} ===")
    display(df.head(3))

    # dtypes (compact)
    dtypes = df.dtypes.to_frame("dtype")
    display(dtypes.T)

    # missingness
    miss = df.isna().sum().to_frame("missing")
    miss["missing_pct"] = (miss["missing"] / len(df) * 100).round(2)
    miss_sorted = miss.sort_values("missing_pct", ascending=False)
    print("Missing values (top 10):")
    display(miss_sorted.head(10))

    # uniques
    nunique = df.nunique(dropna=True).sort_values(ascending=False)
    print("Top uniques (top 10):")
    display(nunique.head(10).to_frame("nunique"))

    # duplicates on common keys
    for key in CRITICAL_KEYS:
        if key in df.columns:
            dup = df.duplicated(subset=[key]).sum()
            if dup:
                print(f"WARNING: {dup} duplicate rows on key '{key}'")
    
    # text length quick peek
    txt_col = first_present(df, TEXT_COL_HINTS)
    if txt_col:
        lens = df[txt_col].dropna().astype(str).str.len()
        print(f"Text column: '{txt_col}' | non-null: {lens.size} | len[min/median/max]:",
              int(lens.min()) if not lens.empty else None,
              int(lens.median()) if not lens.empty else None,
              int(lens.max()) if not lens.empty else None)

    # numeric range checks commonly used here
    for col in ["probability", "score_soros_sentence", "soros_count", "year"]:
        if col in df.columns:
            coln = pd.to_numeric(df[col], errors="coerce")
            valid = coln.notna()
            if valid.any():
                print(f"Numeric '{col}': min={coln[valid].min()}, median={coln[valid].median()}, max={coln[valid].max()}")

    # date coverage
    date_col, dt = parse_best_date(df)
    if date_col:
        dt_nm = dt.dropna()
        if not dt_nm.empty:
            print(f"Date column used: '{date_col}' | range: {dt_nm.min().date()} → {dt_nm.max().date()}")
        else:
            print(f"Date column candidate '{date_col}' could not be parsed to any non-null datetimes.")
    else:
        print("No parseable date column found.")

    print("===")

In [None]:
audit(tsargrad_data, "Tsargrad")
audit(zavtra_data,   "Zavtra")
audit(eastview_data, "Eastlink/EastView")
audit(eir_data,      "EIR")
audit(rt_data,       "RT (filtered)")

# Review notes
- Zavtra dates from PublicationDate / ArticleDate, which include Russian month names (e.g., “29 марта 2025”) and/or English (“23 July 2024”).
- EastView dates Year + Month (or ExtractedDate) instead of “Date collected”.
- EIR dates 20220701 must be parsed with a fixed format (%Y%m%d)—generic parsing turns it into 1970-01-01.
- Tsargrad: translated_date already parses fine.
- Drop NaN columns (in “Unnamed” columns), standardise a year_month columns

In [None]:
# for name, df in [("Tsargrad", tsargrad_data), ("Zavtra", zavtra_data), ("EastView", eastview_data), ("EIR", eir_data)]:
#     s = df["year_month"].value_counts().sort_index()
#     print(f"\n{name} months: {len(s)} distinct")
#     display(s.tail(5))


In [None]:
# Save
tsargrad_data.to_csv("tsargrad_soros_final_01102025_clean.csv", index=False)
zavtra_data.to_csv("zavtra_soros_final_01102025_clean.csv", index=False)
eastview_data.to_csv("eastlink_soros_final_01102025_clean.csv", index=False)
eir_data.to_csv("eir_clean.csv", index=False)

In [None]:
zavtra_data.head()
# zavtra_data.columns.tolist()
# zavtra_data.info()

In [None]:
tsargrad_data.head()
# tsargrad_data.columns.tolist()
# tsargrad_data.info()

In [None]:
eastview_data.head()
# eastview_data.columns.tolist()
# eastview_data.info()

In [None]:
eir_data.head()
# eir_data.columns.tolist()
# eir_data.info()

In [None]:
rt_data.head()
# rt_data.columns.tolist()
# rt_data.info()

# Data Analysis Set up

---

## Set Colors & publication maps
- https://colorbrewer2.org/#type=sequential&scheme=BuGn&n=3
- https://www.simplifiedsciencepublishing.com/resources/best-color-palettes-for-scientific-figures-and-data-visualizations

In [None]:
import numpy as np
import pandas as pd

# Your palettes
custom_palette = {
    "kommersant": "#ff7f0e",      # vivid orange
    "literaturnaia_gazeta": "#1b9e77",  # teal-green
    "nezavisimaia_gazeta": "#d62728",   # red
    "novaia_gazeta": "#9467bd",   # purple
    "pravda": "#8c564b",          # brown
    "slovo": "#e377c2",           # pink
    "sovetskaia_rossiia": "#7f7f7f",    # grey
    "trud": "#bcbd22",            # yellow-green
    "vedomosti": "#17becf",       # cyan
    "b_mh": "#377eb8",            # blue
    "о_г": "#ffae00",             # yellow
}
eir_colors = {"eir": "#1f77b4"}       # keep blue
rt_colors  = {"rt":  "#4daf4a"}       # dark leaf green
ztra       = {"Zavtra": "#984ea3"}   # violet
tsar_colors= {"Tsargrad": "#a65628"} # chestnut brown


# Master palette used in plots (only keys that appear will be used)
master_palette = {}
master_palette.update(custom_palette)
master_palette.update(eir_colors)
master_palette.update(rt_colors)
master_palette.update(ztra)
master_palette.update(tsar_colors)

# EastView -> palette keys (lowercase/ASCII)
eastview_map = {
    "Kommersant": "kommersant",
    "Literaturnaia gazeta": "literaturnaia_gazeta",
    "Nezavisimaia gazeta": "nezavisimaia_gazeta",
    "Novaia gazeta": "novaia_gazeta",
    "Pravda": "pravda",
    "Slovo": "slovo",
    "Sovetskaia Rossiia": "sovetskaia_rossiia",
    "Trud": "trud",
    "Vedomosti": "vedomosti",
    "Время MH": "b_mh",   # latin H
    "Время МН": "b_mh",   # cyrillic Н (just in case)
    "Общая газета": "о_г",
}


## Helpers

In [None]:
def derive_neg_prob(df):
    """Return a Series with negativity probability derived from score+probability."""
    if "probability" not in df.columns or "score" not in df.columns:
        return pd.Series([np.nan]*len(df), index=df.index)
    p = pd.to_numeric(df["probability"], errors="coerce")
    s = df["score"].astype(str).str.lower()
    # map: negative -> p; positive -> 1-p; neutral -> 0.5; else -> NaN
    out = np.where(s.eq("negative"), p,
          np.where(s.eq("positive"), 1 - p,
          np.where(s.eq("neutral"), 0.5, np.nan)))
    return pd.Series(out, index=df.index)

def ensure_year_month(df):
    """Use existing year_month (from your earlier step). If missing, try to build from a date column."""
    if "year_month" in df.columns:
        return pd.to_datetime(df["year_month"], errors="coerce")
    # cheap fallback for one-off cases
    for c in ["PublicationDate","PublishedDate","publication_date","translated_date","ExtractedDate","Date","date"]:
        if c in df.columns:
            dt = pd.to_datetime(df[c], errors="coerce")
            if dt.notna().any():
                return dt.dt.to_period("M").dt.to_timestamp(how="start")
    return pd.Series(pd.NaT, index=df.index)


## Build article-level frames for each source

In [None]:
# ---- EastView (11 pubs) ----
ev = eastview_data.copy()

# publication key for colors + a nice display name
ev["Publication_key"] = ev["Publication"].map(eastview_map).fillna(
    ev["Publication"].astype(str).str.lower().str.replace(" ", "_", regex=False)
)
ev["Publication_display"] = ev["Publication"]  # keep original pretty label

# mentions and negativity
ev["mentions"] = pd.to_numeric(ev.get("soros_count", 0), errors="coerce").fillna(0).astype(int)
ev["neg_prob"] = derive_neg_prob(ev)

# time
ev["year_month"] = ensure_year_month(ev)
ev["year"] = ev["year_month"].dt.year

# Use ArticleID if present, else ArticleLink as article key
ev["article_key"] = ev["ArticleID"] if "ArticleID" in ev.columns else ev["ArticleLink"]
ev_articles = ev.dropna(subset=["article_key"]).copy()

# ---- Zavtra ----
zv = zavtra_data.copy()
zv["Publication_key"] = "Zavtra"          # use display label as key (palette has it)
zv["Publication_display"] = "Zavtra"
zv["mentions"] = pd.to_numeric(zv.get("soros_count", 0), errors="coerce").fillna(0).astype(int)
zv["neg_prob"] = derive_neg_prob(zv)
zv["year_month"] = ensure_year_month(zv)
zv["year"] = zv["year_month"].dt.year
zv["article_key"] = zv["ArticleLink"] if "ArticleLink" in zv.columns else zv.index
zv_articles = zv.dropna(subset=["article_key"]).copy()

# ---- Tsargrad ----
ts = tsargrad_data.copy()
ts["Publication_key"] = "Tsargrad"
ts["Publication_display"] = "Tsargrad"
ts["mentions"] = pd.to_numeric(ts.get("soros_count", 0), errors="coerce").fillna(0).astype(int)
ts["neg_prob"] = derive_neg_prob(ts)
ts["year_month"] = ensure_year_month(ts)
ts["year"] = ts["year_month"].dt.year
ts["article_key"] = ts["url"] if "url" in ts.columns else ts.index
ts_articles = ts.dropna(subset=["article_key"]).copy()

# ---- RT (group sentences by article uid) ----
rt = rt_data.copy()
rt["Publication_key"] = "rt"
rt["Publication_display"] = "RT"
rt["year_month"] = ensure_year_month(rt)
rt["year"] = rt["year_month"].dt.year

rt_grouped = (
    rt.groupby(["uid","Publication_key","Publication_display","year_month","year"], as_index=False)
      .agg(mentions=("soros", "sum"),
           rt_prob=("score_soros_sentence","mean"))   # sentence-level average per article
)
rt_grouped["article_key"] = rt_grouped["uid"]

# ---- EIR (one row = one article) ----
eir = eir_data.copy()
eir["Publication_key"] = "eir"
eir["Publication_display"] = "EIR"
eir["mentions"] = pd.to_numeric(eir.get("soros", 0), errors="coerce").fillna(0).astype(int)
eir["year_month"] = ensure_year_month(eir)
eir["year"] = eir["year_month"].dt.year
# set rt_prob/neg_prob to NaN (no sentiment model here)
eir["neg_prob"] = np.nan
eir["rt_prob"] = np.nan
eir["article_key"] = eir.index
eir_articles = eir[["Publication_key","Publication_display","year_month","year","mentions","neg_prob","rt_prob","article_key"]].copy()

# Harmonize eastview/zavtra/tsargrad columns to match
keep_cols = ["Publication_key","Publication_display","year_month","year","mentions","neg_prob","article_key"]
ev_articles  = ev_articles[keep_cols].copy()
zv_articles  = zv_articles[keep_cols].copy()
ts_articles  = ts_articles[keep_cols].copy()
rt_articles  = rt_grouped.rename(columns={"rt_prob":"neg_prob"})[["Publication_key","Publication_display","year_month","year","mentions","neg_prob","article_key"]].copy()
# NOTE: we relabeled RT mean sentence score into 'neg_prob' column to reuse code below,
# but we will *not* mix RT with others in the same "negativity" chart; we'll plot RT separately.

# Combine everything
articles_all = pd.concat([ev_articles, zv_articles, ts_articles, rt_articles, eir_articles], ignore_index=True)
# Clean time
articles_all["year_month"] = pd.to_datetime(articles_all["year_month"], errors="coerce")
articles_all["year"] = pd.to_numeric(articles_all["year"], errors="coerce")


In [None]:
# Build a palette keyed by the *display* names visible in legends
def palette_by_display(df, master_palette):
    # df must have Publication_display + Publication_key
    m = (df[["Publication_display","Publication_key"]]
         .drop_duplicates()
         .set_index("Publication_display")["Publication_key"])
    return {disp: master_palette.get(key, "#999999") for disp, key in m.items()}

def rotate_year_ticks(ax, rotation=45):
    ax.xaxis.set_major_locator(mdates.YearLocator(base=1))
    ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
    plt.setp(ax.get_xticklabels(), rotation=rotation, ha="right")

# These rely on your prebuilt articles_all with:
# ['Publication_display','Publication_key','year_month','year','article_key','mentions','neg_prob']
# If any are missing, run your previous “assembly” cell first.
assert {"Publication_display","Publication_key","year_month","year","article_key","mentions"}.issubset(articles_all.columns)


---

## Average ratio of mentioned 'soros' per article for each publication
- mentions_per_article = total_mentions / number_of_articles
- Both total_mentions and number_of_articles are integers, but their ratio is the average

In [None]:
# Mentions per article
per_pub_mentions = (
    articles_all
    .groupby(["Publication_display","Publication_key"], as_index=False)
    .agg(articles=("article_key","nunique"),
         total_mentions=("mentions","sum"))
    .assign(mentions_per_article=lambda d: d["total_mentions"] / d["articles"])
    .sort_values("mentions_per_article", ascending=False)
)

# Negativity (exclude sources without comparable model, i.e., keep EastView/Zavtra/Tsargrad)
comp_sources = articles_all["Publication_display"].isin(["Zavtra","Tsargrad"]) | articles_all["Publication_key"].isin(custom_palette.keys())
per_pub_neg = (
    articles_all.loc[comp_sources & articles_all["neg_prob"].notna()]
    .groupby(["Publication_display","Publication_key"], as_index=False)
    .agg(mean_neg_prob=("neg_prob","mean"),
         n_articles=("article_key","nunique"))
    .sort_values("mean_neg_prob", ascending=False)
)



In [None]:
# see all rows, sorted
pd.set_option("display.max_rows", 200)

display(
    per_pub_mentions
      .sort_values(["mentions_per_article","Publication_display"], ascending=[False, True])
      .reset_index(drop=True)
)

display(
    per_pub_neg
      .sort_values(["mean_neg_prob","Publication_display"], ascending=[False, True])
      .reset_index(drop=True)
)


In [None]:
# EastView only (11 pubs)
ev_mentions = per_pub_mentions[per_pub_mentions["Publication_key"].isin(custom_palette.keys())]
ev_neg      = per_pub_neg[per_pub_neg["Publication_key"].isin(custom_palette.keys())]

# Singles (Zavtra, Tsargrad, RT, EIR)
singles_mentions = per_pub_mentions[per_pub_mentions["Publication_display"].isin(["Zavtra","Tsargrad","RT","EIR"])]
singles_neg      = per_pub_neg[per_pub_neg["Publication_display"].isin(["Zavtra","Tsargrad"])]

print("EastView mentions (all 11):")
display(ev_mentions.sort_values("mentions_per_article", ascending=False))

print("Singles mentions (Zavtra, Tsargrad, RT, EIR):")
display(singles_mentions.sort_values("mentions_per_article", ascending=False))

print("EastView negativity (all with sentiment):")
display(ev_neg.sort_values("mean_neg_prob", ascending=False))

print("Singles negativity (comparable sentiment only):")
display(singles_neg.sort_values("mean_neg_prob", ascending=False))


---

## Coverage — number of records per year

In [None]:
# Recompute yearly counts
yearly_counts = (
    articles_all.dropna(subset=["year"])
      .assign(year=lambda d: d["year"].astype(int))
      .groupby(["Publication_display","Publication_key","year"], as_index=False)
      .agg(articles=("article_key","nunique"))
)

# --- Palette: Publication_display -> color from your master palette ---
def palette_by_display(df, master_palette):
    m = {}
    for _, r in df[["Publication_display","Publication_key"]].drop_duplicates().iterrows():
        disp, key = r["Publication_display"], r["Publication_key"]
        m[disp] = master_palette.get(key, "#999999")
    return m

pal = palette_by_display(yearly_counts, master_palette)

# ---- Build a complete grid (years × pubs) so missing combos are zero ----
def complete_grid(df, pubs, year_order):
    # collapse to year_str + pub
    tmp = (df.assign(year_str=df["year"].astype(str))
             .groupby(["year_str","Publication_display"], as_index=False)["articles"].sum())

    # full cartesian product of all years × pubs
    grid = pd.MultiIndex.from_product([year_order, pubs],
                                      names=["year_str","Publication_display"])

    out = (tmp.set_index(["year_str","Publication_display"])
              .reindex(grid, fill_value=0)   # fill missing with zero
              .reset_index())

    # keep ordered categorical for x
    out["year_str"] = pd.Categorical(out["year_str"],
                                     categories=year_order, ordered=True)
    return out



## Coverage — number of records per year EIR 1984 to 1995 

In [None]:
# --- Window & x-axis setup (EVERY year tick 1995..2025) ---
start_year = 1984
end_year   = 1995 
year_order = [str(y) for y in range(start_year, end_year + 1)]

df_win = yearly_counts[
    (yearly_counts["year"] >= start_year) & (yearly_counts["year"] <= end_year)
].copy()

# pubs to show
pubs_all = sorted(df_win["Publication_display"].unique().tolist())

# completed grid for ALL pubs (for consistent y-limit)
df_all_full = complete_grid(df_win, pubs_all, year_order)

# Shared y-limit (use all-pubs grid so both charts are comparable)
ymax = int(np.ceil(df_all_full["articles"].max() * 1.10)) if not df_all_full.empty else 1

# --- Global font sizing (bigger) ---
plt.rcParams.update({
    "font.size": 13,
    "axes.titlesize": 22,
    "axes.labelsize": 16,
    "xtick.labelsize": 11,
    "ytick.labelsize": 13,
    "legend.fontsize": 12
})

def plot_grouped_bars_bottom_legend(df, pubs_to_show, title, palette_map, year_order, ymax, width=0.9):
    # complete grid for the chosen pubs
    df_full = complete_grid(df, pubs_to_show, year_order)

    # build palette only for pubs shown
    pal_local = {p: palette_map.get(p, "#999999") for p in pubs_to_show}

    fig, ax = plt.subplots(figsize=(28, 12))

    sns.barplot(
        data=df_full.sort_values(["year_str","Publication_display"]),
        x="year_str", y="articles",
        hue="Publication_display",
        palette=pal_local,
        width=width,
        ax=ax
    )

    ax.set_ylim(0, ymax)
    ax.set_xlabel("Year")
    ax.set_ylabel("Articles")
    ax.set_title(title)

    # FORCE every year label (do NOT call MaxNLocator afterwards)
    ax.set_xticks(np.arange(len(year_order)), labels=year_order)
    plt.setp(ax.get_xticklabels(), rotation=90, ha="center")

    # Legend below the chart (centered)
    ncols = min(8, max(2, int(np.ceil(len(pubs_to_show)/3))))
    ax.legend(
        title="Publication",
        bbox_to_anchor=(0.5, -0.18),
        loc="upper center",
        ncol=ncols,
        frameon=False
    )

    plt.tight_layout()
    plt.subplots_adjust(bottom=0.24)  # extra space for legend
    plt.show()


# 1995–2025
plot_grouped_bars_bottom_legend(
    df=df_win,
    pubs_to_show=pubs_all,
    title="Number of articles per year (all publications): 1984–1995",
    palette_map=pal,
    year_order=year_order,
    ymax=ymax,
    width=0.9
)

In [None]:
# Window & “complete grid” helper
start_year = 1995
end_year   = 2025   # keep 2025 so labels show through end even if last data < 2025
year_order = [str(y) for y in range(start_year, end_year + 1)]

df_win = yearly_counts[
    (yearly_counts["year"] >= start_year) & (yearly_counts["year"] <= end_year)
].copy()

def complete_grid(df, pubs, year_order):
    """Return a DataFrame with all (year, pub) combos; missing → 0."""
    tmp = (df.assign(year_str=df["year"].astype(str))
             .groupby(["year_str","Publication_display"], as_index=False)["articles"].sum())
    grid = pd.MultiIndex.from_product([year_order, pubs], names=["year_str","Publication_display"])
    out = (tmp.set_index(["year_str","Publication_display"])
              .reindex(grid, fill_value=0)
              .reset_index())
    out["year_str"] = pd.Categorical(out["year_str"], categories=year_order, ordered=True)
    return out

pubs_all = sorted(df_win["Publication_display"].unique().tolist())
df_all_full = complete_grid(df_win, pubs_all, year_order)

# y-limit for line charts (non-stacked): use max single-series value
ymax_lines = int(np.ceil(df_all_full["articles"].max() * 1.10)) if not df_all_full.empty else 1

# 2) Styling — larger fonts
plt.rcParams.update({
    "font.size": 13,
    "axes.titlesize": 22,
    "axes.labelsize": 16,
    "xtick.labelsize": 11,
    "ytick.labelsize": 13,
    "legend.fontsize": 12
})

# Helper: multi-series LINE plot with legend below
def plot_grouped_lines_bottom_legend(df, pubs_to_show, title, palette_map, year_order, ymax, linewidth=2.5, markersize=6):
    df_full = complete_grid(df, pubs_to_show, year_order)
    pal_local = {p: palette_map.get(p, "#999999") for p in pubs_to_show}

    fig, ax = plt.subplots(figsize=(28, 22))
    sns.lineplot(
        data=df_full.sort_values(["Publication_display","year_str"]),
        x="year_str", y="articles",
        hue="Publication_display",
        palette=pal_local,
        marker="o",
        linewidth=linewidth,
        markersize=markersize,
        ax=ax
    )

    ax.set_ylim(0, ymax)
    ax.set_xlabel("Year")
    ax.set_ylabel("Articles")
    ax.set_title(title)

    # force every year label
    ax.set_xticks(np.arange(len(year_order)))
    ax.set_xticklabels(year_order, rotation=90, ha="center")

    # legend below chart
    ncols = min(8, max(2, int(np.ceil(len(pubs_to_show)/3))))
    ax.legend(
        title="Publication",
        bbox_to_anchor=(0.5, -0.15),
        loc="upper center",
        ncol=ncols,
        frameon=False,
        fontsize=16,           # legend text
        title_fontsize=16      # legend title
    )

    plt.tight_layout()
    plt.subplots_adjust(bottom=0.20)
    plt.show()


## Coverage — number of records per year (collated) and Top N publications 1995-2025

In [None]:
# ALL publications
plot_grouped_lines_bottom_legend(
    df=df_win,
    pubs_to_show=pubs_all,
    title="Number of articles per year (all publications): 1995–2025",
    palette_map=pal,
    year_order=year_order,
    ymax=ymax_lines,
    linewidth=2.8,
    markersize=6.5
)

# TOP N publications
TOP_N = 6
top_pubs = (
    df_win.groupby("Publication_display")["articles"]
          .sum().sort_values(ascending=False)
          .head(TOP_N).index.tolist()
)

plot_grouped_lines_bottom_legend(
    df=df_win,
    pubs_to_show=top_pubs,
    title=f"Number of articles per year (top {TOP_N} publications): 1995–2025",
    palette_map=pal,
    year_order=year_order,
    ymax=ymax_lines,
    linewidth=3.0,
    markersize=7
)

## Coverage — number of records per year (each year view) 1995-2025
- Produces 30+ graph, uncomment to run

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# ---------- 1) Build monthly averages with year & month ----------
monthly_base = (
    articles_all.dropna(subset=["year_month"])
    .assign(year_month=lambda d: pd.to_datetime(d["year_month"], errors="coerce"))
    .dropna(subset=["year_month"])
    .assign(
        year=lambda d: d["year_month"].dt.year.astype(int),
        month=lambda d: d["year_month"].dt.month.astype(int),
    )
)

# monthly articles per pub
monthly_articles = (
    monthly_base
    .groupby(["Publication_display","Publication_key","year","month"], as_index=False)
    .agg(articles=("article_key","nunique"))
)

# monthly total mentions per pub
monthly_mentions = (
    monthly_base
    .groupby(["Publication_display","Publication_key","year","month"], as_index=False)
    .agg(mentions=("mentions","sum"))
)

# merge + compute average
monthly_avg = (
    pd.merge(
        monthly_articles, monthly_mentions,
        on=["Publication_display","Publication_key","year","month"], how="outer"
    )
    .fillna({"articles": 0, "mentions": 0})
    .assign(
        avg_mentions_per_article=lambda df: np.where(
            df["articles"] > 0, df["mentions"] / df["articles"], np.nan
        )
    )
)

# ---------- 2) Palette helper ----------
def palette_by_display(df, master_palette):
    out = {}
    for _, r in df[["Publication_display","Publication_key"]].drop_duplicates().iterrows():
        out[r["Publication_display"]] = master_palette.get(r["Publication_key"], "#999999")
    return out

pal_all = palette_by_display(monthly_avg, master_palette)

# List of years & pubs to iterate
years_in_data = (
    monthly_avg["year"].dropna().astype(int).sort_values().unique().tolist()
)
pubs_all = sorted(monthly_avg["Publication_display"].unique())

# ---------- 3) Per-year peak (for dynamic height) ----------
per_year_peak = (
    monthly_avg
    .groupby("year")["avg_mentions_per_article"]
    .max(min_count=1)   # NaN if no data that year
)

# Cap extremes so one outlier doesn’t squash everything
peak_cap = np.nanpercentile(per_year_peak, 95) if per_year_peak.notna().any() else 1.0

# ---------- 4) Grid builder for a given year (keep NaN gaps) ----------
def monthly_avg_grid_for_year(df, year, pubs):
    dy = df[df["year"] == year][["Publication_display","month","avg_mentions_per_article"]].copy()
    grid = pd.MultiIndex.from_product([pubs, range(1,13)],
                                      names=["Publication_display","month"])
    out = (
        dy.set_index(["Publication_display","month"])
          .reindex(grid)   # keep NaN where no articles (no fake zeros)
          .reset_index()
          .rename(columns={"avg_mentions_per_article": "avg"})
    )
    return out

# ---------- 5) Plot per-year with dynamic height ----------
sns.set_context("talk", font_scale=1.15)
plt.rcParams.update({
    "axes.titlesize": 22,
    "axes.labelsize": 16,
    "xtick.labelsize": 11,
    "ytick.labelsize": 13,
    "legend.fontsize": 12
})

FIGWIDTH   = 28
MIN_HEIGHT = 8
MAX_HEIGHT = 16
LINEWIDTH  = 2.8
MARKERSZ   = 6.5
month_labels = ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]

for y in years_in_data:
    df_y = monthly_avg_grid_for_year(monthly_avg, y, pubs_all)

    # Keep only publications that have any non-NaN avg in this year
    keep_pubs = (
        df_y.groupby("Publication_display")["avg"]
            .apply(lambda s: s.notna().any())
    )
    keep_pubs = keep_pubs[keep_pubs].index.tolist()
    if not keep_pubs:
        print(f"No data for {y}; skipping.")
        continue
    df_y = df_y[df_y["Publication_display"].isin(keep_pubs)]

    # Per-year dynamic limits & height
    yr_peak = float(per_year_peak.get(y, np.nan))
    if not np.isfinite(yr_peak) or yr_peak <= 0:
        yr_peak = 1.0
    yr_ylim = yr_peak * 1.15

    scale = min(yr_peak, peak_cap) / max(peak_cap, 1e-9)  # normalize to [0,1]
    fig_h = MIN_HEIGHT + (MAX_HEIGHT - MIN_HEIGHT) * scale

    # Plot
    plt.figure(figsize=(FIGWIDTH, fig_h))
    ax = sns.lineplot(
        data=df_y.sort_values(["Publication_display", "month"]),
        x="month", y="avg",
        hue="Publication_display",
        palette={p: pal_all.get(p, "#999999") for p in keep_pubs},
        marker="o",
        linewidth=LINEWIDTH,
        markersize=MARKERSZ,
    )

    ax.set_xlim(1, 12)
    ax.set_xticks(range(1, 13))
    ax.set_xticklabels(month_labels, rotation=0)
    ax.set_ylim(0, yr_ylim)     # per-year y-limit
    ax.grid(axis="y", linestyle=":", alpha=.35)

    ax.set_title(f"Average Soros mentions per article — {y} (all publications)")
    ax.set_xlabel("Month")
    ax.set_ylabel("Avg mentions per article")

    # Legend below the chart
    ncols = min(8, max(2, int(np.ceil(len(keep_pubs)/3))))
    ax.legend(
        title="Publication",
        bbox_to_anchor=(0.5, -0.18),
        loc="upper center",
        ncol=ncols,
        frameon=False
    )

    plt.tight_layout()
    plt.subplots_adjust(bottom=0.22)
    plt.show()


## Coverage Comparison Graphs
- Soros mentions per year (all publications) 1995-2025
- Average sentiment probability per year (EastView pubs + Zavtra + Tsargrad only)

In [None]:
# Setup
start_year = 1995
end_year   = 2025
year_order = [str(y) for y in range(start_year, end_year + 1)]

def complete_grid_counts(df, value_col, pubs, year_order):
    """Complete grid for counts/mentions; missing -> 0."""
    tmp = (df.assign(year_str=df["year"].astype(str))
             .groupby(["year_str","Publication_display"], as_index=False)[value_col].sum())
    grid = pd.MultiIndex.from_product([year_order, pubs], names=["year_str","Publication_display"])
    out = (tmp.set_index(["year_str","Publication_display"])
              .reindex(grid, fill_value=0)
              .reset_index())
    out["year_str"] = pd.Categorical(out["year_str"], categories=year_order, ordered=True)
    return out

def complete_grid_probs(df, pubs, year_order):
    """Complete grid for probability; keep NaN (don't fabricate)."""
    tmp = (df.assign(year_str=df["year"].astype(str))
             .groupby(["year_str","Publication_display"], as_index=False)["probability"].mean())
    grid = pd.MultiIndex.from_product([year_order, pubs], names=["year_str","Publication_display"])
    out = (tmp.set_index(["year_str","Publication_display"])
              .reindex(grid)
              .reset_index())
    out["year_str"] = pd.Categorical(out["year_str"], categories=year_order, ordered=True)
    return out

def parse_any_year(df, candidates):
    """Return a Series of years by parsing the first workable date column."""
    for c in candidates:
        if c in df.columns:
            s = df[c]
            # If it's already datetime:
            try:
                if pd.api.types.is_datetime64_any_dtype(s):
                    y = s.dt.year
                    if y.notna().any():
                        return y
            except Exception:
                pass
            # Try parse to datetime:
            dt = pd.to_datetime(s, errors="coerce")
            if dt.notna().any():
                return dt.dt.year
    # If nothing worked, return all-NaNs
    return pd.Series([np.nan]*len(df), index=df.index)

# Build yearly base metrics
yearly_counts = (
    articles_all.dropna(subset=["year"])
      .assign(year=lambda d: d["year"].astype(int))
      .groupby(["Publication_display","Publication_key","year"], as_index=False)
      .agg(
          articles=("article_key","nunique"),
          mentions=("mentions","sum")
      )
)

pal_all = palette_by_display(yearly_counts, master_palette)
pubs_all = sorted(yearly_counts["Publication_display"].unique().tolist())

df_articles = complete_grid_counts(yearly_counts, "articles", pubs_all, year_order)
df_mentions = complete_grid_counts(yearly_counts, "mentions", pubs_all, year_order)

ymax_articles = int(np.ceil(df_articles["articles"].max() * 1.10)) if len(df_articles) else 1
ymax_mentions = int(np.ceil(df_mentions["mentions"].max() * 1.10)) if len(df_mentions) else 1


# Styling (full-width figs)
sns.set_context("talk", font_scale=1.15)
plt.rcParams.update({
    "axes.titlesize": 22,
    "axes.labelsize": 16,
    "xtick.labelsize": 11,
    "ytick.labelsize": 13,
    "legend.fontsize": 12
})

In [None]:
# Articles / year



# --- helper: legend below (dedupe, don’t drop first item) ---
def legend_below_no_dupes(ax, title="Publication", ncol=8, y=-0.12, fs=16):
    handles, labels = ax.get_legend_handles_labels()
    seen = set()
    new_h, new_l = [], []
    for h, l in zip(handles, labels):
        if l and l != "Publication_display" and l not in seen:
            new_h.append(h); new_l.append(l); seen.add(l)
    ax.legend(new_h, new_l, title=title, ncol=ncol, loc="upper center",
              bbox_to_anchor=(0.5, y), frameon=False,
              fontsize=fs, title_fontsize=fs)

# Ensure we use the same publication order everywhere
hue_order = sorted(df_articles["Publication_display"].unique().tolist())

# ---------------------------
# Articles / year
# ---------------------------
fig1, ax1 = plt.subplots(figsize=(32, 22))
sns.lineplot(
    data=df_articles.sort_values(["Publication_display","year_str"]),
    x="year_str", y="articles",
    hue="Publication_display",
    hue_order=hue_order,             # <-- enforce order (EIR included)
    palette=pal_all,
    marker="o", linewidth=3.0, markersize=7, ax=ax1
)
ax1.set_ylim(0, ymax_articles)
ax1.set_xlabel("Year"); ax1.set_ylabel("Articles")
ax1.set_title("Number of articles per year (all publications): 1995–2025")
ax1.set_xticks(np.arange(len(year_order)))
ax1.set_xticklabels(year_order, rotation=90, ha="center")
ax1.grid(axis="y", linestyle=":", alpha=.35)
legend_below_no_dupes(ax1, title="Publication", ncol=8, y=-0.12, fs=16)  # <-- fixed legend
plt.tight_layout(); plt.subplots_adjust(bottom=0.20); plt.show()

# ---------------------------
# Mentions / year
# ---------------------------
fig2, ax2 = plt.subplots(figsize=(32, 22))
sns.lineplot(
    data=df_mentions.sort_values(["Publication_display","year_str"]),
    x="year_str", y="mentions",
    hue="Publication_display",
    hue_order=hue_order,             # <-- enforce same order
    palette=pal_all,
    marker="o", linewidth=3.0, markersize=7, ax=ax2
)
ax2.set_ylim(0, ymax_mentions)
ax2.set_xlabel("Year"); ax2.set_ylabel("Soros mentions")
ax2.set_title("Soros mentions per year not averaged (all publications): 1995–2025")
ax2.set_xticks(np.arange(len(year_order)))
ax2.set_xticklabels(year_order, rotation=90, ha="center")
ax2.grid(axis="y", linestyle=":", alpha=.35)
legend_below_no_dupes(ax2, title="Publication", ncol=8, y=-0.10, fs=16)  # <-- fixed legend
plt.tight_layout(); plt.subplots_adjust(bottom=0.20); plt.show()


## Coverage - Comparison Graphs Soros mentions per year Averaged
- Average is computed by: `avg_mentions_per_article = mentions/ articles`

In [None]:
# Make continuous monthly time axis for the whole window
start_year, end_year = 1995, 2025
all_months = pd.date_range(f"{start_year}-01-01", f"{end_year}-12-01", freq="MS")

# If you don't already have a palette for every pub:
def palette_by_display(df, master_palette):
    out = {}
    for _, r in df[["Publication_display","Publication_key"]].drop_duplicates().iterrows():
        out[r["Publication_display"]] = master_palette.get(r["Publication_key"], "#999999")
    return out

# 1) Ensure monthly_avg has year_month (1st of month)
monthly_avg_time = monthly_avg.copy()
monthly_avg_time["year_month"] = pd.to_datetime(
    dict(year=monthly_avg_time["year"], month=monthly_avg_time["month"], day=1),
    errors="coerce"
)
monthly_avg_time = monthly_avg_time.dropna(subset=["year_month"])

# 2) Complete grid (months × pubs); keep NaN so gaps remain where a pub had no articles
pubs_all = sorted(monthly_avg_time["Publication_display"].unique().tolist())
grid = pd.MultiIndex.from_product([all_months, pubs_all], names=["year_month","Publication_display"])

monthly_full = (
    monthly_avg_time
    .set_index(["year_month","Publication_display"])["avg_mentions_per_article"]
    .reindex(grid)  # NaN for months with no articles (no fake zeros)
    .reset_index()
    .rename(columns={"avg_mentions_per_article":"avg"})
)

# 3) Palette just for the pubs present
pal_all = palette_by_display(monthly_avg_time, master_palette)

# 4) Plot: one big multi-series line chart
sns.set_context("talk", font_scale=1.15)
plt.rcParams.update({
    "axes.titlesize": 22,
    "axes.labelsize": 16,
    "xtick.labelsize": 11,
    "ytick.labelsize": 13,
    "legend.fontsize": 12
})

fig, ax = plt.subplots(figsize=(32, 22))

sns.lineplot(
    data=monthly_full.sort_values(["Publication_display","year_month"]),
    x="year_month", y="avg",
    hue="Publication_display",
    palette={p: pal_all.get(p, "#999999") for p in pubs_all},
    marker="o", linewidth=3.0, markersize=7,
    ax=ax
)

# Y range
ymax_all = monthly_full["avg"].max(skipna=True)
ymax_all = float(ymax_all * 1.15) if pd.notna(ymax_all) and ymax_all > 0 else 1.0
ax.set_ylim(0, ymax_all)

# X axis: show EVERY year label
years = pd.date_range(f"{start_year}-01-01", f"{end_year}-01-01", freq="YS")
ax.set_xticks(years)
ax.set_xticklabels([str(d.year) for d in years], rotation=90, ha="center")

ax.set_xlabel("Year")
ax.set_ylabel("Avg mentions per article")
ax.set_title("Average Soros mentions per article — all publications (1995–2025)")
ax.grid(axis="y", linestyle=":", alpha=.35)

# Legend below (drop redundant first handle if present)
handles, labels = ax.get_legend_handles_labels()
if labels and labels[0].lower() == "publication_display":
    handles, labels = handles[1:], labels[1:]
ax.legend(handles=handles, labels=labels, title="Publication",
    ncol=min(8, max(2, int(np.ceil(len(pubs_all)/3)))),
    loc="upper center", bbox_to_anchor=(0.5, -0.15),
    frameon=False,
    fontsize=16,           # legend text
    title_fontsize=16      # legend title
)

plt.tight_layout()
plt.subplots_adjust(bottom=0.20)
plt.show()

## Coverage - Comparison Graphs Soros mentions per year (each year view)
- Produces 30+ graph, uncomment to run
- Average is computed by: `avg_mentions_per_article = mentions/ articles`

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# --- compute per-year peak (max avg mentions/article in that year) ---
per_year_peak = (
    monthly_avg
    .groupby("year")["avg_mentions_per_article"]
    .max(min_count=1)   # NaN if a year has no data
)

# Cap extremes so one outlier year doesn’t make all the other charts tiny
peak_cap = np.nanpercentile(per_year_peak, 95) if per_year_peak.notna().any() else 1.0

# Figure-size controls (you can tweak these)
FIGWIDTH   = 28
MIN_HEIGHT = 8
MAX_HEIGHT = 16
LINEWIDTH  = 2.8
MARKERSZ   = 7

for y in years_in_data:
    # grid for year y (you already have this function)
    df_y = monthly_avg_grid_for_year(monthly_avg, y, pubs_all)

    # Keep only pubs that actually have any non-NaN avg that year
    keep_pubs = (
        df_y.groupby("Publication_display")["avg"]
            .apply(lambda s: s.notna().any())
    )
    keep_pubs = keep_pubs[keep_pubs].index.tolist()
    if not keep_pubs:
        print(f"No data for {y}; skipping.")
        continue
    df_y = df_y[df_y["Publication_display"].isin(keep_pubs)]

    # ----- dynamic limits + dynamic figure height -----
    yr_peak = float(per_year_peak.get(y, np.nan))
    if not np.isfinite(yr_peak) or yr_peak <= 0:
        # fall back to a small default to avoid errors
        yr_peak = 1.0

    # y-axis limit = a bit above the year’s peak
    yr_ylim = yr_peak * 1.15

    # height scaled to the (capped) year peak so low-variance years aren't tiny
    scale = min(yr_peak, peak_cap) / max(peak_cap, 1e-9)  # normalize to [0,1]
    fig_h = MIN_HEIGHT + (MAX_HEIGHT - MIN_HEIGHT) * scale

    # ----- plot -----
    plt.figure(figsize=(FIGWIDTH, fig_h))
    ax = sns.lineplot(
        data=df_y.sort_values(["Publication_display", "month"]),
        x="month", y="avg",
        hue="Publication_display",
        palette={p: pal_all.get(p, "#999999") for p in keep_pubs},
        marker="o",
        linewidth=LINEWIDTH,
        markersize=MARKERSZ,
    )

    ax.set_xlim(1, 12)
    ax.set_xticks(range(1, 13))
    ax.set_xticklabels(["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"])
    ax.set_ylim(0, yr_ylim)                  # <-- per-year y-limit
    ax.grid(axis="y", linestyle=":", alpha=.35)

    ax.set_title(f"Average Soros mentions per article — {y} (all publications)")
    ax.set_xlabel("Month")
    ax.set_ylabel("Avg mentions per article")

    # Legend below the chart
    ncols = min(8, max(2, int(np.ceil(len(keep_pubs)/3))))
    ax.legend(
        title="Publication",
        bbox_to_anchor=(0.5, -0.18),
        loc="upper center",
        ncol=ncols,
        frameon=False
    )

    plt.tight_layout()
    plt.subplots_adjust(bottom=0.22)
    plt.show()


## Sentiment analysis
## Average negativity probability over time (average negativity probability (0–1))
- Per-article negativity score: for every article we turn the model’s score + probability into “probability the article is negative”:
    - if score == "negative" → neg_prob = probability
    - if score == "positive" → neg_prob = 1 - probability
    - if score == "neutral" → neg_prob = 0.5
    - otherwise → NaN (dropped from averages)
- Averaging: for each publication and each time bucket (month or year), we take the simple arithmetic mean of neg_prob across all its articles in that bucket. Months with no articles stay NaN (shown as gaps, not zeros).
- Publications: all EastView outlets (the 11 titles), Zavtra, and Tsargrad—they all have the compatible score/probability fields.

In [None]:
neg_time = articles_all.dropna(subset=["year_month","neg_prob"]).copy()
neg_time["year_month"] = pd.to_datetime(neg_time["year_month"], errors="coerce")
neg_time = neg_time.dropna(subset=["year_month"])

monthly_neg = (
    neg_time
    .groupby(["Publication_display","Publication_key","year_month"], as_index=False)
    .agg(mean_neg_prob=("neg_prob","mean"))
    .sort_values(["Publication_display","year_month"])
)

# Palette helper (uses your master_palette via Publication_key)
def palette_by_display(df, master_palette):
    m = {}
    for _, r in df[["Publication_display","Publication_key"]].drop_duplicates().iterrows():
        m[r["Publication_display"]] = master_palette.get(r["Publication_key"], "#999999")
    return m

pal = palette_by_display(monthly_neg, master_palette)

start_year = 1995
end_year   = 2025
all_months = pd.date_range(f"{start_year}-01-01", f"{end_year}-12-01", freq="MS")

pubs = sorted(monthly_neg["Publication_display"].unique().tolist())

# complete grid: (month x publication), keep NaN for missing means (so lines gap rather than fake data)
grid = pd.MultiIndex.from_product([all_months, pubs], names=["year_month","Publication_display"])
monthly_full = (
    monthly_neg.set_index(["year_month","Publication_display"])["mean_neg_prob"]
               .reindex(grid)
               .reset_index()
               .rename(columns={"mean_neg_prob":"neg_mean"})
)

# Styling (full-width charts)
sns.set_context("talk", font_scale=1.15)
plt.rcParams.update({
    "axes.titlesize": 22,
    "axes.labelsize": 16,
    "xtick.labelsize": 11,
    "ytick.labelsize": 13,
    "legend.fontsize": 12
})

# ALL publications, full-width, legend below
fig, ax = plt.subplots(figsize=(32, 22))

sns.lineplot(
    data=monthly_full.sort_values(["Publication_display","year_month"]),
    x="year_month", y="neg_mean",
    hue="Publication_display",
    palette=pal,
    marker="o",
    linewidth=3.0,
    markersize=7,
    ax=ax
)

ax.set_ylim(0, 1)
ax.set_ylabel("Negativity probability (0–1)")
ax.set_xlabel("Month")
ax.set_title("Average negativity probability over time")

# Year ticks on every year; labels for the year only
ax.xaxis.set_major_locator(mdates.YearLocator(base=1))
ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
plt.setp(ax.get_xticklabels(), rotation=90, ha="center")

ax.grid(axis="y", linestyle=":", alpha=.35)

# Legend underneath
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles[1:], labels=labels[1:],  # drop the label for the mapped variable
          title="Publication",
          ncol=min(8, max(2, int(np.ceil(len(pubs)/3)))),
          loc="upper center", 
          bbox_to_anchor=(0.5, -0.15), 
          frameon=False,
          fontsize=16,           # legend text
          title_fontsize=16)      # legend title

plt.tight_layout()
plt.subplots_adjust(bottom=0.20)
plt.show()

# One publication per graph
import matplotlib.ticker as mticker

HEIGHT  = 7.2     # was ~3.8
ASPECT  = 2.0     # width = ASPECT * HEIGHT
HSPACE  = 0.60    # more vertical breathing room between facets


def facet_line_single(data, **kws):
    ax = plt.gca()
    pub = data["Publication_display"].iloc[0]
    d = data.sort_values("year_month")
    sns.lineplot(
        data=d, x="year_month", y="neg_mean",
        marker="o", linewidth=3.0, markersize=6,
        color=pal.get(pub, "#999999"), ax=ax
    )
    # midline at 0.5
    ax.axhline(0.5, linestyle="--", linewidth=1.6, color="#555555", alpha=0.6, zorder=0)

    ax.set_ylim(0, 1)
    ax.grid(axis="y", linestyle=":", alpha=.35)
    ax.xaxis.set_major_locator(mdates.YearLocator(base=1))
    ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
    ax.tick_params(axis="x", rotation=90, labelbottom=True)


g = sns.FacetGrid(
    monthly_full.dropna(subset=["neg_mean"]),
    col="Publication_display",
    col_wrap=1,               # stacked single column
    height=HEIGHT,
    aspect=ASPECT,
    sharey=True, sharex=True,
    margin_titles=True
)
g.map_dataframe(facet_line_single)
g.set_axis_labels("Month", "Negativity probability (0–1)")
g.set_titles("{col_name}")
g.set(ylim=(0, 1))

# Force y-ticks and add some padding/breathing room
for ax in g.axes.flat:
    ax.yaxis.set_major_locator(mticker.FixedLocator([0, .25, .5, .75, 1]))
    ax.yaxis.set_major_formatter(mticker.FixedFormatter(["0", "0.25", "0.5", "0.75", "1"]))
    ax.tick_params(axis="y", labelleft=True)
    ax.tick_params(axis="x", rotation=90, pad=6)  # extra space under x labels
    ax.margins(x=0.01, y=0.06)                    # a bit of space inside each axis

# More space around the entire figure and between rows
g.fig.subplots_adjust(top=0.96, bottom=0.07, left=0.09, right=0.99, hspace=HSPACE)
plt.show()



## Average negativity probability over time (top 6 publications by total article count (1995–2025))

In [None]:
# Average negativity probability over time (top 6 publications by total article count (1995–2025))
neg_time = articles_all.dropna(subset=["year_month","neg_prob"]).copy()
neg_time["year_month"] = pd.to_datetime(neg_time["year_month"], errors="coerce")
neg_time = neg_time.dropna(subset=["year_month"])

monthly_neg = (
    neg_time
    .groupby(["Publication_display","Publication_key","year_month"], as_index=False)
    .agg(mean_neg_prob=("neg_prob","mean"))
    .sort_values(["Publication_display","year_month"])
)

# Palette helper (uses your master_palette via Publication_key)
def palette_by_display(df, master_palette):
    m = {}
    for _, r in df[["Publication_display","Publication_key"]].drop_duplicates().iterrows():
        m[r["Publication_display"]] = master_palette.get(r["Publication_key"], "#999999")
    return m

pal = palette_by_display(monthly_neg, master_palette)
start_year = 1995
end_year   = 2025
all_months = pd.date_range(f"{start_year}-01-01", f"{end_year}-12-01", freq="MS")

pubs = sorted(monthly_neg["Publication_display"].unique().tolist())

# complete grid: (month x publication), keep NaN for missing means (so lines gap rather than fake data)
grid = pd.MultiIndex.from_product([all_months, pubs], names=["year_month","Publication_display"])
monthly_full = (
    monthly_neg.set_index(["year_month","Publication_display"])["mean_neg_prob"]
               .reindex(grid)
               .reset_index()
               .rename(columns={"mean_neg_prob":"neg_mean"})
)

sns.set_context("talk", font_scale=1.15)
plt.rcParams.update({
    "axes.titlesize": 22,
    "axes.labelsize": 16,
    "xtick.labelsize": 11,
    "ytick.labelsize": 13,
    "legend.fontsize": 12
})

# TOP 6 publications (by total articles, 1995–2025) WITH neg_prob data
win_mask = (yearly_counts["year"] >= 1995) & (yearly_counts["year"] <= 2025)

# pubs that actually have negativity data
pubs_with_neg = monthly_neg["Publication_display"].unique()

top6 = (
    yearly_counts.loc[win_mask & yearly_counts["Publication_display"].isin(pubs_with_neg)]
    .groupby("Publication_display")["articles"]
    .sum()
    .sort_values(ascending=False)
    .head(6)
    .index
    .tolist()
)

# Filter the completed monthly grid to those top 6 pubs
monthly_full_top = monthly_full[monthly_full["Publication_display"].isin(top6)].copy()

# Local palette for exactly these pubs (keeps your assigned colors)
pal_top = {p: pal.get(p, "#999999") for p in top6}

# Full-width line chart, legend underneath, every year labeled
fig, ax = plt.subplots(figsize=(32, 22))
sns.lineplot(
    data=monthly_full_top.sort_values(["Publication_display","year_month"]),
    x="year_month", y="neg_mean",
    hue="Publication_display",
    palette=pal_top,
    marker="o",
    linewidth=3.0,
    markersize=7,
    ax=ax
)

ax.set_ylim(0, 1)
ax.set_ylabel("Negativity probability (0–1)")
ax.set_xlabel("Month")
ax.set_title("Average negativity probability over time — Top 6 publications (1995–2025)")

# Show every year on the x-axis
ax.xaxis.set_major_locator(mdates.YearLocator(base=1))
ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
plt.setp(ax.get_xticklabels(), rotation=90, ha="center")

ax.grid(axis="y", linestyle=":", alpha=.35)

# Legend underneath
handles, labels = ax.get_legend_handles_labels()
ax.legend(
    handles=handles[1:], labels=labels[1:],   # drop mapped var label
    title="Publication",
    ncol=min(6, len(top6)),
    loc="upper center", bbox_to_anchor=(0.5, -0.10), frameon=False,
    fontsize=16,           # legend text
    title_fontsize=16      # legend title
)

plt.tight_layout()
plt.show()


## Average negativity probability over time (Per year)
- Produces 30+ graph, uncomment to run
- This loop creates one line chart per year showing the average negativity probability (0–1) by month for each publication.
- For every year, it builds a complete 12-month grid per outlet (leaving NaN gaps where an outlet had no data so lines don’t fake zeros), filters to outlets that actually have values that year, and can optionally keep only the top-N outlets.
- It then plots the month-by-month lines with your publication colors, forces clear month labels, fixes the y-axis to 0–1 for comparability, and places a wrapped legend below the figure so the plot area stays uncluttered.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Config
ONLY_TOP_N = None   # set to an int (e.g., 6) to cap pubs per year, else None
FIGSIZE     = (28, 12)
LINEWIDTH   = 3.0
MARKERSIZE  = 7

years_in_data = (
    monthly_neg["year_month"].dt.year.dropna().astype(int).sort_values().unique().tolist()
)
pubs_all = sorted(monthly_neg["Publication_display"].unique())

def monthly_neg_grid_for_year(df, year, pubs):
    dy = df[df["year_month"].dt.year == year].copy()
    dy["month"] = dy["year_month"].dt.month
    dy = dy[["Publication_display", "month", "mean_neg_prob"]]
    grid = pd.MultiIndex.from_product([pubs, range(1, 13)],
                                      names=["Publication_display", "month"])
    out = (dy.set_index(["Publication_display", "month"])
             .reindex(grid)        # keep NaN to show gaps
             .reset_index()
             .rename(columns={"mean_neg_prob": "neg_mean"}))
    return out


for y in years_in_data:
    df_y = monthly_neg_grid_for_year(monthly_neg, y, pubs_all)

    # 2) Keep only pubs with at least one real value this year
    keep_pubs = (
        df_y.groupby("Publication_display")["neg_mean"]
            .apply(lambda s: s.notna().any())
    )
    keep_pubs = keep_pubs[keep_pubs].index.tolist()
    if not keep_pubs:
        continue

    # ALWAYS filter df_y to the pubs we will actually color
    df_y = df_y[df_y["Publication_display"].isin(keep_pubs)]

    # 3) Optionally limit to top-N by months-with-data in that year
    if ONLY_TOP_N is not None and len(keep_pubs) > ONLY_TOP_N:
        month_counts = (
            df_y.groupby("Publication_display")["neg_mean"]
                .apply(lambda s: s.notna().sum())
                .sort_values(ascending=False)
        )
        keep_pubs = month_counts.head(ONLY_TOP_N).index.tolist()
        df_y = df_y[df_y["Publication_display"].isin(keep_pubs)]

    # 4) Build palette exactly for the pubs we are plotting, and set hue_order
    hue_order = sorted(keep_pubs)
    pal_local = {p: pal.get(p, "#999999") for p in hue_order}

    # 5) Plot
    plt.figure(figsize=FIGSIZE)
    ax = sns.lineplot(
        data=df_y.sort_values(["Publication_display","month"]),
        x="month", y="neg_mean",
        hue="Publication_display",
        hue_order=hue_order,
        palette=pal_local,
        marker="o",
        linewidth=LINEWIDTH,
        markersize=MARKERSIZE,
    )

    ax.set_xlim(1, 12)
    ax.set_xticks(range(1, 13))
    ax.set_xticklabels(["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"])
    ax.set_ylim(0, 1)
    ax.set_yticks([0, .25, .5, .75, 1])
    ax.grid(axis="y", linestyle=":", alpha=.35)

    ax.set_title(f"Average negativity probability by month — {y}")
    ax.set_xlabel("Month")
    ax.set_ylabel("Negativity probability (0–1)")

    ncols = min(8, max(2, int(np.ceil(len(hue_order)/3))))
    ax.legend(
        title="Publication",
        bbox_to_anchor=(0.5, -0.18),
        loc="upper center",
        ncol=ncols,
        frameon=False
    )

    plt.tight_layout()
    plt.subplots_adjust(bottom=0.22)
    plt.show()


# Cross-source comparisons: Soros mentions per article 

## (1) Cross-source comparisons: Soros mentions per article  “all increased / all decreased” shading
- Big event detection & synchronized surges: Code 1 with METRIC="articles" (coverage) or "mentions" (total chatter).
- No average applied
- (SMOOTH_WINDOW, default 3 months).
- Best to view for major agenda spikes across outlets. If all four outlets publish more Soros pieces, you’ll see synchronized “all increased” months. It’s ideal for spotting event-driven coverage waves.

### Overview:
- mentions: total Soros mentions in a month. Spikes when a newsroom publishes more pieces or writes pieces with lots of mentions. It’s a volume pulse of the conversation.
- articles: number of Soros-related articles in a month. Pure coverage volume (how many pieces), regardless of how intensely each article mentions Soros.

In [None]:
# Monthly per-source table
src_map = {
    "EIR":      articles_all["Publication_display"].eq("EIR"),
    "RT":       articles_all["Publication_display"].eq("RT"),
    "Zavtra":   articles_all["Publication_display"].eq("Zavtra"),
    "Tsargrad": articles_all["Publication_display"].eq("Tsargrad"),
}

monthly = []
for label, mask in src_map.items():
    tmp = (
        articles_all.loc[mask]
        .dropna(subset=["year_month"])
        .assign(year_month=lambda d: pd.to_datetime(d["year_month"], errors="coerce")
                                    .dt.to_period("M").dt.to_timestamp(how="start"))  # <-- fixed
        .groupby("year_month", as_index=False)
        .agg(mentions=("mentions","sum"),
             articles=("article_key","nunique"))
        .assign(source=label)
    )
    monthly.append(tmp)

monthly = pd.concat(monthly, ignore_index=True)

# Continuous monthly index 1995–2025
start, end = pd.Timestamp("1995-01-01"), pd.Timestamp("2025-12-01")
all_months = pd.date_range(start, end, freq="MS")

# 2) Mtric & compute wide series per source
METRIC        = "mentions"   # "avg" (recommended) | "mentions" | "articles"
MIN_ARTICLES  = 1       # ignore avg for months where a source had < N articles
SMOOTH_WINDOW = 3       # rolling window in months

def make_wide_counts(col):
    w = (monthly.pivot(index="year_month", columns="source", values=col)
                 .reindex(all_months))
    return w

def make_wide_metric(metric):
    if metric == "articles":
        return make_wide_counts("articles").fillna(0)
    if metric == "mentions":
        return make_wide_counts("mentions").fillna(0)
    # avg mentions per article (NaN if too few articles that month)
    w_m = make_wide_counts("mentions")
    w_a = make_wide_counts("articles")
    avg = (w_m / w_a).where(w_a >= MIN_ARTICLES)  # guard low-volume noise
    return avg

wide = make_wide_metric(METRIC)

# Smooth (per source) to reduce noise
wide_smooth = wide.rolling(window=SMOOTH_WINDOW, min_periods=1).mean()

# Compute "all up / all down" flags on smoothed series
diff = wide_smooth.diff()

# only consider a month if all series are present (or relax with .any(skipna=True))
all_up   = diff.gt(0).all(axis=1)
all_down = diff.lt(0).all(axis=1)

# Plot
color_map = {
    "EIR":      eir_colors.get("eir", "#3066a8"),
    "RT":       rt_colors.get("rt", "#31a354"),
    "Zavtra":   master_palette.get("Zavtra", "#5b31a3"),
    "Tsargrad": master_palette.get("Tsargrad", "#a35b31"),
}

title_map = {
    "articles": "Total Soros-related articles per month (1995–2025)",
    "mentions": "Total Soros mentions per month (1995–2025)",
    "avg":      "Average Soros mentions per article per month (1995–2025)",
}
ylabel_map = {
    "articles": "Articles",
    "mentions": "Mentions",
    "avg":      "Avg mentions per article",
}

sns.set_context("talk", font_scale=1.1)
plt.rcParams.update({
    "axes.titlesize": 22, "axes.labelsize": 16,
    "xtick.labelsize": 11, "ytick.labelsize": 13,
    "legend.fontsize": 13
})

fig, ax = plt.subplots(figsize=(32, 16))

for col in ["EIR","RT","Zavtra","Tsargrad"]:
    if col in wide_smooth.columns:
        ax.plot(wide_smooth.index, wide_smooth[col],
                marker="o", linewidth=3.0, markersize=6.5,
                label=col, color=color_map[col])

# Shade months where all moved the same direction
ymin, ymax = ax.get_ylim()
for x, inc, dec in zip(wide_smooth.index, all_up, all_down):
    if bool(inc) or bool(dec):
        ax.axvspan(x - pd.offsets.Day(15), x + pd.offsets.Day(15),
                   color=("green" if inc else "red"), alpha=0.15)
ax.set_ylim(ymin, ymax)

# X axis: yearly ticks
ax.set_xlim(start, end)
ax.xaxis.set_major_locator(mdates.YearLocator(1))
ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
plt.setp(ax.get_xticklabels(), rotation=90, ha="center")

ax.set_title(title_map[METRIC])
ax.set_xlabel("Month"); ax.set_ylabel(ylabel_map[METRIC])
ax.grid(axis="y", linestyle=":", alpha=.35)

# Legend below; bump it up slightly if it overlaps the bottom margin
ax.legend(title="Source", ncol=4, loc="upper center",
          bbox_to_anchor=(0.5, -0.10), frameon=False)

plt.tight_layout()
plt.subplots_adjust(bottom=0.16)
plt.show()


# (2) Cross-source comparisons: Soros Average mentions per article  “all increased / all decreased” shading
- Bias/intensity shifts inside articles: Code 2 (avg) with sensible MIN_ARTICLES, MIN_SOURCES, EPS, and smoothing.
- Average applied
- avg = mentions / articles, but only where articles ≥ MIN_ARTICLES; otherwise set to NaN. Then we smooth with a rolling mean (SMOOTH_WINDOW, default 3 months).
- Best for when you care about narrative intensity rather than just volume. If an outlet keeps publishing the same number of pieces but starts stuffing each with more Soros references, avg climbs while articles stays flat.
    - Intensity may genuinely be stable even as volume changes—so avg won’t move while mentions / articles do.
### Overview:
- avg: mentions per article = mentions / articles. This normalizes for output and shows intensity per piece (“how Soros-heavy is the average article?”).

In [None]:
src_map = {
    "EIR":      articles_all["Publication_display"].eq("EIR"),
    "RT":       articles_all["Publication_display"].eq("RT"),
    "Zavtra":   articles_all["Publication_display"].eq("Zavtra"),
    "Tsargrad": articles_all["Publication_display"].eq("Tsargrad"),
}

monthly = []
for label, mask in src_map.items():
    tmp = (
        articles_all.loc[mask]
        .dropna(subset=["year_month"])
        .assign(year_month=lambda d: pd.to_datetime(d["year_month"], errors="coerce")
                                    .dt.to_period("M").dt.to_timestamp(how="start"))  # <- fixed
        .groupby("year_month", as_index=False)
        .agg(mentions=("mentions","sum"),
             articles=("article_key","nunique"))
        .assign(source=label)
    )
    monthly.append(tmp)
monthly = pd.concat(monthly, ignore_index=True)

# Continuous monthly index 1995–2025
start, end = pd.Timestamp("1995-01-01"), pd.Timestamp("2025-12-01")
all_months = pd.date_range(start, end, freq="MS")

# Build metric = average mentions/article
MIN_ARTICLES   = 1     # was 3; loosen so more months survive
SMOOTH_WINDOW  = 3     # 3-month rolling mean
EPS            = 0.01  # ignore tiny wiggles
MIN_SOURCES    = 3     # require at least K sources present that month

def wide_counts(col):
    return (monthly.pivot(index="year_month", columns="source", values=col)
                   .reindex(all_months))

w_m = wide_counts("mentions")
w_a = wide_counts("articles")
avg = (w_m / w_a).where(w_a >= MIN_ARTICLES)        # NaN when too few articles
avg_smooth = avg.rolling(window=SMOOTH_WINDOW, min_periods=1).mean()

# Robust "all up / all down"
d = avg_smooth.diff()

present = d.notna().sum(axis=1)                      # how many sources have a valid diff
inc_any = (d >  EPS).any(axis=1)                     # at least one increased
dec_any = (d < -EPS).any(axis=1)                     # at least one decreased
has_mix = ((d > EPS).any(axis=1)) & ((d < -EPS).any(axis=1))  # both directions present

all_up   = inc_any & ~has_mix & (present >= MIN_SOURCES)
all_down = dec_any & ~has_mix & (present >= MIN_SOURCES)

color_map = {
    "EIR":      eir_colors.get("eir", "#3066a8"),
    "RT":       rt_colors.get("rt", "#31a354"),
    "Zavtra":   master_palette.get("Zavtra", "#5b31a3"),
    "Tsargrad": master_palette.get("Tsargrad", "#a35b31"),
}

sns.set_context("talk", font_scale=1.1)
plt.rcParams.update({
    "axes.titlesize": 22, "axes.labelsize": 16,
    "xtick.labelsize": 11, "ytick.labelsize": 13,
    "legend.fontsize": 13
})

fig, ax = plt.subplots(figsize=(32, 16))

for src in ["EIR","RT","Zavtra","Tsargrad"]:
    if src in avg_smooth.columns:
        ax.plot(avg_smooth.index, avg_smooth[src],
                marker="o", linewidth=3.0, markersize=6.5,
                label=src, color=color_map[src])

# Shade months where all available moved in the same direction
ymin, ymax = ax.get_ylim()
for x, up, down in zip(avg_smooth.index, all_up, all_down):
    if bool(up) or bool(down):
        ax.axvspan(x - pd.offsets.Day(15), x + pd.offsets.Day(15),
                   color=("green" if up else "red"), alpha=0.15)
ax.set_ylim(ymin, ymax)

# X axis: yearly ticks
ax.set_xlim(start, end)
ax.xaxis.set_major_locator(mdates.YearLocator(1))
ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
plt.setp(ax.get_xticklabels(), rotation=90, ha="center")

ax.set_title("Average Soros mentions per article per month (smoothed) — 1995–2025")
ax.set_xlabel("Month"); ax.set_ylabel("Avg mentions per article")
ax.grid(axis="y", linestyle=":", alpha=.35)

# Legend below
ax.legend(title="Source", ncol=4, loc="upper center",
          bbox_to_anchor=(0.5, -0.10), frameon=False)

plt.tight_layout()
plt.subplots_adjust(bottom=0.16)
plt.show()

# Optional: quick sanity counts
print("Flagged months — all up:", int(all_up.sum()), "| all down:", int(all_down.sum()))


In [None]:
# Build the list of publications to plot
#    - EV 11 = rows whose Publication_key is in your custom_palette keys
#    - plus the four single sources (EIR, RT, Zavtra, Tsargrad)
# ---------------------------------------
ev_mask   = articles_all["Publication_key"].isin(custom_palette.keys())
ev_titles = (articles_all.loc[ev_mask, "Publication_display"]
                        .dropna().sort_values().unique().tolist())

singles   = ["EIR", "RT", "Zavtra", "Tsargrad"]
pubs_scope = singles + ev_titles   # <- now includes all 11 EV pubs + singles

# Monthly per-publication table (mentions & articles)
monthly = (
    articles_all.loc[articles_all["Publication_display"].isin(pubs_scope)]
      .dropna(subset=["year_month"])
      .assign(
          year_month=lambda d: pd.to_datetime(d["year_month"], errors="coerce")
                                .dt.to_period("M").dt.to_timestamp(how="start")
      )
      .groupby(["year_month", "Publication_display", "Publication_key"], as_index=False)
      .agg(
          mentions=("mentions","sum"),
          articles=("article_key","nunique")
      )
)

# Continuous monthly index 1995–2025
start, end  = pd.Timestamp("1995-01-01"), pd.Timestamp("2025-12-01")
all_months  = pd.date_range(start, end, freq="MS")

# Average mentions / article per pub per month (with a guard)
MIN_ARTICLES   = 1   # keep months where a pub has ≥ this many articles
SMOOTH_WINDOW  = 3   # rolling-month smoothing
EPS            = 0.01
MIN_SOURCES    = 3   # need at least this many pubs present to consider "ALL"

def wide_counts(df, col):
    """Pivot to wide (rows=month, cols=Publication_display)."""
    wide = (df.pivot(index="year_month", columns="Publication_display", values=col)
              .reindex(all_months))
    return wide

w_m = wide_counts(monthly, "mentions")
w_a = wide_counts(monthly, "articles")

avg = (w_m / w_a).where(w_a >= MIN_ARTICLES)                 # NaN when too few articles
avg_smooth = avg.rolling(window=SMOOTH_WINDOW, min_periods=1).mean()

# “All up / all down” flags across ALL series in scope
d = avg_smooth.diff()

present = d.notna().sum(axis=1)
inc_any = (d >  EPS).any(axis=1)
dec_any = (d < -EPS).any(axis=1)
has_mix = ((d > EPS).any(axis=1)) & ((d < -EPS).any(axis=1))

# If you literally want *every* series to move the same way, use .all(axis=1)
# If that’s too strict with 15+ pubs, keep the robust version below:
all_up   = inc_any & ~has_mix & (present >= MIN_SOURCES)
all_down = dec_any & ~has_mix & (present >= MIN_SOURCES)

# Colors for every publication we’ll plot
def palette_by_display(df, master_palette):
    m = {}
    for _, r in df[["Publication_display","Publication_key"]].drop_duplicates().iterrows():
        m[r["Publication_display"]] = master_palette.get(r["Publication_key"], "#999999")
    return m

pal = palette_by_display(monthly, master_palette)

sns.set_context("talk", font_scale=1.1)
plt.rcParams.update({
    "axes.titlesize": 22, "axes.labelsize": 16,
    "xtick.labelsize": 11, "ytick.labelsize": 13,
    "legend.fontsize": 13
})

fig, ax = plt.subplots(figsize=(32, 16))

# Only plot columns that actually exist (some pubs might have no data after filtering)
pubs_to_plot = [p for p in pubs_scope if p in avg_smooth.columns]
for p in pubs_to_plot:
    ax.plot(
        avg_smooth.index, avg_smooth[p],
        marker="o", linewidth=3.0, markersize=6.5,
        label=p, color=pal.get(p, "#999999")
    )

# Shade months where “all” available moved the same direction
ymin, ymax = ax.get_ylim()
for x, up, down in zip(avg_smooth.index, all_up, all_down):
    if bool(up) or bool(down):
        ax.axvspan(x - pd.offsets.Day(15), x + pd.offsets.Day(15),
                   color=("green" if up else "red"), alpha=0.15)
ax.set_ylim(ymin, ymax)

# X axis: yearly ticks 1995–2025
ax.set_xlim(start, end)
ax.xaxis.set_major_locator(mdates.YearLocator(1))
ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
plt.setp(ax.get_xticklabels(), rotation=90, ha="center")

ax.set_title("Average Soros mentions per article per month (smoothed) — EV 11 + EIR + RT + Zavtra + Tsargrad")
ax.set_xlabel("Month"); ax.set_ylabel("Avg mentions per article")
ax.grid(axis="y", linestyle=":", alpha=.35)

# Legend below, multi-column
ncols = min(8, max(3, int(np.ceil(len(pubs_to_plot)/4))))
ax.legend(title="Publication", ncol=ncols, loc="upper center",
          bbox_to_anchor=(0.5, -0.10), frameon=False)

plt.tight_layout()
plt.subplots_adjust(bottom=0.16)
plt.show()

print("Flagged months — all up:", int(all_up.sum()), "| all down:", int(all_down.sum()))


# (2) Cross-source comparisons: Soros Average mentions per article  “all increased / all decreased” shading (each year view)
- Produces 30+ graph, uncomment to run

In [None]:
# Years that actually have any average data (within 1995–2025)
from datetime import datetime
years_in_data = sorted({d.year for d in avg_smooth.index if 1995 <= d.year <= 2025 and avg_smooth.loc[d].notna().any()})

# Colors (same mapping you’ve been using)
color_map = {
    "EIR":      eir_colors.get("eir", "#3066a8"),
    "RT":       rt_colors.get("rt", "#31a354"),
    "Zavtra":   master_palette.get("Zavtra", "#5b31a3"),
    "Tsargrad": master_palette.get("Tsargrad", "#a35b31"),
}

# Style
sns.set_context("talk", font_scale=1.1)
plt.rcParams.update({
    "axes.titlesize": 22, "axes.labelsize": 16,
    "xtick.labelsize": 12, "ytick.labelsize": 13,
    "legend.fontsize": 13
})

# --- Dynamic scaling controls ---
HEADROOM = 1.20     # multiply the year’s peak by this for top of y-axis
MIN_TOP  = 0.30     # if a year is super tiny, at least this high

for y in years_in_data:
    # Slice the smoothed monthly averages for this year
    mask = avg_smooth.index.year == y
    df_y = avg_smooth.loc[mask]
    if df_y.dropna(how="all").empty:
        continue

    # Compute year-specific y-top from the max across ALL publications for that year
    yr_peak = float(np.nanmax(df_y.values))
    if not np.isfinite(yr_peak) or yr_peak <= 0:
        yr_peak = MIN_TOP
    y_top = max(yr_peak * HEADROOM, MIN_TOP)

    fig, ax = plt.subplots(figsize=(30, 12))

    # Plot one line per source
    for src in ["EIR", "RT", "Zavtra", "Tsargrad"]:
        if src in df_y.columns:
            ax.plot(
                df_y.index, df_y[src],
                marker="o", linewidth=3.0, markersize=6.5,
                label=src, color=color_map.get(src, "#999999")
            )

    # Shade months where all available moved the same way
    ymin, _ = ax.get_ylim()
    for x in df_y.index:
        up = bool(all_up.get(x, False))
        down = bool(all_down.get(x, False))
        if up or down:
            ax.axvspan(x - pd.offsets.Day(15), x + pd.offsets.Day(15),
                       color=("green" if up else "red"), alpha=0.15)
    ax.set_ylim(ymin, y_top)

    # X axis: show month names for that year
    ax.set_xlim(datetime(y, 1, 1), datetime(y, 12, 31))
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
    ax.xaxis.set_major_formatter(mdates.DateFormatter("%b"))
    plt.setp(ax.get_xticklabels(), rotation=0)

    ax.set_title(f"Average Soros mentions per article — {y} (dynamic y-scale)")
    ax.set_xlabel("Month")
    ax.set_ylabel("Avg mentions per article")
    ax.grid(axis="y", linestyle=":", alpha=.35)

    # Legend below
    ax.legend(
        title="Source", ncol=4, loc="upper center",
        bbox_to_anchor=(0.5, -0.10), frameon=False
    )

    plt.tight_layout()
    plt.subplots_adjust(bottom=0.16)
    plt.show()


## (2) Cross-source comparisons: Soros Average mentions per article  “all increased / all decreased” shading (each year view) - All Eastview Publications
- Produces 30+ graph, uncomment to run

In [None]:
START_YEAR, END_YEAR = 1995, 2025
MIN_ARTICLES  = 1      # months where a pub has < MIN_ARTICLES -> avg is NaN (ignored)
SMOOTH_WINDOW = 3      # rolling months for smoothing (set to 1 for raw)
EPS           = 0.01   # ignore tiny wiggles in month-over-month change
MIN_SOURCES   = 3      # need at least this many pubs present to consider "ALL ↑/↓"

# Build the pub scope (EV11 + singles)
ev_mask   = articles_all["Publication_key"].isin(custom_palette.keys())
ev_titles = (articles_all.loc[ev_mask, "Publication_display"]
                        .dropna().sort_values().unique().tolist())
singles   = ["EIR", "RT", "Zavtra", "Tsargrad"]
pubs_scope = singles + ev_titles

# Monthly table per pub (mentions, articles)
monthly = (
    articles_all.loc[articles_all["Publication_display"].isin(pubs_scope)]
      .dropna(subset=["year_month"])
      .assign(
          year_month=lambda d: pd.to_datetime(d["year_month"], errors="coerce")
                                 .dt.to_period("M").dt.to_timestamp(how="start")
      )
      .groupby(["year_month", "Publication_display", "Publication_key"], as_index=False)
      .agg(
          mentions=("mentions","sum"),
          articles=("article_key","nunique")
      )
)

# Continuous monthly index for the whole window
start, end = pd.Timestamp(f"{START_YEAR}-01-01"), pd.Timestamp(f"{END_YEAR}-12-01")
all_months = pd.date_range(start, end, freq="MS")

def wide_counts(df, col):
    return (df.pivot(index="year_month", columns="Publication_display", values=col)
              .reindex(all_months))

w_m = wide_counts(monthly, "mentions")
w_a = wide_counts(monthly, "articles")

# Avg mentions/article; NaN when too few articles
avg = (w_m / w_a).where(w_a >= MIN_ARTICLES)
avg_smooth = avg.rolling(window=SMOOTH_WINDOW, min_periods=1).mean()

# Palette {Publication_display -> color}
def palette_by_display(df, master_palette):
    m = {}
    for _, r in df[["Publication_display","Publication_key"]].drop_duplicates().iterrows():
        m[r["Publication_display"]] = master_palette.get(r["Publication_key"], "#999999")
    return m
pal = palette_by_display(monthly, master_palette)

# Years that actually have any data
years_in_data = (avg_smooth.dropna(how="all").index.year
                 .astype(int)).unique()
years_in_data = [y for y in range(START_YEAR, END_YEAR+1) if y in years_in_data]

# Plot styling
sns.set_context("talk", font_scale=1.1)
plt.rcParams.update({
    "axes.titlesize": 22, "axes.labelsize": 16,
    "xtick.labelsize": 12, "ytick.labelsize": 13,
    "legend.fontsize": 13
})
month_labels = ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]

for y in years_in_data:
    # ----- slice to this year -----
    yr = avg_smooth.loc[(avg_smooth.index.year == y)]
    if yr.empty or yr.isna().all().all():
        continue

    # Pubs with any data this year
    pubs_y = [c for c in yr.columns if yr[c].notna().any()]
    if not pubs_y:
        continue

    # Month numbers for x axis
    mnums = yr.index.month
    # Tidy frame for seaborn (keep NaNs so gaps appear)
    df_y = yr.copy()
    df_y["month"] = mnums
    long_y = df_y.melt(id_vars="month", var_name="Publication", value_name="avg")

    # ----- within-year "ALL ↑ / ALL ↓" on smoothed series -----
    d = yr.diff()
    present = d.notna().sum(axis=1)
    inc_any = (d >  EPS).any(axis=1)
    dec_any = (d < -EPS).any(axis=1)
    has_mix = ((d > EPS).any(axis=1)) & ((d < -EPS).any(axis=1))
    all_up_y   = inc_any & ~has_mix & (present >= MIN_SOURCES)
    all_down_y = dec_any & ~has_mix & (present >= MIN_SOURCES)

    # Dynamic y-limit (scale to this year's peak so small years aren't squished)
    yr_peak = float(yr.max(skipna=True).max())
    yr_ylim = (yr_peak * 1.15) if np.isfinite(yr_peak) and yr_peak > 0 else 1.0

    # ----- plot -----
    fig, ax = plt.subplots(figsize=(28, 12))
    sns.lineplot(
        data=long_y.sort_values(["Publication","month"]),
        x="month", y="avg",
        hue="Publication",
        hue_order=sorted(pubs_y),
        palette={p: pal.get(p, "#999999") for p in pubs_y},
        marker="o", linewidth=3.0, markersize=6.5, ax=ax
    )

    ax.set_xlim(1, 12)
    ax.set_xticks(range(1,13))
    ax.set_xticklabels(month_labels)
    ax.set_ylim(0, yr_ylim)
    ax.grid(axis="y", linestyle=":", alpha=.35)

    ax.set_title(f"Average Soros mentions per article per month (smoothed) — {y}")
    ax.set_xlabel("Month")
    ax.set_ylabel("Avg mentions per article")

    # Shade months where all moved same direction (tolerant to flats)
    ymin, ymax = ax.get_ylim()
    for month_num, up, down in zip(mnums, all_up_y, all_down_y):
        if bool(up) or bool(down):
            ax.axvspan(month_num - 0.5, month_num + 0.5,
                       color=("green" if up else "red"), alpha=0.15)
    ax.set_ylim(ymin, ymax)

    # Legend below
    ncols = min(8, max(2, int(np.ceil(len(pubs_y)/4))))
    ax.legend(
        title="Publication",
        ncol=ncols,
        loc="upper center",
        bbox_to_anchor=(0.5, -0.12),
        frameon=False
    )

    plt.tight_layout()
    plt.subplots_adjust(bottom=0.18)
    plt.show()

# Cross-source comparisons: Soros negative probability per article “all increased / all decreased” shading

In [None]:
# CONFIG 
SMOOTH_WINDOW = 3   # rolling window in months; set to 1 for no smoothing
EPS           = 0.005  # ignore tiny wiggles (month-over-month change smaller than this)
MIN_PUBS      = 3   # require at least this many pubs present to evaluate "ALL ↑/↓"
START_YEAR, END_YEAR = 1995, 2025  # x-axis window

# Set monthly average negativity probability
#    neg_prob is already in articles_all (derived earlier from score+probability)
#    Keep only comparable sources: EastView 11 + Zavtra + Tsargrad

# Expect: custom_palette contains the 11 EV keys (e.g., "kommersant", "pravda", ...).
# master_palette should also include "Zavtra" and "Tsargrad".
def palette_by_display(df, master_palette):
    out = {}
    for _, r in df[["Publication_display","Publication_key"]].drop_duplicates().iterrows():
        out[r["Publication_display"]] = master_palette.get(r["Publication_key"], "#999999")
    return out

neg_time = (
    articles_all
      .dropna(subset=["year_month", "neg_prob", "Publication_display", "Publication_key"])
      .copy()
)
neg_time["year_month"] = pd.to_datetime(neg_time["year_month"], errors="coerce")
neg_time = neg_time.dropna(subset=["year_month"])

# Filter to comparable sources (EV 11 via palette keys) + Zavtra + Tsargrad
keep_mask = (
    neg_time["Publication_key"].isin(custom_palette.keys()) |
    neg_time["Publication_display"].isin(["Zavtra", "Tsargrad"])
)
neg_time = neg_time.loc[keep_mask].copy()

# Monthly mean neg_prob
monthly_neg = (
    neg_time
    .groupby(["Publication_display","Publication_key","year_month"], as_index=False)
    .agg(mean_neg_prob=("neg_prob","mean"))
    .sort_values(["Publication_display","year_month"])
)

# Build palette {Publication_display -> color}
pal = palette_by_display(monthly_neg, master_palette)

# Wide matrix with continuous monthly index
start, end = pd.Timestamp(f"{START_YEAR}-01-01"), pd.Timestamp(f"{END_YEAR}-12-01")
all_months = pd.date_range(start, end, freq="MS")
pubs = sorted(monthly_neg["Publication_display"].unique().tolist())

wide = (
    monthly_neg
      .pivot(index="year_month", columns="Publication_display", values="mean_neg_prob")
      .reindex(all_months)   # keep NaN where a pub had no articles that month
)

# Smooth & tolerant ALL-UP / ALL-DOWN flags (allow flats)
wide_smooth = wide.rolling(window=SMOOTH_WINDOW, min_periods=1).mean()
d = wide_smooth.diff()

# Count how many pubs have valid month-over-month changes
present = d.notna().sum(axis=1)

# "Some up" / "Some down" using EPS; flats (|diff| <= EPS) are neutral
some_up   = (d >  EPS).any(axis=1)
some_down = (d < -EPS).any(axis=1)

# Tolerant definitions: at least one moved in that direction, and nobody moved opposite
all_up   = some_up   & ~some_down & (present >= MIN_PUBS)
all_down = some_down & ~some_up   & (present >= MIN_PUBS)

print("Flagged months — ALL UP:", int(all_up.sum()), " | ALL DOWN:", int(all_down.sum()))

# Plot (full width, legend below, yearly ticks, green/red shading)
sns.set_context("talk", font_scale=1.1)
plt.rcParams.update({
    "axes.titlesize": 22, "axes.labelsize": 16,
    "xtick.labelsize": 11, "ytick.labelsize": 13,
    "legend.fontsize": 14
})

fig, ax = plt.subplots(figsize=(32, 16))

# Plot each publication
for pub in pubs:
    ax.plot(
        wide_smooth.index, wide_smooth[pub],
        marker="o", linewidth=3.0, markersize=6.5,
        label=pub, color=pal.get(pub, "#999999")
    )

# X axis: yearly ticks + full window
ax.set_xlim(start, end)
ax.xaxis.set_major_locator(mdates.YearLocator(1))
ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
plt.setp(ax.get_xticklabels(), rotation=90, ha="center")

# Shade months where ALL moved in the same direction (allowing flats)
ymin, ymax = ax.get_ylim()
for x, up, down in zip(wide_smooth.index, all_up, all_down):
    if bool(up) or bool(down):
        ax.axvspan(x - pd.offsets.Day(15), x + pd.offsets.Day(15),
                   color=("green" if up else "red"), alpha=0.15)
ax.set_ylim(ymin, ymax)

ax.set_title("Average negativity probability per month (smoothed) — EV 11 + Zavtra + Tsargrad")
ax.set_xlabel("Month")
ax.set_ylabel("Negativity probability (0–1)")
ax.set_ylim(0, 1)  # natural scale for probabilities
ax.grid(axis="y", linestyle=":", alpha=.35)

# Legend centered below the plot
ncols = min(8, max(2, int(np.ceil(len(pubs)/3))))
ax.legend(title="Publication", ncol=ncols, loc="upper center",
          bbox_to_anchor=(0.5, -0.10), frameon=False)

plt.tight_layout()
plt.subplots_adjust(bottom=0.16)
plt.show()

# Cross-source comparisons: Soros negative probability per article “all increased / all decreased” shading (each year view)
- Produces 30+ graph, uncomment to run

In [None]:
# CONFIG
SMOOTH_WINDOW = 3    # rolling window in months; set to 1 for no smoothing
EPS           = 0.005  # ignore tiny wiggles (month-over-month change smaller than this)
MIN_PUBS      = 3    # require at least this many pubs present to evaluate "ALL ↑/↓"
START_YEAR, END_YEAR = 1995, 2025

# Monthly average negativity probability (EV11 + Zavtra + Tsargrad)
def palette_by_display(df, master_palette):
    out = {}
    for _, r in df[["Publication_display","Publication_key"]].drop_duplicates().iterrows():
        out[r["Publication_display"]] = master_palette.get(r["Publication_key"], "#999999")
    return out

neg_time = (
    articles_all
      .dropna(subset=["year_month", "neg_prob", "Publication_display", "Publication_key"])
      .copy()
)
neg_time["year_month"] = pd.to_datetime(neg_time["year_month"], errors="coerce")
neg_time = neg_time.dropna(subset=["year_month"])

# Keep ONLY comparable sources (EV11 via custom_palette keys) + Zavtra + Tsargrad
keep_mask = (
    neg_time["Publication_key"].isin(custom_palette.keys()) |
    neg_time["Publication_display"].isin(["Zavtra", "Tsargrad"])
)
neg_time = neg_time.loc[keep_mask].copy()

monthly_neg = (
    neg_time
    .groupby(["Publication_display","Publication_key","year_month"], as_index=False)
    .agg(mean_neg_prob=("neg_prob","mean"))
    .sort_values(["Publication_display","year_month"])
)

# Colors
pal = palette_by_display(monthly_neg, master_palette)

# Continuous monthly index 1995–2025; wide matrix (rows=month, cols=publication)
start, end = pd.Timestamp(f"{START_YEAR}-01-01"), pd.Timestamp(f"{END_YEAR}-12-01")
all_months = pd.date_range(start, end, freq="MS")
pubs_all   = sorted(monthly_neg["Publication_display"].unique().tolist())

wide = (
    monthly_neg
      .pivot(index="year_month", columns="Publication_display", values="mean_neg_prob")
      .reindex(all_months)    # keep NaN where a pub had no articles that month
)

# Smooth the monthly series before we compute MoM changes
wide_smooth = wide.rolling(window=SMOOTH_WINDOW, min_periods=1).mean()

# Years to plot (only those with any data)
years_in_data = [y for y in range(START_YEAR, END_YEAR+1)
                 if wide_smooth.loc[wide_smooth.index.year == y].notna().any().any()]

# Per-year plots with ALL↑/ALL↓ shading
sns.set_context("talk", font_scale=1.1)
plt.rcParams.update({
    "axes.titlesize": 22, "axes.labelsize": 16,
    "xtick.labelsize": 12, "ytick.labelsize": 13,
    "legend.fontsize": 13
})
month_labels = ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]

for y in years_in_data:
    yr = wide_smooth.loc[wide_smooth.index.year == y, pubs_all]
    if yr.empty or yr.isna().all().all():
        continue

    # Keep only pubs with any data this year
    pubs_y = [c for c in yr.columns if yr[c].notna().any()]
    if not pubs_y:
        continue

    # Compute tolerant ALL↑ / ALL↓ within this year
    d = yr[pubs_y].diff()
    present = d.notna().sum(axis=1)
    inc_any = (d >  EPS).any(axis=1)
    dec_any = (d < -EPS).any(axis=1)
    has_mix = ((d > EPS).any(axis=1)) & ((d < -EPS).any(axis=1))
    all_up_y   = inc_any & ~has_mix & (present >= MIN_PUBS)
    all_down_y = dec_any & ~has_mix & (present >= MIN_PUBS)

    # Tidy frame for seaborn (keep NaNs so gaps appear)
    df_y = yr[pubs_y].copy()
    df_y["month"] = df_y.index.month
    long_y = df_y.melt(id_vars="month", var_name="Publication", value_name="neg_mean")

    # Plot (full width; fixed 0–1 y-scale for probabilities)
    fig, ax = plt.subplots(figsize=(28, 14))
    sns.lineplot(
        data=long_y.sort_values(["Publication","month"]),
        x="month", y="neg_mean",
        hue="Publication",
        hue_order=sorted(pubs_y),
        palette={p: pal.get(p, "#999999") for p in pubs_y},
        marker="o", linewidth=3.0, markersize=6.5, ax=ax
    )

    ax.set_xlim(1, 12)
    ax.set_xticks(range(1,13))
    ax.set_xticklabels(month_labels)
    ax.set_ylim(0, 1)
    ax.set_yticks([0,.25,.5,.75,1])
    ax.grid(axis="y", linestyle=":", alpha=.35)

    ax.set_title(f"Average negativity probability by month — {y} (EV11 + Zavtra + Tsargrad)")
    ax.set_xlabel("Month")
    ax.set_ylabel("Negativity probability (0–1)")

    # Shade months where "all" moved same direction (flats allowed)
    ymin, ymax = ax.get_ylim()
    months_in_year = yr.index.month
    for m, up, down in zip(months_in_year, all_up_y, all_down_y):
        if bool(up) or bool(down):
            ax.axvspan(m - 0.5, m + 0.5, color=("green" if up else "red"), alpha=0.15)
    ax.set_ylim(ymin, ymax)

    # Legend centered below
    ncols = min(8, max(2, int(np.ceil(len(pubs_y)/4))))
    ax.legend(title="Publication", ncol=ncols, loc="upper center",
              bbox_to_anchor=(0.5, -0.12), frameon=False)

    plt.tight_layout()
    plt.subplots_adjust(bottom=0.18)
    plt.show()


## Heatmaps (mentions & sentiment)

In [None]:
# Mentions pivot
hm_df = (
    articles_all.dropna(subset=["year_month"])
      .groupby(["Publication_display","Publication_key","year_month"], as_index=False)
      .agg(mentions=("mentions","sum"))
)
hm_pivot = hm_df.pivot_table(index="year_month", columns="Publication_display", values="mentions", fill_value=0)
hm_pivot = hm_pivot.sort_index()

plt.figure(figsize=(28, 10))  # big
ax = sns.heatmap(hm_pivot.T, cmap="YlOrRd", cbar_kws={"label":"Mentions"})
ax.set_title("Heatmap — Total Monthly Soros mentions", fontsize=20)
ax.set_ylabel("Publication", fontsize=14)

# Format x-axis: years only, vertical
idx = hm_pivot.index.to_timestamp() if hasattr(hm_pivot.index, "to_timestamp") else pd.to_datetime(hm_pivot.index)
jan_locs = [i for i, d in enumerate(idx) if d.month == 1]
year_labels = [idx[i].strftime("%Y") for i in jan_locs]

ax.set_xlabel("Year", fontsize=14)
ax.set_xticks(jan_locs)
ax.set_xticklabels(year_labels, rotation=90, fontsize=12)

plt.tight_layout()
plt.show()


In [None]:
# Avg
# Monthly average mentions per article
d = (
    articles_all.dropna(subset=["year_month"])
    .assign(year_month=lambda x: pd.to_datetime(x["year_month"], errors="coerce")
                                 .dt.to_period("M").dt.to_timestamp(how="start"))
)

monthly_avg = (
    d.groupby(["Publication_display","Publication_key","year_month"], as_index=False)
     .agg(total_mentions=("mentions","sum"),
          n_articles=("article_key","nunique"))
     .assign(avg_mentions=lambda df: df["total_mentions"].div(df["n_articles"]).where(df["n_articles"]>0))
)

avg_pivot = (
    monthly_avg.pivot_table(index="year_month",
                            columns="Publication_display",
                            values="avg_mentions")
    .sort_index()
)


# Ensure a continuous monthly index so year ticks line up
start, end = "1995-01-01", "2025-12-01"
idx = pd.date_range(start, end, freq="MS")
avg_pivot_cont = (
    avg_pivot.copy()
             .reindex(idx)          # keep months with no data as NaN (gaps)
             .sort_index()
)

# Set vmax so one outlier doesn’t wash out the colormap
vals = avg_pivot_cont.to_numpy().astype(float)
vmax95 = np.nanpercentile(vals, 95) if np.isfinite(np.nanpercentile(vals, 95)) else None
# sensible fallback if everything is 0/NaN
vmax = float(vmax95) if vmax95 and vmax95 > 0 else 1.0

plt.figure(figsize=(28, 10))
ax = sns.heatmap(
    avg_pivot_cont.T,                 # pubs on y, months on x
    cmap="YlGnBu",
    mask=avg_pivot_cont.T.isna(),     # hide months with no articles
    vmin=0, vmax=vmax,
    cbar_kws={"label": "Avg mentions per article"}
)

ax.set_title("Heatmap — Monthly average Soros mentions per article", fontsize=20)
ax.set_ylabel("Publication", fontsize=14)

# X-axis: show only January labels (years)
jan_locs = [i for i, dt in enumerate(avg_pivot_cont.index) if dt.month == 1]
year_labels = [avg_pivot_cont.index[i].strftime("%Y") for i in jan_locs]
ax.set_xlabel("Year", fontsize=14)
ax.set_xticks(jan_locs)
ax.set_xticklabels(year_labels, rotation=90, fontsize=12)

plt.tight_layout()
plt.show()


In [None]:
# Keep only sources with compatible sentiment: EV11 + Zavtra + Tsargrad
keep_mask = (
    articles_all["Publication_key"].isin(custom_palette.keys()) |
    articles_all["Publication_display"].isin(["Zavtra","Tsargrad"])
)

hs_df = (
    articles_all.loc[keep_mask]
      .dropna(subset=["year_month","neg_prob"])
      .assign(year_month=lambda d: pd.to_datetime(d["year_month"], errors="coerce")
                                     .dt.to_period("M").dt.to_timestamp(how="start"))
      .dropna(subset=["year_month"])
      .groupby(["Publication_display","Publication_key","year_month"], as_index=False)
      .agg(mean_neg=("neg_prob","mean"))
)

# Pivot and reindex to a continuous monthly timeline (1995–2025)
months = pd.date_range("1995-01-01", "2025-12-01", freq="MS")
hs_pivot = (
    hs_df.pivot_table(index="year_month", columns="Publication_display", values="mean_neg")
         .reindex(months)
         .sort_index()
)

plt.figure(figsize=(28, 10))
ax = sns.heatmap(
    hs_pivot.T, vmin=0, vmax=1, cmap="coolwarm_r",
    mask=hs_pivot.T.isna(), cbar_kws={"label":"Negativity (0–1)"}
)
ax.set_title("Heatmap — Monthly average negativity probability", fontsize=20)
ax.set_ylabel("Publication", fontsize=14)

# Year ticks at each January
jan_locs = [i for i, dt in enumerate(hs_pivot.index) if dt.month == 1]
year_labels = [hs_pivot.index[i].strftime("%Y") for i in jan_locs]
ax.set_xlabel("Year", fontsize=14)
ax.set_xticks(jan_locs)
ax.set_xticklabels(year_labels, rotation=90, fontsize=12)

plt.tight_layout()
plt.show()


# Mentions per 1,000 words (where full text is available)

In [None]:
# Best-effort word counts from common text columns
def word_count_series(df):
    for c in ["ArticleTextEnglish","ArticleTextEnglish1", "ArticleTextEnglish2", "ArticleTextEnglish3", "ArticleTextEnglish4", "ArticleTextEnglish5", "ArticleTextEnglish6", "content"]:
        if c in df.columns:
            return df[c].astype(str).str.split().map(len)
    return pd.Series(np.nan, index=df.index)

# Build per-publication table with total words + mentions
wc_frames = []
for label, df in [("EastView", eastview_data), ("Zavtra", zavtra_data), ("Tsargrad", tsargrad_data), ("RT", rt_data), ("EIR", eir_data)]:
    if df is None: continue
    tmp = pd.DataFrame()
    tmp["Publication_display"] = label if label in ["Zavtra","Tsargrad","RT","EIR"] else df["Publication"]
    tmp["Publication_key"] = (df["Publication"].map(eastview_map).fillna(df["Publication"].astype(str).str.lower().str.replace(" ","_", regex=False))
                              if label=="EastView" else label.lower())
    tmp["mentions"] = pd.to_numeric(df.get("soros_count", df.get("soros", 0)), errors="coerce").fillna(0).astype(int)
    tmp["words"] = word_count_series(df)
    wc_frames.append(tmp[["Publication_display","Publication_key","mentions","words"]])

wc = pd.concat(wc_frames, ignore_index=True).dropna(subset=["words"])
per_pub_1000w = (
    wc.groupby(["Publication_display","Publication_key"], as_index=False)
      .agg(total_mentions=("mentions","sum"), total_words=("words","sum"))
      .assign(mentions_per_1000w=lambda d: 1000*d["total_mentions"]/d["total_words"])
      .sort_values("mentions_per_1000w", ascending=False)
)
display(per_pub_1000w)


## Coverage rate vs negativity (who talks a lot & is negative?)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

# --- compute coverage vs negativity (your code) ---
article_level = (articles_all
    .groupby(["Publication_display","Publication_key","article_key"], as_index=False)
    .agg(has_mention=("mentions", lambda s: int((s > 0).sum() > 0)),
         neg_prob=("neg_prob", "mean"))
)

per_pub_cov = (
    article_level.groupby(["Publication_display","Publication_key"], as_index=False)
      .agg(coverage_rate=("has_mention","mean"),
           mean_neg=("neg_prob","mean"),
           n_articles=("article_key","nunique"))
)

# --- palette ---
pal = palette_by_display(per_pub_cov, master_palette)

# --- bubble size scale (adjust if you want bigger/smaller bubbles) ---
# map n_articles -> marker size points^2
size_base = 40
size_k    = 3.0
sizes = size_base + size_k * per_pub_cov["n_articles"].to_numpy()

fig, ax = plt.subplots(figsize=(22, 12))

# plot all points in one call (vectorized)
ax.scatter(
    per_pub_cov["coverage_rate"],
    per_pub_cov["mean_neg"],
    s=sizes,
    c=per_pub_cov["Publication_display"].map(lambda d: pal.get(d, "#999999")),
    alpha=0.9,
    linewidths=0.5,
    edgecolors="white"
)

ax.set_xlabel("Coverage rate (share with ≥1 mention)")
ax.set_ylabel("Mean negativity (0–1)")
ax.set_title("Coverage vs negativity by publication (bubble size = #articles)")
ax.grid(True, linestyle=":", alpha=.5)

# -----------------------------
# LEGEND(S) AT THE BOTTOM
# -----------------------------

# 1) Color legend (one swatch per publication, fixed marker size)
pubs = per_pub_cov["Publication_display"].tolist()
color_handles = [
    Line2D([0],[0], marker='o', linestyle='',
           markerfacecolor=pal.get(p, "#999999"), markeredgecolor="none",
           markersize=10, label=p)
    for p in pubs
]

# 2) Size legend (optional): pick nice example sizes (low/median/high)
ex_values = np.unique(np.round(np.quantile(per_pub_cov["n_articles"], [0.2, 0.5, 0.8])).astype(int))
size_handles = [
    plt.scatter([], [], s=size_base + size_k * v, color="#777", alpha=.35, edgecolor="none",
                label=f"{v} articles")
    for v in ex_values
]

# Build one combined legend at the bottom (colors on top row, sizes on second row)
handles = color_handles + size_handles
ncols_color = min(6, max(2, int(np.ceil(len(color_handles)/3))))
ncols_size  = len(size_handles)

leg = ax.legend(
    handles=handles,
    loc="upper center",
    bbox_to_anchor=(0.5, -0.14),   # push below plot
    frameon=False,
    ncol=max(ncols_color, ncols_size),
    title="Publication  •  Bubble size = #articles"
)

plt.tight_layout()
plt.subplots_adjust(bottom=0.22)     # extra space for the bottom legend
plt.show()


## Word clouds & top keywords per publication

In [None]:
import sys, subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "nltk", "wordcloud"])


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation

In [None]:
# WORD CLOUDS (one per publication, using assigned colors)

# Ensure stopwords are available
nltk.download('stopwords')

# --- Stopword sets (EN + RU + your custom adds) ---
stop_en = set(stopwords.words('english'))
try:
    stop_ru = set(stopwords.words('russian'))
except:
    stop_ru = set()
custom_stops = {
    # add/remove as you like
    "soros","soroses","sorosomania","sorosites",
    "said","say","says","also","one","two","new","would","could",
    "в","и","на","с","что","по","это","как","из","за","для","к","от","не","его","ее","их","мы","они","он","она"
}
STOP = stop_en | stop_ru | custom_stops

# --- Helper: map Publication_display -> your palette key ---
disp_to_key = {
    # singles
    "RT": "rt",
    "EIR": "eir",
    "Zavtra": "Zavtra",
    "Tsargrad": "Tsargrad",
    # EastView (use your eastview_map if defined)
    **eastview_map
}

def pub_color(pub_display):
    """Return the hex color for this publication, using your master_palette."""
    key = disp_to_key.get(pub_display, disp_to_key.get(pub_display.strip(), pub_display.lower()))
    return master_palette.get(key, "#444444")

# --- Helper: get best-available text columns from a DataFrame ---
TEXT_CANDIDATES = [
    "ArticleTextEnglish", "ArticleText", "article_text",
    "translated_article_excerpt", "content", "text",
    "soros_sentence"  # last resort (short)
]

def collect_text_from_df(df):
    if df is None or len(df) == 0:
        return ""
    for col in TEXT_CANDIDATES:
        if col in df.columns:
            # Join only non-null strings
            return " ".join(df[col].dropna().astype(str).tolist())
    return ""

# --- Build a dict {Publication_display: concatenated text} using your original frames ---
pub_text = {}

# Singles (use source-specific DataFrames if present)
pub_text["Zavtra"]   = collect_text_from_df(globals().get("zavtra_data"))
pub_text["Tsargrad"] = collect_text_from_df(globals().get("tsargrad_data"))
pub_text["RT"]       = collect_text_from_df(globals().get("rt_data"))
pub_text["EIR"]      = collect_text_from_df(globals().get("eir_data"))

# EastView sub-pubs from eastview_data (group by "Publication")
ev_df = globals().get("eastview_data")
if ev_df is not None and "Publication" in ev_df.columns:
    for name, sub in ev_df.groupby("Publication", dropna=True):
        pub_text[name] = collect_text_from_df(sub)

# Optional: fall back to articles_all (grouped) if some pubs are still empty
missing_pubs = [p for p, t in pub_text.items() if not t]
if missing_pubs:
    for p in missing_pubs:
        txt = collect_text_from_df(articles_all.loc[articles_all["Publication_display"] == p])
        if txt:
            pub_text[p] = txt

# --- Generate a word cloud per pub (single-color using assigned hex) ---
def solid_color_func(hex_color):
    """Return a wordcloud color_func that paints every word the same hex color."""
    def _f(*args, **kwargs):
        return hex_color
    return _f

# You can tweak these for aesthetic/legibility
WC_KW = dict(
    width=1400, height=800,
    background_color="white",
    max_words=300,
    collocations=False,  # treat bigrams independently
    prefer_horizontal=0.9
)

for pub, txt in pub_text.items():
    if not txt or not isinstance(txt, str) or len(txt.strip()) == 0:
        continue

    # Simple token filter against stopwords
    tokens = [w for w in txt.split() if w.isalpha() and w.lower() not in STOP]
    if not tokens:
        continue

    wc = WordCloud(**WC_KW).generate(" ".join(tokens))
    color = pub_color(pub)

    plt.figure(figsize=(12, 6))
    plt.imshow(wc.recolor(color_func=solid_color_func(color)), interpolation="bilinear")
    plt.title(f"Word cloud — {pub}", fontsize=18)
    plt.axis("off")
    plt.tight_layout()
    plt.show()

# (Optional) quick sanity check: which pubs produced text?
for p, t in pub_text.items():
    print(f"{p:15s} | characters: {len(t):>8d}")


## TF-IDF keywords, SVD scatter, and LDA topics (per publication)

In [None]:
def get_text_series(df):
    for c in ["ArticleTextEnglish","article_text","translated_article_excerpt","content"]:
        if c in df.columns:
            return df[c].astype(str).fillna("")
    return pd.Series([], dtype=str)


#----------
# ------ PUBLICATIONS CHANGE HERE ------
# pick 6 focal publications
focals = ["Zavtra","Tsargrad","RT","EIR","Kommersant","Vedomosti"]
#----------

docs = []
labels = []
for pub in focals:
    if pub=="Zavtra":   s = get_text_series(zavtra_data)
    elif pub=="Tsargrad": s = get_text_series(tsargrad_data)
    elif pub=="RT": s = rt_data["content"].astype(str).fillna("")
    elif pub=="EIR": s = eir_data["content"].astype(str).fillna("")
    else:
        s = eastview_data.loc[eastview_data["Publication"]==pub, "ArticleTextEnglish"].astype(str).fillna("")
    if len(s)==0: continue
    docs.extend(s.tolist()); labels.extend([pub]*len(s))

# TF-IDF
vec = TfidfVectorizer(max_features=10000, stop_words="english")
X = vec.fit_transform(docs)

# Top TF-IDF terms per publication
tfidf_df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out())
tfidf_df["pub"] = labels
top_terms = {}
for pub, g in tfidf_df.groupby("pub"):
    mean_scores = g.drop(columns=["pub"]).mean().sort_values(ascending=False).head(20)
    top_terms[pub] = mean_scores
    print(f"\nTop TF-IDF terms — {pub}\n", mean_scores.head(10))

# SVD to 2D (document scatter)
svd = TruncatedSVD(n_components=2, random_state=42)
XY = svd.fit_transform(X)
plt.figure(figsize=(12, 14))
for pub in np.unique(labels):
    idx = [i for i,l in enumerate(labels) if l==pub]
    plt.scatter(XY[idx,0], XY[idx,1], s=8, alpha=0.5, label=pub,
                color=master_palette.get(eastview_map.get(pub, pub.lower()), "#999999"))
plt.legend(markerscale=3, bbox_to_anchor=(1.02, 1), loc="upper left", frameon=False)
plt.title("SVD (LSA) document scatter by publication")
plt.xlabel("Component 1"); plt.ylabel("Component 2")
plt.tight_layout(); plt.show()

# LDA topics (example K=8 on whole corpus)
lda = LatentDirichletAllocation(n_components=8, random_state=42, learning_method="batch")
lda_Z = lda.fit_transform(X)
terms = vec.get_feature_names_out()
def top_words(topic_vec, n=10):
    return [terms[i] for i in topic_vec.argsort()[-n:][::-1]]
for k, comp in enumerate(lda.components_):
    print(f"Topic {k+1}: ", ", ".join(top_words(comp, n=12)))


# SVD (LSA) scatterplot for all publications

In [None]:
def get_text_series(df):
    for c in ["ArticleTextEnglish","article_text","translated_article_excerpt","content"]:
        if c in df.columns:
            return df[c].astype(str).fillna("")
    return pd.Series([], dtype=str)
    
EXTRA_STOPS = {
    "said","say","says","mr","ms","mrs","today","yesterday","tomorrow",
    "news","report","reports","reported","according","week","weeks","month","months",
    "year","years","daily","update","live","video","photo","photos","twitter","facebook",
    "gov","govt","via","—","–","’","“","”","amp"
}

STOPWORDS = sorted(set(ENGLISH_STOP_WORDS) | EXTRA_STOPS)

# Drop 1–2 letter tokens and numbers
TOKEN_PATTERN = r"(?u)\b[a-zA-Z]{3,}\b"

# Collect data
focals = ["Zavtra","Tsargrad","RT","EIR","Kommersant","Vedomosti"]

docs = []
labels = []
for pub in focals:
    if pub == "Zavtra":
        s = get_text_series(zavtra_data)
    elif pub == "Tsargrad":
        s = get_text_series(tsargrad_data)
    elif pub == "RT":
        s = rt_data["content"].astype(str).fillna("")
    elif pub == "EIR":
        s = eir_data["content"].astype(str).fillna("")
    else:
        s = eastview_data.loc[eastview_data["Publication"]==pub, "ArticleTextEnglish"].astype(str).fillna("")
    if len(s) == 0:
        continue
    docs.extend(s.tolist())
    labels.extend([pub] * len(s))

# TF-IDF
vec = TfidfVectorizer(
    max_features=10000,
    stop_words=STOPWORDS,      # <-- list ok
    token_pattern=r"(?u)\b[a-zA-Z]{3,}\b",
    lowercase=True,
    max_df=0.90,
    min_df=2,
    strip_accents="unicode"    # optional: normalize accented chars
)
X = vec.fit_transform(docs)


terms = vec.get_feature_names_out()

# Top TF-IDF terms per publication
top_terms = {}
labels_arr = np.array(labels)
for pub in sorted(set(labels)):
    idx = np.where(labels_arr == pub)[0]
    if len(idx) == 0:
        continue
    mean_vec = X[idx].mean(axis=0).A1              # sparse-safe mean
    top_idx = mean_vec.argsort()[-20:][::-1]
    top_terms[pub] = pd.Series(mean_vec[top_idx], index=terms[top_idx])
    print(f"\nTop TF-IDF terms — {pub}\n", top_terms[pub].head(10))

# SVD (LSA) scatter
svd = TruncatedSVD(n_components=2, random_state=42)
XY = svd.fit_transform(X)

plt.figure(figsize=(12, 14), dpi=120)
for pub in sorted(set(labels)):
    idx = np.where(labels_arr == pub)[0]
    # falls back to grey if your palettes/ map aren’t in scope
    color = master_palette.get(eastview_map.get(pub, pub.lower()), "#999999") if 'master_palette' in globals() and 'eastview_map' in globals() else "#999999"
    plt.scatter(
        XY[idx, 0], XY[idx, 1],
        s=14, alpha=0.55, label=pub, color=color, edgecolors="none"
    )

plt.title("SVD (LSA) document scatter by publication", fontsize=22, pad=12)
plt.xlabel("Component 1", fontsize=16)
plt.ylabel("Component 2", fontsize=16)
plt.grid(ls=":", alpha=.25)

plt.legend(
    title="Publication",
    ncol=min(6, len(set(labels))),
    loc="upper center",
    bbox_to_anchor=(0.5, -0.08),
    frameon=False,
    fontsize=14,
    title_fontsize=14
)
plt.tight_layout()
plt.subplots_adjust(bottom=0.12)
plt.show()

# LDA topics
lda = LatentDirichletAllocation(n_components=8, random_state=42, learning_method="batch")
lda_Z = lda.fit_transform(X)

def top_words(topic_vec, n=12):
    idx = topic_vec.argsort()[-n:][::-1]
    return [terms[i] for i in idx]

for k, comp in enumerate(lda.components_):
    print(f"Topic {k+1}: ", ", ".join(top_words(comp)))

## SVD (LSA) scatterplot for all publications over time

In [None]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import TruncatedSVD
from collections import defaultdict

# ========== CONFIG ==========
START_YEAR, END_YEAR = 1995, 2025
WINDOW_YEARS        = 5          # 5-year buckets
MIN_DOCS_WINDOW     = 60         # skip a window if fewer docs total than this
MIN_DOCS_PER_PUB    = 8          # pub must have at least this many docs in window
MAX_DOCS_PER_PUB    = 2000       # cap for speed; set None to disable
RANDOM_SEED         = 42

# Display/visual
POINT_SIZE  = 24      # bigger!
POINT_ALPHA = 0.45
CENTROID_SIZE = 180
LABEL_FONTSIZE = 11

# Stopwords / tokenization
EXTRA_STOPS = {
    "said","say","says","mr","ms","mrs","today","yesterday","tomorrow",
    "news","report","reports","reported","according","week","weeks","month","months",
    "year","years","daily","update","live","video","photo","photos","twitter","facebook",
    "gov","govt","via","—","–","’","“","”","amp"
}
STOPWORDS = sorted(set(ENGLISH_STOP_WORDS) | EXTRA_STOPS)
TOKEN_PATTERN = r"(?u)\b[a-zA-Z]{3,}\b"

rng = np.random.default_rng(RANDOM_SEED)

# ---------- Helpers ----------
def get_text_series(df):
    for c in ["ArticleTextEnglish","article_text","translated_article_excerpt","content","ArticleText"]:
        if c in df.columns:
            return df[c].astype(str).fillna("")
    return pd.Series([], dtype=str)

def has_year_col(df):
    return ("year_month" in df.columns) and pd.api.types.is_datetime64_any_dtype(df["year_month"])

def texts_for_pub_range(pub, y0, y1):
    """Return list of texts for one publication across [y0, y1]."""
    if pub == "Zavtra":
        df = zavtra_data
    elif pub == "Tsargrad":
        df = tsargrad_data
    elif pub == "RT":
        df = rt_data
    elif pub == "EIR":
        df = eir_data
    else:
        # EastView sub-publication
        df = eastview_data.loc[eastview_data.get("Publication","") == pub]

    if df is None or df.empty or not has_year_col(df):
        return []

    s = get_text_series(df)
    y = df["year_month"].dt.year
    sel = s[(y >= y0) & (y <= y1)]
    if MAX_DOCS_PER_PUB and len(sel) > MAX_DOCS_PER_PUB:
        idx = rng.choice(len(sel), size=MAX_DOCS_PER_PUB, replace=False)
        sel = sel.iloc[idx]
    return sel.tolist()

def build_palette_from_mapping(names, master_palette, eastview_map=None):
    out = {}
    for name in names:
        key = eastview_map.get(name, name) if eastview_map else name
        color = master_palette.get(key) or master_palette.get(str(key).lower(), "#999999")
        out[name] = color
    return out

# ---------- Publication universe (EastView pubs + singles) ----------
ev_pubs = sorted(eastview_data["Publication"].dropna().astype(str).unique().tolist()) if "Publication" in eastview_data.columns else []
pubs_all = ev_pubs + ["Zavtra","Tsargrad","RT","EIR"]

# ---------- Build windows ----------
windows = []
y = START_YEAR
while y <= END_YEAR:
    y0, y1 = y, min(y + WINDOW_YEARS - 1, END_YEAR)
    windows.append((y0, y1))
    y += WINDOW_YEARS

# ---------- (1) Fit a GLOBAL TF-IDF + SVD basis across ALL windows ----------
all_docs_for_basis = []
for (y0, y1) in windows:
    for pub in pubs_all:
        texts = texts_for_pub_range(pub, y0, y1)
        if len(texts) >= MIN_DOCS_PER_PUB:
            all_docs_for_basis.extend(texts)
print(f"Global basis corpus size: {len(all_docs_for_basis):,} docs")

# If you have very many docs, you can downsample here for speed:
# if len(all_docs_for_basis) > 200_000:
#     idx = rng.choice(len(all_docs_for_basis), size=200_000, replace=False)
#     all_docs_for_basis = [all_docs_for_basis[i] for i in idx]

vec = TfidfVectorizer(
    max_features=30000,
    stop_words=STOPWORDS,
    token_pattern=TOKEN_PATTERN,
    lowercase=True,
    max_df=0.90,
    min_df=2,
    strip_accents="unicode"
)
X_global = vec.fit_transform(all_docs_for_basis)

svd = TruncatedSVD(n_components=2, random_state=RANDOM_SEED)
svd.fit(X_global)  # global orientation for comparability

# ---------- (2) Plot each 5-year window ----------
sns.set_context("talk", font_scale=1.1)

for (y0, y1) in windows:
    # collect docs/labels for this window
    docs, labels = [], []
    pub_counts = defaultdict(int)
    for pub in pubs_all:
        texts = texts_for_pub_range(pub, y0, y1)
        if len(texts) >= MIN_DOCS_PER_PUB:
            docs.extend(texts)
            labels.extend([pub]*len(texts))
            pub_counts[pub] = len(texts)

    total_docs = len(docs)
    if total_docs < MIN_DOCS_WINDOW:
        print(f"[{y0}–{y1}] skipped — only {total_docs} docs (need ≥{MIN_DOCS_WINDOW}).")
        continue

    # transform with global vectorizer/SVD for comparable axes
    X = vec.transform(docs)
    XY = svd.transform(X)

    unique_labels = sorted(set(labels))
    pal = build_palette_from_mapping(unique_labels, master_palette, eastview_map)

    # ---- Figure ----
    fig, ax = plt.subplots(figsize=(24, 16), dpi=110)
    labels_arr = np.array(labels)

    # scatter (bigger points, light outline for visibility)
    for pub in unique_labels:
        idx = np.where(labels_arr == pub)[0]
        ax.scatter(
            XY[idx, 0], XY[idx, 1],
            s=POINT_SIZE, alpha=POINT_ALPHA,
            label=f"{pub} (n={pub_counts.get(pub,0)})",
            color=pal.get(pub, "#999999"),
            edgecolors="white", linewidths=0.2
        )

    # centroids (to see inter-pub proximity clearly)
    for pub in unique_labels:
        idx = np.where(labels_arr == pub)[0]
        cx, cy = np.nanmean(XY[idx, 0]), np.nanmean(XY[idx, 1])
        ax.scatter(cx, cy, s=CENTROID_SIZE,
                   marker="X", color=pal[pub], alpha=0.95, edgecolors="black", linewidths=0.6, zorder=5)
        ax.text(cx, cy, f"  {pub}", va="center", ha="left",
                fontsize=LABEL_FONTSIZE, weight="bold", color=pal[pub],
                zorder=6)

    ax.set_title(f"SVD (LSA) document scatter by publication — {y0}–{y1}", pad=10)
    ax.set_xlabel("Component 1"); ax.set_ylabel("Component 2")
    ax.grid(ls=":", alpha=.25)

    # Legend at bottom
    ncols = min(8, max(2, int(np.ceil(len(unique_labels)/3))))
    ax.legend(
        title="Publication",
        ncol=ncols, loc="upper center", bbox_to_anchor=(0.5, -0.08),
        frameon=False, fontsize=13, title_fontsize=13
    )

    plt.tight_layout()
    plt.subplots_adjust(bottom=0.15)
    plt.show()


In [None]:
# # No labels
# # ==============================
# # SVD (LSA) per 5-year window — EV11 + Zavtra + Tsargrad + RT + EIR
# # ==============================
# import re
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt
# from matplotlib.lines import Line2D
# from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
# from sklearn.decomposition import TruncatedSVD

# # -------------------------------------------------
# # 0) CONFIG
# # -------------------------------------------------
# START_YEAR = 1995
# END_YEAR   = 2025
# WINDOW     = 5                # 5-year windows: 95–99, 00–04, ...
# MIN_DOCS_TOTAL     = 50       # skip window if fewer docs overall
# MIN_DOCS_PER_PUB   = 10       # keep pubs with at least this many docs in the window
# MIN_CHARS_PER_DOC  = 200      # optional: filter very short texts

# POINT_SIZE         = 18
# POINT_ALPHA        = 0.45
# CENTROID_SIZE      = 140

# SHOW_CENTROIDS        = True
# SHOW_CENTROID_LABELS  = False    # <-- toggle centroid text labels here
# SHOW_LEGEND           = True

# TITLE_PREFIX = "SVD (LSA) — content similarity by publication"

# # -------------------------------------------------
# # 1) Utilities
# # -------------------------------------------------
# def first_present(df, cols):
#     for c in cols:
#         if c in df.columns:
#             return c
#     return None

# def pick_first_date(df, candidates):
#     for c in candidates:
#         if c in df.columns:
#             dt = pd.to_datetime(df[c], errors="coerce")
#             if dt.notna().any():
#                 return dt
#     return pd.Series(pd.NaT, index=df.index)

# def to_month_start(dt_ser):
#     dt = pd.to_datetime(dt_ser, errors="coerce")
#     return pd.to_datetime({"year": dt.dt.year, "month": dt.dt.month, "day": 1})

# # If you already have eastview_map/master_palette in memory, we’ll use them.
# # Otherwise, we’ll define safe fallbacks for color lookup.
# eastview_map = globals().get("eastview_map", {
#     "Vedomosti": "vedomosti",
#     "Nezavisimaia gazeta": "nezavisimaia_gazeta",
#     "Trud": "trud",
#     "Время МН": "b_mh",
#     "Kommersant": "kommersant",
#     "Общая газета": "о_г",
#     "Sovetskaia Rossiia": "sovetskaia_rossiia",
#     "Novaia gazeta": "novaia_gazeta",
#     "Slovo": "slovo",
#     "Literaturnaia gazeta": "literaturnaia_gazeta",
#     "Pravda": "pravda",
# })
# master_palette = globals().get("master_palette", {
#     # If your session already has your full palette, it will be used instead.
#     # Add the common singles for safety:
#     "rt":  "#31a354",
#     "eir": "#3066a8",
#     "Zavtra":   "#5b31a3",
#     "Tsargrad": "#a35b31",
# })

# def build_palette(names, master_palette, eastview_map=None):
#     """Map display names to colors using your palette and fallback to gray."""
#     pal = {}
#     for n in names:
#         key = eastview_map.get(n, n) if eastview_map else n
#         col = (master_palette.get(key) or
#                master_palette.get(str(key).lower()) or
#                "#999999")
#         pal[n] = col
#     return pal

# # Stronger stopwords
# EXTRA_STOPS = {
#     "said","say","says","mr","ms","mrs","today","yesterday","tomorrow",
#     "news","report","reports","reported","according","week","weeks","month","months",
#     "year","years","daily","update","live","video","photo","photos","twitter","facebook",
#     "gov","govt","via","—","–","’","“","”","amp"
# }
# STOPWORDS = sorted(set(ENGLISH_STOP_WORDS) | EXTRA_STOPS)
# TOKEN_PATTERN = r"(?u)\b[a-zA-Z]{3,}\b"

# # -------------------------------------------------
# # 2) Assemble one text corpus with year
# #     Inputs expected in scope:
# #     - eastview_data, zavtra_data, tsargrad_data, rt_data, eir_data
# #     Required fields: text-ish columns and a parseable date column.
# # -------------------------------------------------
# def assemble_all_articles(eastview_data, zavtra_data, tsargrad_data, rt_data=None, eir_data=None):
#     frames = []

#     # ---------- EastView (multi-pub) ----------
#     if eastview_data is not None and len(eastview_data):
#         ev = eastview_data.copy()

#         pub_col = first_present(ev, ["Publication", "publication"])
#         if pub_col is None:
#             ev["Publication_clean"] = "unknown"
#         else:
#             ev["Publication_clean"] = ev[pub_col].map(eastview_map).fillna(
#                 ev[pub_col].astype(str).str.lower().str.replace(" ", "_", regex=False)
#             )

#         # year_month
#         if {"Year", "Month"}.issubset(ev.columns):
#             y = pd.to_numeric(ev["Year"], errors="coerce")
#             m = pd.to_numeric(ev["Month"], errors="coerce")
#             ev["year_month"] = pd.to_datetime({"year": y, "month": m, "day": 1}, errors="coerce")
#         else:
#             ev["year_month"] = pick_first_date(
#                 ev, ["PublishedDate","PublicationDate","publication_date","translated_date","Date","date"]
#             )
#             ev["year_month"] = ev["year_month"].dt.to_period("M").dt.to_timestamp(how="start")

#         text_col = first_present(ev, ["ArticleTextEnglish","ArticleText","article_text","translated_article_excerpt","content"])
#         if text_col is None:
#             ev["__text"] = pd.NA
#             text_col = "__text"

#         frames.append(pd.DataFrame({
#             "Publication_clean": ev["Publication_clean"],
#             "year_month": ev["year_month"],
#             "text": ev[text_col].astype(str)
#         }))

#     # ---------- Zavtra ----------
#     if zavtra_data is not None and len(zavtra_data):
#         zv = zavtra_data.copy()
#         zv["Publication_clean"] = "Zavtra"
#         zv_date = pick_first_date(zv, ["PublicationDate","publication_date","translated_date","PublishedDate","Date","date"])
#         zv["year_month"] = zv_date.dt.to_period("M").dt.to_timestamp(how="start")
#         text_col = first_present(zv, ["ArticleTextEnglish","article_text","translated_article_excerpt","content"])
#         if text_col is None:
#             zv["__text"] = pd.NA
#             text_col = "__text"
#         frames.append(pd.DataFrame({
#             "Publication_clean": "Zavtra",
#             "year_month": zv["year_month"],
#             "text": zv[text_col].astype(str)
#         }))

#     # ---------- Tsargrad ----------
#     if tsargrad_data is not None and len(tsargrad_data):
#         ts = tsargrad_data.copy()
#         ts["Publication_clean"] = "Tsargrad"
#         ts_date = pick_first_date(ts, ["PublicationDate","publication_date","translated_date","PublishedDate","Date","date"])
#         ts["year_month"] = ts_date.dt.to_period("M").dt.to_timestamp(how="start")
#         text_col = first_present(ts, ["ArticleTextEnglish","article_text","translated_article_excerpt","content"])
#         if text_col is None:
#             ts["__text"] = pd.NA
#             text_col = "__text"
#         frames.append(pd.DataFrame({
#             "Publication_clean": "Tsargrad",
#             "year_month": ts["year_month"],
#             "text": ts[text_col].astype(str)
#         }))

#     # ---------- RT ----------
#     if rt_data is not None and len(rt_data):
#         rt = rt_data.copy()
#         rt["Publication_clean"] = "RT"
#         rt_date = pick_first_date(rt, ["published_at","PublicationDate","publication_date","translated_date","PublishedDate","Date","date"])
#         rt["year_month"] = rt_date.dt.to_period("M").dt.to_timestamp(how="start")
#         text_col = first_present(rt, ["content","ArticleTextEnglish","article_text","translated_article_excerpt"])
#         if text_col is None:
#             rt["__text"] = pd.NA
#             text_col = "__text"
#         frames.append(pd.DataFrame({
#             "Publication_clean": "RT",
#             "year_month": rt["year_month"],
#             "text": rt[text_col].astype(str)
#         }))

#     # ---------- EIR ----------
#     if eir_data is not None and len(eir_data):
#         ei = eir_data.copy()
#         ei["Publication_clean"] = "EIR"
#         # 'date' often numeric YYYYMMDD
#         if "date" in ei.columns:
#             ei_dt = pd.to_datetime(ei["date"].astype("string"), format="%Y%m%d", errors="coerce")
#         else:
#             ei_dt = pick_first_date(ei, ["published_at","PublicationDate","publication_date","translated_date","PublishedDate","Date","date"])
#         ei["year_month"] = ei_dt.dt.to_period("M").dt.to_timestamp(how="start")
#         text_col = first_present(ei, ["content","ArticleTextEnglish","article_text","translated_article_excerpt"])
#         if text_col is None:
#             ei["__text"] = pd.NA
#             text_col = "__text"
#         frames.append(pd.DataFrame({
#             "Publication_clean": "EIR",
#             "year_month": ei["year_month"],
#             "text": ei[text_col].astype(str)
#         }))

#     all_articles = pd.concat(frames, ignore_index=True)
#     all_articles["year"] = pd.to_datetime(all_articles["year_month"], errors="coerce").dt.year
#     # optional: filter short texts
#     all_articles["text_len"] = all_articles["text"].astype(str).str.len()
#     all_articles = all_articles[all_articles["text_len"] >= MIN_CHARS_PER_DOC].dropna(subset=["year"])
#     return all_articles[["Publication_clean","year","text"]]

# # -------------------------------------------------
# # 3) Plot SVD for a given 5-year window
# # -------------------------------------------------
# def plot_svd_window(df_window, title_suffix, palette):
#     """df_window has columns: Publication_clean, text"""
#     docs   = df_window["text"].astype(str).tolist()
#     labels = df_window["Publication_clean"].astype(str).tolist()
#     labels_arr = np.array(labels)

#     # TF-IDF
#     vec = TfidfVectorizer(
#         max_features=12000,
#         stop_words=STOPWORDS,
#         token_pattern=TOKEN_PATTERN,
#         lowercase=True,
#         max_df=0.92,
#         min_df=2,
#         strip_accents="unicode"
#     )
#     X = vec.fit_transform(docs)

#     # SVD → 2D
#     svd = TruncatedSVD(n_components=2, random_state=42)
#     XY = svd.fit_transform(X)

#     unique_labels = sorted(pd.unique(labels))
#     pal = {k: palette.get(k, "#999999") for k in unique_labels}

#     # --- Figure ---
#     fig, ax = plt.subplots(figsize=(24, 16), dpi=110)

#     # scatter points
#     for pub in unique_labels:
#         idx = np.where(labels_arr == pub)[0]
#         ax.scatter(
#             XY[idx, 0], XY[idx, 1],
#             s=POINT_SIZE, alpha=POINT_ALPHA,
#             label=pub if SHOW_LEGEND else None,
#             color=pal.get(pub, "#999999"),
#             edgecolors="white", linewidths=0.2
#         )

#     # centroids
#     if SHOW_CENTROIDS:
#         for pub in unique_labels:
#             idx = np.where(labels_arr == pub)[0]
#             cx, cy = np.nanmean(XY[idx, 0]), np.nanmean(XY[idx, 1])
#             ax.scatter(cx, cy, s=CENTROID_SIZE, marker="X",
#                        color=pal[pub], alpha=0.95,
#                        edgecolors="black", linewidths=0.6, zorder=5)
#             if SHOW_CENTROID_LABELS:
#                 ax.text(cx, cy, f"  {pub}", va="center", ha="left",
#                         fontsize=14, weight="bold", color=pal[pub], zorder=6)

#     ax.set_title(f"{TITLE_PREFIX} — {title_suffix}", fontsize=22, pad=10)
#     ax.set_xlabel("Component 1", fontsize=15)
#     ax.set_ylabel("Component 2", fontsize=15)
#     ax.grid(ls=":", alpha=.25)

#     # Legend at bottom (manual, stable)
#     if SHOW_LEGEND:
#         handles = [Line2D([0],[0], marker="o", linestyle="",
#                           markersize=8, markerfacecolor=pal.get(n,"#999999"),
#                           markeredgecolor="white", label=n)
#                    for n in unique_labels]
#         ncols = min(8, max(2, int(np.ceil(len(unique_labels)/3))))
#         leg = ax.legend(handles=handles, title="Publication",
#                         ncol=ncols, loc="upper center",
#                         bbox_to_anchor=(0.5, -0.08), frameon=False,
#                         fontsize=13, title_fontsize=13)
#     else:
#         leg = ax.get_legend()
#         if leg: leg.remove()

#     plt.tight_layout()
#     plt.subplots_adjust(bottom=0.12)
#     plt.show()

# # -------------------------------------------------
# # 4) Run: assemble corpus, then loop 5-year windows
# # -------------------------------------------------
# # Expect these DataFrames in memory:
# # eastview_data, zavtra_data, tsargrad_data, rt_data, eir_data
# all_articles_text = assemble_all_articles(
#     eastview_data, zavtra_data, tsargrad_data, rt_data, eir_data
# )

# # Build palette for possible names we’ll see
# all_names = sorted(all_articles_text["Publication_clean"].unique().tolist())
# global_palette = build_palette(all_names, master_palette, eastview_map)

# # Loop windows
# for s in range(START_YEAR, END_YEAR+1, WINDOW):
#     e = min(s + WINDOW - 1, END_YEAR)
#     mask = (all_articles_text["year"] >= s) & (all_articles_text["year"] <= e)
#     df_w = all_articles_text.loc[mask].copy()

#     # filter pubs with enough docs in window
#     counts = df_w["Publication_clean"].value_counts()
#     keep_pubs = counts[counts >= MIN_DOCS_PER_PUB].index.tolist()
#     df_w = df_w[df_w["Publication_clean"].isin(keep_pubs)]

#     n_docs = len(df_w)
#     if n_docs < MIN_DOCS_TOTAL or len(keep_pubs) < 2:
#         print(f"[{s}–{e}] skipped — docs={n_docs}, pubs>={MIN_DOCS_PER_PUB}={len(keep_pubs)}")
#         continue

#     print(f"[{s}–{e}] plotting — docs={n_docs}, pubs={len(keep_pubs)}")
#     plot_svd_window(df_w, f"{s}–{e}", global_palette)


## SVD before 2011 and after 2011

In [None]:
# =============================================
# SVD comparison: BEFORE 2011 (≤2010) vs AFTER 2011 (≥2011)
# - EV 11 + Zavtra + Tsargrad + RT + EIR
# - Shared TF-IDF + SVD space for apples-to-apples
# - Two period plots + centroid shift plot
# =============================================
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import TruncatedSVD

# -------------------------
# CONFIG
# -------------------------
SPLIT_YEAR           = 2011     # compare ≤2010 vs ≥2011
MIN_DOCS_TOTAL       = 100      # skip a plot if fewer docs overall
MIN_DOCS_PER_PUB_PRE = 10       # require at least this many docs for a pub pre-2011
MIN_DOCS_PER_PUB_POST= 10       # and post-2011
MIN_CHARS_PER_DOC    = 200      # filter super-short texts

POINT_SIZE   = 18
POINT_ALPHA  = 0.45
CENTROID_SZ  = 140

SHOW_CENTROIDS        = True
SHOW_CENTROID_LABELS  = False     # toggle labels on centroid markers
SHOW_LEGEND           = True

TITLE_PREFIX = "SVD (LSA) — content similarity by publication"

# -------------------------
# Helpers (palette + parsing)
# -------------------------
def first_present(df, cols):
    for c in cols:
        if c in df.columns:
            return c
    return None

def pick_first_date(df, candidates):
    for c in candidates:
        if c in df.columns:
            dt = pd.to_datetime(df[c], errors="coerce")
            if dt.notna().any():
                return dt
    return pd.Series(pd.NaT, index=df.index)

# Use your mappings if they exist; otherwise, safe defaults:
eastview_map = globals().get("eastview_map", {
    "Vedomosti": "vedomosti",
    "Nezavisimaia gazeta": "nezavisimaia_gazeta",
    "Trud": "trud",
    "Время МН": "b_mh",
    "Kommersant": "kommersant",
    "Общая газета": "о_г",
    "Sovetskaia Rossiia": "sovetskaia_rossiia",
    "Novaia gazeta": "novaia_gazeta",
    "Slovo": "slovo",
    "Literaturnaia gazeta": "literaturnaia_gazeta",
    "Pravda": "pravda",
})
master_palette = globals().get("master_palette", {
    "rt":  "#31a354",
    "eir": "#3066a8",
    "Zavtra":   "#5b31a3",
    "Tsargrad": "#a35b31",
})

def build_palette(names, master_palette, eastview_map=None):
    pal = {}
    for n in names:
        key = eastview_map.get(n, n) if eastview_map else n
        col = (master_palette.get(key) or
               master_palette.get(str(key).lower()) or
               "#999999")
        pal[n] = col
    return pal

# Stronger stopword list
EXTRA_STOPS = {
    "said","say","says","mr","ms","mrs","today","yesterday","tomorrow",
    "news","report","reports","reported","according","week","weeks","month","months",
    "year","years","daily","update","live","video","photo","photos","twitter","facebook",
    "gov","govt","via","—","–","’","“","”","amp"
}
STOPWORDS = sorted(set(ENGLISH_STOP_WORDS) | EXTRA_STOPS)
TOKEN_PATTERN = r"(?u)\b[a-zA-Z]{3,}\b"

# -------------------------
# Assemble articles (EV 11 + Zavtra + Tsargrad + RT + EIR)
# Expect these to exist: eastview_data, zavtra_data, tsargrad_data, rt_data, eir_data
# -------------------------
def assemble_all_articles(eastview_data, zavtra_data, tsargrad_data, rt_data=None, eir_data=None):
    frames = []

    # EastView
    if eastview_data is not None and len(eastview_data):
        ev = eastview_data.copy()
        pub_col = first_present(ev, ["Publication","publication"])
        if pub_col is None:
            ev["Publication_clean"] = "unknown"
        else:
            ev["Publication_clean"] = ev[pub_col].map(eastview_map).fillna(
                ev[pub_col].astype(str).str.lower().str.replace(" ", "_", regex=False)
            )
        if {"Year","Month"}.issubset(ev.columns):
            y = pd.to_numeric(ev["Year"], errors="coerce")
            m = pd.to_numeric(ev["Month"], errors="coerce")
            ev["year_month"] = pd.to_datetime({"year": y, "month": m, "day": 1}, errors="coerce")
        else:
            ev["year_month"] = pick_first_date(
                ev, ["PublishedDate","PublicationDate","publication_date","translated_date","Date","date"]
            ).dt.to_period("M").dt.to_timestamp(how="start")
        text_col = first_present(ev, ["ArticleTextEnglish","ArticleText","article_text","translated_article_excerpt","content"])
        if text_col is None:
            ev["__text"] = pd.NA; text_col = "__text"
        frames.append(pd.DataFrame({
            "Publication_clean": ev["Publication_clean"],
            "year_month": ev["year_month"],
            "text": ev[text_col].astype(str)
        }))

    # Zavtra
    if zavtra_data is not None and len(zavtra_data):
        zv = zavtra_data.copy()
        zv["Publication_clean"] = "Zavtra"
        zv_dt = pick_first_date(zv, ["PublicationDate","publication_date","translated_date","PublishedDate","Date","date"])
        zv["year_month"] = zv_dt.dt.to_period("M").dt.to_timestamp(how="start")
        text_col = first_present(zv, ["ArticleTextEnglish","article_text","translated_article_excerpt","content"])
        if text_col is None:
            zv["__text"] = pd.NA; text_col = "__text"
        frames.append(pd.DataFrame({
            "Publication_clean": "Zavtra",
            "year_month": zv["year_month"],
            "text": zv[text_col].astype(str)
        }))

    # Tsargrad
    if tsargrad_data is not None and len(tsargrad_data):
        ts = tsargrad_data.copy()
        ts["Publication_clean"] = "Tsargrad"
        ts_dt = pick_first_date(ts, ["PublicationDate","publication_date","translated_date","PublishedDate","Date","date"])
        ts["year_month"] = ts_dt.dt.to_period("M").dt.to_timestamp(how="start")
        text_col = first_present(ts, ["ArticleTextEnglish","article_text","translated_article_excerpt","content"])
        if text_col is None:
            ts["__text"] = pd.NA; text_col = "__text"
        frames.append(pd.DataFrame({
            "Publication_clean": "Tsargrad",
            "year_month": ts["year_month"],
            "text": ts[text_col].astype(str)
        }))

    # RT
    if rt_data is not None and len(rt_data):
        rt = rt_data.copy()
        rt["Publication_clean"] = "RT"
        rt_dt = pick_first_date(rt, ["published_at","PublicationDate","publication_date","translated_date","PublishedDate","Date","date"])
        rt["year_month"] = rt_dt.dt.to_period("M").dt.to_timestamp(how="start")
        text_col = first_present(rt, ["content","ArticleTextEnglish","article_text","translated_article_excerpt"])
        if text_col is None:
            rt["__text"] = pd.NA; text_col = "__text"
        frames.append(pd.DataFrame({
            "Publication_clean": "RT",
            "year_month": rt["year_month"],
            "text": rt[text_col].astype(str)
        }))

    # EIR
    if eir_data is not None and len(eir_data):
        ei = eir_data.copy()
        ei["Publication_clean"] = "EIR"
        if "date" in ei.columns:
            ei_dt = pd.to_datetime(ei["date"].astype("string"), format="%Y%m%d", errors="coerce")
        else:
            ei_dt = pick_first_date(ei, ["published_at","PublicationDate","publication_date","translated_date","PublishedDate","Date","date"])
        ei["year_month"] = ei_dt.dt.to_period("M").dt.to_timestamp(how="start")
        text_col = first_present(ei, ["content","ArticleTextEnglish","article_text","translated_article_excerpt"])
        if text_col is None:
            ei["__text"] = pd.NA; text_col = "__text"
        frames.append(pd.DataFrame({
            "Publication_clean": "EIR",
            "year_month": ei["year_month"],
            "text": ei[text_col].astype(str)
        }))

    all_articles = pd.concat(frames, ignore_index=True)
    all_articles["year"] = pd.to_datetime(all_articles["year_month"], errors="coerce").dt.year
    all_articles["text_len"] = all_articles["text"].astype(str).str.len()
    all_articles = all_articles[(all_articles["text_len"] >= MIN_CHARS_PER_DOC) & all_articles["year"].notna()]
    return all_articles[["Publication_clean","year","text"]]

# -------------------------
# Plotters
# -------------------------
def plot_period_scatter(XY, labels_arr, mask, title_suffix, palette):
    """Scatter + centroids for a given boolean mask (pre or post)."""
    idx = np.where(mask)[0]
    if idx.size == 0:
        print(f"[{title_suffix}] no docs — skipping")
        return

    labs = labels_arr[idx]
    pubs = sorted(pd.unique(labs))
    pal  = {p: palette.get(p, "#999999") for p in pubs}

    fig, ax = plt.subplots(figsize=(24, 16), dpi=110)

    # points
    for p in pubs:
        sel = idx[labs == p]
        ax.scatter(
            XY[sel, 0], XY[sel, 1],
            s=POINT_SIZE, alpha=POINT_ALPHA,
            label=p if SHOW_LEGEND else None,
            color=pal.get(p, "#999999"),
            edgecolors="white", linewidths=0.2
        )

    # centroids
    if SHOW_CENTROIDS:
        for p in pubs:
            sel = idx[labs == p]
            cx, cy = np.nanmean(XY[sel, 0]), np.nanmean(XY[sel, 1])
            ax.scatter(cx, cy, s=CENTROID_SZ, marker="X",
                       color=pal[p], alpha=0.95,
                       edgecolors="black", linewidths=0.6, zorder=5)
            if SHOW_CENTROID_LABELS:
                ax.text(cx, cy, f"  {p}", va="center", ha="left",
                        fontsize=14, weight="bold", color=pal[p], zorder=6)

    ax.set_title(f"{TITLE_PREFIX} — {title_suffix}", fontsize=22, pad=10)
    ax.set_xlabel("Component 1", fontsize=15)
    ax.set_ylabel("Component 2", fontsize=15)
    ax.grid(ls=":", alpha=.25)

    if SHOW_LEGEND:
        handles = [Line2D([0],[0], marker="o", linestyle="",
                          markersize=8, markerfacecolor=pal.get(n,"#999999"),
                          markeredgecolor="white", label=n)
                   for n in pubs]
        ncols = min(8, max(2, int(np.ceil(len(pubs)/3))))
        ax.legend(handles=handles, title="Publication", ncol=ncols,
                  loc="upper center", bbox_to_anchor=(0.5, -0.08), frameon=False,
                  fontsize=13, title_fontsize=13)

    plt.tight_layout()
    plt.subplots_adjust(bottom=0.12)
    plt.show()

def plot_centroid_shift(XY, labels_arr, pre_mask, post_mask, palette,
                        min_docs_pre=MIN_DOCS_PER_PUB_PRE, min_docs_post=MIN_DOCS_PER_PUB_POST):
    """Arrow plot from pre→post centroids (same SVD space)."""
    pre_idx  = np.where(pre_mask)[0]
    post_idx = np.where(post_mask)[0]
    if pre_idx.size == 0 or post_idx.size == 0:
        print("[Centroid shift] no docs in one of the periods — skipping")
        return

    pre_labs  = labels_arr[pre_idx]
    post_labs = labels_arr[post_idx]

    pubs = sorted(set(pre_labs) | set(post_labs))
    # keep only pubs with enough docs in BOTH periods
    keep = []
    for p in pubs:
        n_pre  = (pre_labs  == p).sum()
        n_post = (post_labs == p).sum()
        if n_pre >= min_docs_pre and n_post >= min_docs_post:
            keep.append(p)
    if not keep:
        print("[Centroid shift] no publications pass min doc thresholds — skipping")
        return

    pal = {p: palette.get(p, "#999999") for p in keep}

    # compute centroids
    C_pre, C_post = {}, {}
    for p in keep:
        C_pre[p]  = XY[pre_idx[pre_labs  == p]].mean(axis=0)
        C_post[p] = XY[post_idx[post_labs == p]].mean(axis=0)

    fig, ax = plt.subplots(figsize=(24, 16), dpi=110)

    for p in keep:
        (x0, y0) = C_pre[p]
        (x1, y1) = C_post[p]
        ax.scatter(x0, y0, s=CENTROID_SZ, marker="o", color=pal[p],
                   edgecolors="black", linewidths=0.6, zorder=4)
        ax.scatter(x1, y1, s=CENTROID_SZ, marker="X", color=pal[p],
                   edgecolors="black", linewidths=0.6, zorder=5)
        ax.annotate("",
                    xy=(x1, y1), xytext=(x0, y0),
                    arrowprops=dict(arrowstyle="->", lw=2, color=pal[p], alpha=0.9))

        if SHOW_CENTROID_LABELS:
            ax.text(x1, y1, f"  {p}", va="center", ha="left",
                    fontsize=14, weight="bold", color=pal[p], zorder=6)

    ax.set_title(f"{TITLE_PREFIX} — centroid shift (≤{SPLIT_YEAR-1} → ≥{SPLIT_YEAR})", fontsize=22, pad=10)
    ax.set_xlabel("Component 1", fontsize=15)
    ax.set_ylabel("Component 2", fontsize=15)
    ax.grid(ls=":", alpha=.25)

    handles = [
        Line2D([0],[0], marker="o", linestyle="", markersize=9, markerfacecolor="#777777",
               markeredgecolor="black", label="Pre (≤{}) centroid".format(SPLIT_YEAR-1)),
        Line2D([0],[0], marker="X", linestyle="", markersize=9, markerfacecolor="#777777",
               markeredgecolor="black", label="Post (≥{}) centroid".format(SPLIT_YEAR)),
    ] + [Line2D([0],[0], color=palette.get(p,"#999999"), lw=3, label=p) for p in keep]

    ncols = min(8, max(2, int(np.ceil(len(keep)/3))))
    ax.legend(handles=handles, title="Shift & Publications",
              ncol=ncols, loc="upper center", bbox_to_anchor=(0.5, -0.10),
              frameon=False, fontsize=13, title_fontsize=13)

    plt.tight_layout()
    plt.subplots_adjust(bottom=0.14)
    plt.show()

# -------------------------
# Build corpus → shared TF-IDF + SVD → plot
# -------------------------
# Expect these in memory: eastview_data, zavtra_data, tsargrad_data, rt_data, eir_data
all_articles_text = assemble_all_articles(
    eastview_data, zavtra_data, tsargrad_data, rt_data, eir_data
)

# palette across all seen names
all_names = sorted(all_articles_text["Publication_clean"].unique().tolist())
global_palette = build_palette(all_names, master_palette, eastview_map)

# split masks
pre_mask_all  = all_articles_text["year"] <= (SPLIT_YEAR - 1)
post_mask_all = all_articles_text["year"] >= SPLIT_YEAR

# keep publications with enough docs per period
counts_pre  = all_articles_text.loc[pre_mask_all,  "Publication_clean"].value_counts()
counts_post = all_articles_text.loc[post_mask_all, "Publication_clean"].value_counts()
keep_pubs = sorted(set(counts_pre[counts_pre >= MIN_DOCS_PER_PUB_PRE].index)
                   & set(counts_post[counts_post >= MIN_DOCS_PER_PUB_POST].index))
if not keep_pubs:
    print("No publications pass the per-period minimum doc thresholds—lower MIN_DOCS_PER_PUB_* or widen the years.")

# filter to those pubs
mask_keep = all_articles_text["Publication_clean"].isin(keep_pubs)
df = all_articles_text.loc[mask_keep].copy()

# sanity totals
n_pre  = pre_mask_all[mask_keep].sum()
n_post = post_mask_all[mask_keep].sum()
print(f"PRE (≤{SPLIT_YEAR-1}): {n_pre} docs across {len(keep_pubs)} pubs")
print(f"POST(≥{SPLIT_YEAR}): {n_post} docs across {len(keep_pubs)} pubs")

if (n_pre + n_post) >= MIN_DOCS_TOTAL and len(keep_pubs) >= 2:
    # vectorize ALL docs together -> shared space
    docs   = df["text"].astype(str).tolist()
    labels = df["Publication_clean"].astype(str).tolist()
    years  = df["year"].astype(int).values

    vec = TfidfVectorizer(
        max_features=12000,
        stop_words=STOPWORDS,
        token_pattern=TOKEN_PATTERN,
        lowercase=True,
        max_df=0.92,
        min_df=2,
        strip_accents="unicode"
    )
    X = vec.fit_transform(docs)

    svd = TruncatedSVD(n_components=2, random_state=42)
    XY  = svd.fit_transform(X)

    labels_arr = np.array(labels)
    pre_mask   = years <= (SPLIT_YEAR - 1)
    post_mask  = years >= SPLIT_YEAR

    # (A) pre-2011 plot
    plot_period_scatter(XY, labels_arr, pre_mask,
                        title_suffix=f"Pre {SPLIT_YEAR} (≤{SPLIT_YEAR-1})",
                        palette=global_palette)

    # (B) post-2011 plot
    plot_period_scatter(XY, labels_arr, post_mask,
                        title_suffix=f"Post {SPLIT_YEAR} (≥{SPLIT_YEAR})",
                        palette=global_palette)

    # (C) centroid shift (same space)
    plot_centroid_shift(XY, labels_arr, pre_mask, post_mask, global_palette)
else:
    print("Not enough total docs or publications—adjust thresholds or ranges and re-run.")


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib.lines import Line2D
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import TruncatedSVD

# ---------- helpers ----------
def get_text_series(df):
    for c in ["ArticleTextEnglish","ArticleText","article_text","translated_article_excerpt","content"]:
        if c in df.columns:
            return df[c].astype(str).fillna("")
    return pd.Series([], dtype=str)

def pick_first_present(df, cols):
    for c in cols:
        if c in df.columns:
            return c
    return None

def build_palette_from_mapping(levels, master_palette, eastview_map=None):
    pal = {}
    for name in levels:
        key = eastview_map.get(name, name) if eastview_map else name
        color = master_palette.get(key) or master_palette.get(str(key).lower(), "#999999")
        pal[name] = color
    return pal

def ensure_bottom_legend(ax, names, palette, title="Publication", ncols=6, y_offset=-0.10, fontsize=13):
    # Manual legend so every pub shows even if some have very few points
    handles = [Line2D([0],[0], marker='o', lw=0, ms=8,
                      color=palette.get(n, "#999999"), label=n) for n in names]
    leg = ax.legend(handles=handles, title=title, ncol=ncols,
                    loc="upper center", bbox_to_anchor=(0.5, y_offset),
                    frameon=False, fontsize=fontsize)
    leg.get_title().set_fontsize(fontsize)
    return leg

# Your EV display->palette key mapping (extend as needed)
eastview_map = {
    "Vedomosti": "vedomosti",
    "Nezavisimaia gazeta": "nezavisimaia_gazeta",
    "Trud": "trud",
    "Время МН": "b_mh",
    "Kommersant": "kommersant",
    "Общая газета": "о_г",
    "Sovetskaia Rossiia": "sovetskaia_rossiia",
    "Novaia gazeta": "novaia_gazeta",
    "Slovo": "slovo",
    "Literaturnaia gazeta": "literaturnaia_gazeta",
    "Pravda": "pravda",
}

# ---------- build a full corpus: EV subs + Zavtra + Tsargrad + RT + EIR ----------
def assemble_corpus_for_svd(eastview_data, zavtra_data, tsargrad_data, rt_data, eir_data):
    frames = []

    # EastView: one "pub" per Publication value
    if "Publication" in eastview_data.columns:
        for pub in eastview_data["Publication"].dropna().unique():
            sub = eastview_data.loc[eastview_data["Publication"] == pub]
            txt = get_text_series(sub)
            if len(txt):
                frames.append(pd.DataFrame({"pub": pub, "text": txt}))

    # Single-title sources
    if zavtra_data is not None and len(zavtra_data):
        frames.append(pd.DataFrame({"pub": "Zavtra", "text": get_text_series(zavtra_data)}))
    if tsargrad_data is not None and len(tsargrad_data):
        frames.append(pd.DataFrame({"pub": "Tsargrad", "text": get_text_series(tsargrad_data)}))
    if rt_data is not None and len(rt_data):
        frames.append(pd.DataFrame({"pub": "RT", "text": get_text_series(rt_data)}))
    if eir_data is not None and len(eir_data):
        frames.append(pd.DataFrame({"pub": "EIR", "text": get_text_series(eir_data)}))

    corpus = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame(columns=["pub","text"])
    corpus["text"] = corpus["text"].astype(str).fillna("").str.strip()
    corpus = corpus[corpus["text"].str.len() > 0].reset_index(drop=True)
    return corpus

corpus = assemble_corpus_for_svd(eastview_data, zavtra_data, tsargrad_data, rt_data, eir_data)
print("Docs by publication:\n", corpus["pub"].value_counts())

# ---------- TF-IDF (custom stopwords) ----------
EXTRA_STOPS = {
    "said","say","says","mr","ms","mrs","today","yesterday","tomorrow",
    "news","report","reports","reported","according","week","weeks","month","months",
    "year","years","daily","update","live","video","photo","photos","twitter","facebook",
    "gov","govt","via","—","–","’","“","”","amp"
}
STOPWORDS = sorted(set(ENGLISH_STOP_WORDS) | EXTRA_STOPS)
TOKEN_PATTERN = r"(?u)\b[a-zA-Z]{3,}\b"

vec = TfidfVectorizer(
    max_features=20000,
    stop_words=STOPWORDS,
    token_pattern=TOKEN_PATTERN,
    lowercase=True,
    max_df=0.95,
    min_df=2,
    strip_accents="unicode"
)
X = vec.fit_transform(corpus["text"])
labels = corpus["pub"].to_numpy()
terms = vec.get_feature_names_out()

# ---------- SVD (2D embedding) ----------
svd = TruncatedSVD(n_components=2, random_state=42)
XY = svd.fit_transform(X)

# Colors from your master palette (+ eastview_map for EV subs)
all_levels = sorted(pd.unique(labels))
pal_all = build_palette_from_mapping(all_levels, master_palette, eastview_map)

# ---------- (A) ONE PLOT: all publications together ----------
plt.figure(figsize=(12, 14), dpi=120)
for pub in all_levels:
    idx = np.where(labels == pub)[0]
    if len(idx) == 0: 
        continue
    plt.scatter(XY[idx, 0], XY[idx, 1],
                s=12, alpha=0.55, label=pub,
                color=pal_all.get(pub, "#999999"), edgecolors="none")
plt.title("SVD (LSA) document scatter — ALL publications", fontsize=22, pad=12)
plt.xlabel("Component 1", fontsize=16)
plt.ylabel("Component 2", fontsize=16)
plt.grid(ls=":", alpha=.25)

# bottom legend, multi-column
ncols = min(8, max(3, int(np.ceil(len(all_levels) / 4))))
ensure_bottom_legend(plt.gca(), all_levels, pal_all, ncols=ncols, y_offset=-0.08, fontsize=14)

plt.tight_layout()
plt.subplots_adjust(bottom=0.12)
plt.show()

# ---------- (B) MANY PLOTS: one per publication (same embedding & axes) ----------
# Keep global axis limits so small-multiples are comparable
pad = 0.02
x_min, x_max = XY[:,0].min(), XY[:,0].max()
y_min, y_max = XY[:,1].min(), XY[:,1].max()
xr = x_max - x_min; yr = y_max - y_min
xlim = (x_min - xr*pad, x_max + xr*pad)
ylim = (y_min - yr*pad, y_max + yr*pad)

for pub in all_levels:
    idx = np.where(labels == pub)[0]
    if len(idx) == 0:
        continue

    plt.figure(figsize=(12, 9), dpi=120)
    # faint background of all docs for context
    plt.scatter(XY[:, 0], XY[:, 1], s=6, alpha=0.10, color="#CCCCCC", edgecolors="none")
    # highlight this publication
    plt.scatter(XY[idx, 0], XY[idx, 1],
                s=16, alpha=0.70,
                color=pal_all.get(pub, "#999999"), edgecolors="none", label=pub)

    plt.xlim(xlim); plt.ylim(ylim)
    plt.title(f"SVD (LSA) — {pub}", fontsize=18, pad=10)
    plt.xlabel("Component 1"); plt.ylabel("Component 2")
    plt.grid(ls=":", alpha=.25)

    # single legend (just the pub), under the chart
    ensure_bottom_legend(plt.gca(), [pub], pal_all, ncols=1, y_offset=-0.10, fontsize=13)
    plt.tight_layout()
    plt.subplots_adjust(bottom=0.12)
    plt.show()

# ---------- (optional) Top TF-IDF terms per publication ----------
top_terms = {}
for pub in all_levels:
    idx = np.where(labels == pub)[0]
    if len(idx) == 0:
        continue
    mean_vec = X[idx].mean(axis=0).A1
    top_idx = mean_vec.argsort()[-20:][::-1]
    top_terms[pub] = pd.Series(mean_vec[top_idx], index=terms[top_idx])
    print(f"\nTop TF-IDF terms — {pub}\n", top_terms[pub].head(10))


### `probability` soros-bias score (0–1) and ArticleTextEnglish is the full article
(a) filter/compare by bias
(b) see distributions
(c) use the bias in SVD scatter

In [None]:
from matplotlib.lines import Line2D
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

def assemble_all_articles(eastview_data, zavtra_data, tsargrad_data, rt_data, eir_data):
    frames = []

    # ---- EastView (many pubs) ----
    ev = eastview_data.copy()
    pub_col = pick_first_present(ev, ["Publication", "publication"])
    ev["Publication_clean"] = (
        "unknown" if pub_col is None else
        ev[pub_col].map(eastview_map).fillna(
            ev[pub_col].astype(str).str.lower().str.replace(" ", "_", regex=False)
        )
    )
    ev["probability"] = pd.to_numeric(ev.get("probability", np.nan), errors="coerce")
    if {"Year", "Month"}.issubset(ev.columns):
        y = pd.to_numeric(ev["Year"], errors="coerce")
        m = pd.to_numeric(ev["Month"], errors="coerce")
        ev["year_month"] = pd.to_datetime({"year": y, "month": m, "day": 1}, errors="coerce")
    else:
        ev["year_month"] = pick_first_date(
            ev, ["PublishedDate","PublicationDate","publication_date","translated_date","Date","date"]
        ).dt.to_period("M").dt.to_timestamp(how="start")
    ev_text_col  = pick_first_present(ev, ["ArticleTextEnglish","ArticleText","article_text","translated_article_excerpt","content"])
    ev_title_col = pick_first_present(ev, ["ArticleTitle","title","translated_title","headline"])
    if ev_text_col is None:  ev["__text"]  = pd.NA
    if ev_title_col is None: ev["__title"] = pd.NA
    ev_out = pd.DataFrame({
        "Publication_clean": ev["Publication_clean"],
        "year_month": ev["year_month"],
        "probability": ev["probability"],
        "text": ev[ev_text_col] if ev_text_col else ev["__text"],
        "title": ev[ev_title_col] if ev_title_col else ev["__title"],
        "ArticleID": ev.get("ArticleID", pd.Series([pd.NA]*len(ev)))
    }).dropna(subset=["probability"])
    frames.append(ev_out)

    # ---- Zavtra ----
    zv = zavtra_data.copy()
    zv["Publication_clean"] = "Zavtra"
    zv["probability"] = pd.to_numeric(zv.get("probability", np.nan), errors="coerce")
    zv["year_month"] = pick_first_date(
        zv, ["PublicationDate","publication_date","translated_date","PublishedDate","Date","date"]
    ).dt.to_period("M").dt.to_timestamp(how="start")
    zv_text_col  = pick_first_present(zv, ["ArticleTextEnglish","article_text","translated_article_excerpt","content"])
    zv_title_col = pick_first_present(zv, ["title","translated_title","headline"])
    if zv_text_col is None:  zv["__text"]  = pd.NA
    if zv_title_col is None: zv["__title"] = pd.NA
    zv_out = pd.DataFrame({
        "Publication_clean": "Zavtra",
        "year_month": zv["year_month"],
        "probability": zv["probability"],
        "text": zv[zv_text_col] if zv_text_col else zv["__text"],
        "title": zv[zv_title_col] if zv_title_col else zv["__title"],
        "ArticleID": zv.get("ArticleID", pd.Series([pd.NA]*len(zv)))
    }).dropna(subset=["probability"])
    frames.append(zv_out)

    # ---- Tsargrad ----
    ts = tsargrad_data.copy()
    ts["Publication_clean"] = "Tsargrad"
    ts["probability"] = pd.to_numeric(ts.get("probability", np.nan), errors="coerce")
    ts["year_month"] = pick_first_date(
        ts, ["PublicationDate","publication_date","translated_date","PublishedDate","Date","date"]
    ).dt.to_period("M").dt.to_timestamp(how="start")
    ts_text_col  = pick_first_present(ts, ["ArticleTextEnglish","article_text","translated_article_excerpt","content"])
    ts_title_col = pick_first_present(ts, ["title","translated_title","headline"])
    if ts_text_col is None:  ts["__text"]  = pd.NA
    if ts_title_col is None: ts["__title"] = pd.NA
    ts_out = pd.DataFrame({
        "Publication_clean": "Tsargrad",
        "year_month": ts["year_month"],
        "probability": ts["probability"],
        "text": ts[ts_text_col] if ts_text_col else ts["__text"],
        "title": ts[ts_title_col] if ts_title_col else ts["__title"],
        "ArticleID": ts.get("ArticleID", pd.Series([pd.NA]*len(ts)))
    }).dropna(subset=["probability"])
    frames.append(ts_out)

    # ---- RT ----
    if rt_data is not None and len(rt_data):
        rt = rt_data.copy()
        rt["Publication_clean"] = "RT"
        rt["probability"] = pd.to_numeric(rt.get("probability", np.nan), errors="coerce")
        rt["year_month"] = pick_first_date(
            rt, ["published_at","PublicationDate","publication_date","translated_date","PublishedDate","Date","date"]
        ).dt.to_period("M").dt.to_timestamp(how="start")
        rt_text_col  = pick_first_present(rt, ["content","ArticleTextEnglish","article_text","translated_article_excerpt"])
        rt_title_col = pick_first_present(rt, ["title","headline"])
        if rt_text_col is None:  rt["__text"]  = pd.NA
        if rt_title_col is None: rt["__title"] = pd.NA
        rt = rt.dropna(subset=["probability"]) 
        rt_out = pd.DataFrame({
            "Publication_clean": "RT",
            "year_month": rt["year_month"],
            "probability": rt["probability"],
            "text": rt[rt_text_col] if rt_text_col else rt["__text"],
            "title": rt[rt_title_col] if rt_title_col else rt["__title"],
            "ArticleID": rt.get("ArticleID", pd.Series([pd.NA]*len(rt)))
        })
        frames.append(rt_out)

    # ---- EIR ----
    if eir_data is not None and len(eir_data):
        ei = eir_data.copy()
        ei["Publication_clean"] = "EIR"
        ei["probability"] = pd.to_numeric(ei.get("probability", np.nan), errors="coerce")
        ei["year_month"] = pick_first_date(
            ei, ["published_at","PublicationDate","publication_date","translated_date","PublishedDate","Date","date"]
        ).dt.to_period("M").dt.to_timestamp(how="start")
        ei_text_col  = pick_first_present(ei, ["content","ArticleTextEnglish","article_text","translated_article_excerpt"])
        ei_title_col = pick_first_present(ei, ["title","headline"])
        if ei_text_col is None:  ei["__text"]  = pd.NA
        if ei_title_col is None: ei["__title"] = pd.NA
        ei = ei.dropna(subset=["probability"])
        ei_out = pd.DataFrame({
            "Publication_clean": "EIR",
            "year_month": ei["year_month"],
            "probability": ei["probability"],
            "text": ei[ei_text_col] if ei_text_col else ei["__text"],
            "title": ei[ei_title_col] if ei_title_col else ei["__title"],
            "ArticleID": ei.get("ArticleID", pd.Series([pd.NA]*len(ei)))
        })
        frames.append(ei_out)

    all_articles = pd.concat(frames, ignore_index=True)

    # 3-bin bucket for bias (optional)
    all_articles["bias_bin"] = pd.cut(
        all_articles["probability"],
        bins=[0, 0.33, 0.66, 1.000001],
        labels=["low","mid","high"]
    )
    return all_articles

# Build the dataset FIRST
all_articles = assemble_all_articles(eastview_data, zavtra_data, tsargrad_data, rt_data, eir_data)

# Sanity check for RT/EIR presence
kde_df = all_articles.dropna(subset=["probability","Publication_clean"])
# print(kde_df["Publication_clean"].value_counts().reindex(["RT","EIR"]))




In [None]:
def build_palette_from_mapping(levels, master_palette, eastview_map=None):
    pal = {}
    for name in levels:
        key = eastview_map.get(name, name) if eastview_map else name
        # try exact then lowercase (handles RT/EIR -> "rt"/"eir")
        color = master_palette.get(key) or master_palette.get(str(key).lower(), "#999999")
        pal[name] = color
    return pal

def ensure_bottom_legend(ax, names, palette, title="Publication", ncols=6, y_offset=-0.12, fontsize=14):
    leg = ax.get_legend()
    if leg is not None:
        try:
            from seaborn import move_legend
            move_legend(ax, "upper center", bbox_to_anchor=(0.5, y_offset),
                        ncol=ncols, frameon=False, title=title)
            leg = ax.get_legend()
            for t in leg.get_texts(): t.set_fontsize(fontsize)
            leg.get_title().set_fontsize(fontsize)
            return leg
        except Exception:
            pass
    handles = [Line2D([0],[0], color=palette.get(n, "#999999"), lw=3, label=n) for n in names]
    leg = ax.legend(handles=handles, title=title, ncol=ncols,
                    loc="upper center", bbox_to_anchor=(0.5, y_offset),
                    frameon=False, fontsize=fontsize)
    leg.get_title().set_fontsize(fontsize)
    return leg

# KDE of bias probability by publication (with RT & EIR)
kde_df  = all_articles.dropna(subset=["probability","Publication_clean"]).copy()
levels  = sorted(kde_df["Publication_clean"].unique().tolist())
pal_kde = build_palette_from_mapping(levels, master_palette, eastview_map)

plt.figure(figsize=(28, 12))
ax = sns.kdeplot(
    data=kde_df,
    x="probability",
    hue="Publication_clean",
    hue_order=levels,
    palette=pal_kde,
    common_norm=False,
    bw_adjust=1.0,
    linewidth=2.5,
    fill=False,
    legend=True
)
ax.set_title("Bias score (probability) distribution by publication")
ax.set_xlabel("Soros bias probability"); ax.set_ylabel("Density")
ax.set_xlim(0, 1)
ax.grid(axis="y", linestyle=":", alpha=.35)

ncols = min(8, max(2, int(np.ceil(len(levels)/3))))
ensure_bottom_legend(ax, levels, pal_kde, ncols=ncols, y_offset=-0.12, fontsize=14)
plt.tight_layout(); plt.subplots_adjust(bottom=0.18); plt.show()
