In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
# ==== Per-article targets with TextBlob (chunked, low-memory) ====
# Outputs: /content/drive/MyDrive/nyt_outputs/article_level_targets_textblob.csv

!pip -q install textblob

import os, math, gc
import pandas as pd
from collections import defaultdict
from textblob import TextBlob

# ---------- CONFIG ----------
DRIVE_DIR = "/content/drive/MyDrive/nyt_outputs"
COMBINED_STRICT = os.path.join(DRIVE_DIR, "us_elections_combined_strict.csv")
STRICT_1718    = os.path.join(DRIVE_DIR, "us_elections_2017_2018_strict.csv")
STRICT_2020    = os.path.join(DRIVE_DIR, "us_elections_2020_strict.csv")
OUT_CSV        = os.path.join(DRIVE_DIR, "article_level_targets_textblob.csv")

# Use combined if present; else fall back to per-year strict files
INPUTS = [COMBINED_STRICT] if os.path.exists(COMBINED_STRICT) else [p for p in (STRICT_1718, STRICT_2020) if os.path.exists(p)]
assert INPUTS, "No strict CSVs found in Drive. Copy *_strict.csv files to /content/drive/MyDrive/nyt_outputs first."

# Columns we will read from the strict CSVs
META_COLS = ["url","pub_date","section","subsection","headline","abstract","news_desk","type_of_material","keywords"]
NEEDED_COLS = ["article_id","comment"] + META_COLS

# ---------- helpers ----------
def safe_polarity(text):
    if text is None or (isinstance(text, float) and math.isnan(text)):
        return 0.0
    s = str(text).strip()
    if not s:
        return 0.0
    try:
        return TextBlob(s).sentiment.polarity  # [-1,1]
    except Exception:
        return 0.0

# Aggregation holders
# numeric: sum of polarity and count of comments
agg_sum = defaultdict(float)   # article_id -> sum polarity
agg_cnt = defaultdict(int)     # article_id -> num comments
# metadata: keep first non-null we see
agg_meta = {}  # article_id -> dict of META_COLS

# ---------- main loop (chunked) ----------
for path in INPUTS:
    print(f"Processing: {os.path.basename(path)}")
    for i, chunk in enumerate(pd.read_csv(path, low_memory=False, chunksize=100_000, usecols=lambda c: c in NEEDED_COLS, dtype={"article_id": str})):
        # sentiment per comment
        chunk["comment"] = chunk["comment"].astype(str)
        chunk["polarity"] = chunk["comment"].map(safe_polarity)

        # numeric aggregation per chunk
        sums  = chunk.groupby("article_id")["polarity"].sum()
        counts= chunk.groupby("article_id")["polarity"].count()

        for aid, s in sums.items():
            agg_sum[aid] += float(s)
        for aid, n in counts.items():
            agg_cnt[aid] += int(n)

        # metadata: take first occurrence per article_id if not already stored
        meta_first = chunk.drop_duplicates("article_id")[["article_id"] + META_COLS].set_index("article_id")
        for aid, row in meta_first.iterrows():
            if aid not in agg_meta:
                agg_meta[aid] = {k: row.get(k) for k in META_COLS}

        if (i+1) % 20 == 0:
            print(f"  chunks processed: {i+1}, distinct articles so far: {len(agg_meta):,}")
        del chunk, sums, counts, meta_first
        gc.collect()

print("Finalizing…")

# ---------- build output frame ----------
rows = []
for aid in agg_cnt.keys():
    total = agg_cnt[aid]
    sent_mean = (agg_sum[aid] / total) if total else 0.0
    meta = agg_meta.get(aid, {})
    rows.append({
        "article_id": aid,
        **{k: meta.get(k) for k in META_COLS},
        "comments_total": total,
        "sentiment_mean_textblob": sent_mean,
    })

out_df = pd.DataFrame(rows)

# Optional: sort by pub_date then comments_total desc
# (pub_date might be string; we won't coerce here to keep it simple/fast)
if "pub_date" in out_df.columns:
    out_df = out_df.sort_values(by=["pub_date","comments_total"], ascending=[True, False], ignore_index=True)

# Write to Drive
out_df.to_csv(OUT_CSV, index=False)
print("Wrote ->", OUT_CSV, "rows:", len(out_df))


Processing: us_elections_2017_2018_strict.csv
Processing: us_elections_2020_strict.csv
Finalizing…
Wrote -> /content/drive/MyDrive/nyt_outputs/article_level_targets_textblob.csv rows: 950
