In [None]:
pip install kagglehub



In [None]:
# it took a few times to get this to work without crashing, but this is the version we ended up working with:
import os, re, glob, math, gc
from typing import List, Optional, Dict, Tuple
import pandas as pd

# ------------------- Helpers -------------------
def _clean_url(u):
    if pd.isna(u): return u
    return str(u).strip().split('?')[0].rstrip('/').lower()

def _to_lower_str(val):
    if val is None or val is pd.NA: return ""
    try:
        if isinstance(val, float) and math.isnan(val): return ""
    except Exception:
        pass
    if hasattr(pd, "isna") and pd.isna(val): return ""
    return str(val).strip().lower()

# Output columns
OUT_COLS = [
    "comment_id","parent_id","user","comment","recommendations","reply_count",
    "editors_selection","date",
    "article_id","article_url","url","pub_date","section","subsection","headline",
    "abstract","news_desk","type_of_material","keywords",
]

# ------------------- Column maps -------------------
# Articles: prefer uniqueID
ART_MAP = {
    "article_id": ["uniqueID","uniqueid","uniqueId","UniqueID","articleID","asset_id","assetId","id","article_id"],
    "url": ["articleURL","url","web_url","link"],
    "pub_date": ["pub_date","pubDate","date","published_date","PublicationDate"],
    "section": ["section","section_name","newsSection","news_desk"],
    "subsection": ["subsection","subsection_name","sub_section"],
    "headline": ["headline","title","main_headline","headline.main","Title"],
    "abstract": ["abstract","snippet","abstract_text","lead_paragraph","Summary"],
    "news_desk": ["news_desk","desk"],
    "type_of_material": ["type_of_material","type","material_type_facet"],
    "keywords": ["keywords","descriptors","subject","keywords_list"],
}

# Comments: prefer articleID
COM_MAP = {
    "comment_id": ["commentID","comment_id","id"],
    "parent_id": ["parentID","parent_id","inReplyTo"],
    "user": ["userDisplayName","user","display_name","author"],
    "comment": ["commentBody","comment","text","body","content"],
    "recommendations": ["recommendations","recommendCount","recommendedCount","recommendationCount"],
    "reply_count": ["replyCount","replies","numReplies"],
    "editors_selection": ["editorsSelection","editors_selection"],
    "date": ["createDate","date","timestamp","createdAt"],
    "article_url": ["articleURL","url","web_url","articleUrl","story_url","storyUrl","article_link","articleLink","link"],
    "article_id": ["articleID","articleid","asset_id","assetId","article_id","story_id","storyId"],
}

# ------------------- Election filtering -------------------
ELECTION_KEYWORD_PATTERNS = [
    r"\belection(s)?\b", r"\bprimary\b", r"\bcaucus(es)?\b", r"\brunoff\b",
    r"\bmidterm(s)?\b", r"\bgeneral election\b",
    r"\bballot(s)?\b", r"\babsentee\b", r"\bmail[- ]in\b", r"\bearly voting\b",
    r"\bvoter(s)?\b", r"\bvoter registration\b", r"\bregistered voter(s)?\b",
    r"\bvoting\b", r"\bpoll(s|ing place)?\b", r"\bturnout\b",
    r"\bcampaign(s|ing)?\b", r"\bcandidate(s)?\b",
    r"\bpac(s)?\b", r"\bsuper pac(s)?\b", r"\bfec\b",
    r"\bdebate(s)?\b", r"\bendorse(ment|ments)?\b",
]
ELECTION_REGEX = re.compile("|".join(ELECTION_KEYWORD_PATTERNS), re.IGNORECASE)

ELECTION_URL_SEGMENTS = [
    "/politics/elections", "/elections/", "/interactive/us/elections",
    "/politics/", "/us/politics/", "/live/20",
    "/news-event/2020-election", "/news-event/2022-midterms", "/news-event/2024-election"
]

ELECTION_SECTIONS = {"politics"}
ELECTION_SUBSTR = ["election", "campaign", "midterm", "primary", "vote"]

def _is_us_election_row(section, subsection, headline, abstract, keywords, url_hint=None, url_hint2=None):
    def url_says_election(u):
        u = _to_lower_str(u)
        if not u:
            return False
        if any(seg in u for seg in ELECTION_URL_SEGMENTS):
            return True
        if any(tok in u for tok in [
            "election", "primary", "caucus", "midterm", "ballot", "campaign",
            "voting", "turnout", "polls"
        ]):
            return True
        return False

    # Section/subsection
    sec = _to_lower_str(section)
    if sec in ELECTION_SECTIONS:
        return True
    sub = _to_lower_str(subsection)
    if any(x in sub for x in ELECTION_SUBSTR):
        return True

    # Keywords/headline/abstract
    if isinstance(keywords, list):
        kwtxt = " ".join(_to_lower_str(k) for k in keywords if k is not None)
    else:
        kwtxt = _to_lower_str(keywords)
    if ELECTION_REGEX.search(kwtxt or ""):
        return True
    if ELECTION_REGEX.search(_to_lower_str(headline) or ""):
        return True
    if ELECTION_REGEX.search(_to_lower_str(abstract) or ""):
        return True

    # URL fallback
    if url_says_election(url_hint) or url_says_election(url_hint2):
        return True

    return False

# ------------------- Normalizers -------------------
def _first_present(df, candidates, default=pd.NA):
    for c in candidates:
        if c in df.columns:
            return df[c]
    return pd.Series([default]*len(df))

def _normalize_articles_small(df: pd.DataFrame) -> pd.DataFrame:
    out = pd.DataFrame({
        "article_id": _first_present(df, ART_MAP["article_id"]),
        "url": _first_present(df, ART_MAP["url"]).map(_clean_url),
        "pub_date": _first_present(df, ART_MAP["pub_date"]),
        "section": _first_present(df, ART_MAP["section"]),
        "subsection": _first_present(df, ART_MAP["subsection"]),
        "headline": _first_present(df, ART_MAP["headline"]),
        "abstract": _first_present(df, ART_MAP["abstract"]),
        "news_desk": _first_present(df, ART_MAP["news_desk"]),
        "type_of_material": _first_present(df, ART_MAP["type_of_material"]),
        "keywords": _first_present(df, ART_MAP["keywords"]),
    })
    return out

def _normalize_comments_small(df: pd.DataFrame) -> pd.DataFrame:
    def _first_present_local(cands, default=pd.NA):
        for c in cands:
            if c in df.columns:
                return df[c]
        return pd.Series([default]*len(df))

    out = pd.DataFrame({
        "comment_id": _first_present_local(COM_MAP["comment_id"]),
        "parent_id": _first_present_local(COM_MAP["parent_id"]),
        "user": _first_present_local(COM_MAP["user"]),
        "comment": _first_present_local(COM_MAP["comment"]),
        "recommendations": _first_present_local(COM_MAP["recommendations"]),
        "reply_count": _first_present_local(COM_MAP["reply_count"]),
        "editors_selection": _first_present_local(COM_MAP["editors_selection"]),
        "date": _first_present_local(COM_MAP["date"]),
        "article_url": _first_present_local(COM_MAP["article_url"]).map(_clean_url),
        "article_id": _first_present_local(COM_MAP["article_id"]),
    })
    return out

# ------------------- Loader functions -------------------
def _find(paths_patterns: List[str]) -> List[str]:
    s = set()
    for pat in paths_patterns:
        s.update(glob.glob(pat))
    return sorted(s)

def load_articles_build_maps(dirpath: str):
    a_paths = _find([
        os.path.join(dirpath, "Articles*.csv"),
        os.path.join(dirpath, "*articles*.csv"),
        os.path.join(dirpath, "articles*.csv"),
        os.path.join(dirpath, "nyt-articles-2020.csv"),
    ])
    if not a_paths:
        raise FileNotFoundError(f"No article CSVs in {dirpath}")

    arts = pd.concat([pd.read_csv(p, low_memory=False) for p in a_paths], ignore_index=True)
    arts = _normalize_articles_small(arts)

    # Deduplicate
    arts = arts.sort_values("pub_date").drop_duplicates(subset=["article_id","url"], keep="first")

    cols = ["headline","abstract","pub_date","section","subsection","news_desk","type_of_material","keywords","url"]

    by_id: Dict[object, Tuple] = {}
    for aid, t in zip(arts["article_id"], arts[cols].itertuples(index=False, name=None)):
        if pd.isna(aid): continue
        by_id[aid] = t
    for aid, t in zip(arts["article_id"], arts[cols].itertuples(index=False, name=None)):
        if pd.isna(aid): continue
        by_id[str(aid)] = t

    by_url: Dict[str, Tuple] = {}
    for u, t in zip(arts["url"], arts[cols].itertuples(index=False, name=None)):
        if pd.isna(u): continue
        by_url[str(u)] = t

    return by_id, by_url

def comment_files(dirpath: str) -> List[str]:
    c_paths = _find([
        os.path.join(dirpath, "Comments*.csv"),
        os.path.join(dirpath, "*comments*.csv"),
        os.path.join(dirpath, "comments*.csv"),
        os.path.join(dirpath, "nyt-comments-2020*.csv"),
    ])
    if not c_paths:
        raise FileNotFoundError(f"No comment CSVs in {dirpath}")
    return c_paths

# ------------------- Streaming join & election filtering -------------------
def process_comment_stream(by_id, by_url, cpaths: List[str], out_filtered_csv: str, chunksize: int = 25_000):
    os.makedirs(os.path.dirname(out_filtered_csv), exist_ok=True)
    if os.path.exists(out_filtered_csv):
        os.remove(out_filtered_csv)

    for p in cpaths:
        print(f"Processing: {os.path.basename(p)}")
        for i, raw in enumerate(pd.read_csv(p, low_memory=False, chunksize=chunksize)):
            c = _normalize_comments_small(raw)

            cols = ["headline","abstract","pub_date","section","subsection","news_desk","type_of_material","keywords","url"]
            for col in cols:
                c[col] = pd.NA

            # --- ID join (raw, then str) ---
            aid_raw = c["article_id"]
            got_raw = aid_raw.map(by_id)
            mask_raw = got_raw.notna()
            id_hits_raw = int(mask_raw.sum())
            if id_hits_raw:
                vals = got_raw[mask_raw].tolist()
                filled = pd.DataFrame(vals, index=c.index[mask_raw], columns=cols)
                for col in cols:
                    c.loc[mask_raw, col] = filled[col].values

            mask_need_id = c["headline"].isna()
            got_str = aid_raw.astype(str).where(mask_need_id, None).map(by_id)
            mask_str = got_str.notna()
            id_hits_str = int(mask_str.sum())
            if id_hits_str:
                vals2 = got_str[mask_str].tolist()
                filled2 = pd.DataFrame(vals2, index=c.index[mask_str], columns=cols)
                for col in cols:
                    c.loc[mask_str, col] = filled2[col].values

            # --- URL fallback ---
            mask_need_url = c["headline"].isna()
            url_clean = c.loc[mask_need_url, "article_url"].map(_clean_url)
            got_url = url_clean.map(by_url)
            mask_url = got_url.notna()
            url_hits = int(mask_url.sum())
            if url_hits:
                vals3 = got_url[mask_url].tolist()
                filled3 = pd.DataFrame(vals3, index=c.index[mask_need_url][mask_url], columns=cols)
                for col in cols:
                    c.loc[c.index[mask_need_url][mask_url], col] = filled3[col].values

            # --- Election-only filter ---
            keep_mask = c.apply(
                lambda r: _is_us_election_row(
                    r["section"], r["subsection"], r["headline"], r["abstract"], r["keywords"],
                    url_hint=r.get("url"), url_hint2=r.get("article_url")
                ),
                axis=1
            )
            out = c.loc[keep_mask, OUT_COLS].copy()

            if len(out):
                out.to_csv(out_filtered_csv, mode="a", header=not os.path.exists(out_filtered_csv), index=False)

            print(f"  chunk {i+1}: total={len(c):,}, id_hits={id_hits_raw + id_hits_str:,}, url_hits={url_hits:,}, kept={len(out):,}")

            del raw, c, out, got_raw, got_str, got_url
            gc.collect()

# ------------------- Master runner -------------------
def build_dataset_ultra(path_2017_2018: str, path_2020: str, out_dir="/content/nyt_outputs", chunksize=25_000):
    os.makedirs(out_dir, exist_ok=True)

    # 2017/2018
    by_id_A, by_url_A = load_articles_build_maps(path_2017_2018)
    filtA = os.path.join(out_dir, "us_elections_2017_2018.csv")
    process_comment_stream(by_id_A, by_url_A, comment_files(path_2017_2018), filtA, chunksize=chunksize)

    # 2020
    by_id_B, by_url_B = load_articles_build_maps(path_2020)
    filtB = os.path.join(out_dir, "us_elections_2020.csv")
    process_comment_stream(by_id_B, by_url_B, comment_files(path_2020), filtB, chunksize=chunksize)

    # combine
    combo = os.path.join(out_dir, "us_elections_combined.csv")
    if os.path.exists(combo): os.remove(combo)
    for f in [filtA, filtB]:
        if os.path.exists(f):
            pd.read_csv(f, low_memory=False).to_csv(combo, mode="a", header=not os.path.exists(combo), index=False)

    print("Done. Outputs in:", out_dir)


In [None]:
import kagglehub

path_17_18 = kagglehub.dataset_download("aashita/nyt-comments")
path_2020  = kagglehub.dataset_download("benjaminawd/new-york-times-articles-comments-2020")

build_dataset_ultra(
    path_2017_2018=path_17_18,
    path_2020=path_2020,
    out_dir="/content/nyt_outputs",
    chunksize=25_000
)


Using Colab cache for faster access to the 'nyt-comments' dataset.
Using Colab cache for faster access to the 'new-york-times-articles-comments-2020' dataset.
Processing: CommentsApril2017.csv
  chunk 1: total=25,000, id_hits=25,000, url_hits=0, kept=4,244
  chunk 2: total=50,000, id_hits=25,000, url_hits=0, kept=6,716
  chunk 3: total=50,000, id_hits=25,000, url_hits=0, kept=2,114
  chunk 4: total=50,000, id_hits=25,000, url_hits=0, kept=1,513
  chunk 5: total=50,000, id_hits=25,000, url_hits=0, kept=2,671
  chunk 6: total=50,000, id_hits=25,000, url_hits=0, kept=2,140
  chunk 7: total=50,000, id_hits=25,000, url_hits=0, kept=5,981
  chunk 8: total=50,000, id_hits=25,000, url_hits=0, kept=749
  chunk 9: total=50,000, id_hits=25,000, url_hits=0, kept=3,639
  chunk 10: total=37,664, id_hits=18,832, url_hits=0, kept=4,657
Processing: CommentsApril2018.csv
  chunk 1: total=25,000, id_hits=25,000, url_hits=0, kept=8,532
  chunk 2: total=50,000, id_hits=25,000, url_hits=0, kept=8,177
  chun

KeyboardInterrupt: 