In [9]:
# download
# !pip install feedparser pyarrow requests

# necessary inputs
import json, os, csv
from datetime import timezone
import requests
import feedparser
from dateutil import parser as dtparse
import pandas as pd

import requests
from bs4 import BeautifulSoup

import time, random, re
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

import google.generativeai as genai

In [11]:
FEED_URL = "https://jpmorganchaseco.gcs-web.com/rss/news-releases.xml"

OUT_CSV = "data/jpm_press_releases.csv"
STATE_JSON = "data/jpm_rss_state.json"

SESSION = requests.Session()
SESSION.headers.update({"User-Agent": "BNYCapstone/1.0"})

In [13]:
def fetch_feed_bytes(url: str) -> bytes:
    r = SESSION.get(url, timeout=30)
    r.raise_for_status()
    return r.content

def to_iso_utc(dt_str: str | None) -> str:
    if not dt_str:
        return ""
    try:
        dt = dtparse.parse(dt_str)
        if not dt.tzinfo:
            dt = dt.replace(tzinfo=timezone.utc)
        return dt.astimezone(timezone.utc).isoformat()
    except Exception:
        return ""

def parse_feed_to_df(feed_bytes: bytes, source_name: str = "JPMorganChase") -> pd.DataFrame:
    parsed = feedparser.parse(feed_bytes)
    rows = []
    for e in parsed.entries:
        guid = (e.get("id") or e.get("guid") or e.get("link") or
                f"{e.get('title','')}-{e.get('published','')}")
        title = (e.get("title") or "").strip()
        link = (e.get("link") or "").strip()
        published = to_iso_utc(e.get("published") or e.get("updated"))

        tags = []
        if isinstance(e.get("tags"), list):
            for t in e["tags"]:
                label = t.get("term") or t.get("label")
                if label:
                    tags.append(str(label).strip())
        categories = "; ".join(tags)
        summary = (e.get("summary") or e.get("description") or "").strip()
        summary = " ".join(summary.split())
        rows.append({
            "source": source_name,
            "guid": guid.strip(),
            "title": title,
            "link": link,
            "published_utc": published,
            "summary": summary
        })
    return pd.DataFrame(rows)

def load_seen(state_path: str = STATE_JSON) -> set[str]:
    if not os.path.exists(state_path):
        return set()
    try:
        with open(state_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        return set(data.get("seen_ids", []))
    except Exception:
        return set()

def save_seen(seen: set[str], state_path: str = STATE_JSON) -> None:
    with open(state_path, "w", encoding="utf-8") as f:
        json.dump({"seen_ids": sorted(seen)}, f, indent=2)

In [15]:
# MAIN SCRIPT

# fetch data
feed_bytes = fetch_feed_bytes(FEED_URL)
df = parse_feed_to_df(feed_bytes)

# filter for new data
seen = load_seen(STATE_JSON)
is_new = ~df["guid"].isin(seen)
df_new = df[is_new].copy()

# find accurate summaries
genai.configure(api_key="AIzaSyAm-pqLVce_uYDyOHvJn-wHFaSHp2j3jt8")
model = genai.GenerativeModel("models/gemini-2.5-flash")
df_new["summary"] = df_new["link"].apply(
    lambda x: model.generate_content(f"For the article in this link, {x}, \
    provide me a summary of the article. 2-3 sentences.").text
)

# store feed data in csv
df_new.to_csv(OUT_CSV, mode="a", header=False, index=False, quoting=csv.QUOTE_MINIMAL)
print(f'{df_new.shape[0]} rows added to csv file in: {OUT_CSV}')

# update json state (metadata)
seen.update(df_new["guid"].tolist())
save_seen(seen, STATE_JSON)
print(f'{len(seen)} guids in to metadata file in: {STATE_JSON}')

0 rows added to csv file in: data/jpm_press_releases.csv
10 guids added to metadata file in: data/jpm_rss_state.json
