In [178]:
# download
# !pip install feedparser pyarrow requests

# necessary inputs
import json, os, csv
from datetime import timezone
import requests
import feedparser
from dateutil import parser as dtparse
import pandas as pd

import requests
from bs4 import BeautifulSoup

import time, random, re
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

import google.generativeai as genai

## JP MORGAN

In [199]:
FEED_URL = "https://jpmorganchaseco.gcs-web.com/rss/news-releases.xml"

OUT_CSV = "data/jpm_press_releases.csv"
STATE_JSON = "data/jpm_rss_state.json"

SESSION = requests.Session()
SESSION.headers.update({"User-Agent": "BNYCapstone/1.0"})

In [259]:
def fetch_feed_bytes(url: str) -> bytes:
    r = SESSION.get(url, timeout=30)
    r.raise_for_status()
    return r.content

def to_iso_utc(dt_str: str | None) -> str:
    if not dt_str:
        return ""
    try:
        dt = dtparse.parse(dt_str)
        if not dt.tzinfo:
            dt = dt.replace(tzinfo=timezone.utc)
        return dt.astimezone(timezone.utc).isoformat()
    except Exception:
        return ""

def parse_feed_to_df(feed_bytes: bytes, source_name: str = "JPMorganChase") -> pd.DataFrame:
    parsed = feedparser.parse(feed_bytes)
    rows = []
    for e in parsed.entries:
        guid = (e.get("id") or e.get("guid") or e.get("link") or
                f"{e.get('title','')}-{e.get('published','')}")
        title = (e.get("title") or "").strip()
        link = (e.get("link") or "").strip()
        published = to_iso_utc(e.get("published") or e.get("updated"))

        tags = []
        if isinstance(e.get("tags"), list):
            for t in e["tags"]:
                label = t.get("term") or t.get("label")
                if label:
                    tags.append(str(label).strip())
        categories = "; ".join(tags)
        summary = (e.get("summary") or e.get("description") or "").strip()
        summary = " ".join(summary.split())
        rows.append({
            "source": source_name,
            "guid": guid.strip(),
            "title": title,
            "link": link,
            "published_utc": published,
            "summary": summary
        })
    return pd.DataFrame(rows)

In [185]:
def load_seen(state_path: str = STATE_JSON) -> set[str]:
    if not os.path.exists(state_path):
        return set()
    try:
        with open(state_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        return set(data.get("seen_ids", []))
    except Exception:
        return set()

def save_seen(seen: set[str], state_path: str = STATE_JSON) -> None:
    with open(state_path, "w", encoding="utf-8") as f:
        json.dump({"seen_ids": sorted(seen)}, f, indent=2)


In [207]:
# MAIN SCRIPT

# fetch data
feed_bytes = fetch_feed_bytes(FEED_URL)
df = parse_feed_to_df(feed_bytes)

# filter for new data
seen = load_seen(STATE_JSON)
is_new = ~df["guid"].isin(seen)
df_new = df[is_new].copy()

# find accurate summaries
genai.configure(api_key="AIzaSyAm-pqLVce_uYDyOHvJn-wHFaSHp2j3jt8")
model = genai.GenerativeModel("models/gemini-2.5-flash")
df_new["summary"] = df_new["link"].apply(
    lambda x: model.generate_content(f"For the article in this link, {x}, \
    provide me a summary of the article. 2-3 sentences.").text
)

# store feed data in csv
df_new.to_csv(OUT_CSV, mode="a", header=False, index=False, quoting=csv.QUOTE_MINIMAL)
print(f'{df_new.shape[0]} rows added to csv file in: {OUT_CSV}')

# update json state (metadata)
seen.update(df_new["guid"].tolist())
save_seen(seen, STATE_JSON)
print(f'{len(seen)} guids added to metadata file in: {STATE_JSON}')

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): jpmorganchaseco.gcs-web.com:443
DEBUG:urllib3.connectionpool:https://jpmorganchaseco.gcs-web.com:443 "GET /rss/news-releases.xml HTTP/11" 200 7709
10 rows added to csv file in: data/jpm_press_releases.csv
10 guids added to metadata file in: data/jpm_rss_state.json


## NewsWire

In [104]:
def parse_feed_to_df(feed_bytes: bytes, source_name: str) -> pd.DataFrame:
    parsed = feedparser.parse(feed_bytes)
    rows = []
    for e in parsed.entries:
        guid = (e.get("id") or e.get("guid") or e.get("link") or
                f"{e.get('title','')}-{e.get('published','')}")
        tags = []
        if isinstance(e.get("tags"), list):
            for t in e["tags"]:
                label = t.get("term") or t.get("label")
                if label: tags.append(str(label).strip())
        rows.append({
            "source": source_name,
            "guid": (guid or "").strip(),
            "title": (e.get("title") or "").strip(),
            "link": (e.get("link") or "").strip(),
            "published_utc": to_iso_utc(e.get("published") or e.get("updated")),
            "categories": "; ".join(tags),
            "summary": " ".join(((e.get("summary") or e.get("description") or "").strip()).split())
        })
    return pd.DataFrame(rows)

def ingest_feed(feed_url: str, source_name: str, csv_path: str, parquet_path: str, state_path: str):
    feed_bytes = fetch_feed_bytes(feed_url)
    df = parse_feed_to_df(feed_bytes, source_name)
    seen = load_seen(state_path)
    df_new = df[~df["guid"].isin(seen)].copy()

    # if not df_new.empty:
    #     if not os.path.exists(csv_path):
    #         df_new.to_csv(csv_path, index=False, quoting=csv.QUOTE_MINIMAL)
    #     else:
    #         df_new.to_csv(csv_path, mode="a", header=False, index=False, quoting=csv.QUOTE_MINIMAL)

    #     if not os.path.exists(parquet_path):
    #         df_new.to_parquet(parquet_path, index=False)
    #     else:
    #         old = pd.read_parquet(parquet_path)
    #         pd.concat([old, df_new], ignore_index=True).to_parquet(parquet_path, index=False)

    #     seen.update(df_new["guid"].tolist())
    #     save_seen(seen, state_path)

    return df_new #len(df_new)


In [115]:
df_jp = ingest_feed(
    "https://jpmorganchaseco.gcs-web.com/rss/news-releases.xml",
    "JPMorganChase",
    "jpm_press_releases.csv",
    "jpm_press_releases.parquet",
    "jpm_rss_state.json"
)

df_newswire = ingest_feed(
    "https://www.prnewswire.com/rss/news-releases-list.rss",
    "PRNewswire",
    "prnewswire_press_releases.csv",
    "prnewswire_press_releases.parquet",
    "prnewswire_rss_state.json"
)

In [116]:
df_newswire.summary.loc[0]

'<p>Amazing Key Facts: Short Getaways: Jamaica, Bahamas, Barbados, and Turks & Caicos rise in winter bookings for quick escapes. Long Stays: Jeju, Bali, Mauritius, and Fiji cater to digital nomads and extended stays. Luxury Wellness: Sicily, Bora Bora, Maui, and Hilton Head lead in wellness...</p>'

In [176]:
df_newswire

Unnamed: 0,source,guid,title,link,published_utc,categories,summary
0,PRNewswire,https://www.prnewswire.com/news-releases/top-3...,"Top 30 Island Escapes for Winter Travel 2025, ...",https://www.prnewswire.com/news-releases/top-3...,2025-10-15T21:15:00+00:00,SVY,"<p>Amazing Key Facts: Short Getaways: Jamaica,..."
1,PRNewswire,https://www.prnewswire.com/news-releases/cptn-...,CPTN Investors have Opportunity to Lead Cepton...,https://www.prnewswire.com/news-releases/cptn-...,2025-10-15T21:15:00+00:00,ATY,"<p>NEW YORK, Oct. 15, 2025 /PRNewswire/ -- Why..."
2,PRNewswire,https://www.prnewswire.com/news-releases/jeju-...,Jeju Air Crash Families Sue Boeing for Deadly ...,https://www.prnewswire.com/news-releases/jeju-...,2025-10-15T21:15:00+00:00,LAW,<p>Lawsuit alleges catastrophic failures of el...
3,PRNewswire,https://www.prnewswire.com/news-releases/south...,Southwire CEO Rich Stinson Announces Retiremen...,https://www.prnewswire.com/news-releases/south...,2025-10-15T21:15:00+00:00,PER,<p>Stinson celebrates more than 40 years of in...
4,PRNewswire,https://www.prnewswire.com/news-releases/resid...,Resideo To Release Third Quarter 2025 Financia...,https://www.prnewswire.com/news-releases/resid...,2025-10-15T21:06:00+00:00,CCA; FVT,"<p>SCOTTSDALE, Ariz., Oct. 15, 2025 /PRNewswir..."
5,PRNewswire,https://www.prnewswire.com/news-releases/silve...,Silvercorp Reports Operational Results and Fin...,https://www.prnewswire.com/news-releases/silve...,2025-10-15T21:05:00+00:00,,<p>Trading Symbol: TSX/NYSE American: SVM VANC...
6,PRNewswire,https://www.prnewswire.com/news-releases/share...,SHAREHOLDER RIGHTS ALERT: Halper Sadeh LLC Inv...,https://www.prnewswire.com/news-releases/share...,2025-10-15T21:04:00+00:00,ATY,"<p>NEW YORK, Oct. 15, 2025 /PRNewswire/ -- Hal..."
7,PRNewswire,https://www.prnewswire.com/news-releases/willi...,WILLIAM SHATNER CALLS ON GLOBAL HEALTH ADVOCAT...,https://www.prnewswire.com/news-releases/willi...,2025-10-15T21:03:00+00:00,PDT,"<p>CLEVELAND , Oct. 15, 2025 /PRNewswire/ -- L..."
8,PRNewswire,https://www.prnewswire.com/news-releases/diego...,Diego Gonzalez Named Sales Leader at Clear Tec...,https://www.prnewswire.com/news-releases/diego...,2025-10-15T21:02:00+00:00,PER,<p>Executive's Strategic Vision and Team-Build...
9,PRNewswire,https://www.prnewswire.com/news-releases/nevad...,NEVADA KING GRADUATES TO TIER 1 OF THE TSX VEN...,https://www.prnewswire.com/news-releases/nevad...,2025-10-15T21:00:00+00:00,,"<p>VANCOUVER, BC, Oct. 15, 2025 /PRNewswire/ -..."


# Scrape Article from Link

In [141]:
def get_full_article_text(url: str) -> str:
    try:
        r = requests.get(url, timeout=50, headers={"User-Agent": "BNYCapstone/1.0"})
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")

        # PR Newswire uses <div id="release-body"> or similar
        article_div = soup.select_one("#release-body, .article-body, .article-content, .news-release-body")
        if article_div:
            # Clean up
            for tag in article_div(["script", "style"]):
                tag.decompose()
            text = " ".join(article_div.get_text(separator=" ", strip=True).split())
            return text
        return ""
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return ""


In [143]:
test_link = df_jp.link.loc[0]

In [145]:
text = get_full_article_text(test_link)

Error fetching https://jpmorganchaseco.gcs-web.com/news-releases/news-release-details/jpmorganchase-declares-preferred-stock-dividends-13: HTTPSConnectionPool(host='jpmorganchaseco.gcs-web.com', port=443): Read timed out. (read timeout=50)


In [163]:
import time, random, logging, sys
import requests
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

# --- Turn on detailed connection logs (helpful!) ---
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger("urllib3").setLevel(logging.DEBUG)

TARGET = "https://jpmorganchaseco.gcs-web.com/news-releases/news-release-details/jpmorganchase-declares-preferred-stock-dividends-13"

def make_session():
    s = requests.Session()
    s.headers.update({
        "User-Agent": (
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/119.0.0.0 Safari/537.36"
        ),
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
        "Connection": "keep-alive",
    })
    retry = Retry(
        total=4,                 # total retries (connect/read)
        connect=4,
        read=4,
        backoff_factor=1.5,      # 0, 1.5, 3.0, 4.5 ...
        status_forcelist=(429, 500, 502, 503, 504),
        allowed_methods=frozenset(["GET", "HEAD"]),
        raise_on_status=False,
    )
    adapter = HTTPAdapter(max_retries=retry, pool_connections=10, pool_maxsize=10)
    s.mount("https://", adapter)
    s.mount("http://", adapter)
    return s

def probe(session, url, connect_timeout=10, read_timeout=25):
    print("\n" + "="*80)
    print(f"URL: {url}")
    t0 = time.time()
    try:
        r = session.get(url, timeout=(connect_timeout, read_timeout), allow_redirects=True)
    except requests.exceptions.ConnectTimeout:
        print("❌ Connect timeout (could not establish TCP/TLS).")
        return
    except requests.exceptions.ReadTimeout:
        print("❌ Read timeout (server accepted connection but no body returned in time).")
        return
    except requests.exceptions.SSLError as e:
        print(f"❌ SSL error: {e}")
        return
    except requests.exceptions.RequestException as e:
        print(f"❌ Request error: {e}")
        return
    t1 = time.time()

    print(f"✅ Status: {r.status_code} in {t1-t0:.2f}s")
    if r.history:
        print("↪ Redirect chain:")
        for h in r.history:
            print(f"  {h.status_code} -> {h.headers.get('Location')!r}")
    print(f"Final URL: {r.url}")
    print(f"Content-Length header: {r.headers.get('Content-Length')}")
    print(f"Downloaded bytes: {len(r.content)}")
    print(f"Server: {r.headers.get('Server')}, CDN: {r.headers.get('CF-Cache-Status') or r.headers.get('X-Cache')}")
    print("Some headers:", {k: r.headers.get(k) for k in ["Content-Type","Cache-Control","Set-Cookie"]})

# --- Run diagnostics ---
session = make_session()

# 0) Lightweight probe on same host to rule out general connectivity/SSL/CDN blocks
probe(session, "https://jpmorganchaseco.gcs-web.com/robots.txt", connect_timeout=5, read_timeout=10)

# 1) Original URL
probe(session, TARGET, connect_timeout=10, read_timeout=50)

# 2) Try with trailing slash (Q4 pages often prefer this)
if not TARGET.endswith("/"):
    probe(session, TARGET + "/", connect_timeout=10, read_timeout=60)

# 3) HEAD request (sometimes allowed even if GET is slow); helps see if it’s reachable
print("\n" + "="*80 + "\nHEAD check:")
try:
    r_head = session.head(TARGET, timeout=(5, 10), allow_redirects=True)
    print(f"HEAD status: {r_head.status_code}, final URL: {r_head.url}")
    print("HEAD headers (subset):", {k: r_head.headers.get(k) for k in ["Content-Type","Content-Length","Server"]})
except Exception as e:
    print("HEAD error:", e)



URL: https://jpmorganchaseco.gcs-web.com/robots.txt
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): jpmorganchaseco.gcs-web.com:443
DEBUG:urllib3.util.retry:Incremented Retry for (url='/robots.txt'): Retry(total=3, connect=4, read=3, redirect=None, status=None)
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (2): jpmorganchaseco.gcs-web.com:443
DEBUG:urllib3.util.retry:Incremented Retry for (url='/robots.txt'): Retry(total=2, connect=4, read=2, redirect=None, status=None)
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (3): jpmorganchaseco.gcs-web.com:443
DEBUG:urllib3.util.retry:Incremented Retry for (url='/robots.txt'): Retry(total=1, connect=4, read=1, redirect=None, status=None)
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (4): jpmorganchaseco.gcs-web.com:443
DEBUG:urllib3.util.retry:Incremented Retry for (url='/robots.txt'): Retry(total=0, connect=4, read=0, redirect=None, status=None)
DEBUG:urllib3.connectionpool:Starting new H

In [161]:
r

<Response [404]>

In [172]:
test_link

'https://jpmorganchaseco.gcs-web.com/news-releases/news-release-details/jpmorganchase-declares-preferred-stock-dividends-13'