In [9]:
import requests
import pandas as pd
import os
import time
from yahoo_fin import stock_info as si
import requests
from bs4 import BeautifulSoup



In [13]:
def fetch_news(start_date, end_date, output_csv_path):
    companies = [
        "JPM",  # JPMorgan Chase
        # "GS",   # Goldman Sachs
        # "PFE",  # Pfizer
        # "MRNA", # Moderna
        # "AAPL", # Apple
        # "MSFT", # Microsoft
        # "TSLA", # Tesla
        # "NVDA"  # Nvidia
    ]

    all_articles = []

    for company in companies:
        print(f"Scraping news for: {company}")
        company_news = scrape_yahoo_finance(company, start_date, end_date)
        all_articles.extend(company_news)

    # Save the data to a CSV
    df = pd.DataFrame(all_articles)
    os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)
    df.to_csv(output_csv_path, index=False)
    print(f"News data saved to: {output_csv_path}")

In [14]:
def scrape_yahoo_finance(company, start_date, end_date):
    """
    Scrapes Yahoo Finance news for a specific company within a date range.
    Supports pagination to fetch multiple pages of news articles.
    """
    base_url = f"https://finance.yahoo.com/quote/{company}/news?p={company}&count=100"
    page = 1
    all_articles = []

    while True:
        # Construct URL with pagination
        url = f"{base_url}&page={page}"
        response = requests.get(url)
        
        if response.status_code != 200:
            print(f"Failed to fetch news for {company}: {response.status_code}")
            break

        soup = BeautifulSoup(response.content, "html.parser")
        articles = soup.find_all("li", class_="js-stream-content")
        
        if not articles:  # If no articles are found, exit pagination
            break

        for article in articles:
            try:
                title_element = article.find("h3")
                title = title_element.text if title_element else "N/A"
                link = "https://finance.yahoo.com" + title_element.find("a")["href"] if title_element else "N/A"

                # Fetch the article content
                content = scrape_full_content(link)

                all_articles.append({
                    "company": company,
                    "title": title,
                    "url": link,
                    "content": content,
                })
            except Exception as e:
                print(f"Error processing article: {e}")
                continue

        page += 1  # Move to the next page
        time.sleep(1)  # Delay to avoid overwhelming the server

    return all_articles

def scrape_full_content(url):
    """
    Scrapes the full content of a news article given its URL.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, "html.parser")
        paragraphs = soup.find_all("p")
        full_text = " ".join([p.get_text() for p in paragraphs])

        return full_text
    except Exception as e:
        print(f"Failed to scrape article content from {url}: {e}")
        return "N/A"

In [15]:
start_date = "2024-09-27"
end_date = "2024-11-01"
output_csv_path = "test_data/financial_news.csv"

fetch_news(start_date, end_date, output_csv_path)

Scraping news for: JPM
Failed to fetch news for JPM: 404
News data saved to: test_data/financial_news.csv
