In [97]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime
import random

url = "https://finance.yahoo.com"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

In [98]:
def get_article_links():
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    articles = []
    clamp_titles = soup.find_all(class_='clamp')

    for title_elem in clamp_titles:
        link_elem = title_elem.find_parent('a', href=True)
        if link_elem and '/news/' in link_elem['href']:
            article = {
                'title': title_elem.text.strip(),
                'url': url + link_elem['href'] if link_elem['href'].startswith('/') else link_elem['href']
            }
            articles.append(article)
    print(articles)
    return articles

In [99]:
def scrape_article_content(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    content_elements = soup.find_all(['p', 'h2'])
    content = ' '.join([elem.text.strip() for elem in content_elements if elem.text.strip()])
    return content

In [100]:
print("Getting articles from main page...")
articles_data = get_article_links()
print(f"Found {len(articles_data)} articles")


Getting articles from main page...
[{'title': 'Nasdaq leads stocks higher as Netflix soars', 'url': 'https://finance.yahoo.com/news/live/stock-market-today-nasdaq-jumps-to-lead-sp-dow-higher-as-netflix-soars-and-ai-buzz-returns-120536015.html'}, {'title': "Netflix hits record high as Wall Street cheers 'near flawless' earnings", 'url': 'https://finance.yahoo.com/news/netflix-stock-reaches-all-time-high-as-wall-street-cheers-near-flawless-earnings-214816664.html'}, {'title': 'Nvidia, tech stocks rally after Trump announces $500B AI project', 'url': 'https://finance.yahoo.com/news/nvidia-tech-stocks-rally-after-trump-announces-500-billion-stargate-ai-project-153919780.html'}, {'title': "Dimon argues tariffs are good for US security: 'Get over it'", 'url': 'https://finance.yahoo.com/news/jpmorgans-dimon-argues-tariffs-are-good-for-us-security-even-if-inflationary-get-over-it-153641129.html'}, {'title': "Microsoft relaxes grip on OpenAI amid $500B 'Stargate' venture", 'url': 'https://finan

In [101]:
for i, article in enumerate(articles_data, 1):
    print(f"Scraping article {i}/{len(articles_data)}: {article['title'][:50]}...")
    content = scrape_article_content(article['url'])
    article['content'] = content
    time.sleep(random.uniform(1, 3))

Scraping article 1/21: Nasdaq leads stocks higher as Netflix soars...
Scraping article 2/21: Netflix hits record high as Wall Street cheers 'ne...
Scraping article 3/21: Nvidia, tech stocks rally after Trump announces $5...
Scraping article 4/21: Dimon argues tariffs are good for US security: 'Ge...
Scraping article 5/21: Microsoft relaxes grip on OpenAI amid $500B 'Starg...
Scraping article 6/21: Head of Norway's $1.8T fund sees a contrarian bet ...
Scraping article 7/21: Johnson & Johnson stock down despite beating expec...
Scraping article 8/21: JPMorgan's Dimon: US stock prices are 'kind of inf...
Scraping article 9/21: Hindenburg's closure highlights 'wear and tear' of...
Scraping article 10/21: Venture Global Slashes IPO Price Range by More Tha...
Scraping article 11/21: Goldman Sachs CEO says important for US, China to ...
Scraping article 12/21: Herald investors vote against Boaz Weinstein's rev...
Scraping article 13/21: Caffeine fix? Coffee wholesaler sues brokerage ove...
Sc

In [102]:
df = pd.DataFrame(articles_data)
output_file = f'yahoo_finance_articles.csv'
df.to_csv(output_file, index=False, encoding='utf-8')