In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime

# Define the URL for Yahoo Finance news
url = "https://finance.yahoo.com/news"

# Make an HTTP request to the website
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all the <div> containers that hold the article headlines and links
    articles = soup.find_all('div', class_='content yf-1qcp8cc')
    
    # Extract headlines, links, publication dates, source, and sentiment
    news_data = []
    for article in articles:
        # Try to extract the headline using the h3 tag
        headline_tag = article.find('h3', class_='clamp tw-line-clamp-3 sm:tw-line-clamp-2 yf-1qcp8cc')
        
        if headline_tag:
            # Extract headline text
            headline = headline_tag.text.strip()
        else:
            continue  # Skip if no headline is found
        
        # Extract link from the <a> tag (if exists)
        link_tag = article.find('a', class_='subtle-link fin-size-small titles noUnderline yf-1e4diqp')
        
        if link_tag and 'href' in link_tag.attrs:
            link = link_tag['href']
            if not link.startswith('http'):
                link = "https://finance.yahoo.com" + link
        else:
            continue  # Skip if no link is found

        # Extract publication date from article metadata (if available)
        date_tag = article.find('span', class_='Fw(b)')
        if date_tag:
            date = date_tag.text.strip()
        else:
            date = datetime.datetime.now().strftime('%Y-%m-%d')  # Default to current date if not found

        # Extract source (if available)
        source_tag = article.find('span', class_='C($tertiaryColor)')
        source = source_tag.text.strip() if source_tag else 'Yahoo Finance'  # Default to Yahoo Finance

        # Extract category (if available)
        category_tag = article.find('div', class_='C(#fff)')
        category = category_tag.text.strip() if category_tag else 'General'  # Default to 'General' if no category is found
        
        # Append the data for each article
        news_data.append({
            "Headline": headline,
            "Link": link,
            "Date": date,
            "Source": source,
            "Category": category
        })

    # Convert to a Pandas DataFrame
    news_df = pd.DataFrame(news_data)

    # Display the first few rows
    print(news_df.head())

    # Save to a CSV file for later use
    news_df.to_csv('news_data.csv', index=False)

else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

                                            Headline  \
0  Lowe's beats its quarterly estimates, but nega...   
1  Hilton plans to expand footprint in India amid...   
2  Starting a small business is hard. Exiting can...   
3  Factbox-What new taxes could help raise money ...   
4  Google’s Chrome to Fetch Up to $20 Billion If ...   

                                                Link        Date  \
0  https://finance.yahoo.com/news/lowes-beats-its...  2024-11-19   
1  https://finance.yahoo.com/news/hilton-plans-ex...  2024-11-19   
2  https://finance.yahoo.com/news/starting-small-...  2024-11-19   
3  https://finance.yahoo.com/news/factbox-taxes-c...  2024-11-19   
4  https://finance.yahoo.com/news/doj-push-google...  2024-11-19   

          Source Category  
0  Yahoo Finance  General  
1  Yahoo Finance  General  
2  Yahoo Finance  General  
3  Yahoo Finance  General  
4  Yahoo Finance  General  
