In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Function to get the HTML content of a webpage
def get_html(url):
    response = requests.get(url)
    return response.content

# Function to parse the BBC homepage and extract article links
def parse_homepage(html):
    soup = BeautifulSoup(html, 'html.parser')
    articles = []
    
    for link in soup.find_all('a', href=True):
        href = link['href']
        if '/news/' in href and href.startswith('/news/'):
            full_url = 'https://www.bbc.com' + href
            articles.append(full_url)
    
    return list(set(articles))  # Remove duplicates

# Function to parse individual article page
def parse_article(html):
    soup = BeautifulSoup(html, 'html.parser')
    title = soup.find('h1').text if soup.find('h1') else 'No title'
    paragraphs = soup.find_all('p')
    content = ' '.join([para.text for para in paragraphs])
    
    return title, content

# Main script
if __name__ == "__main__":
    homepage_url = 'https://www.bbc.com'
    html = get_html(homepage_url)
    article_links = parse_homepage(html)
    
    articles_data = []

    for link in article_links:
        try:
            article_html = get_html(link)
            title, content = parse_article(article_html)
            articles_data.append({'Title': title, 'Content': content, 'URL': link})
        except Exception as e:
            print(f"Failed to scrape {link}: {e}")
    
    # Save to CSV
    df = pd.DataFrame(articles_data)
    df.to_csv('bbc_articles.csv', index=False)
    
    print("Scraping completed and data saved to 'bbc_articles.csv'")


Scraping completed and data saved to 'bbc_articles.csv'
