In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

# Define BBC sections to scrape
sections = {
    "World": "https://www.bbc.com/news/world",
    "Business": "https://www.bbc.com/news/business",
    "Technology": "https://www.bbc.com/news/technology"
}

all_data = []

for genre, url in sections.items():
    print(f" Scraping BBC section: {genre} ({url})")

    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(response.text, "html.parser")

    # Select article links
    articles = soup.select("a[href*='/news/']")

    for a in articles:
        headline = a.get_text(strip=True)
        link = a.get("href")

        # Skip invalid links or non-articles
        if not headline or not link or "/av/" in link or "/live/" in link:
            continue

        if link.startswith("/"):
            link = "https://www.bbc.com" + link

        # Fetch the article text (optional but included for 'Article' field)
        article_text = ""
        try:
            article_response = requests.get(link, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
            article_soup = BeautifulSoup(article_response.text, "html.parser")
            paragraphs = article_soup.select("article p")
            article_text = " ".join(p.get_text(strip=True) for p in paragraphs)
        except Exception as e:
            article_text = "Unable to retrieve article text."

        all_data.append({
            "Date": datetime.now().strftime("%Y-%m-%d"),
            "Headline": headline,
            "Article": article_text,
            "Genre": genre
        })

# Create DataFrame and save
df = pd.DataFrame(all_data).drop_duplicates(subset=["Headline"])
df.to_csv("bbc_news_multigenre.csv", index=False, encoding="utf-8-sig")

print("\n BBC News data saved to 'bbc_news_multigenre.csv'")
print(df.head())


 Scraping BBC section: World (https://www.bbc.com/news/world)
 Scraping BBC section: Business (https://www.bbc.com/news/business)
 Scraping BBC section: Technology (https://www.bbc.com/news/technology)

 BBC News data saved to 'bbc_news_multigenre.csv'
         Date         Headline  \
0  2025-11-02  Israel-Gaza War   
1  2025-11-02   War in Ukraine   
2  2025-11-02      US & Canada   
3  2025-11-02               UK   
4  2025-11-02      UK Politics   

                                             Article  Genre  
0  The bodies were identified as those of Amiram ...  World  
1  Oleksandr Syrskyi says special forces have bee...  World  
2  From hip-hop musician to the brink of running ...  World  
3  Ten people are injured while police say the in...  World  
4  The poor state of military housing has been a ...  World  
