In [6]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re

In [54]:
# initialize the empty dataframe
df = pd.DataFrame(columns=['URL', 'Heading', 'SubHeadings', 'Content'])

In [7]:
# https://www.bbc.com/news
res = requests.get('https://www.bbc.com/news')

# Check if the request was successful
if res.status_code == 200:
    # Parse the HTML content of the page
    soup_data = BeautifulSoup(res.text, 'html.parser')

    # Find the parent div with class 'nw-c-top-stories--international'
    top_stories_div = soup_data.find('div', class_='nw-c-top-stories--international')

    # Check if the 'nw-c-top-stories--international' div was found
    if top_stories_div:
        # Find all <a> tags within the 'top_stories_div' with a class containing 'gs-c-promo-heading'
        promo_headings = top_stories_div.find_all('a', class_=lambda x: x and 'gs-c-promo-heading' in x)

        # Extract the links from the promo_headings
        links = [heading['href'] for heading in promo_headings]

        # Print the links
        for link in links:
            print(link)
    else:
        print("The 'nw-c-top-stories--international' div was not found on the page.")
else:
    print("Failed to retrieve the page. Status code:", res.status_code)


/news/live/world-middle-east-67339462
/news/live/world-middle-east-67339462
/news/world-middle-east-67339008
/news/world-europe-67342367
/news/science-environment-67343374
/news/world-europe-67346780
/news/world-us-canada-67346904
/sport/football/67343050
/news/world-middle-east-67321241
/sport/athletics/67336536
/news/world-africa-67342882
/news/world-africa-67342714
/news/world-africa-67342882
/news/world-africa-67342714
/news/world-europe-67343002
/news/world-asia-china-67341196
/news/business-67343893


In [8]:
len(links)

17

In [55]:
# drop duplicates
links = list(set(links))
links = [link for link in links if 'live' not in link]
links

['/news/world-europe-67342367',
 '/news/world-africa-67342714',
 '/sport/football/67343050',
 '/news/world-europe-67343002',
 '/news/world-europe-67346780',
 '/news/world-middle-east-67321241',
 '/news/science-environment-67343374',
 '/news/world-africa-67342882',
 '/sport/athletics/67336536',
 '/news/world-asia-china-67341196',
 '/news/business-67343893',
 '/news/world-middle-east-67339008',
 '/news/world-us-canada-67346904']

In [56]:
# assign the links to the dataframe
df['URL'] = links
df.head()

Unnamed: 0,URL,Heading,SubHeadings,Content
0,/news/world-europe-67342367,,,
1,/news/world-africa-67342714,,,
2,/sport/football/67343050,,,
3,/news/world-europe-67343002,,,
4,/news/world-europe-67346780,,,


In [57]:
MAIN_URL = 'https://www.bbc.com'

In [121]:
def requestArticle(idx, url):
    url_to_fetch = MAIN_URL + url

    # Send an HTTP GET request to the specified URL
    res = requests.get(url_to_fetch)

    # Check if the request was successful
    if res.status_code == 200:
        # Parse the HTML content of the page
        soup_data = BeautifulSoup(res.text, 'html.parser')

        # Find the <article> tag and get all elements inside it
        article = soup_data.find('article')

        if article:
            # Find all elements (tags) inside the <article> tag
            article_elements = article.find_all()
            
            # Extract the heading
            heading = article.find('h1').text
            # print("Heading")
            # print(heading)

            # Extract the subheadings
            # get headlines with data-component="subheadline-block"
            subheadings = article.find_all('div', attrs={'data-component': 'subheadline-block'})
            subheadings = [subheading for subheading in subheadings if 'You may also be interested in:' not in subheading.text]
            subheadings = [subheading.text for subheading in subheadings]
            subheadings = (', ').join(subheadings)
            # print("SubHeading")
            # print(subheadings)


            # Extract the content
            content = '\n'.join([element.text for element in article_elements if element.name == 'p'])
            # print("Content")
            # print(content)

            # Update the dataframe
            df.loc[idx, 'Heading'] = heading
            df.loc[idx, 'SubHeadings'] = subheadings
            df.loc[idx, 'Content'] = content

        else:
            print("No <article> tag found on the page.")

    else:
        print("Failed to retrieve the page. Status code:", res.status_code)


In [122]:
# Loop through the links and request each article
for idx, link in enumerate(links):
    requestArticle(idx, link)

In [126]:
df.head()

Unnamed: 0,URL,Heading,SubHeadings,Content
0,/news/world-europe-67342367,Ukraine war: Grenade birthday gift kills army ...,,A grenade given as a birthday present has blow...
1,/news/world-africa-67342714,Mr Ibu: Nigerian Nollywood actor's leg amputat...,,Family members of popular Nigerian comic actor...
2,/sport/football/67343050,Ukraine: Dnipro-1 and FC Oleksandriya play ‘lo...,,Last updated on 3 hours ago3 hours ago.From th...
3,/news/world-europe-67343002,Germany agrees to consider UK-style plan on pr...,,"German Chancellor Olaf Scholz has pledged to ""..."
4,/news/world-europe-67346780,Portuguese PM AntÃ³nio Costa resigns over lith...,,Portuguese Prime Minister AntÃ³nio Costa says ...
