In [None]:
# Write a function to Get and parse html content from a Wikipedia page

In [3]:

import requests
from bs4 import BeautifulSoup

def get_and_parse_wikipedia_page(url):
    try:
        # Get the HTML content of the page
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        return soup

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None

if __name__ == "__main__":
    url = "https://en.wikipedia.org/wiki/Peter_Talbot"
    soup = get_and_parse_wikipedia_page(url)

    if soup:
        # Example: Print the page title
        print(soup.title.string)

        # Example: Extract the first paragraph
        first_paragraph = soup.find('p')
        print(first_paragraph.get_text())


Peter Talbot - Wikipedia
Peter Talbot may refer to:



In [None]:
# Write a function to Extract article title

In [4]:
import requests
from bs4 import BeautifulSoup

def extract_wikipedia_title(url):
    try:
        # Get the HTML content of the page
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the title of the article
        title = soup.find('h1', {'id': 'firstHeading'}).get_text()

        return title

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None

    except AttributeError as e:
        print(f"An error occurred while parsing the title: {e}")
        return None

if __name__ == "__main__":
    url = "https://en.wikipedia.org/wiki/Peter_Talbot"
    title = extract_wikipedia_title(url)

    if title:
        print(f"Article Title: {title}")


Article Title: Peter Talbot


In [None]:
# Write a function to Extract article text for each paragraph with their respective headings
# Map those headings to their respective paragraphs in the dictionary.

In [5]:
import requests
from bs4 import BeautifulSoup

def extract_article_content(url):
    try:
        # Get the HTML content of the page
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract all headings and paragraphs
        content = {}
        current_heading = "Introduction"
        content[current_heading] = []

        for tag in soup.find_all(['h2', 'h3', 'p']):
            if tag.name == 'h2' or tag.name == 'h3':
                current_heading = tag.get_text().strip()
                content[current_heading] = []
            elif tag.name == 'p':
                if current_heading not in content:
                    content[current_heading] = []
                content[current_heading].append(tag.get_text().strip())

        # Merge paragraphs under each heading
        for heading in content:
            content[heading] = ' '.join(content[heading])

        return content

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None

    except AttributeError as e:
        print(f"An error occurred while parsing the content: {e}")
        return None

if __name__ == "__main__":
    url = "https://en.wikipedia.org/wiki/Peter_Talbot"
    content = extract_article_content(url)

    if content:
        for heading, paragraphs in content.items():
            print(f"Heading: {heading}\n")
            print(f"Content: {paragraphs}\n")


Heading: Introduction

Content: Peter Talbot may refer to:



In [None]:
# Write a function to collect every link that redirects to another Wikipedia page

In [6]:
import requests
from bs4 import BeautifulSoup

def collect_wikipedia_links(url):
    try:
        # Get the HTML content of the page
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all links that redirect to other Wikipedia pages
        links = []
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            if href.startswith('/wiki/') and not ':' in href:
                full_url = f"https://en.wikipedia.org{href}"
                links.append(full_url)

        return links

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None

if __name__ == "__main__":
    url = "https://en.wikipedia.org/wiki/Web_scraping"
    links = collect_wikipedia_links(url)

    if links:
        for link in links:
            print(link)


https://en.wikipedia.org/wiki/Main_Page
https://en.wikipedia.org/wiki/Main_Page
https://en.wikipedia.org/wiki/Web_scraping
https://en.wikipedia.org/wiki/Web_scraping
https://en.wikipedia.org/wiki/Web_scraping
https://en.wikipedia.org/wiki/Data_scraping
https://en.wikipedia.org/wiki/Scraper_site
https://en.wikipedia.org/wiki/Data_scraping
https://en.wikipedia.org/wiki/Data_extraction
https://en.wikipedia.org/wiki/Website
https://en.wikipedia.org/wiki/World_Wide_Web
https://en.wikipedia.org/wiki/Hypertext_Transfer_Protocol
https://en.wikipedia.org/wiki/Internet_bot
https://en.wikipedia.org/wiki/Web_crawler
https://en.wikipedia.org/wiki/Database
https://en.wikipedia.org/wiki/Data_retrieval
https://en.wikipedia.org/wiki/Data_analysis
https://en.wikipedia.org/wiki/Parsing
https://en.wikipedia.org/wiki/Contact_scraping
https://en.wikipedia.org/wiki/Web_indexing
https://en.wikipedia.org/wiki/Web_mining
https://en.wikipedia.org/wiki/Data_mining
https://en.wikipedia.org/wiki/Comparison_shopping

In [None]:
# Wrap all the previous functions into a single function that takes as parameters a Wikipedia link

In [7]:
import requests
from bs4 import BeautifulSoup

def extract_wikipedia_data(url):
    try:
        # Get the HTML content of the page
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Function to extract the title
        def extract_title(soup):
            try:
                title = soup.find('h1', {'id': 'firstHeading'}).get_text()
                return title
            except AttributeError as e:
                print(f"An error occurred while parsing the title: {e}")
                return None

        # Function to extract article content
        def extract_content(soup):
            content = {}
            current_heading = "Introduction"
            content[current_heading] = []

            for tag in soup.find_all(['h2', 'h3', 'p']):
                if tag.name == 'h2' or tag.name == 'h3':
                    current_heading = tag.get_text().strip()
                    content[current_heading] = []
                elif tag.name == 'p':
                    if current_heading not in content:
                        content[current_heading] = []
                    content[current_heading].append(tag.get_text().strip())

            # Merge paragraphs under each heading
            for heading in content:
                content[heading] = ' '.join(content[heading])

            return content

        # Function to collect Wikipedia links
        def collect_links(soup):
            links = []
            for a_tag in soup.find_all('a', href=True):
                href = a_tag['href']
                if href.startswith('/wiki/') and not ':' in href:
                    full_url = f"https://en.wikipedia.org{href}"
                    links.append(full_url)
            return links

        # Extract data
        title = extract_title(soup)
        content = extract_content(soup)
        links = collect_links(soup)

        return {
            "title": title,
            "content": content,
            "links": links
        }

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None

if __name__ == "__main__":
    url = "https://en.wikipedia.org/wiki/Peter_Talbot"
    data = extract_wikipedia_data(url)

    if data:
        print(f"Title: {data['title']}\n")
        
        print("Content:")
        for heading, paragraphs in data['content'].items():
            print(f"Heading: {heading}\n")
            print(f"Content: {paragraphs}\n")
        
        print("Links:")
        for link in data['links']:
            print(link)


Title: Peter Talbot

Content:
Heading: Introduction

Content: Peter Talbot may refer to:

Links:
https://en.wikipedia.org/wiki/Main_Page
https://en.wikipedia.org/wiki/Main_Page
https://en.wikipedia.org/wiki/Peter_Talbot
https://en.wikipedia.org/wiki/Peter_Talbot
https://en.wikipedia.org/wiki/Peter_Talbot
https://en.wikipedia.org/wiki/Peter_Talbot_(bishop)
https://en.wikipedia.org/wiki/Peter_Talbot_(politician)
