In [1]:
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page

In [29]:
url = 'https://en.wikipedia.org/wiki/Economy_of_Nigeria'

In [31]:
data = requests.get(url).content

In [33]:
soup = BeautifulSoup(data, "html.parser") 

In [62]:
 #Write a function to Get and parse html content from a Wikipedia page

def get_soup_from_url(url):
    """
    Takes a URL, sends a GET request, and returns a BeautifulSoup object
    for HTML parsing.
    
    Parameters:
    - url (str): The URL of the page to scrape.

    Returns:
    - BeautifulSoup object
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    return soup


In [39]:
#Write a function to Extract article title

def extract_article_title(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract title tag content
    title = soup.title.string if soup.title else "No title found"
    
    return title
    
title_tag = soup.find('title')
print(title_tag.string)

Economy of Nigeria - Wikipedia


In [55]:
#Write a function to Extract article text for each paragraph with their respective headings. Map those headings to their respective paragraphs in the dictionary.

def extract_article_structure(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    article_dict = {}
    current_heading = None

    # Target headings and paragraphs
    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'p']):
        if tag.name in ['h1', 'h2', 'h3', 'h4']:
            current_heading = tag.get_text(strip=True)
            article_dict[current_heading] = []
        elif tag.name == 'p' and current_heading:
            text = tag.get_text(strip=True)
            if text:  # avoid empty paragraphs
                article_dict[current_heading].append(text)

    return article_dict


In [57]:
extract_article_structure(url)

{'Contents': [],
 'Economy of Nigeria': ['Theeconomy of Nigeriais a middle-income,mixed economyandemerging market[27][28]with expanding manufacturing, financial, service, communications, technology, and entertainment sectors.[29][30]It is ranked as the53rd-largest economy in the world in terms of nominal GDP, the sixth largest in Africa and the27th-largest in terms of purchasing power parity.',
  "The country's re-emergent manufacturing sector became the largest on the continent in 2013, and it produces a large proportion of goods and services for the region ofWest Africa.[31]Nigeria'sdebt-to-GDP ratiowas 36.63% in 2021 according to theIMF.[21]",
  "Although oil revenues contributed 2/3 of state revenues,[32]oil only contributes about 9% to the GDP. Nigeria produces about 2.7% of theworld's oil supply. Although the petroleum sector is important, as Nigeria's government revenues still heavily rely on this sector, it remains a small part of the country's overall economy. The largely subs

In [74]:
#Write a function to collect every link that redirects to another Wikipedia page

def get_internal_wikipedia_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    internal_links = set()

    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        # Filter internal Wikipedia article links (ignores special pages, files, etc.)
        if href.startswith('/wiki/') and ':' not in href:
            full_url = 'https://en.wikipedia.org' + href
            internal_links.add(full_url)

    return list(internal_links)


In [76]:
get_internal_wikipedia_links(url)

['https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)',
 'https://en.wikipedia.org/wiki/Economy_of_Kyrgyzstan',
 'https://en.wikipedia.org/wiki/Economy_of_Chad',
 'https://en.wikipedia.org/wiki/Education_in_Nigeria',
 'https://en.wikipedia.org/wiki/BUA_Cement',
 'https://en.wikipedia.org/wiki/Doha_Development_Round',
 'https://en.wikipedia.org/wiki/Bali_Package',
 'https://en.wikipedia.org/wiki/United_Nations_Economic_Commission_for_Africa',
 'https://en.wikipedia.org/wiki/Refugees_of_Nigeria',
 'https://en.wikipedia.org/wiki/Petroleum',
 'https://en.wikipedia.org/wiki/Economy_of_Saint_Kitts_and_Nevis',
 'https://en.wikipedia.org/wiki/Foreign_exchange_market',
 'https://en.wikipedia.org/wiki/Hadejia',
 'https://en.wikipedia.org/wiki/Economy_of_Uruguay',
 'https://en.wikipedia.org/wiki/Cocoa_bean',
 'https://en.wikipedia.org/wiki/List_of_ecoregions_in_Nigeria',
 'https://en.wikipedia.org/wiki/Economy_of_Haiti',
 'https://en.wikipedia.org/wiki/Rainforest',
 'https://en.wikip

In [None]:
#Wrap all the previous functions into a single function that takes as parameters a Wikipedia link

def extract_wikipedia_article_info(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the article title
    title = soup.title.string if soup.title else "No title found"

    # Extract structure: headings and associated paragraphs
    article_structure = {}
    current_heading = None
    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'p']):
        if tag.name in ['h1', 'h2', 'h3', 'h4']:
            current_heading = tag.get_text(strip=True)
            article_structure[current_heading] = []
        elif tag.name == 'p' and current_heading:
            paragraph = tag.get_text(strip=True)
            if paragraph:
                article_structure[current_heading].append(paragraph)

    # Extract internal Wikipedia links
    internal_links = set()
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        if href.startswith('/wiki/') and ':' not in href:
            full_url = 'https://en.wikipedia.org' + href
            internal_links.add(full_url)

    return {
        "title": title,
        "structure": article_structure,
        "internal_links": list(internal_links)
    }
