In [27]:
import requests
from bs4 import BeautifulSoup

In [28]:
def get_html_content(url):
    response = requests.get(url)
    if response.status_code == 200:
        
        return  BeautifulSoup(response.content, 'html.parser')
    else:
        raise Exception(f"Failed to retrieve the page, status code: {response.status_code}")

In [29]:
def extract_title(soup):
    title = soup.find('h1', {'id': 'firstHeading'}).text
    return title

In [30]:
def extract_text_with_headings(soup):
    content = {}
    current_heading = None
    
    for element in soup.find_all(['h2', 'h3', 'h4', 'h5', 'h6', 'p']):
        if element.name.startswith('h'):
            current_heading = element.text.strip()
            content[current_heading] = []
        elif current_heading:
            content[current_heading].append(element.text.strip())
    
    return content

In [34]:
def collect_wikipedia_links(soup):
    links = []
    
    for link in soup.find_all('a', href=True):
        href = link['href']
        
        if href.startswith('/wiki/') and not href.startswith('/wiki/Special:'):
            links.append(f"https://en.wikipedia.org{href}")
    
    return links

In [35]:
def scrape_wikipedia_page(url):
    soup = get_html_content(url)
    title = extract_title(soup)
    content = extract_text_with_headings(soup)
    links = collect_wikipedia_links(soup)
    
    ret = {
        'title': title,
        'content': content,
        'links': links
    }
    
    return ret

In [36]:
if __name__ == "__main__":
    url = 'https://en.wikipedia.org/wiki/Python_(programming_language)'
    data = scrape_wikipedia_page(url)
    
    print(f"Article Title: {data['title']}")
    print("\nArticle Content:")
    
    for heading, paragraphs in data['content'].items():
        print(f"\n{heading}")
        for paragraph in paragraphs:
            print(paragraph)
            
    print("\nLinks to other Wikipedia pages:")
    for link in data['links']:
        print(link)

Article Title: Python (programming language)

Article Content:

Contents

Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation.[33]
Python is dynamically typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming. It is often described as a "batteries included" language due to its comprehensive standard library.[34][35]
Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python 0.9.0.[36] Python 2.0 was released in 2000. Python 3.0, released in 2008, was a major revision not completely backward-compatible with earlier versions. Python 2.7.18, released in 2020, was the last release of Python 2.[37]
Python consistently ranks as one of the most popular programming languages, and has gained widespread us