Check if scraping is allowed using robots.txt

In [1]:
from urllib import robotparser

robots_url = "https://en.wikipedia.org/robots.txt"
rp = robotparser.RobotFileParser()
rp.set_url(robots_url)
rp.read()

In [2]:
target = "https://en.wikipedia.org/wiki/Artificial_intelligence"
print("Allowed to scrape?", rp.can_fetch("*", target))

Allowed to scrape? True


Sending an HTTP GET request

In [1]:
import requests

In [2]:
url = "https://en.wikipedia.org/wiki/Artificial_intelligence"
headers = None

resp = requests.get(url, headers=headers, timeout=5)

In [3]:
print(f"Status code : {resp.status_code}")
print(f"Content-Type : {resp.headers.get('Content-Type')}")
print(f"First 100 char of HTML page : {resp.text[:100]}")

Status code : 200
Content-Type : text/html; charset=UTF-8
First 100 char of HTML page : <!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-la


Parsing HTML with BeautifulSoup

In [4]:
from bs4 import BeautifulSoup

In [6]:
# soup = BeautifulSoup(resp.text, "lxml")
soup = BeautifulSoup(resp.text, "html.parser")

In [8]:
soup.title.get_text()

'Artificial intelligence - Wikipedia'

In [9]:
heading = soup.find("h1", id="firstHeading").get_text(strip=True)
print("Main heading : ", heading)

Main heading :  Artificial intelligence


In [11]:
content_div = soup.find("div", id="mw-content-text")

first_paragraph = ""

for p in content_div.find_all("p"):
    text = p.get_text(strip=True)
    if len(text) > 50:
        first_paragraph = text
        break

print(f"First paragraph : {first_paragraph}")

First paragraph : Artificial intelligence(AI) is the capability ofcomputational systemsto perform tasks typically associated withhuman intelligence, such aslearning,reasoning,problem-solving,perception, anddecision-making. It is afield of researchincomputer sciencethat develops and studies methods andsoftwarethat enable machines toperceive their environmentand uselearningandintelligenceto take actions that maximize their chances of achieving defined goals.[1]


In [12]:
from urllib.parse import urljoin

base = "https://en.wikipedia.org"

internal_links = set()

for a in soup.select("#mw-content-text a[href^='/wiki/']"):
    href = a.get('href')
    if href and ':' not in href:
        full = urljoin(base, href)
        internal_links.add(full)

print("Internal wiki links found: ", len(internal_links))
print("Sample links: ")
for link in internal_links:
    print(link)

Internal wiki links found:  1785
Sample links: 
https://en.wikipedia.org/wiki/Recraft
https://en.wikipedia.org/wiki/Emergent_algorithm
https://en.wikipedia.org/wiki/The_Guardian
https://en.wikipedia.org/wiki/Murder
https://en.wikipedia.org/wiki/KUKA
https://en.wikipedia.org/wiki/Free_will
https://en.wikipedia.org/wiki/Short-circuit_evaluation
https://en.wikipedia.org/wiki/General-purpose_programming_language
https://en.wikipedia.org/wiki/Glossary_of_astronomy
https://en.wikipedia.org/wiki/Conditionally_independent
https://en.wikipedia.org/wiki/OpenAI_o1
https://en.wikipedia.org/wiki/ISBN_(identifier)
https://en.wikipedia.org/wiki/Midjourney
https://en.wikipedia.org/wiki/Academic_research
https://en.wikipedia.org/wiki/Activation_function
https://en.wikipedia.org/wiki/Software_design
https://en.wikipedia.org/wiki/Continuum_robot
https://en.wikipedia.org/wiki/Glossary_of_chemistry_terms
https://en.wikipedia.org/wiki/Alva_No%C3%AB
https://en.wikipedia.org/wiki/Network_security
https://en.w

In [13]:
import csv
out_file = "ai_wikipedia_summary.csv"

with open(out_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["field", "value"])
    writer.writerow(["heading", heading])
    writer.writerow(["first_paragraph", first_paragraph])
    writer.writerow(["top_internal_links_count", len(internal_links)])

print("Saved summary to", out_file)

Saved summary to ai_wikipedia_summary.csv
