In [11]:
import requests
from bs4 import BeautifulSoup

WIKIPEDIA_PAGE_URL = 'https://en.wikipedia.org/wiki/Machine_learning'

## For authorization
REQUEST_HEADERS = {
    "User-Agent": "UCalgary-Student-Assignment/1.0 (educational-use)"
}

def has_3_rows_at_least(table):
    """
    Return True if the table has at least 3 data rows.
    A "data row" is a <tr> that contains at least one <td>.
    """
    row_count = 0
    for row in table.find_all("tr"):
        if row.find("td"):
            row_count += 1
            if row_count >= 3:
                return True
    return False

def clean_text(text):
    """
    Clean up cell text:
    - replace non-breaking spaces
    - collapse multiple spaces/newlines into single spaces
    """
    return " ".join(text.replace("\xa0", " ").split())


wikipedia_page_html = requests.get(WIKIPEDIA_PAGE_URL, headers = REQUEST_HEADERS)
wikipedia_page_html.raise_for_status() # Check the status of the request

parsed_html_document = BeautifulSoup(wikipedia_page_html.text, 'html.parser')

div_content = parsed_html_document.find("div", id = "mw-content-text")

# Find the FIRST table that has at least 3 data rows
target_table = None
for table in div_content.find_all("table"):
    if has_3_rows_at_least(table):
        target_table = table
        break # stop at the first matching table

if not target_table:
    print('No matching table found')

# Extract table
tr_contents = target_table.find_all("tr")

data_in_trs = [tr_content for tr_content in tr_contents if tr_content.find("td")]
data_rows = []
max_columns = 0

for tr_content in data_in_trs:
    cells = tr_content.find_all(["th", "td"])
    row = [clean_text(cell.get_text(" ", strip=True)) for cell in cells]
    data_rows.append(row)
    max_columns = max(max_columns, len(row))

if max_columns == 0:
    print("Found the table but couldn't extract any cells")

# Header: use a <tr> of only <th> before the first data row if it matches width
first_data_index = tr_contents.index(data_in_trs[0])
header = None

for tr_content in tr_contents[:first_data_index]:
    th_contents = tr_content.find_all(["th"])
    td_contents = tr_content.find_all(["td"])

    if th_contents and not td_contents and len(th_contents) == max_columns:
        header = [clean_text(th_content.get_text(" ", strip=True)) for th_content in th_contents]
        break

# If no header, create col1...colN
if not header:
    header = [f"col{i}" for i in range(1, max_columns + 1)]

# Pad short rows with ""
normalized = []
for row in data_rows:
    if len(row) < max_columns:
        row = row + [""] * (max_columns - len(row))
    elif len(row) > max_columns:
        row = row[:max_columns]
    normalized.append(row)

# Save CSV
with open("wiki_table.csv", "w", encoding="utf-8", newline="") as file:
    file.write(",".join(header) + "\n")
    for row in normalized:
        file.write(",".join(row) + "\n")

print("Saved wiki_table.csv")

Saved wiki_table.csv
