In [1]:
import requests
from bs4 import *
import string
import re
from functools import reduce
import pandas as pd

letters = string.ascii_letters

In [2]:
url = "https://en.wikipedia.org/w/index.php?title=List_of_presidents_of_the_United_States&oldid=1312863317"

# the Wikimedia policy requires setting up a user agent
# https://wikitech.wikimedia.org/wiki/Robot_policy
# https://foundation.wikimedia.org/wiki/Policy:Wikimedia_Foundation_User-Agent_Policy
# https://phabricator.wikimedia.org/T400119

headers = {
    "User-Agent": "Beautiful Soup Scraper 30.09.2025 ; kirillovdm2002@gmail.com)"
}

response = requests.get(url, headers=headers)
content = response.content
soup = BeautifulSoup(content, "html.parser")
# print(soup.title)
# print(soup.title.get_text())
# print(soup.body)
# print(response.status_code)

table = soup.find_all("tr")

print(soup)
print(table)

# print(tables[2])

cont = BeautifulSoup(str(table[3]), "html.parser")

<!DOCTYPE html>

<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of presidents of the United States - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-

In [17]:
# for td:
# 0 - portrait - not needed for scraping
# 1 - Name and (birth-death)
# 2 - Term(s)
# 3 - Background colour - not needed for scraping
# 4 - Party
# 5 - Election
# 6 - Vice president

# The table has a very complex structure... it's hard to scrape


def iterate_table(table):
    lst = list()
    for it in table[1:48]:
        #        print(it)
        cont = BeautifulSoup(str(it), "html.parser")
        lst.append(get_values(cont))
    return lst


# a function to get terms (complex formatting makes it very hard to get in a straight way)
def get_terms(arg):
    words = reduce(lambda x, y: x + y, arg)
    # delete bad substrings
    words = re.sub(r"\[\w\]", "", string=words)
    words = re.sub(r"\n", "", string=words)
    return words


def get_values(cont):
    strings = list()
    tds = cont.find_all("td")
    for val in tds:
        print(list(val.strings))
        strings.append(list(val.strings))
#        print(strings)
    dct = dict()
    bad_substrings=("\n","[","]",*letters)
    dct["Name"] = strings[1][0]
    dct["Dates of birth and death"] = reduce(lambda x, y: x + y, strings[1][1])
    #    dct['Term'] = reduce(lambda x,y : x + y , strings[2][0:3])
    dct["Term"] = get_terms(strings[2])
    dct["Party"] = reduce(lambda x, y: x + " " + y,
                          filter(lambda x: x not in bad_substrings,strings[4])
                          )
    dct["Election"] = reduce(
        lambda x, y: x + " ; " + y, filter(lambda x: x!="\n", strings[5])
    )
    dct["Vice President"] = reduce(
        lambda x, y: x + "; " + y,
        filter(lambda x: not x in bad_substrings, strings[6]),
    )
    return dct

In [18]:
results = iterate_table(table)

print(results)

['\n']
['George Washington', '(1732–1799)', '[', '19', ']', '\n']
['April 30, 1789', '–', 'March 4, 1797', '\n']
['\n']
['Unaffiliated', '\n']
['1788–89', '1792', '\n']
['John Adams', '[', 'c', ']', '\n']
['\n']
['John Adams', '(1735–1826)', '[', '21', ']', '\n']
['March 4, 1797', '–', 'March 4, 1801', '\n']
['\n']
['Federalist', '\n']
['1796', '\n']
['Thomas Jefferson', '[', 'd', ']', '\n']
['\n']
['Thomas Jefferson', '(1743–1826)', '[', '23', ']', '\n']
['March 4, 1801', '–', 'March 4, 1809', '\n']
['\n']
['Democratic-', 'Republican', '\n']
['1800', '1804', '\n']
['Aaron Burr', 'George Clinton', '\n']
['\n']
['James Madison', '(1751–1836)', '[', '24', ']', '\n']
['March 4, 1809', '–', 'March 4, 1817', '\n']
['\n']
['Democratic-', 'Republican', '\n']
['1808', '1812', '\n']
['George Clinton', '[', 'e', ']', 'Vacant\xa0after', 'April 20, 1812', 'Elbridge Gerry', '[', 'e', ']', 'Vacant\xa0after', 'November 23, 1814', '\n']
['\n']
['James Monroe', '(1758–1831)', '[', '26', ']', '\n']
['Ma

In [19]:
# "DATE OF BIRTH AND DEATH" FOR LIVING PEOPLE IS STILL SCRAPED INCORRECTLY DUE TO COMPLEXITIES WITH FORMATTING

df = pd.DataFrame(data=results)

print(df)

                      Name Dates of birth and death  \
0        George Washington              (1732–1799)   
1               John Adams              (1735–1826)   
2         Thomas Jefferson              (1743–1826)   
3            James Madison              (1751–1836)   
4             James Monroe              (1758–1831)   
5        John Quincy Adams              (1767–1848)   
6           Andrew Jackson              (1767–1845)   
7         Martin Van Buren              (1782–1862)   
8   William Henry Harrison              (1773–1841)   
9               John Tyler              (1790–1862)   
10           James K. Polk              (1795–1849)   
11          Zachary Taylor              (1784–1850)   
12        Millard Fillmore              (1800–1874)   
13         Franklin Pierce              (1804–1869)   
14          James Buchanan              (1791–1868)   
15         Abraham Lincoln              (1809–1865)   
16          Andrew Johnson              (1808–1875)   
17        

In [6]:
# Saving to parquet doesn't work for some reason (I installed pyarrow)
df.to_csv("dataset.csv", index=False)