In [49]:
%pip install gazpacho --upgrade

Note: you may need to restart the kernel to use updated packages.


In [50]:
URL = "https://en.wikipedia.org/wiki/List_of_world_records_in_swimming"

In [51]:
import gazpacho

In [52]:
html = gazpacho.get(URL)

In [53]:
len(html)

603647

In [54]:
html[:500]

'<!DOCTYPE html>\n<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vect'

In [55]:
html[-500:]

'anization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2007-03-15T21:20:10Z","dateModified":"2025-01-22T02:12:42Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/a\\/ae\\/Caeleb_Dressel_before_winning_100_fly_%2842769914221%29.jpg","headline":"Wikimedia list article"}</script>\n</body>\n</html>'

In [56]:
from_where = html.find("<table")
html[from_where : from_where + 500]

'<table class="wikitable sortable" style="font-size: 95%;">\n<caption>\n</caption>\n<tbody><tr>\n<th>Event\n</th>\n<th style="width:4em" class="unsortable">Time\n</th>\n<th class="unsortable">\n</th>\n<th>Name</th>\n<th>Nationality</th>\n<th>Date</th>\n<th>Meet</th>\n<th>Location\n</th>\n<th style="width:2em" class="unsortable">Ref\n</th></tr>\n\n<tr>\n<td><span data-sort-value="01&#160;!"><a href="/wiki/World_record_progression_50_metres_freestyle" title="World record progression 50 metres freestyle">50m freestyle<'

In [57]:
soup = gazpacho.Soup(html)

In [58]:
type(soup)

gazpacho.soup.Soup

In [59]:
print(dir(soup))

['attrs', 'find', 'get', 'html', 'strip', 'tag', 'text']


In [60]:
tables = soup.find("table")

In [61]:
type(tables)

list

In [62]:
len(tables)

12

In [63]:
type(tables[0])

gazpacho.soup.Soup

In [64]:
type(tables[-1])

gazpacho.soup.Soup

In [65]:
rows = tables[0].find("tr", mode="all")

In [66]:
type(rows)

list

In [67]:
len(rows)

21

In [68]:
rows[0]

<tr>
  <th>Event
</th>
  <th style="width:4em" class="unsortable">Time
</th>
  <th class="unsortable">
</th>
  <th>Name</th>
  <th>Nationality</th>
  <th>Date</th>
  <th>Meet</th>
  <th>Location
</th>
  <th style="width:2em" class="unsortable">Ref
</th>
</tr>

In [69]:
for n, table in enumerate(tables):
    rows = table.find("tr", mode="all")
    print(f"{n} -> {len(rows)} rows")

0 -> 21 rows
1 -> 1 rows
2 -> 21 rows
3 -> 3 rows
4 -> 25 rows
5 -> 26 rows
6 -> 4 rows
7 -> 26 rows
8 -> 1 rows
9 -> 7 rows
10 -> 5 rows
11 -> 8 rows


In [70]:
for n, table in enumerate(tables):
    rows = table.find("tr", mode="all")
    cols = rows[-1].find("td", mode="all")
    print(f"{n} -> {len(rows)} rows, {len(cols)} columns")

0 -> 21 rows, 9 columns
1 -> 1 rows, 2 columns
2 -> 21 rows, 9 columns
3 -> 3 rows, 9 columns
4 -> 25 rows, 9 columns
5 -> 26 rows, 9 columns
6 -> 4 rows, 9 columns
7 -> 26 rows, 11 columns
8 -> 1 rows, 2 columns
9 -> 7 rows, 1 columns
10 -> 5 rows, 1 columns
11 -> 8 rows, 1 columns


In [71]:
table = tables[0]
for row in table.find("tr", mode="all")[1:]:
    columns = row.find("td", mode="all")
    event = columns[0].text
    time = columns[1].text
    print(f"{event} -> {time}")

50m freestyle -> 20.91
100m freestyle -> 46.40
200m freestyle -> 1:42.00
400m freestyle -> 3:40.07
800m freestyle -> 7:32.12
1500m freestyle -> 14:30.67
50m backstroke -> 23.55
100m backstroke -> 51.60
200m backstroke -> 1:51.92
50m breaststroke -> 25.95
100m breaststroke -> 56.88
200m breaststroke -> 2:05.48
50m butterfly -> 22.27
100m butterfly -> 49.45
200m butterfly -> 1:50.34
200m individual medley -> 1:54.00
400m individual medley -> 4:02.50
4 × 100 m freestyle relay -> 3:08.24
4 × 200 m freestyle relay -> 6:58.55
4 × 100 m medley relay -> 3:26.78


In [72]:
RECORDS = (0, 2, 4, 5)
COURSES = ("LC Men", "LC Women", "SC Men", "SC Women")

In [73]:
list(zip(RECORDS, COURSES))

[(0, 'LC Men'), (2, 'LC Women'), (4, 'SC Men'), (5, 'SC Women')]

In [74]:
RECORDS = (0, 2, 4, 5)
COURSES = ("LC Men", "LC Women", "SC Men", "SC Women")

for table, course in zip(RECORDS, COURSES):
    print(f"{course}:")
    for row in tables[table].find("tr", mode="all")[1:]:
        columns = row.find("td", mode="all")
        event = columns[0].text
        time = columns[1].text
        print(f"\t{event} -> {time}")
    print()

LC Men:
	50m freestyle -> 20.91
	100m freestyle -> 46.40
	200m freestyle -> 1:42.00
	400m freestyle -> 3:40.07
	800m freestyle -> 7:32.12
	1500m freestyle -> 14:30.67
	50m backstroke -> 23.55
	100m backstroke -> 51.60
	200m backstroke -> 1:51.92
	50m breaststroke -> 25.95
	100m breaststroke -> 56.88
	200m breaststroke -> 2:05.48
	50m butterfly -> 22.27
	100m butterfly -> 49.45
	200m butterfly -> 1:50.34
	200m individual medley -> 1:54.00
	400m individual medley -> 4:02.50
	4 × 100 m freestyle relay -> 3:08.24
	4 × 200 m freestyle relay -> 6:58.55
	4 × 100 m medley relay -> 3:26.78

LC Women:
	50m freestyle -> 23.61
	100m freestyle -> 51.71
	200m freestyle -> 1:52.23
	400m freestyle -> 3:55.38
	800m freestyle -> 8:04.79
	1500m freestyle -> 15:20.48
	50m backstroke -> 26.86
	100m backstroke -> 57.13
	200m backstroke -> 2:03.14
	50m breaststroke -> 29.16
	100m breaststroke -> 1:04.13
	200m breaststroke -> 2:17.55
	50m butterfly -> 24.43
	100m butterfly -> 55.18
	200m butterfly -> 2:01.81


In [75]:
RECORDS = (0, 2, 4, 5)
COURSES = ("LC Men", "LC Women", "SC Men", "SC Women")

records = {}
for table, course in zip(RECORDS, COURSES):
    records[course] = {}
    for row in tables[table].find("tr", mode="all")[1:]:
        columns = row.find("td", mode="all")
        event = columns[0].text
        time = columns[1].text
        records[course][event] = time

In [76]:
records.keys()

dict_keys(['LC Men', 'LC Women', 'SC Men', 'SC Women'])

In [77]:
records["SC Women"]["100m backstroke"]

'54.02'

In [78]:
records["LC Men"]["50m freestyle"]

'20.91'

In [79]:
print(records["LC Men"]["100m butterfly"])
print(records["LC Women"]["100m butterfly"])
print(records["SC Men"]["100m butterfly"])
print(records["SC Women"]["100m butterfly"])

49.45
55.18
47.71
52.71


In [80]:
conversions = {
    "Free": "freestyle",
    "Back": "backstroke",
    "Breast": "breaststroke",
    "Fly": "butterfly",
    "IM": "individual medley",
}

In [81]:
event = "Mike-15-100m-Fly.txt"
*_, distance, stroke = event.removesuffix(".txt").split("-")
lookup = f"{distance} {conversions[stroke]}"
lookup

'100m butterfly'

In [82]:
for course in records.keys():
    print(f"{records[course][lookup]}")

49.45
55.18
47.71
52.71


In [83]:
records["LC Men"]

{'50m freestyle': '20.91',
 '100m freestyle': '46.40',
 '200m freestyle': '1:42.00',
 '400m freestyle': '3:40.07',
 '800m freestyle': '7:32.12',
 '1500m freestyle': '14:30.67',
 '50m backstroke': '23.55',
 '100m backstroke': '51.60',
 '200m backstroke': '1:51.92',
 '50m breaststroke': '25.95',
 '100m breaststroke': '56.88',
 '200m breaststroke': '2:05.48',
 '50m butterfly': '22.27',
 '100m butterfly': '49.45',
 '200m butterfly': '1:50.34',
 '200m individual medley': '1:54.00',
 '400m individual medley': '4:02.50',
 '4 × 100 m freestyle relay': '3:08.24',
 '4 × 200 m freestyle relay': '6:58.55',
 '4 × 100 m medley relay': '3:26.78'}

In [84]:
records = {}
for table, course in zip(RECORDS, COURSES):
    records[course] = {}
    for row in tables[table].find("tr", mode="all")[1:]:
        columns = row.find("td", mode="all")
        event = columns[0].text
        time = columns[1].text
        if "relay" not in event:
            records[course][event] = time

In [85]:
records["LC Men"]

{'50m freestyle': '20.91',
 '100m freestyle': '46.40',
 '200m freestyle': '1:42.00',
 '400m freestyle': '3:40.07',
 '800m freestyle': '7:32.12',
 '1500m freestyle': '14:30.67',
 '50m backstroke': '23.55',
 '100m backstroke': '51.60',
 '200m backstroke': '1:51.92',
 '50m breaststroke': '25.95',
 '100m breaststroke': '56.88',
 '200m breaststroke': '2:05.48',
 '50m butterfly': '22.27',
 '100m butterfly': '49.45',
 '200m butterfly': '1:50.34',
 '200m individual medley': '1:54.00',
 '400m individual medley': '4:02.50'}

In [86]:
import json

In [87]:
with open("records.json", "w") as jf:
    json.dump(records, jf)