### convert the data into a dictionary

In [2]:
import gazpacho

# web address
URL = "https://en.wikipedia.org/wiki/List_of_world_records_in_swimming"
# use `gazpacho` to retrieve raw HTML from URL web address
html = gazpacho.get(URL)
# passing the raw HTML string into the "Soup" constructor creates a parsed representation of the HTML
soup = gazpacho.Soup(html)
# search for tags of interest
tables = soup.find("table", mode="all")

In [3]:
RECORDS = (0, 2, 4, 5)
COURSES = ("LC Men", "LC Women", "SC Men", "SC Women")

records = {}
for table, course in zip(RECORDS, COURSES):
    records[course] = {}
    for row in tables[table].find("tr", mode="all")[1:]:
        columns = row.find("td", mode="all")
        event = columns[0].text
        time = columns[1].text
        records[course][event] = time

In [5]:
records

{'LC Men': {'50m freestyle': '20.91',
  '100m freestyle': '46.86',
  '200m freestyle': '1:42.00',
  '400m freestyle': '3:40.07',
  '800m freestyle': '7:32.12',
  '1500m freestyle': '14:31.02',
  '50m backstroke': '23.55',
  '100m backstroke': '51.60',
  '200m backstroke': '1:51.92',
  '50m breaststroke': '25.95',
  '100m breaststroke': '56.88',
  '200m breaststroke': '2:05.48',
  '50m butterfly': '22.27',
  '100m butterfly': '49.45',
  '200m butterfly': '1:50.34',
  '200m individual medley': '1:54.00',
  '400m individual medley': '4:02.50',
  '4 × 100 m freestyle relay': '3:08.24',
  '4 × 200 m freestyle relay': '6:58.55',
  '4 × 100 m medley relay': '3:26.78'},
 'LC Women': {'50m freestyle': '23.61',
  '100m freestyle': '51.71',
  '200m freestyle': '1:52.85',
  '400m freestyle': '3:55.38',
  '800m freestyle': '8:04.79',
  '1500m freestyle': '15:20.48',
  '50m backstroke': '26.98',
  '100m backstroke': '57.45',
  '200m backstroke': '2:03.14',
  '50m breaststroke': '29.16',
  '100m brea

In [4]:
records.keys()

dict_keys(['LC Men', 'LC Women', 'SC Men', 'SC Women'])

In [6]:
records["LC Women"]["100m backstroke"]

'57.45'

In [7]:
conversions = {
    "Free": "freestyle",
    "Back": "backstroke",
    "Breast": "breaststroke",
    "Fly": "butterfly",
    "IM": "individual medley",
}

In [8]:
event = "Mike-15-100m-Fly.txt"
*_, distance, stroke = event.removesuffix(".txt").split("-")
lookup = f"{distance} {conversions[stroke]}"
lookup

'100m butterfly'

In [9]:
for course in records.keys():
    print(f"{records[course][lookup]}")

49.45
55.48
47.78
54.05


In [10]:
records["LC Men"]

{'50m freestyle': '20.91',
 '100m freestyle': '46.86',
 '200m freestyle': '1:42.00',
 '400m freestyle': '3:40.07',
 '800m freestyle': '7:32.12',
 '1500m freestyle': '14:31.02',
 '50m backstroke': '23.55',
 '100m backstroke': '51.60',
 '200m backstroke': '1:51.92',
 '50m breaststroke': '25.95',
 '100m breaststroke': '56.88',
 '200m breaststroke': '2:05.48',
 '50m butterfly': '22.27',
 '100m butterfly': '49.45',
 '200m butterfly': '1:50.34',
 '200m individual medley': '1:54.00',
 '400m individual medley': '4:02.50',
 '4 × 100 m freestyle relay': '3:08.24',
 '4 × 200 m freestyle relay': '6:58.55',
 '4 × 100 m medley relay': '3:26.78'}

In [11]:
records = {}
for table, course in zip(RECORDS, COURSES):
    records[course] = {}
    for row in tables[table].find("tr", mode="all")[1:]:
        columns = row.find("td", mode="all")
        event = columns[0].text
        time = columns[1].text
        # filter the "relay" data
        if "relay" not in event:
            records[course][event] = time

In [12]:
records["LC Men"]

{'50m freestyle': '20.91',
 '100m freestyle': '46.86',
 '200m freestyle': '1:42.00',
 '400m freestyle': '3:40.07',
 '800m freestyle': '7:32.12',
 '1500m freestyle': '14:31.02',
 '50m backstroke': '23.55',
 '100m backstroke': '51.60',
 '200m backstroke': '1:51.92',
 '50m breaststroke': '25.95',
 '100m breaststroke': '56.88',
 '200m breaststroke': '2:05.48',
 '50m butterfly': '22.27',
 '100m butterfly': '49.45',
 '200m butterfly': '1:50.34',
 '200m individual medley': '1:54.00',
 '400m individual medley': '4:02.50'}

### JSON formatted-file

In [15]:
FD = "data/swimrecord/"

import json
from pyprojroot import here

with open(here(FD + "records.json"), "w") as jf:
    json.dump(records, jf)

In [16]:
# create a function:
# - take a single argument value: filename of any swimmer'data
# - convert the filename into the correct lookup key -> return this
def event_lookup(filename):
    conversions = {
    "Free": "freestyle",
    "Back": "backstroke",
    "Breast": "breaststroke",
    "Fly": "butterfly",
    "IM": "individual medley",
}
    
    *_, distance, stroke = filename.removesuffix(".txt").split("-")
    lookup = f"{distance} {conversions[stroke]}"
    return lookup

In [17]:
event_lookup("Darius-13-50m-Back.txt")

'50m backstroke'

In [18]:
# import JSON data
import json
from pyprojroot import here

FD = "data/swimrecord/"

with open(here(FD + "records.json")) as jf:
    records = json.load(jf)

In [19]:
records["LC Men"][event_lookup("Darius-13-100m-Fly.txt")]

'49.45'