### `pandas`: data analysis library

In [3]:
import pandas as pd

URL = "https://en.wikipedia.org/wiki/List_of_world_records_in_swimming"
tables = pd.read_html(URL)

In [4]:
tables[0].head()

Unnamed: 0,Event,Time,Unnamed: 2,Name,Nationality,Date,Meet,Location,Ref
0,50m freestyle,20.91,ss,César Cielo,Brazil,18 December 2009,Brazilian Championships,"São Paulo, Brazil",[9][10][11][12]
1,100m freestyle,46.86,,David Popovici,Romania,13 August 2022,European Championships,"Rome, Italy",[13][14]
2,200m freestyle,1:42.00,ss,Paul Biedermann,Germany,28 July 2009,World Championships,"Rome, Italy",[15][16][17]
3,400m freestyle,3:40.07,ss,Paul Biedermann,Germany,26 July 2009,World Championships,"Rome, Italy",[18][19][20]
4,800m freestyle,7:32.12,ss,Zhang Lin,China,29 July 2009,World Championships,"Rome, Italy",[21][22]


In [5]:
# select columns from a dataframe
df = tables[0][["Event", "Time"]]

In [6]:
df.head()

Unnamed: 0,Event,Time
0,50m freestyle,20.91
1,100m freestyle,46.86
2,200m freestyle,1:42.00
3,400m freestyle,3:40.07
4,800m freestyle,7:32.12


In [7]:
df.to_dict()

{'Event': {0: '50m freestyle',
  1: '100m freestyle',
  2: '200m freestyle',
  3: '400m freestyle',
  4: '800m freestyle',
  5: '1500m freestyle',
  6: '50m backstroke',
  7: '50m backstroke',
  8: '100m backstroke',
  9: '200m backstroke',
  10: '50m breaststroke',
  11: '100m breaststroke',
  12: '200m breaststroke',
  13: '50m butterfly',
  14: '100m butterfly',
  15: '200m butterfly',
  16: '200m individual medley',
  17: '400m individual medley',
  18: '4 × 100 m freestyle relay',
  19: '4 × 200 m freestyle relay',
  20: '4 × 100 m medley relay'},
 'Time': {0: '20.91',
  1: '46.86',
  2: '1:42.00',
  3: '3:40.07',
  4: '7:32.12',
  5: '14:31.02',
  6: '23.71',
  7: '23.55',
  8: '51.60',
  9: '1:51.92',
  10: '25.95',
  11: '56.88',
  12: '2:05.48',
  13: '22.27',
  14: '49.45',
  15: '1:50.34',
  16: '1:54.00',
  17: '4:02.50',
  18: '3:08.24',
  19: '6:58.55',
  20: '3:26.78'}}

In [9]:
# select rows using pandas conditional expression
# convert data in `Event` column into a string,
# then search for "relay" using `contains()`
df[df["Event"].str.contains("relay")]

Unnamed: 0,Event,Time
18,4 × 100 m freestyle relay,3:08.24
19,4 × 200 m freestyle relay,6:58.55
20,4 × 100 m medley relay,3:26.78


In [10]:
df["Event"].str.contains("relay")

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18     True
19     True
20     True
Name: Event, dtype: bool

In [12]:
# `~`: negative (in `pandas`)
df[~df["Event"].str.contains("relay")]

Unnamed: 0,Event,Time
0,50m freestyle,20.91
1,100m freestyle,46.86
2,200m freestyle,1:42.00
3,400m freestyle,3:40.07
4,800m freestyle,7:32.12
5,1500m freestyle,14:31.02
6,50m backstroke,23.71
7,50m backstroke,23.55
8,100m backstroke,51.60
9,200m backstroke,1:51.92


In [13]:
# overwrite the existing dataframe df
df = df[~df["Event"].str.contains("relay")]

In [14]:
df.to_dict("records")

[{'Event': '50m freestyle', 'Time': '20.91'},
 {'Event': '100m freestyle', 'Time': '46.86'},
 {'Event': '200m freestyle', 'Time': '1:42.00'},
 {'Event': '400m freestyle', 'Time': '3:40.07'},
 {'Event': '800m freestyle', 'Time': '7:32.12'},
 {'Event': '1500m freestyle', 'Time': '14:31.02'},
 {'Event': '50m backstroke', 'Time': '23.71'},
 {'Event': '50m backstroke', 'Time': '23.55'},
 {'Event': '100m backstroke', 'Time': '51.60'},
 {'Event': '200m backstroke', 'Time': '1:51.92'},
 {'Event': '50m breaststroke', 'Time': '25.95'},
 {'Event': '100m breaststroke', 'Time': '56.88'},
 {'Event': '200m breaststroke', 'Time': '2:05.48'},
 {'Event': '50m butterfly', 'Time': '22.27'},
 {'Event': '100m butterfly', 'Time': '49.45'},
 {'Event': '200m butterfly', 'Time': '1:50.34'},
 {'Event': '200m individual medley', 'Time': '1:54.00'},
 {'Event': '400m individual medley', 'Time': '4:02.50'}]

In [15]:
# `set_index()`: reshape an existing dataframe to use an identified column as its index
df = df.set_index("Event")
df.head()

Unnamed: 0_level_0,Time
Event,Unnamed: 1_level_1
50m freestyle,20.91
100m freestyle,46.86
200m freestyle,1:42.00
400m freestyle,3:40.07
800m freestyle,7:32.12


In [16]:
df.to_dict()

{'Time': {'50m freestyle': '20.91',
  '100m freestyle': '46.86',
  '200m freestyle': '1:42.00',
  '400m freestyle': '3:40.07',
  '800m freestyle': '7:32.12',
  '1500m freestyle': '14:31.02',
  '50m backstroke': '23.55',
  '100m backstroke': '51.60',
  '200m backstroke': '1:51.92',
  '50m breaststroke': '25.95',
  '100m breaststroke': '56.88',
  '200m breaststroke': '2:05.48',
  '50m butterfly': '22.27',
  '100m butterfly': '49.45',
  '200m butterfly': '1:50.34',
  '200m individual medley': '1:54.00',
  '400m individual medley': '4:02.50'}}

In [17]:
# build out the records dictionary of dictionary, one course at a time
# create an empty dictionary
records = {}
# assign the "inner" dictionary produced from converting the "df" dataframe
records["LC Men"] = df.to_dict()["Time"]

In [18]:
records

{'LC Men': {'50m freestyle': '20.91',
  '100m freestyle': '46.86',
  '200m freestyle': '1:42.00',
  '400m freestyle': '3:40.07',
  '800m freestyle': '7:32.12',
  '1500m freestyle': '14:31.02',
  '50m backstroke': '23.55',
  '100m backstroke': '51.60',
  '200m backstroke': '1:51.92',
  '50m breaststroke': '25.95',
  '100m breaststroke': '56.88',
  '200m breaststroke': '2:05.48',
  '50m butterfly': '22.27',
  '100m butterfly': '49.45',
  '200m butterfly': '1:50.34',
  '200m individual medley': '1:54.00',
  '400m individual medley': '4:02.50'}}

In [21]:
RECORDS = (0, 2, 4, 5)
COURSES = ("LC Men", "LC Women", "SC Men", "SC Women")

records = {}
for table, course in zip(RECORDS, COURSES):
    df = tables[table][["Event", "Time"]]
    df = df[~df["Event"].str.contains("relay")]
    df = df.set_index("Event")
    records[course] = df.to_dict()["Time"]

In [23]:
import json
import pyprojroot as pyroot

with open(pyroot.here("data/swimrecord/" + "records.json")) as jf:
    gazpacho_records = json.load(jf)

In [24]:
records == gazpacho_records

True

In [25]:
records["SC Women"]["100m breaststroke"]

'1:02.36'

In [26]:
gazpacho_records["SC Women"]["100m breaststroke"]

'1:02.36'