In this notebook:
- Convert the IAAF points tables to tidy pandas dataframe.
- Will use this to add a view of the comparative strengths of the various joggling records.
- Note that this will not fill the gap of comparing records across different props (e.g. 3b vs, 5b)

In [1]:
import pandas as pd
import re
import numpy as np

## PyPDF2

In [7]:
from PyPDF2 import PdfReader
reader = PdfReader('points_data_pdf.pdf')

In [226]:
len(reader.pages)

# M Sprints 9-36
# M Middle Distance 39-66
# M Long Distance 69-96
# M Road Running 99-126

# W Sprints 189-216
# W Middle Distance 219-246
# W Long Distance 249-276
# W Road Running 279-306

# Example String:
reader.pages[189].extract_text()

'WOMEN’S SPRINTS, HURDLES AND RELAYS / FEMMES SPRINTS, HAIES ET RELAIS \n182  Points 100m 100mH 200m 300m 400m 400mH 4x100m 4x200m 4x400m  1400 10.12 11.24 20.51 32.27 45.35 48.07 38.04 1:19.29 3:00.61  1399  - 11.25 20.52 32.29 45.37 48.09 38.06 1:19.34 3:00.72  1398  -  -  - 32.31 45.40 48.12 38.08 1:19.39 3:00.83  1397 10.13 11.26 20.53 32.32 45.42 48.15 38.11 1:19.43 3:00.94  1396  - 11.27 20.54 32.34 45.44 48.18 38.13 1:19.48 3:01.04  1395 10.14  - 20.55 32.35 45.46 48.21 38.15 1:19.53 3:01.15  1394  - 11.28 20.56 32.37 45.49 48.24 38.17 1:19.58 3:01.26  1393  - 11.29 20.57 32.39 45.51 48.27 38.19 1:19.62 3:01.36  1392 10.15  - 20.58 32.40 45.53 48.30 38.21 1:19.67 3:01.47  1391  - 11.30 20.59 32.42 45.56 48.33 38.24 1:19.72 3:01.58  1390 10.16 11.31 20.60 32.43 45.58 48.36 38.26 1:19.77 3:01.69  1389  -  -  - 32.45 45.60 48.39 38.28 1:19.81 3:01.79  1388 10.17 11.32 20.61 32.47 45.63 48.42 38.30 1:19.86 3:01.90  1387  - 11.33 20.62 32.48 45.65 48.45 38.32 1:19.91 3:02.01  1386  -

### Sprints

In [176]:
sprints = ['100m', '110mH', '200m', '300m', '400m', '400mH', '4x100m', '4x200m', '4x400m',]

In [192]:
def even_sprint_formatting(extracted_text, sprints):
    points = extracted_text.splitlines()[1].replace(' Points ',' Points  ').split(' Points  ')[1].replace(' - ','- ')
    points = re.split(' [0-9]{1,4} ',points)
    points_df = pd.DataFrame(columns = sprints, data=[['-'] + row.split() if len(row.split())==8 else row.split() for row in points[0:]]).replace('-',np.nan)
    return points_df

def odd_sprint_formatting(extracted_text,sprints):
    points = extracted_text.splitlines()[1].replace(' Points ',' Points  ').split(' Points  ')[1].replace(' - ','- ')
    points = re.split(' [0-9]{1,4} ',points)
    points_df = pd.DataFrame(columns = sprints, data=[['-'] + row.split() if len(row.split())==8 else row.split() for row in points[0:]]).replace('-',np.nan)
    return points_df


In [227]:
full_df = pd.DataFrame()

for i in range(189,217):  # M:(9,37) , W:(189,217)
    print(f'Formatting page {i}')
    page = reader.pages[i] 
    extracted_text = page.extract_text()
    if i%2 == 0: # even
        points_df = even_sprint_formatting(extracted_text,sprints)
    else:
        points_df = odd_sprint_formatting(extracted_text,sprints)

    full_df = pd.concat([full_df,points_df])


# Remove filler rows
full_df = full_df[full_df['100m']!='100m']
full_df = full_df[~full_df.isnull().all(1)]
full_df['Points'] = np.linspace(1400,1,1400).astype(int)
full_df

Formatting page 189
Formatting page 190
Formatting page 191
Formatting page 192
Formatting page 193
Formatting page 194
Formatting page 195
Formatting page 196
Formatting page 197
Formatting page 198
Formatting page 199
Formatting page 200
Formatting page 201
Formatting page 202
Formatting page 203
Formatting page 204
Formatting page 205
Formatting page 206
Formatting page 207
Formatting page 208
Formatting page 209
Formatting page 210
Formatting page 211
Formatting page 212
Formatting page 213
Formatting page 214
Formatting page 215
Formatting page 216


Unnamed: 0,100m,110mH,200m,300m,400m,400mH,4x100m,4x200m,4x400m,Points
1,10.12,11.24,20.51,32.27,45.35,48.07,38.04,1:19.29,3:00.61,1400
2,,11.25,20.52,32.29,45.37,48.09,38.06,1:19.34,3:00.72,1399
3,,,,32.31,45.40,48.12,38.08,1:19.39,3:00.83,1398
4,10.13,11.26,20.53,32.32,45.42,48.15,38.11,1:19.43,3:00.94,1397
5,,11.27,20.54,32.34,45.44,48.18,38.13,1:19.48,3:01.04,1396
...,...,...,...,...,...,...,...,...,...,...
45,21.29,28.87,44.00,1:14.32,1:46.13,2:05.10,1:34.41,3:24.06,7:42.10,5
46,21.36,28.99,44.16,1:14.60,1:46.54,2:05.62,1:34.79,3:24.90,7:43.99,4
47,21.45,29.13,44.34,1:14.92,1:47.00,2:06.20,1:35.22,3:25.85,7:46.14,3
48,21.55,29.29,44.55,1:15.30,1:47.55,2:06.90,1:35.73,3:26.98,7:48.68,2


In [228]:
# full_df.to_csv('IAAF_W_Sprints.csv',index=False)
# df = pd.read_csv('IAAF_W_Sprints.csv')
# df

## Middle Distance

## Long Distance

## Road Running

In [None]:
page = reader.pages[100] # 99
extracted_text = page.extract_text()

extracted_text

In [124]:
events_odd = ['points', '10km','15km','10 Miles','20km', 'Half Marathon','25km','30km','Marathon','100km']
events_even = ['10km','15km','10 Miles','20km', 'Half Marathon','25km','30km','Marathon','100km', 'points']

In [127]:
# For 'even' page numbers
points = extracted_text.splitlines()[1].split(' Points  ')[1].replace(' - ','- ')
points = re.split('  ',points)
points

points_df = pd.DataFrame(columns=events_even, data=[row.split() for row in points[0:]]).replace('-',np.nan)
points_df

Unnamed: 0,10km,15km,10 Miles,20km,Half Marathon,25km,30km,Marathon,100km,points
0,,39:29,42:30,53:18,56:21,1:07:48,1:22:42,1:59:52,5:49:03,1350
1,25:52,39:30,42:31,53:19,56:23,1:07:50,1:22:44,1:59:55,5:49:13,1349
2,,39:31,42:32,53:20,56:24,1:07:52,1:22:46,1:59:59,5:49:24,1348
3,25:53,,42:33,53:22,56:26,1:07:53,1:22:48,2:00:02,5:49:34,1347
4,,39:32,42:34,53:23,56:27,1:07:55,1:22:50,2:00:05,5:49:44,1346
5,25:54,39:33,42:35,53:24,56:28,1:07:57,1:22:52,2:00:08,5:49:54,1345
6,25:55,39:34,42:36,53:25,56:30,1:07:58,1:22:54,2:00:11,5:50:05,1344
7,,39:35,42:37,53:27,56:31,1:08:00,1:22:56,2:00:14,5:50:15,1343
8,25:56,39:36,42:38,53:28,56:32,1:08:02,1:22:58,2:00:17,5:50:25,1342
9,,39:37,42:39,53:29,56:34,1:08:03,1:23:00,2:00:20,5:50:36,1341


In [128]:
# For odd page numbers
points = extracted_text.splitlines()[1].split(' 100 km  ')[1].replace(' - ','- ') # This works for 'odd' pages
points = re.split('  ',points)
points

points_df = pd.DataFrame(columns=events_odd, data=[row.split() for row in points[0:]]).replace('-',np.nan)
points_df

IndexError: list index out of range

In [16]:
len(reader.pages)

368