In [16]:
import requests
from pyquery import PyQuery as pq
import pandas as pd
import re
import tqdm
import dask.bag as db
from dask.distributed import Client
import json
import numpy as np

In [3]:
BASE_URL = "https://results.chicagomarathon.com/2021/"
PATH = "?page={page}&event=MAR&lang=EN_CAP&num_results=1000&pid=list&search%5Bsex%5D={sex}&search%5Bage_class%5D=%25"

In [4]:
def parse_page(base_url, path, gender):
    resp = requests.get(base_url+path)
    d = pq(resp.content)
    # find first name field and navigate up to overarching row
    all_runners = d(".list-field.type-fullname a").closest(".list-group-item .row")
    all_runners_parsed = []
    for runner in all_runners.items():
    #     print(runner)
        name_country = runner.find(".type-fullname a").text()
        idp = re.search("(?<=idp=)[A-Z0-9_.-]*(?=&)", runner.find(".type-fullname a").attr['href']).group(0)
        details_url = base_url + "?content=detail&idp=" + idp
        data = {
            "name": name_country[:-6],
            "gender": gender,
            "country": name_country[-4:-1],
            "age_class": runner.find(".type-age_class").text().split("\n")[1],
            "half_time": runner.find(".type-time").eq(0).text().split("\n")[1],
            "finish_time": runner.find(".type-time").eq(1).text().split("\n")[1],
            "details_url": details_url,
        }        
        all_runners_parsed.append(data)

    return all_runners_parsed

def get_details(details_url):
    x = pq(details_url)
    splits = {
        "start": {
            "time_of_day": x.find(".f-starttime_net.last").text(),
            "time": "00:00:00"
        },
        "5km": {
            "time_of_day": x.find(".f-time_05 .time_day").text(),
            "time": x.find(".f-time_05 .time").text()
        },
        "10km": {
            "time_of_day": x.find(".f-time_10 .time_day").text(),
            "time": x.find(".f-time_10 .time").text()
        },
        "15km": {
            "time_of_day": x.find(".f-time_15 .time_day").text(),
            "time": x.find(".f-time_15 .time").text()
        },
        "20km": {
            "time_of_day": x.find(".f-time_20 .time_day").text(),
            "time": x.find(".f-time_20 .time").text()
        },
        "half": {
            "time_of_day": x.find(".f-time_52 .time_day").text(),
            "time": x.find(".f-time_52 .time").text()
        },
        "25km": {
            "time_of_day": x.find(".f-time_25 .time_day").text(),
            "time": x.find(".f-time_25 .time").text()
        },
        "30km": {
            "time_of_day": x.find(".f-time_30 .time_day").text(),
            "time": x.find(".f-time_30 .time").text()
        },
        "35km": {
            "time_of_day": x.find(".f-time_35 .time_day").text(),
            "time": x.find(".f-time_35 .time").text()
        },
        "40km": {
            "time_of_day": x.find(".f-time_40 .time_day").text(),
            "time": x.find(".f-time_40 .time").text()
        },
        "finish": {
            "time_of_day": x.find(".f-time_finish_netto .time_day").text(),
            "time": x.find(".f-time_finish_netto .time").text()
        }
    }
    
    return {
        "bib": x.find(".f-start_no_text.last").text(),
        "city_state": x.find(".f-__city_state.last").text(),
        "splits": splits,
    }

In [5]:
all_runners = []

for page in tqdm.tqdm(range(1, 16)):
    all_runners += parse_page(BASE_URL, PATH.format(page=page, sex="M"), gender="man")
    
    
for page in tqdm.tqdm(range(1, 13)):
    all_runners += parse_page(BASE_URL, PATH.format(page=page, sex="W"), gender="woman")

100%|████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:32<00:00,  2.18s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:24<00:00,  2.03s/it]


In [6]:
pd.DataFrame(all_runners).to_csv("data/runners.csv", index=None)

In [7]:
all_runners = pd.read_csv("data/runners.csv").to_dict(orient="records")

In [8]:
client = Client()  # set up local cluster on your laptop
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 56466 instead


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:56466/status,

0,1
Dashboard: http://127.0.0.1:56466/status,Workers: 5
Total threads: 10,Total memory: 32.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:56467,Workers: 5
Dashboard: http://127.0.0.1:56466/status,Total threads: 10
Started: Just now,Total memory: 32.00 GiB

0,1
Comm: tcp://127.0.0.1:56485,Total threads: 2
Dashboard: http://127.0.0.1:56491/status,Memory: 6.40 GiB
Nanny: tcp://127.0.0.1:56471,
Local directory: /Users/kanter/Documents/marathon_analysis/dask-worker-space/worker-s73wciuu,Local directory: /Users/kanter/Documents/marathon_analysis/dask-worker-space/worker-s73wciuu

0,1
Comm: tcp://127.0.0.1:56487,Total threads: 2
Dashboard: http://127.0.0.1:56494/status,Memory: 6.40 GiB
Nanny: tcp://127.0.0.1:56470,
Local directory: /Users/kanter/Documents/marathon_analysis/dask-worker-space/worker-u8t7xvb8,Local directory: /Users/kanter/Documents/marathon_analysis/dask-worker-space/worker-u8t7xvb8

0,1
Comm: tcp://127.0.0.1:56486,Total threads: 2
Dashboard: http://127.0.0.1:56492/status,Memory: 6.40 GiB
Nanny: tcp://127.0.0.1:56472,
Local directory: /Users/kanter/Documents/marathon_analysis/dask-worker-space/worker-zhhbdjlk,Local directory: /Users/kanter/Documents/marathon_analysis/dask-worker-space/worker-zhhbdjlk

0,1
Comm: tcp://127.0.0.1:56484,Total threads: 2
Dashboard: http://127.0.0.1:56489/status,Memory: 6.40 GiB
Nanny: tcp://127.0.0.1:56474,
Local directory: /Users/kanter/Documents/marathon_analysis/dask-worker-space/worker-ky93f3ck,Local directory: /Users/kanter/Documents/marathon_analysis/dask-worker-space/worker-ky93f3ck

0,1
Comm: tcp://127.0.0.1:56488,Total threads: 2
Dashboard: http://127.0.0.1:56493/status,Memory: 6.40 GiB
Nanny: tcp://127.0.0.1:56473,
Local directory: /Users/kanter/Documents/marathon_analysis/dask-worker-space/worker-4yjxvbeg,Local directory: /Users/kanter/Documents/marathon_analysis/dask-worker-space/worker-4yjxvbeg


In [9]:
len(db.from_sequence(all_runners).map(lambda x: x["name"]).compute())

26078

In [10]:
all_runners[0]

{'name': 'Tura Abdiwak, Seifu',
 'gender': 'man',
 'country': 'ETH',
 'age_class': '20-24',
 'half_time': '01:02:29',
 'finish_time': '02:06:12',
 'details_url': 'https://results.chicagomarathon.com/2021/?content=detail&idp=LSMG963824DD5C'}

In [11]:
def add_details(runner):
    runner = runner.copy()
    try:
        details = get_details(runner["details_url"])
        runner.update(details)
        return runner
    except:
        print(runner)
        return {}
    

all_with_details = db.from_sequence(all_runners, npartitions=10000).map(add_details).compute()





In [12]:
with open('data/runners_with_data.json', 'w') as fp:
    json.dump(all_with_details, fp)

In [30]:
df

Unnamed: 0,start.time,5km.time,10km.time,15km.time,20km.time,half.time,25km.time,30km.time,35km.time,40km.time,finish.time
0,00:00:00,00:14:43,00:29:15,00:44:21,00:59:13,01:02:29,01:14:42,01:30:06,01:45:01,01:59:44,02:06:12
1,00:00:00,00:14:43,00:29:25,00:44:23,00:59:24,01:02:40,01:14:44,01:30:07,01:45:02,01:59:53,02:06:35
2,00:00:00,00:14:43,00:29:17,00:44:21,00:59:13,01:02:29,01:14:42,01:30:06,01:45:01,02:00:05,02:06:51
3,00:00:00,00:14:44,00:29:16,00:44:22,00:59:15,01:02:30,01:14:44,01:30:07,01:45:30,02:01:48,02:08:50
4,00:00:00,00:14:36,00:29:15,00:44:06,00:59:10,01:02:29,01:14:43,01:30:08,01:45:59,02:02:16,02:09:39
...,...,...,...,...,...,...,...,...,...,...,...
26073,00:00:00,01:31:52,02:17:27,03:03:53,03:50:07,04:01:03,04:39:12,05:29:52,06:22:39,07:17:44,07:39:35
26074,00:00:00,00:47:22,01:38:19,02:32:04,03:27:30,03:39:30,04:22:17,05:19:36,06:17:51,07:20:33,07:48:05
26075,00:00:00,00:45:41,01:35:30,02:24:48,03:16:57,03:28:36,04:13:40,05:13:16,06:17:47,07:22:44,07:51:14
26076,00:00:00,00:45:46,01:34:30,02:23:44,03:17:35,03:29:42,04:23:49,05:33:07,06:42:26,07:51:45,08:22:10


In [38]:
df = pd.DataFrame(pd.json_normalize(all_with_details))

to_drop = ["splits.start.time", "details_url", "city_state", "finish_time", "half_time", "name", "country", "bib", "gender", "age_class"] 
# drop time of day for now
to_drop += [c for c in df.columns if "time_of_day" in c]
df = df.drop(columns=to_drop)

# handle missing value marker
df = df.replace("–", np.nan)

df.columns = [c.replace("splits.", "") for c in df.columns]
df.to_csv("data/all_runners_with_splits.csv", index=None)