In [1]:
import requests
from pyquery import PyQuery as pq
import pandas as pd
import re
import tqdm
import dask.bag as db
from dask.distributed import Client
import json



In [2]:
BASE_URL = "https://results.chicagomarathon.com/2021/"
PATH = "?page={page}&event=MAR&lang=EN_CAP&num_results=1000&pid=list&search%5Bsex%5D=M&search%5Bage_class%5D=%25"

In [3]:
def parse_page(base_url, path):
    resp = requests.get(base_url+path)
    d = pq(resp.content)
    # find first name field and navigate up to overarching row
    all_runners = d(".list-field.type-fullname a").closest(".list-group-item .row")
    all_runners_parsed = []
    for runner in all_runners.items():
    #     print(runner)
        name_country = runner.find(".type-fullname a").text()
        idp = re.search("(?<=idp=)[A-Z0-9_.-]*(?=&)", runner.find(".type-fullname a").attr['href']).group(0)
        details_url = base_url + "?content=detail&idp=" + idp
        data = {
            "name": name_country[:-6],
            "country": name_country[-4:-1],
            "age_class": runner.find(".type-age_class").text().split("\n")[1],
            "half_time": runner.find(".type-time").eq(0).text().split("\n")[1],
            "finish_time": runner.find(".type-time").eq(1).text().split("\n")[1],
            "details_url": details_url,
        }        
        all_runners_parsed.append(data)

    return all_runners_parsed

def get_details(details_url):
    x = pq(details_url)
    splits = {
        "start": {
            "time_of_day": x.find(".f-starttime_net.last").text(),
            "time": "00:00:00"
        },
        "5km": {
            "time_of_day": x.find(".f-time_05 .time_day").text(),
            "time": x.find(".f-time_05 .time").text()
        },
        "10km": {
            "time_of_day": x.find(".f-time_10 .time_day").text(),
            "time": x.find(".f-time_10 .time").text()
        },
        "15km": {
            "time_of_day": x.find(".f-time_15 .time_day").text(),
            "time": x.find(".f-time_15 .time").text()
        },
        "20km": {
            "time_of_day": x.find(".f-time_20 .time_day").text(),
            "time": x.find(".f-time_20 .time").text()
        },
        "half": {
            "time_of_day": x.find(".f-time_52 .time_day").text(),
            "time": x.find(".f-time_52 .time").text()
        },
        "25km": {
            "time_of_day": x.find(".f-time_25 .time_day").text(),
            "time": x.find(".f-time_25 .time").text()
        },
        "30km": {
            "time_of_day": x.find(".f-time_30 .time_day").text(),
            "time": x.find(".f-time_30 .time").text()
        },
        "35km": {
            "time_of_day": x.find(".f-time_35 .time_day").text(),
            "time": x.find(".f-time_35 .time").text()
        },
        "40km": {
            "time_of_day": x.find(".f-time_40 .time_day").text(),
            "time": x.find(".f-time_40 .time").text()
        },
        "finish": {
            "time_of_day": x.find(".f-time_finish_netto .time_day").text(),
            "time": x.find(".f-time_finish_netto .time").text()
        }
    }
    
    return {
        "bib": x.find(".f-start_no_text.last").text(),
        "city_state": x.find(".f-__city_state.last").text(),
        "splits": splits,
    }

In [None]:
all_runners = []
for page in tqdm.tqdm(range(1, 16)):
    all_runners += parse_page(BASE_URL, PATH.format(page=page))

In [None]:
pd.DataFrame(all_runners).to_csv("data/runners.csv", index=None)

In [4]:
all_runners = pd.read_csv("data/runners.csv").to_dict(orient="records")

In [5]:
client = Client()  # set up local cluster on your laptop
client



0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 8,Total memory: 16.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:57646,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 16.00 GiB

0,1
Comm: tcp://127.0.0.1:57671,Total threads: 2
Dashboard: http://127.0.0.1:57672/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:57652,
Local directory: /Users/max2/Documents/marathon_analysis/dask-worker-space/worker-r6_i5gfb,Local directory: /Users/max2/Documents/marathon_analysis/dask-worker-space/worker-r6_i5gfb

0,1
Comm: tcp://127.0.0.1:57666,Total threads: 2
Dashboard: http://127.0.0.1:57669/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:57651,
Local directory: /Users/max2/Documents/marathon_analysis/dask-worker-space/worker-xf5lihi8,Local directory: /Users/max2/Documents/marathon_analysis/dask-worker-space/worker-xf5lihi8

0,1
Comm: tcp://127.0.0.1:57664,Total threads: 2
Dashboard: http://127.0.0.1:57667/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:57649,
Local directory: /Users/max2/Documents/marathon_analysis/dask-worker-space/worker-nxh8aphq,Local directory: /Users/max2/Documents/marathon_analysis/dask-worker-space/worker-nxh8aphq

0,1
Comm: tcp://127.0.0.1:57662,Total threads: 2
Dashboard: http://127.0.0.1:57663/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:57650,
Local directory: /Users/max2/Documents/marathon_analysis/dask-worker-space/worker-3r820qn8,Local directory: /Users/max2/Documents/marathon_analysis/dask-worker-space/worker-3r820qn8


In [15]:
len(db.from_sequence(all_runners).map(lambda x: x["name"]).compute())

14201

In [21]:
def add_details(runner):
    runner = runner.copy()
    details = get_details(runner["details_url"])
    runner.update(details)
    return runner

all_with_details = db.from_sequence(all_runners).map(add_details).compute()

tornado.application - ERROR - Uncaught exception GET /profile/ws?key=add_details (127.0.0.1)
HTTPServerRequest(protocol='http', host='127.0.0.1:8787', method='GET', uri='/profile/ws?key=add_details', version='HTTP/1.1', remote_ip='127.0.0.1')
Traceback (most recent call last):
  File "/Users/max2/.pyenv/versions/3.10.1/envs/marathon/lib/python3.10/site-packages/tornado/websocket.py", line 954, in _accept_connection
    open_result = handler.open(*handler.open_args, **handler.open_kwargs)
  File "/Users/max2/.pyenv/versions/3.10.1/envs/marathon/lib/python3.10/site-packages/tornado/web.py", line 3173, in wrapper
    return method(self, *args, **kwargs)
  File "/Users/max2/.pyenv/versions/3.10.1/envs/marathon/lib/python3.10/site-packages/bokeh/server/views/ws.py", line 149, in open
    raise ProtocolError("Token is expired.")
bokeh.protocol.exceptions.ProtocolError: Token is expired.


In [23]:
with open('data/runners_with_data.json', 'w') as fp:
    json.dump(all_with_details, fp)

In [22]:
pd.DataFrame(pd.json_normalize(all_with_details)).to_csv("data/runners_with_details.csv", index=None)