In [6]:
import pandas as pd
import sys
import warnings
from plotly.offline import init_notebook_mode
import asyncio
import time
warnings.filterwarnings('ignore')
init_notebook_mode(connected=True)

#hacky way to use sibling module 
# add ` "python.analysis.extraPaths": ["app/"] ` to .vscode/settings.json for pylance
sys.path.append('../app')

Define a 2 month date range to give us enough requests to benchmark on 

In [7]:
dates = pd.date_range("2022-10-01", "2022-11-29")

Try the standard way of one request at a time

In [8]:
import requests

start_time = time.perf_counter()

sesh = requests.Session()

def scrape_diary(user, date, client):
    url = f"https://www.myfitnesspal.com/food/diary/{user}?date={date}"
    res = client.get(url)
    return res.text
diaries =[]
for date in dates:
    diaries.append(scrape_diary("ismailmo", date, sesh))

kcals = []
for diary in diaries:
    kcals.append(pd.read_html(diary, flavor="lxml")[0].iloc[-4,1])

elapsed = time.perf_counter() - start_time

print(f"time to scrape data: {elapsed:.2f} seconds")

time to scrape data: 24.75 seconds


In [10]:
from httpx import AsyncClient

start_time = time.perf_counter()

async_client = AsyncClient()
date_param = "2022-11-29"
user = "ismailmo"

async def async_scrape_diary(user, date, client):
    url = f"https://www.myfitnesspal.com/food/diary/{user}?date={date}"
    res = await client.get(url)
    return date, res.text
coros = []
async_diaries =[]
for date in dates:
    coros.append(async_scrape_diary("ismailmo", date, async_client))

async_diaries = await asyncio.gather(*coros)

async_kcals = []
for date, diary in async_diaries:
    async_kcals.append(pd.read_html(diary, flavor="lxml")[0].iloc[-4,1])

async_elapsed = time.perf_counter() - start_time

print(f"time to scrape data: {async_elapsed:.2f} seconds")


time to scrape data: 3.04 seconds


In [15]:
print(f"Speed up of {((elapsed - async_elapsed)/ elapsed) * 100:.2f}%")

Speed up of 87.72%


This is a huge increase in speed and it's more significant as the size of the date range is greater (more pages scraped and more requests made), lets make sure the data we get back is the same:

In [16]:
async_kcals == kcals

True