In [26]:
# Full scraper with disabled-next-page check
import asyncio
import nest_asyncio
from playwright.async_api import async_playwright
import pandas as pd
from tqdm.notebook import tqdm

nest_asyncio.apply()

async def full_scraper_checked(total_pages=1000, csv_filename="employee_reimbursements.csv"):
    async with async_playwright() as p:
        browser = await p.firefox.launch(headless=True)
        page = await browser.new_page()
        url = ("https://data.cityofchicago.org/Administration-Finance/Employee-Reimbursements/g5h3-jkgt/explore/querySELECT%0A%20%20%60voucher_number%60%2C%0A%20%20%60amount%60%2C%0A%20%20%60payment_date%60%2C%0A%20%20%60vendor_name%60%2C%0A%20%20%60description%60%2C%0A%20%20%60department%60%0AORDER%20BY%20%60payment_date%60%20DESC%20NULL%20FIRST/page/filter")
        await page.goto(url)
        
        data = []
        for page_number in tqdm(range(total_pages), desc="Scraping pages"):
            try:
                await page.wait_for_selector("td > div", timeout=15000)
            except:
                print(f"Timeout waiting for page {page_number+1}. Skipping.")
                continue
            
            rows = await page.query_selector_all("tr")
            for row in rows:
                cells = await row.query_selector_all("td")
                if len(cells) >= 6:
                    row_data = {
                        "voucher_number": (await cells[0].inner_text()).strip(),
                        "amount": (await cells[1].inner_text()).strip(),
                        "payment_date": (await cells[2].inner_text()).strip(),
                        "vendor_name": (await cells[3].inner_text()).strip(),
                        "description": (await cells[4].inner_text()).strip(),
                        "department": (await cells[5].inner_text()).strip(),
                    }
                    data.append(row_data)
            
            # Check if next button is disabled
            next_btn = await page.query_selector("forge-icon-button[aria-label='Next page']")
            if next_btn:
                is_disabled = await next_btn.get_attribute("aria-disabled")
                if is_disabled == "true":
                    print("Next button disabled — reached last page.")
                    break
                else:
                    await next_btn.click()
                    await page.wait_for_timeout(3000)
                    await page.wait_for_load_state("networkidle")
            else:
                print("No next button found — stopping.")
                break
        
        await browser.close()
        df = pd.DataFrame(data)
        return df

# Run the scraper
df_new = asyncio.get_event_loop().run_until_complete(full_scraper_checked())

Scraping pages:   0%|          | 0/1000 [00:00<?, ?it/s]

Exception in callback Task.__step()
handle: <Handle Task.__step()>
Traceback (most recent call last):
  File "/Users/hgorledeenn/.pyenv/versions/3.13.9/lib/python3.13/asyncio/events.py", line 89, in _run
    self._context.run(self._callback, *self._args)
    ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: cannot enter context: <_contextvars.Context object at 0x10cd04e80> is already entered
Exception in callback Task.__step()
handle: <Handle Task.__step()>
Traceback (most recent call last):
  File "/Users/hgorledeenn/.pyenv/versions/3.13.9/lib/python3.13/asyncio/events.py", line 89, in _run
    self._context.run(self._callback, *self._args)
    ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: cannot enter context: <_contextvars.Context object at 0x10cd04e80> is already entered
Exception in callback Task.__step()
handle: <Handle Task.__step()>
Traceback (most recent call last):
  File "/Users/hgorledeenn/.pyenv/versions/3.13.9/lib/python3.13/asyncio/events.py", l

KeyboardInterrupt: 

### Join new data with existing data (preserving original if duplicates exist)

In [None]:
# This is the existing stuff
df_orig = pd.read_csv("employee_reimbursements.csv")

## Merge existing with new stuff
df_safe = pd.concat([df_orig, df_new], ignore_index=True)

## Remove if the id+name combo is duplicated
df_safe = df_safe.drop_duplicates(subset=['voucher_number', 'amount', 'payment_date', 'description'], keep='first')

## Save over the previous existing one
df_safe.to_csv("employee_reimbursements.csv", index=False)

df_safe.head()

Unnamed: 0,voucher_number,amount,payment_date,vendor_name,description,department
0,PV88258801236,$228.89,1/2/26,"BROWN, ALFRED R",2025 SAFETY SHOE REIMBURSEMENT,DEPARTMENT OF WATER MANAGEMENT
1,PV81258102251,$236.49,1/2/26,"TATUM, ANTON D",WORKBOOT REIMBURSEMENT,DEPARTMENT OF STREETS AND SANITATION
2,PV81258102253,$241.66,1/2/26,"MARTINEZ, FERNANDO P",WORKBOOT REIMBURSEMENT,DEPARTMENT OF STREETS AND SANITATION
3,PV88258801281,$250.00,1/2/26,"JIMENEZ, IVAN",2025 SAFETY SHOE REIMBURSEMENT,DEPARTMENT OF WATER MANAGEMENT
4,PV81258102246,$124.06,1/2/26,"MCDANIEL, ERIK",WORKBOOT REIMBURSEMENT,DEPARTMENT OF STREETS AND SANITATION


### Rename columns to all caps for Visualizations

In [None]:
df = df_safe.rename(columns={'voucher_number':'VOUCHER NUMBER',
                             'amount':'AMOUNT',
                             'payment_date':'PAYMENT DATE',
                             'vendor_name':'VENDOR NAME',
                             'description':'DESCRIPTION',
                             'department':'DEPARTMENT'})

## Remove $ and , from amount figures to ensure they're read as numbers by Datawrapper
df['AMOUNT'] = df['AMOUNT'].str.replace('$','')
df['AMOUNT'] = df['AMOUNT'].str.replace(',','')

df.head()

### Save csvs for reimbursements by department

In [4]:
groups = df.groupby(by="DEPARTMENT")

keys = groups.groups.keys()

for i in keys:
    dept = groups.get_group(i)
    name = i.lower()
    name = name.replace(" ", "-")
    name = name.replace("'","")
    dept.to_csv(f"docs/dept-csvs/{name}.csv", index=False)

### Making summary df/csv by org

In [5]:
## Make the 'AMOUNT' column a float
if df['AMOUNT'].dtype != float:
    df['AMOUNT'] = df['AMOUNT'].str.replace('$','')
    df['AMOUNT'] = df['AMOUNT'].str.replace(',','')
    df['AMOUNT']= df['AMOUNT'].astype(float)

## Do my grouping and make sure the 'index' (department names) saves as an actual column
## df_dept_sum == df_by_department_summary
df_dept_sum = df.groupby(by='DEPARTMENT').agg(
    TOTAL=('AMOUNT', 'sum'),
    COUNT=('VOUCHER NUMBER', 'size'),
    AVERAGE=('AMOUNT', 'median'))
df_dept_sum = df_dept_sum.reset_index()

## Add formatting so each link in the summary dataset points to its subpage
def make_link(dep):
    folder = (
        dep.lower()
        .replace(" ", "-")
        .replace("'", "")
    )
    return f'<a href="https://hgorledeenn.github.io/Chicago-Reimbursements-site/{folder}/index.html">{dep}</a>'

df_dept_sum['DEPARTMENT'] = df_dept_sum['DEPARTMENT'].apply(make_link)

## Write summary csv
df_dept_sum.to_csv("docs/by_department_summary.csv", index=False)