In [126]:
# Full scraper with disabled-next-page check
import asyncio
import nest_asyncio
from playwright.async_api import async_playwright
import pandas as pd
from tqdm.notebook import tqdm

nest_asyncio.apply()

async def full_scraper_checked(total_pages=67, csv_filename="employee_reimbursements.csv"):
    async with async_playwright() as p:
        browser = await p.firefox.launch(headless=True)
        page = await browser.new_page()
        url = ("https://data.cityofchicago.org/Administration-Finance/Employee-Reimbursements/"
               "g5h3-jkgt/explore/query/SELECT%0A%20%20%60voucher_number%60%2C%0A%20%20%60amount%60%2C%0A"
               "%20%20%60payment_date%60%2C%0A%20%20%60vendor_name%60%2C%0A%20%20%60description%60%2C%0A"
               "%20%20%60department%60%0AORDER%20BY%20%60payment_date%60%20DESC%20NULL%20FIRST/page/filter")
        await page.goto(url)
        
        data = []
        for page_number in tqdm(range(total_pages), desc="Scraping pages"):
            try:
                await page.wait_for_selector("td > div", timeout=15000)
            except:
                print(f"Timeout waiting for page {page_number+1}. Skipping.")
                continue
            
            rows = await page.query_selector_all("tr")
            for row in rows:
                cells = await row.query_selector_all("td")
                if len(cells) >= 6:
                    row_data = {
                        "voucher_number": (await cells[0].inner_text()).strip(),
                        "amount": (await cells[1].inner_text()).strip(),
                        "payment_date": (await cells[2].inner_text()).strip(),
                        "vendor_name": (await cells[3].inner_text()).strip(),
                        "description": (await cells[4].inner_text()).strip(),
                        "department": (await cells[5].inner_text()).strip(),
                    }
                    data.append(row_data)
            
            # Check if next button is disabled
            next_btn = await page.query_selector("forge-icon-button[aria-label='Next page']")
            if next_btn:
                is_disabled = await next_btn.get_attribute("aria-disabled")
                if is_disabled == "true":
                    print("Next button disabled — reached last page.")
                    break
                else:
                    await next_btn.click()
                    await page.wait_for_timeout(3000)
                    await page.wait_for_load_state("networkidle")
            else:
                print("No next button found — stopping.")
                break
        
        await browser.close()
        df = pd.DataFrame(data)
        return df

# Run the scraper
df_new = asyncio.get_event_loop().run_until_complete(full_scraper_checked())

Scraping pages:   0%|          | 0/67 [00:00<?, ?it/s]

Exception in callback Task.__step()
handle: <Handle Task.__step()>
Traceback (most recent call last):
  File "/Users/hgorledeenn/.pyenv/versions/3.13.9/lib/python3.13/asyncio/events.py", line 89, in _run
    self._context.run(self._callback, *self._args)
    ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: cannot enter context: <_contextvars.Context object at 0x10a7a7200> is already entered
Exception in callback Task.__step()
handle: <Handle Task.__step()>
Traceback (most recent call last):
  File "/Users/hgorledeenn/.pyenv/versions/3.13.9/lib/python3.13/asyncio/events.py", line 89, in _run
    self._context.run(self._callback, *self._args)
    ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: cannot enter context: <_contextvars.Context object at 0x10a7a7200> is already entered
Exception in callback Task.__step()
handle: <Handle Task.__step()>
Traceback (most recent call last):
  File "/Users/hgorledeenn/.pyenv/versions/3.13.9/lib/python3.13/asyncio/events.py", l

Next button disabled — reached last page.


### Join new data with existing data (preserving original if duplicates exist)

In [138]:
# This is the existing stuff
df_orig = pd.read_csv("employee_reimbursements.csv")

# # Merge existing with new stuff
df_safe = pd.concat([df_orig, df_new], ignore_index=True)

# # Remove if the id+name combo is duplicated
df_safe = df_safe.drop_duplicates(subset=['voucher_number', 'amount', 'payment_date', 'description'], keep='first')

# Save over the previous existing one
df_safe.to_csv("employee_reimbursements.csv", index=False)


df_safe.head()

Unnamed: 0,voucher_number,amount,payment_date,vendor_name,description,department
0,PV84258411946,$150.00,12/03/2025,"NICOL, MARY",REIMBURSEMENT FOR CONFERENCE REGISTRATION 11/4...,CHICAGO DEPARTMENT OF TRANSPORTATION
1,PV84258411947,$115.00,12/03/2025,"NICOL, MARY",REIMBURSEMENT FOR REGISTRATION CONFERENCE EQUI...,CHICAGO DEPARTMENT OF TRANSPORTATION
2,PV31253102380,$34.00,12/02/2025,"BUNN, MATTHEW H",COURT COST REIMB,DEPARTMENT OF LAW
3,PV41254198289,$846.05,12/02/2025,"ZARAN, EMILY ROSE",Travel Expense October 2025,CHICAGO DEPARTMENT OF PUBLIC HEALTH
4,PV57255700940,"$1,350.00",12/02/2025,ROLLING VIDEO GAMES CHICAGO INC,"Gold Gaming Package Rental on 6/27/25,7/03/25 ...",CHICAGO POLICE DEPARTMENT


### Rename columns to all caps for Visualizations

In [140]:
df = df_safe.rename(columns={'voucher_number':'VOUCHER NUMBER',
                             'amount':'AMOUNT',
                             'payment_date':'PAYMENT DATE',
                             'vendor_name':'VENDOR NAME',
                             'description':'DESCRIPTION',
                             'department':'DEPARTMENT'})

df.head()

Unnamed: 0,VOUCHER NUMBER,AMOUNT,PAYMENT DATE,VENDOR NAME,DESCRIPTION,DEPARTMENT
0,PV84258411946,$150.00,12/03/2025,"NICOL, MARY",REIMBURSEMENT FOR CONFERENCE REGISTRATION 11/4...,CHICAGO DEPARTMENT OF TRANSPORTATION
1,PV84258411947,$115.00,12/03/2025,"NICOL, MARY",REIMBURSEMENT FOR REGISTRATION CONFERENCE EQUI...,CHICAGO DEPARTMENT OF TRANSPORTATION
2,PV31253102380,$34.00,12/02/2025,"BUNN, MATTHEW H",COURT COST REIMB,DEPARTMENT OF LAW
3,PV41254198289,$846.05,12/02/2025,"ZARAN, EMILY ROSE",Travel Expense October 2025,CHICAGO DEPARTMENT OF PUBLIC HEALTH
4,PV57255700940,"$1,350.00",12/02/2025,ROLLING VIDEO GAMES CHICAGO INC,"Gold Gaming Package Rental on 6/27/25,7/03/25 ...",CHICAGO POLICE DEPARTMENT


### Making summary df/csv by org

In [142]:
## Make the 'AMOUNT' column a float
if df['AMOUNT'].dtype != float:
    df['AMOUNT'] = df['AMOUNT'].str.replace('$','')
    df['AMOUNT'] = df['AMOUNT'].str.replace(',','')
    df['AMOUNT']= df['AMOUNT'].astype(float)

## Do my grouping and make sure the 'index' (department names) saves as an actual column
## df_dept_sum == df_by_department_summary
df_dept_sum = df.groupby(by='DEPARTMENT').agg(
    TOTAL=('AMOUNT', 'sum'),
    COUNT=('VOUCHER NUMBER', 'size'),
    AVERAGE=('AMOUNT', 'median'))
df_dept_sum = df_dept_sum.reset_index()

## Fix column names for datawrapper
df_dept_sum = df_dept_sum.rename(columns = {'AMOUNT':'TOTAL'})
df_dept_sum.to_csv("by_department_summary.csv", index=False)

## Write csv
df_dept_sum.to_csv("by_department_summary.csv", index=False)

### Make dataframes and save csvs for individual orgs

In [143]:
df_chicago_police_department = df[df['DEPARTMENT'] == 'CHICAGO POLICE DEPARTMENT']
df_chicago_police_department.to_csv("~/desktop/chicago-reimbursements-site/dept csvs/chicago_police_department.csv", index=False)

df_chicago_fire_department = df[df['DEPARTMENT'] == 'CHICAGO FIRE DEPARTMENT']
df_chicago_fire_department.to_csv("~/desktop/chicago-reimbursements-site/dept csvs/chicago_fire_department.csv", index=False)

df_chicago_department_of_public_health = df[df['DEPARTMENT'] == 'CHICAGO DEPARTMENT OF PUBLIC HEALTH']
df_chicago_department_of_public_health.to_csv("~/desktop/chicago-reimbursements-site/dept csvs/chicago_department_of_public_health.csv", index=False)

df_department_of_human_resources = df[df['DEPARTMENT'] == 'DEPARTMENT OF HUMAN RESOURCES']
df_department_of_human_resources.to_csv("~/desktop/chicago-reimbursements-site/dept csvs/department_of_human_resources.csv", index=False)

df_office_of_public_safety_administration = df[df['DEPARTMENT'] == 'OFFICE OF PUBLIC SAFETY ADMINISTRATION']
df_office_of_public_safety_administration.to_csv("~/desktop/chicago-reimbursements-site/dept csvs/office_of_public_safety_administration.csv", index=False)

df_chicago_department_of_aviation = df[df['DEPARTMENT'] == 'CHICAGO DEPARTMENT OF AVIATION']
df_chicago_department_of_aviation.to_csv("~/desktop/chicago-reimbursements-site/dept csvs/chicago_department_of_aviation.csv", index=False)

df_department_of_water_management = df[df['DEPARTMENT'] == 'DEPARTMENT OF WATER MANAGEMENT']
df_department_of_water_management.to_csv("~/desktop/chicago-reimbursements-site/dept csvs/department_of_water_management.csv", index=False)

df_office_of_emergency_management_and_communications = df[df['DEPARTMENT'] == 'OFFICE OF EMERGENCY MANAGEMENT AND COMMUNICATIONS']
df_office_of_emergency_management_and_communications.to_csv("~/desktop/chicago-reimbursements-site/dept csvs/office_of_emergency_management_and_communications.csv", index=False)

df_chicago_department_of_transportation = df[df['DEPARTMENT'] == 'CHICAGO DEPARTMENT OF TRANSPORTATION']
df_chicago_department_of_transportation.to_csv("~/desktop/chicago-reimbursements-site/dept csvs/chicago_department_of_transportation.csv", index=False)

df_department_of_streets_and_sanitation = df[df['DEPARTMENT'] == 'DEPARTMENT OF STREETS AND SANITATION']
df_department_of_streets_and_sanitation.to_csv("~/desktop/chicago-reimbursements-site/dept csvs/department_of_streets_and_sanitation.csv", index=False)

df_office_of_the_mayor = df[df['DEPARTMENT'] == 'OFFICE OF THE MAYOR']
df_office_of_the_mayor.to_csv("~/desktop/chicago-reimbursements-site/dept csvs/office_of_the_mayor.csv", index=False)

df_chicago_public_library = df[df['DEPARTMENT'] == 'CHICAGO PUBLIC LIBRARY']
df_chicago_public_library.to_csv("~/desktop/chicago-reimbursements-site/dept csvs/chicago_public_library.csv", index=False)

df_department_of_law = df[df['DEPARTMENT'] == 'DEPARTMENT OF LAW']
df_department_of_law.to_csv("~/desktop/chicago-reimbursements-site/dept csvs/department_of_law.csv", index=False)

df_office_of_budget_and_management = df[df['DEPARTMENT'] == 'OFFICE OF BUDGET & MANAGEMENT']
df_office_of_budget_and_management.to_csv("~/desktop/chicago-reimbursements-site/dept csvs/office_of_budget_and_management.csv", index=False)

df_mayors_office_for_people_with_disabilities = df[df['DEPARTMENT'] == 'MAYORS OFFICE FOR PEOPLE WITH DISABILITIES']
df_mayors_office_for_people_with_disabilities.to_csv("~/desktop/chicago-reimbursements-site/dept csvs/mayors_office_for_people_with_disabilities.csv", index=False)

df_chicago_commission_on_human_relations = df[df['DEPARTMENT'] == 'CHICAGO COMMISSION ON HUMAN RELATIONS']
df_chicago_commission_on_human_relations.to_csv("~/desktop/chicago-reimbursements-site/dept csvs/chicago_commission_on_human_relations.csv", index=False)

df_department_of_cultural_affairs_and_special_events = df[df['DEPARTMENT'] == 'DEPARTMENT OF CULTURAL AFFAIRS AND SPECIAL EVENTS']
df_department_of_cultural_affairs_and_special_events.to_csv("~/desktop/chicago-reimbursements-site/dept csvs/department_of_cultural_affairs_and_special_events.csv", index=False)

df_department_of_business_affairs_and_consumer_protection = df[df['DEPARTMENT'] == 'DEPARTMENT OF BUSINESS AFFAIRS AND CONSUMER PROTECTION']
df_department_of_business_affairs_and_consumer_protection.to_csv("~/desktop/chicago-reimbursements-site/dept csvs/department_of_business_affairs_and_consumer_protection.csv", index=False)

df_department_of_planning_and_development = df[df['DEPARTMENT'] == 'DEPARTMENT OF PLANNING AND DEVELOPMENT']
df_department_of_planning_and_development.to_csv("~/desktop/chicago-reimbursements-site/dept csvs/department_of_planning_and_development.csv", index=False)

df_department_of_housing = df[df['DEPARTMENT'] == 'DEPARTMENT OF HOUSING']
df_department_of_housing.to_csv("~/desktop/chicago-reimbursements-site/dept csvs/department_of_housing.csv", index=False)

df_department_of_finance = df[df['DEPARTMENT'] == 'DEPARTMENT OF FINANCE']
df_department_of_finance.to_csv("~/desktop/chicago-reimbursements-site/dept csvs/department_of_finance.csv", index=False)

df_department_of_buildings = df[df['DEPARTMENT'] == 'DEPARTMENT OF BUILDINGS']
df_department_of_buildings.to_csv("~/desktop/chicago-reimbursements-site/dept csvs/department_of_buildings.csv", index=False)

df_community_commission_for_public_safety_and_accountability = df[df['DEPARTMENT'] == 'COMMUNITY COMMISSION FOR PUBLIC SAFETY AND ACCOUNTABILITY']
df_community_commission_for_public_safety_and_accountability.to_csv("~/desktop/chicago-reimbursements-site/dept csvs/community_commission_for_public_safety_and_accountability.csv", index=False)

df_city_treasurers_office = df[df['DEPARTMENT'] == "CITY TREASURER'S OFFICE"]
df_city_treasurers_office.to_csv("~/desktop/chicago-reimbursements-site/dept csvs/city_treasurers_office.csv", index=False)

df_department_of_family_and_support_services = df[df['DEPARTMENT'] == 'DEPARTMENT OF FAMILY AND SUPPORT SERVICES']
df_department_of_family_and_support_services.to_csv("~/desktop/chicago-reimbursements-site/dept csvs/department_of_family_and_support_services.csv", index=False)
