In [82]:
import os
import polars as pl
from dotenv import load_dotenv
from datetime import date

county_map = {
        "01": "Douglas",
        "02": "Lancaster",
        "59": "Sarpy"
    }

up_to = 5

today = date.today()

load_dotenv()

df = pl.read_database_uri("SELECT DISTINCT CaseID FROM CaseVerbatim WHERE CaseID LIKE '% JV 25 %'", os.environ["database"])

In [72]:
def parse_case_info(case_str, county_map = county_map):

    parts = case_str.split()
    county_code = parts[1]
    year_suffix = parts[3]
    case_number = parts[4]

    year = 2000 + int(year_suffix)  # assuming all years are 2000+
    county = county_map.get(county_code, "Unknown")

    return {
        "CaseYear": year,
        "County": county,
        "CaseNumber": case_number
    }


TimeScraped
CaseYear
Docket
CaseNumber
County
CaseCount

In [73]:
df = df.with_columns(
    pl.col("CaseID").map_elements(lambda x: open("./CaseRecords/" + x + ".html", mode = "r").read()).alias("Docket"),
    pl.col("CaseID").map_elements(parse_case_info).alias("parsed"),
    pl.lit(date.today()).cast(pl.Datetime).alias("TimeScraped")
).unnest("parsed")

In [74]:
df.head()

CaseID,Docket,CaseYear,County,CaseNumber,TimeScraped
str,str,i64,str,str,datetime[μs]
"""D 01 JV 25 000…","""<html lang=""en…",2025,"""Douglas""","""0000454""",2025-04-30 00:00:00
"""D 02 JV 25 000…","""<html lang=""en…",2025,"""Lancaster""","""0000003""",2025-04-30 00:00:00
"""D 02 JV 25 000…","""<html lang=""en…",2025,"""Lancaster""","""0000092""",2025-04-30 00:00:00
"""D 01 JV 25 000…","""<html lang=""en…",2025,"""Douglas""","""0000360""",2025-04-30 00:00:00
"""D 02 JV 25 000…","""<html lang=""en…",2025,"""Lancaster""","""0000006""",2025-04-30 00:00:00


In [75]:
df["CaseID"][0]

'D 01 JV 25 0000454'

In [None]:
from pymongo import MongoClient

# Format the URI
uri = os.environ["mongodb_conn"]

# Connect to the client
client = MongoClient(uri)

# Access your database and collection
db = client["JVCases"]
collection = db["Cases"]

In [46]:
next_batch = []

In [80]:
pipeline = [
    {
        "$group": {
            "_id": {"CaseYear": "$CaseYear", "County": "$County"},
            "MaxCaseNumber": {"$max": {"$toInt": "$CaseNumber"}}
        }
    }
]

checkpoints = list(collection.aggregate(pipeline))

In [81]:
checkpoints

[{'_id': {'CaseYear': 2025, 'County': 'Douglas'}, 'MaxCaseNumber': 487},
 {'_id': {'CaseYear': 2025, 'County': 'Lancaster'}, 'MaxCaseNumber': 358},
 {'_id': {'CaseYear': 2025, 'County': 'Sarpy'}, 'MaxCaseNumber': 185}]

In [57]:
inv_map = {v: k for k, v in county_map.items()}

next_batch = []
for checkpoint in checkpoints:
    year = str(checkpoint["_id"]["CaseYear"] - 2000)
    county = inv_map.get(checkpoint["_id"]["County"])

    new_casenums = [str(checkpoint["MaxCaseNumber"] + i).zfill(7) for i in range(1, up_to)]
    next_batch = next_batch + ["D " + county + " JV " + year + " " + n for n in new_casenums]

In [58]:
df_new = pl.DataFrame({"CaseID": next_batch})

In [59]:
df_new

CaseID
str
"""D 01 JV 25 000…"
"""D 01 JV 25 000…"
"""D 01 JV 25 000…"
"""D 01 JV 25 000…"
"""D 02 JV 25 000…"
"""D 02 JV 25 000…"
"""D 02 JV 25 000…"
"""D 02 JV 25 000…"
"""D 59 JV 25 000…"
"""D 59 JV 25 000…"


In [60]:
df_new.with_columns(
    pl.col("CaseID").map_elements(parse_case_info).alias("parsed"),
    pl.lit(date.today()).cast(pl.Datetime).alias("TimeScraped"),
    pl.lit(None).alias("Docket")
).unnest("parsed")

CaseID,CaseYear,County,CaseNumber,TimeScraped,Docket
str,i64,str,str,datetime[μs],null
"""D 01 JV 25 000…",2025,"""Douglas""","""0000478""",2025-04-30 00:00:00,
"""D 01 JV 25 000…",2025,"""Douglas""","""0000479""",2025-04-30 00:00:00,
"""D 01 JV 25 000…",2025,"""Douglas""","""0000480""",2025-04-30 00:00:00,
"""D 01 JV 25 000…",2025,"""Douglas""","""0000481""",2025-04-30 00:00:00,
"""D 02 JV 25 000…",2025,"""Lancaster""","""0000349""",2025-04-30 00:00:00,
"""D 02 JV 25 000…",2025,"""Lancaster""","""0000350""",2025-04-30 00:00:00,
"""D 02 JV 25 000…",2025,"""Lancaster""","""0000351""",2025-04-30 00:00:00,
"""D 02 JV 25 000…",2025,"""Lancaster""","""0000352""",2025-04-30 00:00:00,
"""D 59 JV 25 000…",2025,"""Sarpy""","""0000176""",2025-04-30 00:00:00,
"""D 59 JV 25 000…",2025,"""Sarpy""","""0000177""",2025-04-30 00:00:00,


In [None]:
from playwright.async_api import async_playwright, TimeoutError
import asyncio

# If your timeout_time was in seconds, multiply by 1000 to get Playwright’s ms-based timeout
timeout_ms = 60_000  # e.g. 60 seconds

async def scrape_case(cases, url=os.environ["case_url"]):
    with async_playwright() as p:
        # launch a headless Chromium; you can switch to firefox or webkit if desired
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        # apply timeouts globally on this page
        await page.set_default_timeout(timeout_ms)
        await page.set_default_navigation_timeout(timeout_ms)

        for case in cases:
            try:
                # navigate
                await page.goto(url)

                # fill in the form fields
                await page.select_option("#court_type", "D")
                await page.select_option("#county_num", case["County"])
                await page.select_option("#case_type", "JV")
                await page.fill("#case_year", case["CaseYear"])
                await page.fill("#case_id", case["CaseNumber"])

                # submit
                await page.click("#search")

                # wait until network is idle (no more than 2 connections for at least 500 ms)
                await page.wait_for_load_state("networkidle")

                # grab the HTML and save
                html = await page.content()
                print(html)

            except TimeoutError:
                print("ERROR")

        # clean up
        await context.close()
        await browser.close()

In [69]:
df_new.to_dicts()

[{'CaseID': 'D 01 JV 25 0000478'},
 {'CaseID': 'D 01 JV 25 0000479'},
 {'CaseID': 'D 01 JV 25 0000480'},
 {'CaseID': 'D 01 JV 25 0000481'},
 {'CaseID': 'D 02 JV 25 0000349'},
 {'CaseID': 'D 02 JV 25 0000350'},
 {'CaseID': 'D 02 JV 25 0000351'},
 {'CaseID': 'D 02 JV 25 0000352'},
 {'CaseID': 'D 59 JV 25 0000176'},
 {'CaseID': 'D 59 JV 25 0000177'},
 {'CaseID': 'D 59 JV 25 0000178'},
 {'CaseID': 'D 59 JV 25 0000179'}]

In [70]:
asyncio.run(scrape_case(df_new.to_dicts()))

RuntimeError: asyncio.run() cannot be called from a running event loop