In [1]:
import polars as pl
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from multiprocessing.pool import ThreadPool 
import time
import os
import re
import random

timeout_time = 3600

county_codes = {"01": "Douglas", "02": "Lancaster", "59": "Sarpy"}

def case_info(case_id):

    pattern = re.compile('D (\d{2}) JV (\d{2})\s*(\d{7})')

    match = pattern.match(case_id)

    if match:
        
        county = county_codes[match.group(1)]
        year = match.group(2)
        case_num = match.group(3)

        return county, year, case_num

def scrape_case(cases, url = os.environ["case_url"]):

    cService = webdriver.ChromeService(executable_path=os.environ["chromedriver"])
    browser = webdriver.Chrome(service = cService)
    
    browser.command_executor._client_config._timeout = timeout_time

    for case in cases:

        try:
            browser.get(url)

            browser.find_element(By.ID, "court_type").send_keys("D")
            browser.find_element(By.ID, "county_num").send_keys(case["County"])
            browser.find_element(By.ID, "case_type").send_keys("JV")
            browser.find_element(By.ID, "case_year").send_keys(case["Year"])
            browser.find_element(By.ID, "case_id").send_keys(case["CaseNum"])

            browser.find_element(By.ID, "search").click()

            with open("./CaseRecords/" + case["CaseID"] + '.html', "w") as outfile:
                    outfile.write(browser.page_source)
                    
        except NoSuchElementException or TimeoutException:
             
             time.sleep(60)
             
             with open("./CaseRecords/" + case["CaseID"] + '.html', "w") as outfile:
                    outfile.write(browser.page_source)

    browser.close()



In [2]:
## TODO: When scraping, please note what are the cases you want to scrape!
def get_unscraped_cases():

    scraped_records = [c.replace(".html", "") for c in os.listdir("CaseRecords")]

    ## CHANGE HERE!!!
    df = pl.read_database_uri("SELECT DISTINCT CaseID FROM CaseVerbatim WHERE CaseSummary = \'\'", os.environ["database"])

    df = df.with_columns(
        pl.col("CaseID").map_elements(case_info).alias("information")
    ).with_columns(
        pl.col("information").map_elements(lambda x: x[0]).alias("County"),
        pl.col("information").map_elements(lambda x: x[1]).alias("Year"),
        pl.col("information").map_elements(lambda x: x[2]).alias("CaseNum")
    ).drop("information")

    cases = df.rows(named=True)
    cases = [c for c in cases if c["CaseID"] not in scraped_records]

    random.shuffle(cases)

    return cases   

In [3]:
cases = get_unscraped_cases()
print(len(cases))

8


In [4]:
def partition_list(lst, num_partitions):
    # Calculate the approximate size of each part
    avg = len(lst) / float(num_partitions)
    partitions = []
    last = 0.0

    while last < len(lst):
        partitions.append(lst[int(last):int(last + avg)])
        last += avg

    return partitions


In [5]:
while( len(cases) != 0 ):
    pool_size = 8
    num_partitions = pool_size

    partitions = partition_list(cases, num_partitions)
    pool = ThreadPool(pool_size)

    for partition in partitions:
        pool.apply_async(scrape_case, (partition,))

    pool.close()
    pool.join()

    cases = get_unscraped_cases()
    print(len(cases))
  

0
