In [1]:
from bs4 import BeautifulSoup
import re
import os
import sqlite3

con = sqlite3.connect("JVCase.db")
cur = con.cursor()

existing_cases = [x[0] for x in cur.execute("SELECT DISTINCT CaseID from CaseBasicInfo").fetchall()]

files = os.listdir("CaseSearchAll")

In [2]:
results = []

for file in files:
    # Replace 'your_file.html' with the actual path to your HTML file
    filename = f"./CaseSearchAll/{file}"

    # Read the contents of the HTML file
    with open(filename, "r", encoding="utf-8") as f:
        html_content = f.read()

    # Parse the HTML
    soup = BeautifulSoup(html_content, "html.parser")

    # Find all <td> elements with style="white-space: nowrap;"
    tds = soup.find_all("td", style="white-space: nowrap;")

    # Extract and print the text for each matching <td>
    for td in tds:
        # td.stripped_strings yields an iterator of all text segments, stripped of whitespace
        items = list(td.stripped_strings)
        # Join all these pieces with a space
        result = " ".join(items)
        results.append(result)

In [3]:
def extract_case(input_text):

    pattern = re.compile(
                r'^\s*'                  # Possible leading space
                r'([^()]+)'              # 1) raw name text (could contain commas)
                r'\(\s*([^)]+)\s*\)'     # 2) type
                r'(?:.*DOB:\s*(\d{4}))?' # 3) optional DOB
                r'(C|D) (\d{2}) JV (\d{2})\s*(\d{7})'
            )
            
    match = pattern.match(input_text)
    
    if match:
        raw_name = match.group(1)
        type = match.group(2).strip()
        dob = match.group(3)  # could be None if not present
        
        # Clean up the raw_name by stripping extra commas/spaces
        # e.g. "NDHHS,," -> "NDHHS"
        name = raw_name.strip().strip(", ")
        case = match.group(4) + " " + match.group(5) + " JV " + match.group(6) + " " + match.group(7)

        return name, type, dob, case


In [4]:
results = [results[i] + results[i + 1] for i in range(0, len(results) - 1, 2)]
extracted = [extract_case(i) for i in results]
extracted = [i for i in extracted if i is not None]

In [5]:
extracted = [x for x in extracted if x[3] not in existing_cases]

In [6]:
extracted

[('SIMPSON,BRANDIE', 'MOM', None, 'D 01 JV 25 0000139'),
 ('JUVENILE PROBATION', 'PRB', None, 'D 01 JV 25 0000139'),
 ('BISHOP,JOHNNISHA', 'INT', None, 'D 01 JV 25 0000139'),
 ('BROWN,TERESA', 'MOM', None, 'D 01 JV 25 0000138'),
 ('MASON,DANAIJHA,L', 'JUV', '2007', 'D 01 JV 25 0000138'),
 ('MASON,DANAJIHA', 'JUV', '2007', 'D 01 JV 25 0000138'),
 ('JUVENILE PROBATION', 'PRB', None, 'D 01 JV 25 0000138'),
 ('SKOGERBOE,MICHAELA', 'GAL', None, 'D 01 JV 25 0000138'),
 ('LEW,NYAGOANOR,C', 'JUV', '2009', 'D 01 JV 25 0000137'),
 ('LEW,DIANE', 'MOM', None, 'D 01 JV 25 0000137'),
 ('KELLOGG,RAZYIAH', 'JUV', '2012', 'D 01 JV 25 0000135'),
 ('NELSON,KARLISSA', 'MOM', None, 'D 01 JV 25 0000135'),
 ('COBB,ANDREW,Z', 'JUV', '2009', 'D 01 JV 25 0000134'),
 ('BREWER,TAKELA', 'MOM', None, 'D 01 JV 25 0000134'),
 ('COBB,ANDREW,,SR', 'DAD', None, 'D 01 JV 25 0000134'),
 ('JUVENILE PROBATION', 'PRB', None, 'D 01 JV 25 0000134'),
 ('HOWARD,DEIRAJI', 'JUV', '2013', 'D 01 JV 25 0000133'),
 ('HOWARD,TOSHMA', '

In [7]:
for result in extracted:

    person, relationship, year_of_birth, id = result
    person = person.replace("\"", "\'")
    relationship = relationship.replace("\"", "\'")

    if year_of_birth is not None:

        cur.execute(f"""
                        INSERT INTO CaseBasicInfo 
                        (Person, Type, YearOfBirth, CaseID) 
                        VALUES (\"{person}\", \"{relationship}\", \"{year_of_birth}\", \"{id}\")
                    """)
    else:

        cur.execute(f"""
                        INSERT INTO CaseBasicInfo 
                        (Person, Type, CaseID) 
                        VALUES (\"{person}\", \"{relationship}\", \"{id}\")
                    """)
    con.commit()