In [15]:
import polars as pl
import os
from rapidfuzz import fuzz, process
import pytesseract
from pdf2image import convert_from_path

In [None]:
ms_cases = [c.replace(".pdf", "") for c in os.listdir("Cases/ManuallyScrapedCases/")]
df = pl.read_csv("./Cases/WestLawNotMatch.csv")
df.head()

Title,Court Line,Citation,OpinionURL,CourtListenerCaseName,Match
str,str,str,str,str,i64
"""O'Neil v. Rata…","""United States …","""563 F.Supp.3d …",,,0
"""Furie v. Infow…","""United States …","""401 F.Supp.3d …",,,0
"""Advanta-STAR A…","""United States …","""672 F.Supp.3d …",,,0
"""4DD Holdings, …","""United States …","""143 Fed.Cl. 11…",,,0
"""Apple Inc. v. …","""United States …","""510 F.Supp.3d …",,,0


In [17]:
def best_fuzzy_match(query, choices = ms_cases):

    query_cleaned = query.lower().replace(" ", "")  # Normalize query
    choices_cleaned = {c: c.lower().replace(" ", "") for c in choices}  # Normalize choices

    best_match, score, _ = process.extractOne(query_cleaned, choices_cleaned.values(), scorer=fuzz.ratio)

    # Get the original string from the dictionary
    best_original = next(orig for orig, cleaned in choices_cleaned.items() if cleaned == best_match)

    return best_original, score

In [18]:
df = df.with_columns(
    pl.col("Title").map_elements(lambda x: best_fuzzy_match(x)[0]+".pdf").alias("Matched"),
    pl.col("Title").map_elements(lambda x: best_fuzzy_match(x)[1]).alias("MatchedScore")
).sort("MatchedScore")

In [19]:
def pdf_to_text(pdf_path, output_txt=None, lang='eng'):
    text = ""
    
    # Convert PDF pages to images
    images = convert_from_path(pdf_path)

    for i, img in enumerate(images):
        page_text = pytesseract.image_to_string(img, lang=lang)
        text += f"\n--- Page {i+1} ---\n{page_text}\n"

    # Save to file if output path is provided
    if output_txt:
        with open(output_txt, "w", encoding="utf-8") as f:
            f.write(text)

    return text

In [24]:
df = df.filter( pl.col("MatchedScore") > 49).with_columns(
    pl.col("Matched").map_elements(lambda x: pdf_to_text("./Cases/ManuallyScrapedCases/" + x)).alias("Opinion")
)

In [31]:
df.write_csv("./Cases/WLNM_Opinion.csv")