In [3]:
import pdfplumber
import os
import csv
import re
from html import unescape

# patterns
incident_pattern = r"Incident #:\s*(\d+)"
date_pattern = r"Date:\s*(\d{4}-\d{2}-\d{2}\s*\d{2}:\d{2}:\d{2})"
type_pattern = r"Type:\s*([\w\s/]+)"
location_pattern = r"Location:\s*(.+?)(?=\n|$)"
arrest_pattern = r"Arrested:"
name_pattern = r"Name:\s*([^:\n]+?)(?=\s*Date of Birth:)"
dob_pattern = r"Date of Birth:\s*(\d{2}/\d{2}/\d{4})"
charges_pattern = r"Charges:\s*((?:.+?(\n|$))*?)(?=\n(?:\w+:|$))"

def process_pdf(file_path):
    data = []
    with pdfplumber.open(file_path) as pdf:
        text = "\n".join(page.extract_text() for page in pdf.pages)
        text = unescape(text)

        # split incidents
        incidents = re.split(r"={10,}", text)
        for incident in incidents:
            incident_number = re.search(incident_pattern, incident)
            date = re.search(date_pattern, incident)
            type_field = re.search(type_pattern, incident)
            location = re.search(location_pattern, incident)

            arrested = "No"
            name = ""
            dob = ""
            charges = ""

            #check if "Arrested:" exists
            if re.search(arrest_pattern, incident):
                arrested = "Yes"
                name_match = re.search(name_pattern, incident)
                dob_match = re.search(dob_pattern, incident)
                charges_match = re.search(charges_pattern, incident, re.DOTALL)

                name = name_match.group(1).strip() if name_match else "N/A"
                dob = dob_match.group(1) if dob_match else "N/A"
                if charges_match:
                    # Split charges into separate lines and join with ";"
                    charges = "; ".join([line.strip() for line in charges_match.group(1).splitlines() if line.strip()])
                else:
                    charges = "N/A"

            # add entry
            data.append({
                "Incident #": incident_number.group(1) if incident_number else "",
                "Date": date.group(1) if date else "",
                "Type": type_field.group(1) if type_field else "",
                "Location": location.group(1) if location else "",
                "Arrested": arrested,
                "Name": name,
                "DOB": dob,
                "Charges": charges
            })
    return data

#run on all files in given folder
def process_all_pdfs(root_folder, output_csv):
    all_data = []
    for root, _, files in os.walk(root_folder):
        for file in files:
            if file.endswith(".pdf"):
                file_path = os.path.join(root, file)
                print(f"Processing: {file_path}")
                all_data.extend(process_pdf(file_path))

    # write to CSV
    with open(output_csv, "w", newline="") as csvfile:
        fieldnames = ["Incident #", "Date", "Type", "Location", "Arrested", "Name", "DOB", "Charges"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(all_data)

# Run the process
output_csv = "output_2018.csv"
process_all_pdfs(r"", output_csv)