In [6]:
import pdfplumber
import os
import csv
import re
from html import unescape

# patterns in log files
incident_pattern = r"Incident #:\s*(\d+)"
date_pattern = r"Date:\s*(\d{4}-\d{2}-\d{2}\s*\d{2}:\d{2}:\d{2})"
type_pattern = r"Type:\s*([\w\s/]+)"
location_pattern = r"Location:\s*(.+?)(?=\n|$)"
arrest_pattern = r"Arrested:"
name_pattern = r"Name:\s*([^:\n]+?)(?=\s*Date of Birth:)"
dob_pattern = r"Date of Birth:\s*(\d{2}/\d{2}/\d{4})"
charges_pattern = r"Charges:\s*((?:.+?(\n|$))*?)(?=\n(?:\w+:|$))"

# folder pattern (e.g., "yyyy_law_pd_data")
folder_pattern = re.compile(r"^\d{4}_law_pd_data$")

def process_pdf(file_path):
    data = []
    with pdfplumber.open(file_path) as pdf:
        text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
        text = unescape(text)

        # split incidents on lines "===========" (the separator between incidents)
        incidents = re.split(r"={10,}", text)
        for incident in incidents:
            incident_number = re.search(incident_pattern, incident)
            date = re.search(date_pattern, incident)
            type_field = re.search(type_pattern, incident)
            location = re.search(location_pattern, incident)

            arrested = "No"
            name = ""
            dob = ""
            charges = ""

            # check if "Arrested:" exists
            if re.search(arrest_pattern, incident):
                arrested = "Yes"
                name_match = re.search(name_pattern, incident)
                dob_match = re.search(dob_pattern, incident)
                charges_match = re.search(charges_pattern, incident, re.DOTALL)

                name = name_match.group(1).strip() if name_match else "N/A"
                dob = dob_match.group(1) if dob_match else "N/A"
                if charges_match:
                    # split charges into separate lines and join with ";"
                    charges = "; ".join(
                        line.strip() 
                        for line in charges_match.group(1).splitlines() 
                        if line.strip()
                    )
                else:
                    charges = "N/A"

            # add the row/entry
            data.append({
                "Incident #": incident_number.group(1) if incident_number else "",
                "Date": date.group(1) if date else "",
                "Type": type_field.group(1) if type_field else "",
                "Location": location.group(1) if location else "",
                "Arrested": arrested,
                "Name": name,
                "DOB": dob,
                "Charges": charges
            })
    return data

def process_all_pdfs(root_folder, output_csv):
    all_data = []
    # walk through every subdirectory in root_folder
    for current_path, dirs, files in os.walk(root_folder):
        folder_name = os.path.basename(current_path)

        # skip folders that don't match the pattern "yyyy_law_pd_data"
        if not folder_pattern.match(folder_name):
            continue

        for file in files:
            if file.endswith(".pdf"):
                file_path = os.path.join(current_path, file)
                print(f"Processing: {file_path}")
                all_data.extend(process_pdf(file_path))

    # write the combined data to CSV
    with open(output_csv, "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["Incident #", "Date", "Type", "Location", "Arrested", "Name", "DOB", "Charges"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(all_data)

In [7]:
root_folder = r"C:\Users\Indel\OneDrive\Documents\Data Science RA\gatewayinitiative-lawrencepd\data"
output_csv = "output_all_years.csv"

# recursively process every .pdf under 'data/.../'
process_all_pdfs(root_folder, output_csv)

print("The combined CSV is at:", output_csv)

The combined CSV is at: output_all_years.csv
