In [2]:
import pdfplumber
import os
import csv
import re
from html import unescape

# patterns in log files
incident_pattern = r"Incident #:\s*(\d+)"
date_pattern = r"Date:\s*(\d{4}-\d{2}-\d{2}\s*\d{2}:\d{2}:\d{2})"
type_pattern = r"Type:\s*([\w\s/]+)"
location_pattern = r"Location:\s*(.+?)(?=\n|$)"
arrest_pattern = r"Arrested:"
name_pattern = r"Name:\s*([^:\n]+?)(?=\s*Date of Birth:)"
dob_pattern = r"Date of Birth:\s*(\d{2}/\d{2}/\d{4})"
charges_pattern = r"Charges:\s*((?:.+?(\n|$))*?)(?=\n(?:\w+:|$))"

# folder pattern (e.g., "yyyy_law_pd_data")
folder_pattern = re.compile(r"^\d{4}_law_pd_data$")

# csv file for logging skipped files and their separators
skipped_csv = "skipped_files.csv"
skipped_data = [] 

# function to find alternative separators
def find_alternate_separator(text):
    """
    If no standard separator is found(- or =), this function identifies the closest 
    alternative separator that consists of any 8 repeating characters.
    """
    alt_separator_match = re.search(r"(\S)\1{7,}", text)  # at least 8 same chars
    return alt_separator_match.group(0) if alt_separator_match else "Unknown"

def process_pdf(file_path):
    data = []
    with pdfplumber.open(file_path) as pdf:
        text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
        text = unescape(text)

        # check for standard separators ('=' or '-')
        if not re.search(r"[=-]{10,}", text):
            # find an alternative separator
            alt_separator = find_alternate_separator(text)

            # store first 500 characters of the extracted text for reference
            extracted_text_preview = text[:500] if text else "No text extracted"

            # log skipped file information
            skipped_data.append({
                "File": file_path,
                "Separator": alt_separator,
                "Extracted Text": extracted_text_preview  # NEW CODE: Save text preview
            })

            print(f"This file, {file_path}, has been skipped because this is its separator: {alt_separator}")
            return []  # skip file

        # split incidents on the standard separator between incidents
        incidents = re.split(r"[=-]{10,}", text)
        for incident in incidents:
            incident_number = re.search(incident_pattern, incident)
            date = re.search(date_pattern, incident)
            type_field = re.search(type_pattern, incident)
            location = re.search(location_pattern, incident)

            arrested = "No"
            name = ""
            dob = ""
            charges = ""

            # check if "Arrested:" exists
            if re.search(arrest_pattern, incident):
                arrested = "Yes"
                name_match = re.search(name_pattern, incident)
                dob_match = re.search(dob_pattern, incident)
                charges_match = re.search(charges_pattern, incident, re.DOTALL)

                name = name_match.group(1).strip() if name_match else "N/A"
                dob = dob_match.group(1) if dob_match else "N/A"
                if charges_match:
                    # split charges into separate lines and join with ";"
                    charges = "; ".join(
                        line.strip() 
                        for line in charges_match.group(1).splitlines() 
                        if line.strip()
                    )
                else:
                    charges = "N/A"

            # add the row/entry
            data.append({
                "Incident #": incident_number.group(1) if incident_number else "",
                "Date": date.group(1) if date else "",
                "Type": type_field.group(1) if type_field else "",
                "Location": location.group(1) if location else "",
                "Arrested": arrested,
                "Name": name,
                "DOB": dob,
                "Charges": charges
            })
    return data

def process_all_pdfs(root_folder, output_csv):
    all_data = []
    
    # look in each subdirectory of the root folder
    for folder_name in os.listdir(root_folder):
        folder_path = os.path.join(root_folder, folder_name)

        # ensure it's a directory and matches 'yyyy_law_pd_data'
        if not os.path.isdir(folder_path) or not folder_pattern.match(folder_name):
            continue

        print(f"Entering main folder: {folder_name}")

        # now go inside a yyyy_law_pd_data folder and process each month/day.pdf
        for month_folder in os.listdir(folder_path):
            month_path = os.path.join(folder_path, month_folder)

            # skip non-directories
            if not os.path.isdir(month_path) or month_folder in {"no_date", "failures.csv", ".gitignore"}:
                continue
            
            print(f"  Entering month folder: {month_folder}")

            # process each PDF file inside the month folder
            for file in os.listdir(month_path):
                if file.endswith(".pdf"):
                    file_path = os.path.join(month_path, file)
                    print(f"    Processing: {file_path}")
                    file_data = process_pdf(file_path)
                    all_data.extend(file_data)  # add data if file was processed

    # write the combined law pd data to csv
    with open(output_csv, "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["Incident #", "Date", "Type", "Location", "Arrested", "Name", "DOB", "Charges"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(all_data)

    # write skipped files and their separators to csv
    with open(skipped_csv, "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["File", "Separator", "Extracted Text"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(skipped_data)

    print("\nProcessing complete.")
    print(f"Skipped files saved in: {skipped_csv}")

In [3]:
root_folder = r"C:\Users\Indel\OneDrive\Documents\Data Science RA\gatewayinitiative-lawrencepd\data"
output_csv = "output_all_years.csv"

process_all_pdfs(root_folder, output_csv)
print("The combined CSV is at:", output_csv)

Entering main folder: 2018_law_pd_data
  Entering month folder: 2018_april
    Processing: C:\Users\Indel\OneDrive\Documents\Data Science RA\gatewayinitiative-lawrencepd\data\2018_law_pd_data\2018_april\04-01-2018.pdf
    Processing: C:\Users\Indel\OneDrive\Documents\Data Science RA\gatewayinitiative-lawrencepd\data\2018_law_pd_data\2018_april\04-02-2018.pdf
    Processing: C:\Users\Indel\OneDrive\Documents\Data Science RA\gatewayinitiative-lawrencepd\data\2018_law_pd_data\2018_april\04-03-2018.pdf
    Processing: C:\Users\Indel\OneDrive\Documents\Data Science RA\gatewayinitiative-lawrencepd\data\2018_law_pd_data\2018_april\04-04-2018.pdf
    Processing: C:\Users\Indel\OneDrive\Documents\Data Science RA\gatewayinitiative-lawrencepd\data\2018_law_pd_data\2018_april\04-05-2018.pdf
    Processing: C:\Users\Indel\OneDrive\Documents\Data Science RA\gatewayinitiative-lawrencepd\data\2018_law_pd_data\2018_april\04-06-2018.pdf
    Processing: C:\Users\Indel\OneDrive\Documents\Data Science RA\g