In [2]:
import pdfplumber
import os
import csv
import re
from html import unescape

# patterns in log files
incident_pattern = r"Incident #:\s*(\d+)"
date_pattern = r"Date:\s*(\d{4}-\d{2}-\d{2}\s*\d{2}:\d{2}:\d{2})"
type_pattern = r"Type:\s*([\w\s/]+)"
location_pattern = r"Location:\s*(.+?)(?=\n|$)"
arrest_pattern = r"Arrested:"
name_pattern = r"Name:\s*([^:\n]+?)(?=\s*Date of Birth:)"
dob_pattern = r"Date of Birth:\s*(\d{2}/\d{2}/\d{4})"
charges_pattern = r"Charges:\s*((?:.+?(\n|$))*?)(?=\n(?:\w+:|$))"

# folder pattern (e.g., "yyyy_law_pd_data")
folder_pattern = re.compile(r"^\d{4}_law_pd_data$")

# csv file for logging skipped files and their separators
skipped_csv = "skipped_files.csv"
skipped_data = [] 

# function to find alternative separators
def find_alternate_separator(text):
    """
    If no standard separator is found(- or =), this function identifies the closest 
    alternative separator that consists of any 8 repeating characters.
    """
    alt_separator_match = re.search(r"(\S)\1{7,}", text)  # at least 8 same chars
    return alt_separator_match.group(0) if alt_separator_match else "Unknown"

def process_pdf(file_path):
    data = []
    with pdfplumber.open(file_path) as pdf:
        text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
        text = unescape(text)

        # check for standard separators ('=' or '-')
        if not re.search(r"[=-]{10,}", text):
            # find an alternative separator
            alt_separator = find_alternate_separator(text)

            # store first 500 characters of the extracted text for reference
            extracted_text_preview = text[:500] if text else "No text extracted"

            # log skipped file information
            skipped_data.append({
                "File": file_path,
                "Separator": alt_separator,
                "Extracted Text": extracted_text_preview  # NEW CODE: Save text preview
            })

            print(f"This file, {file_path}, has been skipped because this is its separator: {alt_separator}")
            return []  # skip file

        # split incidents on the standard separator between incidents
        incidents = re.split(r"[=-]{10,}", text)
        for incident in incidents:
            incident_number = re.search(incident_pattern, incident)
            date = re.search(date_pattern, incident)
            type_field = re.search(type_pattern, incident)
            location = re.search(location_pattern, incident)

            arrested = "No"
            name = ""
            dob = ""
            charges = ""

            # check if "Arrested:" exists
            if re.search(arrest_pattern, incident):
                arrested = "Yes"
                name_match = re.search(name_pattern, incident)
                dob_match = re.search(dob_pattern, incident)
                charges_match = re.search(charges_pattern, incident, re.DOTALL)

                name = name_match.group(1).strip() if name_match else "N/A"
                dob = dob_match.group(1) if dob_match else "N/A"
                if charges_match:
                    # split charges into separate lines and join with ";"
                    charges = "; ".join(
                        line.strip() 
                        for line in charges_match.group(1).splitlines() 
                        if line.strip()
                    )
                else:
                    charges = "N/A"

            # add the row/entry
            data.append({
                "Incident #": incident_number.group(1) if incident_number else "",
                "Date": date.group(1) if date else "",
                "Type": type_field.group(1) if type_field else "",
                "Location": location.group(1) if location else "",
                "Arrested": arrested,
                "Name": name,
                "DOB": dob,
                "Charges": charges
            })
    return data

def process_all_pdfs(root_folder, output_csv):
    all_data = []
    
    # look in each subdirectory of the root folder
    for folder_name in os.listdir(root_folder):
        folder_path = os.path.join(root_folder, folder_name)

        # ensure it's a directory and matches 'yyyy_law_pd_data'
        if not os.path.isdir(folder_path) or not folder_pattern.match(folder_name):
            continue

        print(f"Entering main folder: {folder_name}")

        # now go inside a yyyy_law_pd_data folder and process each month/day.pdf
        for month_folder in os.listdir(folder_path):
            month_path = os.path.join(folder_path, month_folder)

            # skip non-directories
            if not os.path.isdir(month_path) or month_folder in {"no_date", "failures.csv", ".gitignore"}:
                continue
            
            print(f"  Entering month folder: {month_folder}")

            # process each PDF file inside the month folder
            for file in os.listdir(month_path):
                if file.endswith(".pdf"):
                    file_path = os.path.join(month_path, file)
                    print(f"    Processing: {file_path}")
                    file_data = process_pdf(file_path)
                    all_data.extend(file_data)  # add data if file was processed

    # write the combined law pd data to csv
    with open(output_csv, "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["Incident #", "Date", "Type", "Location", "Arrested", "Name", "DOB", "Charges"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(all_data)

    # write skipped files and their separators to csv
    with open(skipped_csv, "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["File", "Separator", "Extracted Text"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(skipped_data)

    print("\nProcessing complete.")
    print(f"Skipped files saved in: {skipped_csv}")

In [3]:
root_folder = r"C:\Users\Indel\OneDrive\Documents\Data Science RA\gatewayinitiative-lawrencepd\data"
output_csv = "output_all_years.csv"

process_all_pdfs(root_folder, output_csv)
print("The combined CSV is at:", output_csv)

Entering main folder: 2018_law_pd_data
  Entering month folder: 2018_april
    Processing: C:\Users\Indel\OneDrive\Documents\Data Science RA\gatewayinitiative-lawrencepd\data\2018_law_pd_data\2018_april\04-01-2018.pdf
    Processing: C:\Users\Indel\OneDrive\Documents\Data Science RA\gatewayinitiative-lawrencepd\data\2018_law_pd_data\2018_april\04-02-2018.pdf
    Processing: C:\Users\Indel\OneDrive\Documents\Data Science RA\gatewayinitiative-lawrencepd\data\2018_law_pd_data\2018_april\04-03-2018.pdf
    Processing: C:\Users\Indel\OneDrive\Documents\Data Science RA\gatewayinitiative-lawrencepd\data\2018_law_pd_data\2018_april\04-04-2018.pdf
    Processing: C:\Users\Indel\OneDrive\Documents\Data Science RA\gatewayinitiative-lawrencepd\data\2018_law_pd_data\2018_april\04-05-2018.pdf
    Processing: C:\Users\Indel\OneDrive\Documents\Data Science RA\gatewayinitiative-lawrencepd\data\2018_law_pd_data\2018_april\04-06-2018.pdf
    Processing: C:\Users\Indel\OneDrive\Documents\Data Science RA\g

## Operation 2023, 2024

In [2]:
import pdfplumber
import csv
import re
from html import unescape

# Patterns
incident_pattern = r"Incident #:\s*(\d+)"
date_pattern = r"Date:\s*(\d{4}-\d{2}-\d{2}\s*\d{2}:\d{2}:\d{2})"
type_pattern = r"Type:\s*([\w\s/]+)"
location_pattern = r"Location:\s*(.+?)(?=\n|$)"
arrest_pattern = r"Arrested:"
name_pattern = r"Name:\s*([^:\n]+?)(?=\s*Date of Birth:)"
dob_pattern = r"Date of Birth:\s*(\d{2}/\d{2}/\d{4})"
charges_pattern = r"Charges:\s*((?:.+?(\n|$))*?)(?=\n(?:\w+:|$))"

def extract_data_from_pdf(pdf_path):
    rows = []
    with pdfplumber.open(pdf_path) as pdf:
        text = "\n".join(p.extract_text() or "" for p in pdf.pages)
        text = unescape(text)

        if not re.search(r"[=-]{10,}", text):
            return []

        incidents = re.split(r"[=-]{10,}", text)
        for entry in incidents:
            row = {
                "Incident #": re.search(incident_pattern, entry).group(1) if re.search(incident_pattern, entry) else "",
                "Date": re.search(date_pattern, entry).group(1) if re.search(date_pattern, entry) else "",
                "Type": re.search(type_pattern, entry).group(1) if re.search(type_pattern, entry) else "",
                "Location": re.search(location_pattern, entry).group(1) if re.search(location_pattern, entry) else "",
                "Arrested": "Yes" if re.search(arrest_pattern, entry) else "No",
                "Name": re.search(name_pattern, entry).group(1).strip() if re.search(name_pattern, entry) else "",
                "DOB": re.search(dob_pattern, entry).group(1) if re.search(dob_pattern, entry) else "",
                "Charges": "; ".join(
                    line.strip() for line in re.search(charges_pattern, entry, re.DOTALL).group(1).splitlines()
                    if line.strip()
                ) if re.search(charges_pattern, entry, re.DOTALL) else ""
            }
            rows.append(row)
    return rows

def parse_all_pdfs_to_csv(input_dir, output_csv):
    all_rows = []
    for folder in os.listdir(input_dir):
        month_dir = os.path.join(input_dir, folder)
        if not os.path.isdir(month_dir):
            continue
        for file in os.listdir(month_dir):
            if file.lower().endswith(".pdf"):
                path = os.path.join(month_dir, file)
                print(f"📄 Parsing: {file}")
                all_rows.extend(extract_data_from_pdf(path))

    with open(output_csv, "w", newline="", encoding="utf-8") as f:
        fieldnames = ["Incident #", "Date", "Type", "Location", "Arrested", "Name", "DOB", "Charges"]
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(all_rows)
    print(f"\n✅ CSV created at: {output_csv}")


In [1]:
import os

In [3]:
# Base data path using current notebook directory
notebook_dir = os.getcwd()
PDF_DATA_DIR = os.path.join(notebook_dir, '..', 'data', 'pdfs')

In [4]:
parsed_csv_path = os.path.join(notebook_dir, '..', 'data', 'pdfs', 'lawrence_2023_2024.csv')
parse_all_pdfs_to_csv(PDF_DATA_DIR, parsed_csv_path)

📄 Parsing: 04-01-2023.pdf
📄 Parsing: 04-02-2023.pdf
📄 Parsing: 04-03-2023.pdf
📄 Parsing: 04-04-2023.pdf
📄 Parsing: 04-05-2023.pdf
📄 Parsing: 04-06-2023.pdf
📄 Parsing: 04-07-2023.pdf
📄 Parsing: 04-08-2023.pdf
📄 Parsing: 04-10-2023.pdf
📄 Parsing: 04-11-2023.pdf
📄 Parsing: 04-12-2023.pdf
📄 Parsing: 04-13-2023.pdf
📄 Parsing: 04-14-2023.pdf
📄 Parsing: 04-15-2023.pdf
📄 Parsing: 04-17-2023.pdf
📄 Parsing: 04-18-2023.pdf
📄 Parsing: 04-20-2023.pdf
📄 Parsing: 04-21-2023.pdf
📄 Parsing: 04-22-2023.pdf
📄 Parsing: 04-23-2023.pdf
📄 Parsing: 04-24-2023.pdf
📄 Parsing: 04-25-2023.pdf
📄 Parsing: 04-26-2023.pdf
📄 Parsing: 04-27-2023.pdf
📄 Parsing: 04-28-2023.pdf
📄 Parsing: 04-29-2023.pdf
📄 Parsing: 04-30-2023.pdf
📄 Parsing: 08-01-2023.pdf
📄 Parsing: 08-02-2023.pdf
📄 Parsing: 08-03-2023.pdf
📄 Parsing: 08-04-2023.pdf
📄 Parsing: 08-05-2023.pdf
📄 Parsing: 08-06-2023.pdf
📄 Parsing: 08-08-2023.pdf
📄 Parsing: 08-09-2023.pdf
📄 Parsing: 08-10-2023.pdf
📄 Parsing: 08-11-2023.pdf
📄 Parsing: 08-12-2023.pdf
📄 Parsing: 0

In [19]:
import os
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
import csv
import re
from html import unescape

# regex patterns
incident_pattern = r"Incident #:\s*(\d+)"
date_pattern = r"Date:\s*(\d{4}-\d{2}-\d{2}\s*\d{2}:\d{2}:\d{2})"
type_pattern = r"Type:\s*([\w\s/]+)"
location_pattern = r"Location:\s*(.+?)(?=\n|$)"
arrest_pattern = r"Arrested:"
name_pattern = r"Name:\s*([^:\n]+?)(?=\s*Date of Birth:)"
dob_pattern = r"Date of Birth:\s*(\d{2}/\d{2}/\d{4})"
charges_pattern = r"Charges:\s*((?:.+?(\n|$))*?)(?=\n(?:\w+:|$))"

# helper: check if the text seems valid
def is_meaningful_police_log(text):
    incident_count = len(re.findall(r"Incident\s+#", text))
    has_keywords = "Location:" in text or "Type:" in text or "NOISE ORD" in text
    is_not_all_symbols = bool(re.search(r"[A-Za-z]{3,}", text))
    return incident_count > 0 and has_keywords and is_not_all_symbols

# extract from pdfplumber format
def extract_data_from_text_pdfplumber(text):
    rows = []
    if not re.search(r"[=-]{10,}", text):
        return rows
    incidents = re.split(r"[=-]{10,}", text)
    for entry in incidents:
        rows.append(extract_entry(entry))
    return rows

# extract from OCR fallback format
def extract_data_from_text_ocr(text):
    rows = []
    incidents = re.split(r"(?=Incident\s+#?:\s*\d+)", text)
    for entry in incidents:
        rows.append(extract_entry(entry))
    return rows

# generic extractor from an entry block
def extract_entry(entry):
    return {
        "Incident #": re.search(incident_pattern, entry).group(1) if re.search(incident_pattern, entry) else "",
        "Date": re.search(date_pattern, entry).group(1) if re.search(date_pattern, entry) else "",
        "Type": re.search(type_pattern, entry).group(1) if re.search(type_pattern, entry) else "",
        "Location": re.search(location_pattern, entry).group(1) if re.search(location_pattern, entry) else "",
        "Arrested": "Yes" if re.search(arrest_pattern, entry) else "No",
        "Name": re.search(name_pattern, entry).group(1).strip() if re.search(name_pattern, entry) else "",
        "DOB": re.search(dob_pattern, entry).group(1) if re.search(dob_pattern, entry) else "",
        "Charges": "; ".join(
            line.strip() for line in re.search(charges_pattern, entry, re.DOTALL).group(1).splitlines()
            if line.strip()
        ) if re.search(charges_pattern, entry, re.DOTALL) else ""
    }

# use pdfplumber first, fallback to OCR
def extract_data_from_pdf_auto(pdf_path):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = "\n".join(p.extract_text() or "" for p in pdf.pages)
            text = unescape(text)

            if is_meaningful_police_log(text):
                return extract_data_from_text_pdfplumber(text)
            else:
                print(f"⚠️ pdfplumber output unreadable, using OCR fallback: {os.path.basename(pdf_path)}")
    except Exception as e:
        print(f"❌ pdfplumber failed on {pdf_path}: {e}")

    # fallback to OCR
    try:
        images = convert_from_path(pdf_path, dpi=300)
        ocr_text = ""
        for img in images:
            ocr_text += pytesseract.image_to_string(img)
        if is_meaningful_police_log(ocr_text):
            return extract_data_from_text_ocr(ocr_text)
    except Exception as e:
        print(f"❌ OCR failed for {pdf_path}: {e}")
    return []

# loop through all PDFs
def parse_all_pdfs_to_csv(input_dir, output_csv, max_pdfs=None):
    all_rows = []
    pdf_count = 0
    for folder in os.listdir(input_dir):
        month_dir = os.path.join(input_dir, folder)
        if not os.path.isdir(month_dir):
            continue
        for file in os.listdir(month_dir):
            if file.lower().endswith(".pdf"):
                path = os.path.join(month_dir, file)
                print(f"📄 Parsing: {file}")
                rows = extract_data_from_pdf_auto(path)
                if not rows:
                    print(f"⚠️ No extractable data in: {file}")
                all_rows.extend(rows)
                pdf_count += 1
                if max_pdfs and pdf_count >= max_pdfs:
                    print(f"⏸️ Stopping early after {pdf_count} PDFs")
                    break
        if max_pdfs and pdf_count >= max_pdfs:
            break

    with open(output_csv, "w", newline="", encoding="utf-8") as f:
        fieldnames = ["Incident #", "Date", "Type", "Location", "Arrested", "Name", "DOB", "Charges"]
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(all_rows)
    print(f"\n✅ CSV created at: {output_csv}")


In [18]:
import os

notebook_dir = os.getcwd()
PDF_DATA_DIR = os.path.join(notebook_dir, '..', 'data', 'pdfs')
parsed_csv_path = os.path.join(PDF_DATA_DIR, 'new_lawrence_2023_2024.csv')

# run for all PDFs, add optional `max_pdfs` argument
parse_all_pdfs_to_csv(PDF_DATA_DIR, parsed_csv_path)


📄 Parsing: 04-01-2023.pdf
⚠️ pdfplumber output unreadable, using OCR fallback: 04-01-2023.pdf
📄 Parsing: 04-02-2023.pdf
⚠️ pdfplumber output unreadable, using OCR fallback: 04-02-2023.pdf
📄 Parsing: 04-03-2023.pdf
📄 Parsing: 04-04-2023.pdf
📄 Parsing: 04-05-2023.pdf
📄 Parsing: 04-06-2023.pdf
📄 Parsing: 04-07-2023.pdf
📄 Parsing: 04-08-2023.pdf
📄 Parsing: 04-10-2023.pdf
📄 Parsing: 04-11-2023.pdf
📄 Parsing: 04-12-2023.pdf
📄 Parsing: 04-13-2023.pdf
📄 Parsing: 04-14-2023.pdf
📄 Parsing: 04-15-2023.pdf
📄 Parsing: 04-17-2023.pdf
📄 Parsing: 04-18-2023.pdf
📄 Parsing: 04-20-2023.pdf
📄 Parsing: 04-21-2023.pdf
📄 Parsing: 04-22-2023.pdf
📄 Parsing: 04-23-2023.pdf
📄 Parsing: 04-24-2023.pdf
📄 Parsing: 04-25-2023.pdf
📄 Parsing: 04-26-2023.pdf
📄 Parsing: 04-27-2023.pdf
📄 Parsing: 04-28-2023.pdf
📄 Parsing: 04-29-2023.pdf
📄 Parsing: 04-30-2023.pdf
📄 Parsing: 08-01-2023.pdf
⚠️ pdfplumber output unreadable, using OCR fallback: 08-01-2023.pdf
📄 Parsing: 08-02-2023.pdf
⚠️ pdfplumber output unreadable, using OCR

In [20]:
import os

notebook_dir = os.getcwd()
PDF_DATA_DIR = os.path.join(notebook_dir, '..', 'data', 'pdfs')
parsed_csv_path = os.path.join(PDF_DATA_DIR, 'lawrence_2018_to_2022.csv')

# run for all PDFs, add optional `max_pdfs` argument
parse_all_pdfs_to_csv(PDF_DATA_DIR, parsed_csv_path)


📄 Parsing: 04-01-2018.pdf
📄 Parsing: 04-02-2018.pdf
📄 Parsing: 04-03-2018.pdf
📄 Parsing: 04-04-2018.pdf
📄 Parsing: 04-05-2018.pdf
📄 Parsing: 04-06-2018.pdf
📄 Parsing: 04-07-2018.pdf
📄 Parsing: 04-08-2018.pdf
📄 Parsing: 04-09-2018.pdf
📄 Parsing: 04-10-2018.pdf
📄 Parsing: 04-11-2018.pdf
📄 Parsing: 04-12-2018.pdf
📄 Parsing: 04-13-2018.pdf
📄 Parsing: 04-14-2018.pdf
📄 Parsing: 04-15-2018.pdf
📄 Parsing: 04-16-2018.pdf
📄 Parsing: 04-17-2018.pdf
📄 Parsing: 04-19-2018.pdf
📄 Parsing: 04-20-2018.pdf
📄 Parsing: 04-21-2018.pdf
📄 Parsing: 04-22-2018.pdf
📄 Parsing: 04-23-2018.pdf
📄 Parsing: 04-24-2018.pdf
📄 Parsing: 04-25-2018.pdf
📄 Parsing: 04-26-2018.pdf
📄 Parsing: 04-27-2018.pdf
📄 Parsing: 04-28-2018.pdf
📄 Parsing: 04-29-2018.pdf
📄 Parsing: 04-30-2018.pdf
📄 Parsing: 08-01-2018.pdf
📄 Parsing: 08-02-2018.pdf
📄 Parsing: 08-03-2018.pdf
📄 Parsing: 08-04-2018.pdf
📄 Parsing: 08-05-2018.pdf
📄 Parsing: 08-06-2018.pdf
📄 Parsing: 08-07-2018.pdf
📄 Parsing: 08-08-2018.pdf
📄 Parsing: 08-09-2018.pdf
📄 Parsing: 0