In [None]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.0.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import drive
import os
import re
import pdfplumber
import pandas as pd
from pathlib import Path

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pdf_folder_path = '/content/drive/MyDrive/DSC 190 Project/Police Reports'  # <- your path

def read_pdf_lines(pdf_path: Path):
    """Extract lines page-by-page, preserving line breaks and normalizing spaces."""
    lines = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text() or ""
            for raw in text.splitlines():
                ln = re.sub(r"\s+", " ", raw.strip())
                lines.append(ln)
    return lines

def parse_records(lines):
    """
    Find each entry by anchoring on 'Date Reported mm/dd/yyyy' and reading the fixed block:
      i-2: Incident type
      i-1: Location
      i  : Date Reported ...
      i+1: Incident/Case# ...
      i+2: Date Occurred ...
      i+3: Time Occurred ...
      i+4: Summary:
      i+5: Disposition:
    """
    recs = []
    for i, ln in enumerate(lines):
        m = re.match(r"^Date Reported\s+(\d{1,2}/\d{1,2}/\d{4})$", ln)
        if not m:
            continue
        try:
            incident_type    = lines[i-2].strip()
            location         = lines[i-1].strip()
            inc_case_ln      = lines[i+1].strip()
            date_occ_ln      = lines[i+2].strip()
            time_occ_ln      = lines[i+3].strip()
            summary_ln       = lines[i+4].strip()
            disposition_ln   = lines[i+5].strip()
        except IndexError:
            continue

        if not inc_case_ln.startswith("Incident/Case# "):  continue
        if not date_occ_ln.startswith("Date Occurred "):    continue
        if not time_occ_ln.startswith("Time Occurred "):    continue
        if not summary_ln.startswith("Summary:"):           continue
        if not disposition_ln.startswith("Disposition:"):   continue

        date_reported = m.group(1)
        incident_case = inc_case_ln.split("Incident/Case# ", 1)[1].strip()
        date_occurred = date_occ_ln.split("Date Occurred ", 1)[1].strip()
        time_occurred = time_occ_ln.split("Time Occurred ", 1)[1].strip()
        summary       = summary_ln.split("Summary:", 1)[1].strip()
        disposition   = disposition_ln.split("Disposition:", 1)[1].strip()

        recs.append({
            "Incident type": incident_type,
            "Location": location,
            "Date Reported": date_reported,
            "Incident/Case#": incident_case,
            "Date Occurred": date_occurred,
            "Time Occurred": time_occurred,
            "Summary": summary,
            "Disposition": disposition,
        })
    return recs

pdf_paths = sorted(Path(pdf_folder_path).glob("*.pdf"))

all_rows = []
for p in pdf_paths:
    lines = read_pdf_lines(p)
    all_rows.extend(parse_records(lines))

df = pd.DataFrame(all_rows, columns=[
    "Incident type",
    "Location",
    "Date Reported",
    "Incident/Case#",
    "Date Occurred",
    "Time Occurred",
    "Summary",
    "Disposition",
])

out_csv = str(Path(pdf_folder_path) / "police_logs_parsed_EXACT.csv")
df.to_csv(out_csv, index=False)

print(f"Parsed {len(df)} entries")
print(f"Saved: {out_csv}")

display(df.head(10))


OSError: Cannot save file into a non-existent directory: '/content/drive/MyDrive/DSC 190 Project/Police Reports'