In [1]:
import requests
import csv
import os
import re
import io
import calendar
from PyPDF2 import PdfReader

# desired location for all data, in this case a new 'data' folder:
DATA_DIR = r"C:\Users\Indel\OneDrive\Documents\Data Science RA\gatewayinitiative-lawrencepd\data"

# regex to match filenames like "01-20-2018.pdf" (case-insensitive).
# remove the .pdf extension before matching, so that part is optional in the regex.
DATE_PATTERN = re.compile(r"^(\d{2})-(\d{2})-(\d{4})$", re.IGNORECASE)

def download_pdfs(
        
    # url format for the PDFs, with a placeholder for the ID
    start_id=2274,
    end_id=49898,
    base_url="https://lawpd.com/DocumentCenter/View/{}",
    failure_csv="failures.csv"
):
    """
    Download PDFs by incrementing through IDs, skipping failures,
    organizing them by date if the filename is in MM-DD-YYYY format,
    or else placing them in a fallback 'no_date' folder with a
    best-effort headline-based filename. Logs errors to a CSV file.
    """

    # ensure the base data directory exists
    os.makedirs(DATA_DIR, exist_ok=True)

    # build a path to the failures.csv in the data directory
    failure_csv_path = os.path.join(DATA_DIR, failure_csv)

    # open CSV file to log failures
    with open(failure_csv_path, mode="w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["ID", "Error"])  # header row

        for file_id in range(start_id, end_id + 1):
            url = base_url.format(file_id)

            try:
                response = requests.get(url, timeout=10)  # 10-second timeout
                status_code = response.status_code

                # check HTTP status
                if status_code != 200:
                    writer.writerow([file_id, f"HTTP status {status_code}"])
                    continue

                # check for PDF content
                content_type = response.headers.get("Content-Type", "").lower()
                if "pdf" not in content_type:
                    writer.writerow([file_id, f"Not a PDF (content-type: {content_type})"])
                    continue

                # attempt to get the raw server filename from Content-Disposition
                content_disp = response.headers.get("Content-Disposition", "")
                server_filename = get_filename_from_content_disposition(content_disp)

                # if the server doesn't provide a filename, fallback to ID #, e.g. "2274.pdf"
                if not server_filename:
                    server_filename = f"{file_id}.pdf"

                # ensure .pdf extension
                if not server_filename.lower().endswith(".pdf"):
                    server_filename += ".pdf"

                # try parsing a date from the filename
                year, month, day = parse_date_from_filename(server_filename)

                # if date is parseable, build the nested folder path
                if year is not None:
                    # example: 2018_law_pd_data -> 2018_january
                    year_folder = f"{year}_law_pd_data"
                    month_name = calendar.month_name[month].lower()  # 'january', 'february', etc.
                    month_folder = f"{year}_{month_name}"

                    year_folder_path = os.path.join(DATA_DIR, year_folder)
                    month_folder_path = os.path.join(year_folder_path, month_folder)

                    # create subdirectories
                    os.makedirs(month_folder_path, exist_ok=True)

                    # final path for the PDF
                    file_path = os.path.join(month_folder_path, server_filename)

                else:
                    # if we can't parse a date, try extracting the first page headline
                    pdf_bytes = response.content
                    headline = extract_pdf_headline(pdf_bytes)

                    if headline:
                        safe_headline = sanitize_filename(headline)
                        fallback_name = f"{file_id}_{safe_headline}.pdf"
                    else:
                        fallback_name = f"{file_id}_no_headline.pdf"

                    fallback_folder_path = os.path.join(DATA_DIR, "no_date")
                    os.makedirs(fallback_folder_path, exist_ok=True)

                    file_path = os.path.join(fallback_folder_path, fallback_name)

                # write the PDF content to disk
                with open(file_path, "wb") as out_file:
                    out_file.write(response.content)

            except Exception as e:
                # log any exception (network errors, parse errors, etc.)
                writer.writerow([file_id, str(e)])
                continue


def get_filename_from_content_disposition(content_disp):
    """
    Attempt to parse the filename= value from a Content-Disposition header.
    Returns None if not found.
    Example header: 'attachment; filename="01-20-2018.pdf"'
    """
    if "filename=" in content_disp.lower():
        parts = content_disp.split("filename=")
        if len(parts) > 1:
            # Remove surrounding quotes or semicolons
            filename_part = parts[1].strip().strip('"').strip(';')
            return filename_part
    return None


def parse_date_from_filename(filename):
    """
    Given a filename like '01-20-2018.pdf', parse it as MM-DD-YYYY.
    Returns (year, month, day) if successful, or (None, None, None) if not.
    """
    # strip off '.pdf'
    base_name = os.path.splitext(filename)[0]
    match = DATE_PATTERN.match(base_name)
    if not match:
        return (None, None, None)

    mm = int(match.group(1))
    dd = int(match.group(2))
    yyyy = int(match.group(3))

    # very basic date validity check
    if 1 <= mm <= 12 and 1 <= dd <= 31:
        return (yyyy, mm, dd)
    else:
        return (None, None, None)


def extract_pdf_headline(pdf_bytes):
    """
    Try reading the first page of a PDF to get the first line of text.
    Returns that line (string) or None if it fails or no text is found.
    """
    try:
        pdf_stream = io.BytesIO(pdf_bytes)
        pdf_reader = PdfReader(pdf_stream)
        if len(pdf_reader.pages) > 0:
            first_page = pdf_reader.pages[0]
            text = first_page.extract_text() or ""
            lines = text.splitlines()
            if lines:
                # return the first non-empty line
                return lines[0].strip()
        return None
    except:
        return None


def sanitize_filename(name):
    """
    remove or replace characters that are problematic in filenames,
    returning a safer string. Also truncates to a reasonable length.
    """
    # replace anything not alphanumeric, underscore, or dash with underscore
    safe = re.sub(r"[^a-zA-Z0-9_\-]+", "_", name)
    # limit length (here to 50 chars)
    return safe[:50]

In [2]:
download_pdfs()

unknown widths : 
[0, IndirectObject(42, 0, 1902545055680)]
unknown widths : 
[0, IndirectObject(46, 0, 1902545055680)]
unknown widths : 
[0, IndirectObject(50, 0, 1902545055680)]
unknown widths : 
[0, IndirectObject(54, 0, 1902545055680)]
unknown widths : 
[0, IndirectObject(58, 0, 1902545055680)]
unknown widths : 
[0, IndirectObject(62, 0, 1902545055680)]
unknown widths : 
[0, IndirectObject(66, 0, 1902545055680)]
unknown widths : 
[0, IndirectObject(70, 0, 1902545055680)]
unknown widths : 
[0, IndirectObject(74, 0, 1902545055680)]
unknown widths : 
[0, IndirectObject(54, 0, 1902545053216)]
unknown widths : 
[0, IndirectObject(58, 0, 1902545053216)]
unknown widths : 
[0, IndirectObject(62, 0, 1902545053216)]
unknown widths : 
[0, IndirectObject(66, 0, 1902545053216)]
unknown widths : 
[0, IndirectObject(70, 0, 1902545053216)]
unknown widths : 
[0, IndirectObject(74, 0, 1902545053216)]
unknown widths : 
[0, IndirectObject(78, 0, 1902545053216)]
unknown widths : 
[0, IndirectObject(82,

## Operation 2023, 2024

In [9]:
import os
import requests
import re
import calendar
from datetime import datetime

# Base data path using current notebook directory
notebook_dir = os.getcwd()
PDF_DATA_DIR = os.path.join(notebook_dir, '..', 'data', 'pdfs')

# Date format: MM-DD-YYYY
DATE_PATTERN = re.compile(r"^(\d{2})-(\d{2})-(2023|2024)\.pdf$", re.IGNORECASE)

def get_filename_from_content_disposition(header):
    if "filename=" in header.lower():
        return header.split("filename=")[-1].strip('"; ')
    return None

def parse_date_from_filename(filename):
    match = DATE_PATTERN.match(filename)
    if match:
        mm, dd, yyyy = int(match.group(1)), int(match.group(2)), int(match.group(3))
        return yyyy, mm, dd
    return None, None, None

def download_pdfs_2023_2024_fast(start_id=45274, end_id=55000):
    os.makedirs(PDF_DATA_DIR, exist_ok=True)
    total = end_id - start_id + 1

    for count, doc_id in enumerate(range(start_id, end_id + 1), start=1):
        if count % 500 == 0:
            print(f"📦 Progress: {count} / {total} checked")

        url = f"https://lawpd.com/DocumentCenter/View/{doc_id}"
        try:
            res = requests.get(url, timeout=10)
            if res.status_code != 200 or "pdf" not in res.headers.get("Content-Type", "").lower():
                continue

            filename = get_filename_from_content_disposition(res.headers.get("Content-Disposition", ""))
            if not filename or not filename.lower().endswith(".pdf"):
                continue

            year, month, day = parse_date_from_filename(filename)
            if year is None:
                continue

            # Build folder path: .../data/pdfs/2023_january
            month_folder = f"{year}_{calendar.month_name[month].lower()}"
            save_dir = os.path.join(PDF_DATA_DIR, month_folder)
            os.makedirs(save_dir, exist_ok=True)

            save_path = os.path.join(save_dir, filename)
            with open(save_path, "wb") as f:
                f.write(res.content)

            print(f"[{count}/{total}] ✅ Saved: {filename}")

        except Exception:
            continue


In [10]:
download_pdfs_2023_2024_fast()

📦 Progress: 500 / 9727 checked
[966/9727] ✅ Saved: 01-01-2023.pdf
[972/9727] ✅ Saved: 01-02-2023.pdf
[973/9727] ✅ Saved: 01-03-2023.pdf
[977/9727] ✅ Saved: 01-04-2023.pdf
[978/9727] ✅ Saved: 01-05-2023.pdf
[981/9727] ✅ Saved: 01-06-2023.pdf
[985/9727] ✅ Saved: 01-07-2023.pdf
[988/9727] ✅ Saved: 01-08-2023.pdf
[989/9727] ✅ Saved: 01-09-2023.pdf
[997/9727] ✅ Saved: 01-10-2023.pdf
📦 Progress: 1000 / 9727 checked
[1001/9727] ✅ Saved: 01-11-2023.pdf
[1010/9727] ✅ Saved: 01-12-2023.pdf
[1016/9727] ✅ Saved: 01-13-2023.pdf
[1017/9727] ✅ Saved: 01-14-2023.pdf
[1019/9727] ✅ Saved: 01-16-2023.pdf
[1020/9727] ✅ Saved: 01-15-2023.pdf
[1031/9727] ✅ Saved: 01-17-2023.pdf
[1037/9727] ✅ Saved: 01-18-2023.pdf
[1038/9727] ✅ Saved: 01-19-2023.pdf
[1050/9727] ✅ Saved: 01-20-2023.pdf
[1054/9727] ✅ Saved: 01-22-2023.pdf
[1055/9727] ✅ Saved: 01-23-2023.pdf
[1060/9727] ✅ Saved: 01-24-2023.pdf
[1067/9727] ✅ Saved: 01-25-2023.pdf
[1072/9727] ✅ Saved: 01-26-2023.pdf
[1073/9727] ✅ Saved: 01-27-2023.pdf
[1074/9727]