In [28]:
import os, io, re, time, random, csv, sys
import requests
import pandas as pd
from bs4 import BeautifulSoup
from pypdf import PdfReader
from dotenv import load_dotenv
from data_loading import build_cap_dataset, court_listener_cleaner
load_dotenv()

ImportError: cannot import name 'court_listener_cleaner' from 'data_loading' (/Users/ilyadavidson/Stanford_Internship/judge_project/data_loading.py)

In [17]:
judge_info = pd.read_csv('data/judge_info.csv')
cases = build_cap_dataset()

Working dir: /Users/ilyadavidson/Stanford_Internship/judge_project
Found 28 parquet files for pattern: data/parquet_files/CAP_data_*.parquet


In [None]:
# Configuration and set-up
###########################################################################
TOKEN               = os.getenv("COURTLISTENER_TOKEN")  
USER_AGENT          = os.getenv("COURTLISTENER_USER_AGENT")  
TOP_N               = 65000               # how many cases to retrieve
PAUSE               = 0.5                 # polite pacing between calls (second between each call)
DEBUG               = False               # set True to print sample keys

SEARCH_URL          = "https://www.courtlistener.com/api/rest/v4/search/"
CLUSTER_URL_TMPL    = "https://www.courtlistener.com/api/rest/v4/clusters/{id}/"
OUTPUT_CSV          = "third_circuit_on_appeal.csv"

if not TOKEN:
    sys.exit("Missing COURTLISTENER_TOKEN in environment (.env).")

session = requests.Session()
session.headers.update({
    "Authorization": f"Token {TOKEN}",
    "User-Agent": USER_AGENT,
})


RETRYABLE = {429, 500, 502, 503, 504}

# In case of error continue parcing
def get_json(url, params=None, timeout=60, max_attempts=7, base_delay=0.8):
    last_err = None
    for attempt in range(1, max_attempts + 1):
        try:
            r = session.get(url, params=params, timeout=timeout)
            if r.status_code in RETRYABLE:
                if r.status_code == 429:
                    ra = r.headers.get("Retry-After")
                    if ra:
                        try:
                            time.sleep(float(ra))
                        except Exception:
                            pass
                raise requests.HTTPError(f"retryable {r.status_code}", response=r)
            r.raise_for_status()
            return r.json()
        except (requests.Timeout, requests.ConnectionError, requests.HTTPError) as e:
            last_err = e
            # stop on non-retryable HTTP codes
            if isinstance(e, requests.HTTPError) and e.response is not None and e.response.status_code not in RETRYABLE:
                raise
            sleep_s = min(base_delay * (2 ** (attempt - 1)) + random.uniform(0, 0.5), 20.0)
            time.sleep(sleep_s)
    raise last_err

def strip_html_to_text(html):
    if not html:
        return ""
    soup = BeautifulSoup(html, "html.parser")
    for bad in soup(["script", "style"]):
        bad.decompose()
    text = soup.get_text("\n")
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()

def pdf_all_pages(pdf_bytes: bytes) -> str:
    reader = PdfReader(io.BytesIO(pdf_bytes))
    out = []
    for i in range(len(reader.pages)):
        try:
            out.append(reader.pages[i].extract_text() or "")
        except Exception:
            out.append("")
    return "\n\f\n".join(out).strip()

def first10_last10_pages_or_all(text: str) -> str:
    """Return first 10 + last 10 pages; if <20 pages, return full text.
       If no page breaks, fallback to first/last 1000 tokens."""
    if not text:
        return ""
    if "\f" in text:
        pages = text.split("\f")
        if len(pages) <= 20:
            return "\n\f\n".join(pages).strip()
        return "\n\f\n".join(pages[:10] + pages[-10:]).strip()
    # Fallback by tokens if no page markers
    toks = re.findall(r"\S+", text)
    if len(toks) <= 2000:
        return " ".join(toks)
    return " ".join(toks[:1000] + ["..."] + toks[-1000:])

# ---------- Field helpers ----------
def resolve_cluster_id(hit: dict) -> str | None:
    if hit.get("cluster_id"):
        return str(hit["cluster_id"])
    cu = hit.get("cluster")
    if isinstance(cu, str) and "/clusters/" in cu:
        return cu.rstrip("/").split("/")[-1]
    return None

def get_docket_number_from_cluster(cluster_json: dict) -> str:
    d_url = cluster_json.get("docket")
    if not d_url:
        return ""
    d = get_json(d_url)
    return d.get("docket_number") or d.get("docket_number_core") or ""

def extract_case_name(cluster_json: dict) -> str:
    for k in ("case_name_full", "case_name"):
        if cluster_json.get(k):
            return (cluster_json[k] or "").strip()
    return ""

def get_combined_text(cluster_json: dict) -> str:
    # Cluster-level plain text first
    for k in ("plain_text", "plain_text_with_citations"):
        if cluster_json.get(k):
            return cluster_json[k]
    # Otherwise first sub-opinion -> text/html/pdf
    sub_ops = cluster_json.get("sub_opinions") or cluster_json.get("opinions") or []
    if sub_ops:
        first = sub_ops[0]
        op_url = first if isinstance(first, str) else first.get("resource_uri") or first.get("id")
        if op_url:
            op = get_json(op_url)
            for k in ("plain_text", "plain_text_with_citations"):
                if op.get(k):
                    return op[k]
            for k in ("html_with_citations", "html"):
                if op.get(k):
                    return strip_html_to_text(op[k])
            pdf_url = op.get("download_url")
            if pdf_url:
                pr = session.get(pdf_url, timeout=120)
                if pr.ok:
                    return pdf_all_pages(pr.content)
    return ""

# ---------- Fetch clusters (paged; supports TOP_N) ----------
def iter_cluster_ids(top_n: int | None):
    params = {
        "court": "ca3",
        "type": "o",
        "q": '"On appeal from"',   # phrase filter; can appear anywhere
        "order_by": "dateFiled desc",
        "page_size": 100,
    }
    url = SEARCH_URL
    yielded = 0
    while url:
        data = get_json(url, params=params)
        hits = data.get("results", [])
        for hit in hits:
            cid = resolve_cluster_id(hit)
            if cid:
                yield cid
                yielded += 1
                if isinstance(top_n, int) and yielded >= top_n:
                    return
            time.sleep(PAUSE)
        url = data.get("next")
        params = None  # only send params on first call

# ---------- Incremental writer / resume support ----------
def read_already_done_ids(path: str) -> set[str]:
    if not os.path.exists(path):
        return set()
    done = set()
    try:
        with open(path, newline="", encoding="utf-8") as f:
            r = csv.DictReader(f)
            if "cluster_id" in r.fieldnames:
                for row in r:
                    done.add(str(row.get("cluster_id", "")).strip())
    except Exception:
        pass
    return done

def append_row(path: str, row: dict, header_fields: list[str]):
    file_exists = os.path.exists(path)
    with open(path, "a", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=header_fields)
        if not file_exists:
            w.writeheader()
        w.writerow(row)

# ---------- Main ----------
def main(top_n: int | None = TOP_N):
    header = ["cluster_id", "case_name", "docket_number", "combined_preview"]
    done_ids = read_already_done_ids(OUTPUT_CSV)
    print(f"[info] resuming; {len(done_ids)} rows already in {OUTPUT_CSV}")

    processed_this_run = 0
    for cid in iter_cluster_ids(top_n):
        if cid in done_ids:
            continue
        try:
            cluster     = get_json(CLUSTER_URL_TMPL.format(id=cid))
            case_name   = extract_case_name(cluster)
            docket_no   = get_docket_number_from_cluster(cluster)
            full_text   = get_combined_text(cluster)

            # OPTIONAL client-side sanity check: ensure phrase exists somewhere
            # (server already filtered, but we double-check)
            if "on appeal from" not in (full_text or "").lower():
                # skip if for some reason phrase isn't in combined text
                continue

            preview = first10_last10_pages_or_all(full_text)

            row = {
                "cluster_id":       cid,
                "case_name":        case_name,
                "docket_number":    docket_no,
                "combined_preview": preview,
            }
            append_row(OUTPUT_CSV, row, header)
            done_ids.add(cid)
            processed_this_run += 1

            if isinstance(top_n, int) and processed_this_run >= top_n:
                break

            time.sleep(PAUSE)
        except Exception as e:
            print(f"[warn] skipping cluster {cid}: {e}")
            time.sleep(PAUSE * 2)
            continue

    print(f"[done] wrote/updated {len(done_ids)} total rows in {OUTPUT_CSV}")

if __name__ == "__main__":
    main(TOP_N)

[info] resuming; 0 rows already in third_circuit_on_appeal.csv
[done] wrote/updated 100 total rows in third_circuit_on_appeal.csv


In [2]:
df = pd.read_csv("third_circuit_on_appeal.csv")

In [3]:
len(df)

3066

In [4]:
df.sample(1)['combined_preview'].iloc[0]

'271 F.3d 140 (3rd Cir. 2001) OXFORD ASSOCIATES; HPC ASSOCIATES; KING OF PRUSSIA ARMS; SUSSEX GWYNEDD LTD. INC.; LAKESIDE APTS. ASSOC.; CURREN PARTNERSHIP; WHITPAIN ASSOCIATES; TOWNE COURTS APARTMENTS; MILL CREEK ASSOCIATES; WYNDON ASSOC.; ONE HUNDRED ONE ASSOC.; LLANBERIS ASSOCIATES; WHITEHALL; HAVERFORD AVENUE ASSOCIATES; 113 CRICKET ASSOCIATES; BRYNWOOD INVESTORS LP; MERION COURT INVESTORS LP; PLACE ONE APARTMENT ASSOCIATES, L.P.; KBF ASSOCIATES L.P., T/A KINGSWOOD APARTMENTS; TIMBERLAKE APARTMENT ASSOCIATES, L.P.; NORRISTOWN ASSOCIATES; CEDARBROOK HOLDINGS; HAROLD MELTZER; EVELYN MELTZER; NOBLE RYDAL ASSOCIATES; VILLAGE GREEN ASSOC. LP; DEKALB ASSOCIATES; GLEN ASSOCIATES; ETON INVESTMENTS LP; VALLEY FORGE TOWERS APARTMENTS NORTH LP; CHELBOURNE PLAZA CONDOMINIUM ASSOCIATION; ELKINS COURT CONDOMINIUM ASSOCIATION; REGENCY TOWERS APARTMENT ASSOCIATES LP; FOXCROFT SQUARE APARTMENT ASSOCIATES LP; GREEN VALLEY MANOR ASSOC.; GUNTRAM WEISSENBERGER; THE WOODS ASSOC.; ROBERT KEENEY; PATRICIA 

In [5]:
import re
import unicodedata
import pandas as pd

# ---------- patterns ----------
_PAT_JUDGE_LINE = re.compile(r'(?is)District\s+Judge:\s*([^\r\n]+)')
_PAT_COURT_LINE = re.compile(r'(?is)On\s+Appeal\s+from.*?\n\s*(for\s+the[^\n(]+)')


# ----- patterns -----
_TITLES = re.compile(r'(?i)^\s*(the\s+honorable|hon\.?|honorable|chief)\s+')
_MARKERS = re.compile(r'[\*\u2020\u2021]')  # * † ‡ anywhere
# parentheticals we consider "junk" (remove the whole (...) group)
_PAREN_JUNK = re.compile(
    r'(?is)\(\s*(?:ret\.?|retired|senior(?:\s+judge)?|emeritus|by\s+designation|pro\s*tem|visiting|acting)[^)]*\)'
)
# trailing suffixes like Jr./Sr./III/etc
_SUFFIXES = re.compile(r'(?i)[,\s]+(jr\.?|sr\.?|junior|senior|ii|iii|iv|v)\s*$')

def _fold(s: str) -> str:
    s = unicodedata.normalize('NFKD', s or '')
    return ''.join(ch for ch in s if not unicodedata.combining(ch))

def _clean_full_name_keep_suffix(s: str) -> str:
    """
    Remove titles, footnote markers, and parenthetical qualifiers like (Ret.),
    keep core name (with middle names/initials), normalize spaces.
    """
    s = (s or '').strip()
    if not s:
        return ''
    s = _TITLES.sub('', s)          # drop Hon., The Honorable, Chief
    s = _MARKERS.sub('', s)         # drop *, †, ‡ anywhere
    s = _PAREN_JUNK.sub('', s)      # drop (Ret.), (Senior Judge), (by designation), etc.
    s = re.sub(r'\s+', ' ', s).strip(' ,;')
    return s

def _is_initial(tok: str) -> bool:
    return bool(re.fullmatch(r"[A-Za-z]\.", tok))

def _clean_tokens(name: str) -> list[str]:
    """
    Split into tokens; keep letters with optional apostrophes/hyphens (O'Neil, Smith-Jones).
    Remove stray punctuation/digits. Fold accents for robustness.
    """
    name = name.replace(',', ' ')
    toks = name.split()
    out = []
    for t in toks:
        t = _fold(t).strip()
        if not t:
            continue
        keep = re.sub(r"[^A-Za-z'\-]", '', t).strip("-'")
        if keep:
            out.append(keep)
    return out

def _last_name_only(full: str) -> str:
    """Robust last name: clean → remove trailing suffixes → last non-initial token (lowercased)."""
    if not isinstance(full, str) or not full.strip():
        return ''
    s = _clean_full_name_keep_suffix(full)
    s = _SUFFIXES.sub('', s).strip()
    toks = _clean_tokens(s)
    for t in reversed(toks):
        if not _is_initial(t) and len(t) >= 2:
            return t.lower()
    return ''

def _first_name_guess(full: str) -> str:
    """Robust first name: clean → remove trailing suffixes → first non-initial token (lowercased)."""
    if not isinstance(full, str) or not full.strip():
        return ''
    s = _clean_full_name_keep_suffix(full)
    s = _SUFFIXES.sub('', s).strip()
    toks = _clean_tokens(s)
    for t in toks:
        if not _is_initial(t) and len(t) >= 2:
            return t.lower()
    return ''
def _extract_court_name(text: str) -> str:
    """Pull the 'for the …' line after the first 'On Appeal from …'."""
    m = _PAT_COURT_LINE.search(text or "")
    if not m:
        return ""
    court = m.group(1).strip()
    return court if court.endswith('.') else court + '.'

def _extract_district_judge_full(text: str) -> str:
    """First 'District Judge:' line → clean full name (keep suffix)."""
    m = _PAT_JUDGE_LINE.search(text or "")
    if not m:
        return ""
    return _clean_full_name_keep_suffix(m.group(1))

def _resolve_judge_id(dj_full: str, court_name: str, judges_info: pd.DataFrame):
    """Map to judges_info using last name → first name → court contains."""
    if judges_info is None or judges_info.empty:
        return pd.NA

    # Build lowercase helper columns once (idempotent)
    ji = judges_info.copy()
    if 'last_name_lc' not in ji.columns:
        ji['last_name_lc']  = ji['last name'].fillna('').str.strip().str.lower()
        ji['first_name_lc'] = ji['first name'].fillna('').str.strip().str.lower()
        ji['court_name_lc'] = ji['court name'].fillna('').str.strip().str.lower()
        # normalize judge id to nullable Int64
        ji['judge id'] = pd.to_numeric(ji['judge id'], errors='coerce').astype('Int64')

    last_lc  = _last_name_only(dj_full)
    first_lc = _first_name_guess(dj_full)
    court_lc = (court_name or '').strip().lower()

    # 1) by last name
    cand = ji[ji['last_name_lc'] == last_lc]
    if len(cand) == 0:
        return pd.NA

    # 2) by first name (if needed)
    if len(cand) > 1 and first_lc:
        cand = cand[cand['first_name_lc'] == first_lc]

    # 3) by court contains (if still ambiguous)
    if len(cand) > 1 and court_lc:
        in_df_to_ji = cand['court_name_lc'].fillna('').str.contains(court_lc, na=False)
        in_ji_to_df = cand['court_name_lc'].fillna('').apply(lambda s: court_lc in s)
        cand = cand[in_df_to_ji | in_ji_to_df]

    if len(cand) == 1:
        return cand['judge id'].iloc[0]
    return pd.NA

def extract_info(text: str, judges_info: pd.DataFrame | None = None) -> dict:
    """
    Parse one opinion's text and return:
      - district_judge (full, suffix kept),
      - district_judge_clean (last name only, lowercase),
      - court_name (the 'for the …' line),
      - judge id (nullable Int64 from judges_info if provided)
    """
    dj_full = _extract_district_judge_full(text)
    dj_clean = _last_name_only(dj_full)
    court = _extract_court_name(text)
    jid = _resolve_judge_id(dj_full, court, judges_info) if judges_info is not None else pd.NA

    return {
        "district_judge": dj_full,
        "district_judge_clean": dj_clean,
        "court_name": court,
        "judge id": jid,
    }

def normalize_judge_id_column(df, col="judge id"):
    # Grab all columns named exactly "judge id"
    block = df.loc[:, df.columns == col]

    if block.shape[1] == 0:
        # Column doesn't exist yet — create an empty nullable Int64 column
        df[col] = pd.Series(pd.array([pd.NA] * len(df), dtype="Int64"), index=df.index)
        return df

    # If multiple "judge id" columns, collapse to a single series by taking first non-null per row
    if block.shape[1] > 1:
        s = block.bfill(axis=1).iloc[:, 0]
        # Optionally, drop the extra duplicate columns, keeping just one
        dup_cols = block.columns.tolist()[1:]
        df.drop(columns=dup_cols, inplace=True)
        df[col] = s
    else:
        # Exactly one column — convert it to a 1-D Series
        df[col] = block.iloc[:, 0]

    # Finally, coerce to nullable Int64 so you get 3455 (not 3455.0)
    df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")
    return df

In [6]:
import re, unicodedata, pandas as pd

def _fold(s: str) -> str:
    s = unicodedata.normalize('NFKD', s or '')
    return ''.join(ch for ch in s if not unicodedata.combining(ch))

def _letters_only_key(s: str) -> str:
    """Fold accents, lowercase, strip non-letters → robust key for exact matches."""
    return re.sub(r'[^a-z]', '', _fold(s).lower())

def _first_token_key(s: str) -> str:
    """
    Take only the FIRST real token of the 'first name' field.
    Handles 'Karen M.', 'Karen M', 'Karen Marie' → 'karen'.
    """
    s = (s or '').strip()
    if not s:
        return ''
    # split on whitespace, commas; take first non-empty
    tok = next((t for t in re.split(r'[\s,]+', s) if t), '')
    # drop trailing period if it’s an initial like 'M.'
    tok = re.sub(r'\.$', '', tok)
    return _letters_only_key(tok)

# --- court canonicalization ---
_STOP_PHRASES = [
    r'\bthe\b', r'\bfor\b', r'\bof\b', r'\bthe\b',
    r'united\s+states\s+district\s+court', r'u\.s\.\s+district\s+court',
    r'united\s+states\s+court\s+of\s+appeals', r'court\s+of\s+appeals',
    r'district\s+court', r'circuit\s+court'
]
_STOP_RE = re.compile('|'.join(_STOP_PHRASES), flags=re.I)

def _canon_court(s: str) -> str:
    """
    Canonicalize court strings to make contains-matching reliable:
    - lowercase, fold accents
    - remove common boilerplate ('United States District Court', 'for', 'the', etc.)
    - collapse whitespace/punct
    """
    s = _fold(s).lower()
    s = _STOP_RE.sub(' ', s)
    s = re.sub(r'[\(\)\.,;:]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s  # e.g. 'western district of pennsylvania' → 'western pennsylvania'

def _ensure_helper_cols(ji: pd.DataFrame) -> pd.DataFrame:
    ji = ji.copy()
    # Expect exact columns: "judge id", "last name", "first name", "court name"
    # Build robust matching keys ONCE
    if 'last_name_key' not in ji.columns:
        # normalize nulls
        for col in ("last name", "first name", "court name"):
            if col in ji.columns:
                ji[col] = ji[col].fillna('')
            else:
                ji[col] = ''
        # keys
        ji['last_name_key']  = ji['last name'].map(_letters_only_key)
        ji['first_name_key'] = ji['first name'].map(_first_token_key)
        ji['court_key']      = ji['court name'].map(_canon_court)
        # id dtype
        ji['judge id'] = pd.to_numeric(ji['judge id'], errors='coerce').astype('Int64')
    return ji

def _resolve_judge_id(dj_full: str, court_name: str, judges_info: pd.DataFrame):
    """
    New matching pipeline (more forgiving):
      1) last name exact (letters-only)
      2) if >1 → first name exact on FIRST TOKEN
      3) if >1 → court canonical contains either direction
      4) if 0 at any step → progressively relax (drop first-name filter, then court)
    """
    if judges_info is None or judges_info.empty:
        return pd.NA

    ji = _ensure_helper_cols(judges_info)

    # Build keys from the parsed/cleaned judge full name and court
    last_key  = _letters_only_key(_last_name_only(dj_full))
    first_key = _first_token_key(_first_name_guess(dj_full))
    court_key = _canon_court(court_name or '')

    # 1) last name
    cand = ji[ji['last_name_key'] == last_key]
    if len(cand) == 0:
        return pd.NA

    # 2) first name (first-token match)
    if first_key:
        cand_first = cand[cand['first_name_key'] == first_key]
        if len(cand_first) == 1:
            return cand_first['judge id'].iloc[0]
        # if 0 left after first-name filter, RELAX: skip first-name filter (maybe data has middle initial in 'first name')
        cand2 = cand_first if len(cand_first) > 0 else cand
    else:
        cand2 = cand

    # 3) court contains (either direction) on canonicalized text
    if court_key:
        mask = cand2['court_key'].str.contains(court_key, na=False) | cand2['court_key'].apply(lambda s: court_key in s)
        cand3 = cand2[mask]
        if len(cand3) == 1:
            return cand3['judge id'].iloc[0]
        if len(cand3) > 1:
            # still ambiguous: pick the first deterministically
            return cand3['judge id'].iloc[0]

    # Fallbacks:
    if len(cand2) == 1:
        return cand2['judge id'].iloc[0]
    if len(cand) == 1:
        return cand['judge id'].iloc[0]

    # As a final tie, return NA (or choose the first to force a result)
    return pd.NA

In [9]:
judge_info = pd.read_csv('data/judge_info.csv')

In [11]:
TEXT_COL = "combined_preview"   # change to 'preview_text' if that's your column

# Apply row-wise and expand the dict into columns
extracted = df[TEXT_COL].apply(lambda t: extract_info(t, judge_info))
extracted_df = pd.DataFrame(list(extracted))

# Attach to your df
df = pd.concat([df, extracted_df], axis=1)
df = normalize_judge_id_column(df, col="judge id")

# Make sure judge id is a nice nullable integer
df['judge id'] = pd.to_numeric(df['judge id'], errors='coerce').astype('Int64')

In [13]:
court_listener = df[df['judge id'].notna()]

In [18]:
court_listener

Unnamed: 0,cluster_id,case_name,docket_number,combined_preview,district_judge,district_judge_clean,court_name,judge id
0,10679495,United States v. Natalya Shvets,22-2683,PRECEDENTIAL\n\n UNITED STATES COURT OF A...,Eduardo C. Robreno,robreno,for the Eastern District of Pennsylvania.,2033
1,10678447,Bobrick Washroom Equipment Inc v. Scranton Pro...,23-2577,PRECEDENTIAL\n\n UNITED STATES COURT OF...,Robert D. Mariani,mariani,for the Middle District of Pennsylvania.,3397
2,10675432,Robert Sofaly v. Portfolio Recovery Associates...,24-2639,PRECEDENTIAL\n\n UNITED STATES COURT OF APP...,Cathy Bissoon,bissoon,for the Western District of Pennsylvania.,3396
3,10674376,United States v. Xavier Josey,24-1891,PRECEDENTIAL\n\n UNITED STATES COURT O...,Matthew Brann,brann,for the Middle District of Pennsylvania.,3455
4,10673697,United States v. Ben McCormack,24-2500,PRECEDENTIAL\n\n UNITED STATES COURT OF ...,Matthew W. Brann,brann,for the Middle District of Pennsylvania.,3455
...,...,...,...,...,...,...,...,...
2342,769630,Planned Parenthood of Central New Jersey Herbe...,99-5042,220 F.3d 127 (3rd Cir. 2000) PLANNED PARENTHOO...,Anne E. Thompson[Copyrighted Material Omitted]...,court,,13761867
2349,769159,"Lucien B. Calhoun Robin L. Calhoun, Individual...",99-1378,216 F.3d 338 (3rd Cir. 2000) LUCIEN B. CALHOUN...,Louis H. Pollak[Copyrighted Material Omitted] ...,law,,1351
2374,767848,"In Re: Magic Restaurants, Inc. Magic American ...",99-5113,205 F.3d 108 (3rd Cir. 2000) IN RE: MAGIC REST...,"Joseph J. Farnan, Jr. Stephen P. McCarron, Esq...",paca,,1822
2417,766243,"United States of America, Ex Rel. Erdem I. Can...",98-3552,192 F.3d 402 (3rd Cir. 1999) UNITED STATES OF ...,"Donald E. Ziegler, Chief JudgeRobert L. Potter...",court,,13761867


In [69]:
court_listener = df[df['judge id'].notna()]

In [20]:
cl_dockets = court_listener["docket_number"].dropna().unique().tolist()

def has_overlap(docket):
    return any(docket in cl_d or cl_d in docket for cl_d in cl_dockets)

cases["overlap"] = cases["docket_number"].dropna().apply(has_overlap)
overlap_cases = cases[cases["overlap"]]

print(f"Cases with overlapping dockets: {len(overlap_cases)}")

Cases with overlapping dockets: 1048


In [27]:
overlap_cases

Unnamed: 0,name,docket_number,decision_date,court_name,jurisdiction,cite,cites_to,judges_raw,judges_clean,judges_ids,opinion_type,opinion_author_raw,opinion_author_clean,opinion_author_id,opinion_text,unique_id,is_appellate,overlap,unique_id_str,in_appellate_matches_json
0,"THE DELAWARE NATION, a Federally Recognized In...",No. 04-4593,2006-05-04,United States Court of Appeals for the Third C...,U.S.,446 Federal Reporter 3d Series 410,"542 F.Supp. 797, 524 U.S, 103, 164 F.Supp. 107...","Before ROTH, FUENTES, and GARTH, Circuit Judges.","[roth, fuentes, garth]","[2063, 2855, 823]",majority,"ROTH, Circuit. Judge.",roth,2063.0,"OPINION OF THE COURT\nROTH, Circuit. Judge.\nT...",0,1,True,0,False
4949,"In re Ronald HARLAND, Debtor. KV Pharmaceutica...",Bankruptcy No. 98-19324DAS; Adversary No. 99-0028,1999-07-01,United States District Court for the Eastern D...,U.S.,235 West's Bankruptcy Reporter 769,"233 West's Bankruptcy Reporter 334, 511 United...",,[],[],majority,"DAVID A. SCHOLL, Bankruptcy Judge.",,,"OPINION\nDAVID A. SCHOLL, Bankruptcy Judge.\nA...",4949,0,True,4949,False
5078,"In re MAGIC RESTAURANTS, INC.; Magic American ...",No. 99-5113,2000-03-01,United States Court of Appeals for the Third C...,U.S.,205 Federal Reporter 3d Series 108,"680 Federal Reporter 2d Series 277, 202 Federa...","Before: NYGAARD, RENDELL and ROSENN, Circuit J...","[nygaard, rendell, rosenn]","[1790, 1992, 2052]",majority,"ROSENN, Circuit Judge.",rosenn,2052.0,OPINION ANNOUNCING THE JUDGMENT OF THE COURT\n...,5078,1,True,5078,False
5079,"In re MAGIC RESTAURANTS, INC.; Magic American ...",No. 99-5113,2000-03-01,United States Court of Appeals for the Third C...,U.S.,205 Federal Reporter 3d Series 108,"680 Federal Reporter 2d Series 277, 202 Federa...","Before: NYGAARD, RENDELL and ROSENN, Circuit J...","[nygaard, rendell, rosenn]","[1790, 1992, 2052]",dissent,"RENDELL, Circuit Judge,",rendell,1992.0,"RENDELL, Circuit Judge,\ndissenting:\nI respec...",5079,1,True,5079,False
5545,"In re: Gary M. MILLER, Debtor Gary M. Miller, ...",No. 01-2799,2002-08-06,United States Court of Appeals for the Third C...,U.S.,299 Federal Reporter 3d Series 183,"188 Federal Reporter 3d Series 116, 28 U.S.C. ...","Before: SCIRICA and GREENBERG, Circuit Judges,...","[scirica, greenberg]","[2128, 912]",majority,"GREENBERG, Circuit Judge.",greenberg,912.0,"OPINION OF THE COURT\nGREENBERG, Circuit Judge...",5545,1,True,5545,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87420,"UNITED STATES of America v. Marshaun THOMAS, A...",No. 03-4447,2005-07-20,United States Court of Appeals for the Third C...,U.S.,139 F. App'x 371,"407 Federal Reporter 3d Series 193, 407 Federa...","Before: SLOVITER, VAN ANTWERPEN, and COWEN, Ci...","[sloviter, cowen, van antwerpen]","[2208, 520, 2437]",majority,"SLOVITER, Circuit Judge.",sloviter,2208.0,"OPINION\nSLOVITER, Circuit Judge.\nThis matter...",87420,1,True,87420,False
87955,"Alfred DIGIACOMO, Appellant, v. TEAMSTERS PENS...",No. 04-3510,2005-11-15,United States Court of Appeals for the Third C...,U.S.,154 F. App'x 312,"769 Federal Reporter 2d Series 140, 993 Federa...","Before SCIRICA, Chief Judge, ALITO and GARTH, ...","[scirica, alito, garth]","[2128, 26, 823]",majority,"GARTH, Circuit Judge:",garth,823.0,OPINION OF THE COURT DENYING APPELLANT’S APPLI...,87955,1,True,87955,False
92761,"UNITED STATES of America, v. Artega GREEN, App...",No. 06-2468,2008-12-30,United States Court of Appeals for the Third C...,U.S.,304 F. App'x 981,541 Federal Reporter 3d Series 176,,[],[],majority,"ROBERT E. COWEN, Circuit Judge.",cowen,520.0,"ORDER\nROBERT E. COWEN, Circuit Judge.\nThe pa...",92761,1,True,92761,False
99158,"UNITED STATES of America, Appellant v. Michael...",No. 10-3552,2012-08-16,United States Court of Appeals for the Third C...,U.S.,493 F. App'x 265,"911 Federal Reporter 2d Series 72, 977 Federal...","Before: AMBRO, CHAGARES and ALDISERT, Circuit ...","[ambro, chagares, aldisert]","[2850, 3107, 21]",majority,"ALDISERT, Circuit Judge.",aldisert,21.0,"OPINION OF THE COURT\nALDISERT, Circuit Judge....",99158,1,True,99158,False


In [26]:
cases[cases['docket_number']==overlap_cases['docket_number']]

ValueError: Can only compare identically-labeled Series objects

In [23]:
import json
import pandas as pd
from pathlib import Path

# --- 1) Reuse your overlap flags (safe to recompute if needed) ---
# Assumes you already have `court_listener`, `cases`, and cl_dockets defined as you showed.
# cl_dockets = court_listener["docket_number"].dropna().unique().tolist()
# def has_overlap(docket):
#     return any(docket in cl_d or cl_d in docket for cl_d in cl_dockets)
# cases["overlap"] = cases["docket_number"].dropna().apply(has_overlap)

# --- 2) Filter to just the overlapping rows, then to appellate rows if the column exists ---
overlap_cases = cases[cases["overlap"]].copy()

# Ensure unique_id is string for consistent matching
overlap_cases["unique_id_str"] = overlap_cases["unique_id"].astype(str)

# --- 3) Load every plausible appellate id from appellate_matches.json ---
# This is robust to structures like:
#   {"best_matches": [...], "confirmed": [...]}  OR a flat list  OR a dict of dicts.
def collect_ids_from_json(obj, into: set):
    if isinstance(obj, dict):
        for k, v in obj.items():
            lk = str(k).lower()
            # Common key names we used in this project:
            if lk in {"appellate_id", "appellate_unique_id", "unique_id", "id"}:
                # Store as string for consistent comparison
                try:
                    sval = str(v).strip()
                    if sval:
                        into.add(sval)
                except Exception:
                    pass
            collect_ids_from_json(v, into)
    elif isinstance(obj, list):
        for it in obj:
            collect_ids_from_json(it, into)

json_path = Path("results/appellate_matches.json")
with json_path.open("r", encoding="utf-8") as f:
    data = json.load(f)

json_ids = set()
collect_ids_from_json(data, json_ids)

# --- 4) Mark which overlap appellate cases are present in the JSON set ---
overlap_cases["in_appellate_matches_json"] = overlap_cases["unique_id_str"].isin(json_ids)

# Optional: keep just the essentials
result = overlap_cases[["unique_id_str", "in_appellate_matches_json"]].drop_duplicates()
result = result.rename(columns={"unique_id_str": "unique_id"})

# --- 5) Summary + (optional) save to CSV ---
total_overlap = len(overlap_cases)
total_app_overlap = len(overlap_cases)
mapped = int(result["in_appellate_matches_json"].sum())
unmapped = total_app_overlap - mapped

print(f"Overlapping-docket rows (all): {total_overlap}")
print(f"Overlapping-docket appellate rows: {total_app_overlap}")
print(f"Found in appellate_matches.json: {mapped}")
print(f"Not found in appellate_matches.json: {unmapped}")

# Inspect a few rows
print(result.head(20))

# Save if helpful
out_csv = "overlap_appellate_id_mapping.csv"
result.to_csv(out_csv, index=False)
print(f"Wrote: {out_csv}")

Overlapping-docket rows (all): 1048
Overlapping-docket appellate rows: 1048
Found in appellate_matches.json: 0
Not found in appellate_matches.json: 1048
     unique_id  in_appellate_matches_json
0            0                      False
4949      4949                      False
5078      5078                      False
5079      5079                      False
5545      5545                      False
5560      5560                      False
5666      5666                      False
5675      5675                      False
5680      5680                      False
5722      5722                      False
5725      5725                      False
5744      5744                      False
5778      5778                      False
5820      5820                      False
5875      5875                      False
5879      5879                      False
5965      5965                      False
6118      6118                      False
6119      6119                      False
6120   

In [79]:
cases[(cases['is_appellate']==1) & (cases['opinion_author_id'].notna())]

Unnamed: 0,name,docket_number,decision_date,court_name,jurisdiction,cite,cites_to,judges_raw,judges_clean,judges_ids,opinion_type,opinion_author_raw,opinion_author_clean,opinion_author_id,opinion_text,unique_id,is_appellate,overlap
0,"THE DELAWARE NATION, a Federally Recognized In...",No. 04-4593,2006-05-04,United States Court of Appeals for the Third C...,U.S.,446 Federal Reporter 3d Series 410,"542 F.Supp. 797, 524 U.S, 103, 164 F.Supp. 107...","Before ROTH, FUENTES, and GARTH, Circuit Judges.","[roth, fuentes, garth]","[2063, 2855, 823]",majority,"ROTH, Circuit. Judge.",roth,2063.0,"OPINION OF THE COURT\nROTH, Circuit. Judge.\nT...",0,1,True
39,"In the Matter of BOLLINGER CORPORATION, Bankru...",No. 79-1766,1980-02-08,United States Court of Appeals for the Third C...,U.S.,614 Federal Reporter 2d Series 924,"28 U.S.C. § 1291, 460 Federal Reporter 2d Seri...","Before ADAMS, ROSENN and SLOVI-TER, Circuit Ju...","[adams, rosenn, sloviter]","[8, 2052, 2208]",majority,"ROSENN, Circuit Judge.",rosenn,2052.0,"OPINION OF THE COURT\nROSENN, Circuit Judge.\n...",39,1,False
87,In the Matter of PENN CENTRAL TRANSPORTATION C...,No. 79-2651,1980-07-11,United States Court of Appeals for the Third C...,U.S.,625 Federal Reporter 2d Series 1112,"7 Lab. L.J. 265, 71 Federal Reporter 2d Series...","Before SEITZ, Chief Judge, and GIBBONS and ROS...","[seitz, gibbons, rosenn]","[2139, 844, 2052]",majority,"GIBBONS, Circuit Judge.",gibbons,844.0,"OPINION OF THE COURT\nGIBBONS, Circuit Judge.\...",87,1,False
91,In the Matter of BECKER'S MOTOR TRANSPORTATION...,No. 79-2796,1980-09-04,United States Court of Appeals for the Third C...,U.S.,632 Federal Reporter 2d Series 242,"11 U.S.C. § 35, 11 U.S.C. § 103, 369 United St...","Before SEITZ, Chief Judge, ADAMS, Circuit Judg...","[seitz, adams]","[2139, 8]",majority,"ADAMS, Circuit Judge.",adams,8.0,"OPINION OF THE COURT\nADAMS, Circuit Judge.\nT...",91,1,False
92,In the Matter of BECKER'S MOTOR TRANSPORTATION...,No. 79-2796,1980-09-04,United States Court of Appeals for the Third C...,U.S.,632 Federal Reporter 2d Series 242,"11 U.S.C. § 35, 11 U.S.C. § 103, 369 United St...","Before SEITZ, Chief Judge, ADAMS, Circuit Judg...","[seitz, adams]","[2139, 8]",concurrence,"SEITZ, Chief Judge,",seitz,2139.0,"SEITZ, Chief Judge,\nconcurring.\nI agree with...",92,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104561,"MEGA CONSTRUCTION CORPORATION, a New Jersey co...",No. 16-2639,2017-04-11,United States Court of Appeals for the Third C...,U.S.,684 F. App'x 196,"417 West's Atlantic Reporter, Second Series 48...","Before: CHAGARES, SCIRICA and FISHER, Circuit ...","[chagares, scirica, fisher]","[3107, 2128, 3047]",majority,"FISHER, Circuit Judge.",fisher,3047.0,"OPINION\nFISHER, Circuit Judge.\nMega Construc...",104561,1,False
104562,"Michael BEENICK, Jr., Appellant v. Michael LEF...",No. 16-3855,2017-04-11,United States Court of Appeals for the Third C...,U.S.,684 F. App'x 200,"180 Federal Reporter 3d Series 558, 477 United...","Before: GREENAWAY, JR., SHWARTZ, GREENBERG, Ci...","[greenaway, shwartz, greenberg]","[911, 3467, 912]",majority,"SHWARTZ, Circuit Judge.",shwartz,3467.0,"OPINION\nSHWARTZ, Circuit Judge.\nMichael Been...",104562,1,False
104563,"Vincent Lionel BENJAMIN, et al.; Terri L. Grif...",Nos. 15-1406; Nos. 15-3496 & 15-3497,2017-03-27,United States Court of Appeals for the Third C...,U.S.,684 F. App'x 207,"747 Federal Reporter 2d Series 863, 26 Federal...","Before: FISHER, KRAUSE and ROTH, Circuit Judges","[fisher, krause, roth]","[3047, 3554, 2063]",majority,"ROTH, Circuit Judge",roth,2063.0,"OPINION\nROTH, Circuit Judge\nAttorneys repres...",104563,1,False
104564,"Jeffrey PODESTA; Street Search, LLC, Appellant...",No. 15-3372,2017-03-27,United States Court of Appeals for the Third C...,U.S.,684 F. App'x 213,"134 S.Ct. 568, 26 Federal Reporter 3d Series 3...","Before: FISHER, VANASKIE, and KRAUSE, Circuit ...","[fisher, vanaskie, krause]","[3047, 2451, 3554]",majority,"KRAUSE, Circuit Judge",krause,3554.0,"OPINION\nKRAUSE, Circuit Judge\nJeffrey Podest...",104564,1,False


In [53]:
mask = df['docket_number'].apply(
    lambda x: cases['docket_number'].astype(str).str.contains(str(x)).any()
)

filtered_df = df[mask]

KeyboardInterrupt: 

In [21]:
#!/usr/bin/env python3
"""
Parallel, rate‑limited CourtListener scraper for Third Circuit opinions
containing the phrase "On appeal from". Produces/updates
`third_circuit_on_appeal.csv` with columns:
  - cluster_id
  - case_name
  - docket_number
  - combined_preview (first/last pages or full text if short)

Key features vs. original:
- Polite parallelism via ThreadPoolExecutor (default 6 workers)
- Global token‑bucket rate limiter (~1.5 req/sec by default)
- Robust retries honoring Retry‑After for 429
- Local JSON cache to avoid re-downloading entities
- Idempotent resume: skips cluster_ids already written to CSV
- Avoids PDFs unless necessary; still supported as last resort

Requirements: requests, bs4, PyPDF2
Environment: COURTLISTENER_TOKEN, COURTLISTENER_USER_AGENT
"""
from __future__ import annotations

import csv
import hashlib
import io
import json
import os
import random
import re
import sys
import threading
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from typing import Optional

import requests
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader

# ===================== Configuration =====================
TOKEN               = os.getenv("COURTLISTENER_TOKEN")
USER_AGENT          = os.getenv("COURTLISTENER_USER_AGENT")
TOP_N               = 65000               # cap on cases to retrieve (safe if fewer exist)
WORKERS             = int(os.getenv("CL_WORKERS", 6))
REQS_PER_SEC        = float(os.getenv("CL_RPS", 1.5))  # average allowed RPS
BURST_TOKENS        = int(os.getenv("CL_BURST", 3))    # short burst capacity
DEBUG               = False

SEARCH_URL          = "https://www.courtlistener.com/api/rest/v4/search/"
CLUSTER_URL_TMPL    = "https://www.courtlistener.com/api/rest/v4/clusters/{id}/"
OUTPUT_CSV          = "third_circuit_on_appeal.csv"
CACHE_DIR           = Path(".cl_cache")
CACHE_DIR.mkdir(exist_ok=True)

if not TOKEN:
    sys.exit("Missing COURTLISTENER_TOKEN in environment (.env).")

# Reuse connections with a bigger pool
session = requests.Session()
session.headers.update({
    "Authorization": f"Token {TOKEN}",
    "User-Agent": USER_AGENT or "courtlistener-scraper/1.0 (contact unknown)",
})
adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100)
session.mount("http://", adapter)
session.mount("https://", adapter)

# ================ Polite rate limiting ===================
RETRYABLE = {429, 500, 502, 503, 504}
_tokens = BURST_TOKENS
_last = time.time()
_lock = threading.Lock()

def _acquire_token():
    """Token bucket acquire. Refills at REQS_PER_SEC; caps at BURST_TOKENS."""
    global _tokens, _last
    while True:
        with _lock:
            now = time.time()
            refill = (now - _last) * REQS_PER_SEC
            if refill > 0:
                _tokens = min(BURST_TOKENS, _tokens + refill)
                _last = now
            if _tokens >= 1:
                _tokens -= 1
                return
        time.sleep(0.03)

# ================ Helpers: HTTP + caching =================

def _cache_path_for(url: str) -> Path:
    h = hashlib.sha1(url.encode("utf-8")).hexdigest()
    return CACHE_DIR / f"{h}.json"

def get_json(url: str, params: Optional[dict] = None, timeout: int = 60, max_attempts: int = 7, base_delay: float = 0.8) -> dict:
    """GET JSON with retries, backoff, and 429 Retry-After support (rate‑limited)."""
    last_err: Optional[Exception] = None
    for attempt in range(1, max_attempts + 1):
        try:
            _acquire_token()
            r = session.get(url, params=params, timeout=timeout)
            if r.status_code in RETRYABLE:
                if r.status_code == 429:
                    ra = r.headers.get("Retry-After")
                    if ra:
                        try:
                            time.sleep(float(ra))
                        except Exception:
                            pass
                raise requests.HTTPError(f"retryable {r.status_code}", response=r)
            r.raise_for_status()
            return r.json()
        except (requests.Timeout, requests.ConnectionError, requests.HTTPError) as e:
            last_err = e
            if isinstance(e, requests.HTTPError) and e.response is not None and e.response.status_code not in RETRYABLE:
                raise
            sleep_s = min(base_delay * (2 ** (attempt - 1)) + random.uniform(0, 0.5), 20.0)
            time.sleep(sleep_s)
    assert last_err is not None
    raise last_err

def get_json_cached(url: str, params: Optional[dict] = None, **kw) -> dict:
    """Cache JSON responses by URL (params are only used for first call in search)."""
    if params:
        # Only the first search call passes params; embed params into a reproducible URL key
        # CourtListener returns a `next` URL we follow without params afterward.
        key = requests.Request("GET", url, params=params).prepare().url
    else:
        key = url
    path = _cache_path_for(key)
    if path.exists():
        try:
            return json.loads(path.read_text(encoding="utf-8"))
        except Exception:
            pass
    data = get_json(url, params=params, **kw)
    try:
        path.write_text(json.dumps(data), encoding="utf-8")
    except Exception:
        pass
    return data

# ================== Text extraction ======================

def strip_html_to_text(html: str | None) -> str:
    if not html:
        return ""
    soup = BeautifulSoup(html, "html.parser")
    for bad in soup(["script", "style"]):
        bad.decompose()
    text = soup.get_text("\n")
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()

def pdf_all_pages(pdf_bytes: bytes) -> str:
    reader = PdfReader(io.BytesIO(pdf_bytes))
    out = []
    for i in range(len(reader.pages)):
        try:
            out.append(reader.pages[i].extract_text() or "")
        except Exception:
            out.append("")
    return "\n\f\n".join(out).strip()

def firstN_lastN_pages_or_all(text: str, n: int = 10) -> str:
    """Return first n + last n pages; if <2n, return full text. If no page breaks, fallback by tokens."""
    if not text:
        return ""
    if "\f" in text:
        pages = text.split("\f")
        if len(pages) <= 2 * n:
            return "\n\f\n".join(pages).strip()
        return "\n\f\n".join(pages[:n] + pages[-n:]).strip()
    toks = re.findall(r"\S+", text)
    if len(toks) <= 2000:
        return " ".join(toks)
    return " ".join(toks[:1000] + ["..."] + toks[-1000:])

# ================== Field helpers ========================

def resolve_cluster_id(hit: dict) -> Optional[str]:
    if hit.get("cluster_id"):
        return str(hit["cluster_id"])
    cu = hit.get("cluster")
    if isinstance(cu, str) and "/clusters/" in cu:
        return cu.rstrip("/").split("/")[-1]
    return None

def get_docket_number_from_cluster(cluster_json: dict) -> str:
    d_url = cluster_json.get("docket")
    if not d_url:
        return ""
    d = get_json_cached(d_url)
    return d.get("docket_number") or d.get("docket_number_core") or ""

def extract_case_name(cluster_json: dict) -> str:
    for k in ("case_name_full", "case_name"):
        if cluster_json.get(k):
            return (cluster_json[k] or "").strip()
    return ""

def fetch_opinion_text_from_sub(first_op_url: str) -> str:
    # Opinion JSON
    op = get_json_cached(first_op_url)
    for k in ("plain_text", "plain_text_with_citations"):
        if op.get(k):
            return op[k]
    for k in ("html_with_citations", "html"):
        if op.get(k):
            return strip_html_to_text(op[k])
    pdf_url = op.get("download_url")
    if pdf_url:
        try:
            _acquire_token()
            pr = session.get(pdf_url, timeout=120)
            if pr.ok:
                return pdf_all_pages(pr.content)
        except Exception:
            return ""
    return ""

def get_combined_text(cluster_json: dict) -> str:
    # Prefer cluster-level text if available
    for k in ("plain_text", "plain_text_with_citations"):
        if cluster_json.get(k):
            return cluster_json[k]
    # Otherwise first sub-opinion -> text/html/pdf
    sub_ops = cluster_json.get("sub_opinions") or cluster_json.get("opinions") or []
    if sub_ops:
        first = sub_ops[0]
        op_url = first if isinstance(first, str) else first.get("resource_uri") or first.get("id")
        if op_url:
            return fetch_opinion_text_from_sub(op_url)
    return ""

# ================== Paging: cluster ids ===================

def iter_cluster_ids(top_n: Optional[int]):
    params = {
        "court": "ca3",              # Third Circuit
        "type": "o",                 # opinions
        "q": '"On appeal from"',      # phrase filter
        "order_by": "dateFiled desc",
        "page_size": 100,
    }
    url = SEARCH_URL
    yielded = 0
    while url:
        data = get_json_cached(url, params=params)
        hits = data.get("results", [])
        for hit in hits:
            cid = resolve_cluster_id(hit)
            if cid:
                yield cid
                yielded += 1
                if isinstance(top_n, int) and yielded >= top_n:
                    return
        url = data.get("next")
        params = None  # send params only on first call

# ================== CSV resume helpers ====================

def read_already_done_ids(path: str) -> set[str]:
    if not os.path.exists(path):
        return set()
    done = set()
    try:
        with open(path, newline="", encoding="utf-8") as f:
            r = csv.DictReader(f)
            if "cluster_id" in r.fieldnames:
                for row in r:
                    done.add(str(row.get("cluster_id", "")).strip())
    except Exception:
        pass
    return done

_write_lock = threading.Lock()

def append_row(path: str, row: dict, header_fields: list[str]):
    file_exists = os.path.exists(path)
    with _write_lock:
        with open(path, "a", newline="", encoding="utf-8") as f:
            w = csv.DictWriter(f, fieldnames=header_fields)
            if not file_exists:
                w.writeheader()
            w.writerow(row)

# ================== Worker: one cluster ===================

def process_one_cluster(cid: str) -> Optional[dict]:
    try:
        cluster = get_json_cached(CLUSTER_URL_TMPL.format(id=cid))
        case_name = extract_case_name(cluster)
        docket_no = get_docket_number_from_cluster(cluster)
        full_text = get_combined_text(cluster)

        if "on appeal from" not in (full_text or "").lower():
            # double-check; skip if phrase not present
            return None

        preview = firstN_lastN_pages_or_all(full_text, n=10)
        row = {
            "cluster_id": cid,
            "case_name": case_name,
            "docket_number": docket_no,
            "combined_preview": preview,
        }
        return row
    except Exception as e:
        if DEBUG:
            print(f"[warn] skipping cluster {cid}: {e}")
        return None

# ================== Main ===================

def main(top_n: Optional[int] = TOP_N):
    header = ["cluster_id", "case_name", "docket_number", "combined_preview"]
    done_ids = read_already_done_ids(OUTPUT_CSV)
    print(f"[info] resuming; {len(done_ids)} rows already in {OUTPUT_CSV}")

    # Precollect up to top_n cluster ids so we can submit to pool
    target_ids = []
    for cid in iter_cluster_ids(top_n):
        if cid in done_ids:
            continue
        target_ids.append(cid)
    if not target_ids:
        print(f"[done] wrote/updated {len(done_ids)} total rows in {OUTPUT_CSV}")
        return

    print(f"[info] processing {len(target_ids)} new clusters with {WORKERS} workers, ~{REQS_PER_SEC} rps")

    processed = 0
    with ThreadPoolExecutor(max_workers=WORKERS) as ex:
        futures = [ex.submit(process_one_cluster, cid) for cid in target_ids]
        for fut in as_completed(futures):
            row = fut.result()
            if row:
                append_row(OUTPUT_CSV, row, header)
                processed += 1
                if DEBUG and processed % 100 == 0:
                    print(f"[info] processed {processed}/{len(target_ids)}")

    total = len(read_already_done_ids(OUTPUT_CSV))
    print(f"[done] wrote/updated {total} total rows in {OUTPUT_CSV} (added {processed})")


if __name__ == "__main__":
    main(TOP_N)


[info] resuming; 100 rows already in third_circuit_on_appeal.csv
[info] processing 3247 new clusters with 6 workers, ~1.5 rps



Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(html, "html.parser")


[done] wrote/updated 3066 total rows in third_circuit_on_appeal.csv (added 2966)


In [23]:
appeals = pd.read_csv("third_circuit_on_appeal.csv")
len(appeals)

3066

In [29]:
import requests

TOKEN = "740692acbd4293e250464b9d55fce035a1473753"
BASE_URL = "https://www.courtlistener.com/api/rest/v4/opinions/"

headers = {
    "Authorization": f"Token {TOKEN}",
    "User-Agent": "JudgePromotions/0.1"
}

params = {"q": "on appeal from", "court": "ca3", "page_size": 1}
r = requests.get(BASE_URL, headers=headers, params=params)
r.raise_for_status()
print(r.json()["count"])

https://www.courtlistener.com/api/rest/v4/opinions/?count=on&court=ca3&page_size=1&q=on+appeal+from


In [36]:
import os, requests

TOKEN = os.environ["COURTLISTENER_TOKEN"]
url = "https://www.courtlistener.com/api/rest/v4/search/"
params = {
    "q": '"on appeal from"',  # exact phrase
    "type": "o",              # opinions
    "court": "ca3",           # Third Circuit
    "page_size": 1            # tiny payload; we only need .count
}
headers = {"Authorization": f"Token {TOKEN}", "User-Agent": "JudgePromotions/0.1"}

r = requests.get(url, params=params, headers=headers)
r.raise_for_status()
print(r.json().get("count", 0))

3347


Unnamed: 0,cluster_id,case_name,docket_number,combined_preview
0,10679495,United States v. Natalya Shvets,22-2683,PRECEDENTIAL\n\n UNITED STATES COURT OF A...
1,10678447,Bobrick Washroom Equipment Inc v. Scranton Pro...,23-2577,PRECEDENTIAL\n\n UNITED STATES COURT OF...
2,10675432,Robert Sofaly v. Portfolio Recovery Associates...,24-2639,PRECEDENTIAL\n\n UNITED STATES COURT OF APP...
3,10674376,United States v. Xavier Josey,24-1891,PRECEDENTIAL\n\n UNITED STATES COURT O...
4,10673697,United States v. Ben McCormack,24-2500,PRECEDENTIAL\n\n UNITED STATES COURT OF ...
...,...,...,...,...
3061,8878276,PETTIT v. BOARD OF CHOSEN FREEHOLDERS OF CAMDE...,,"DALLAS, Circuit Judge. The nature of this case..."
3062,8809954,ROESSING-ERNST CO. v. COAL & COKE BY-PRODUCTS CO.,No. 1885,"WOOLLEY, Circuit Judge. This is an appeal from..."
3063,9341210,VAN CAMP PACKING CO. v. CRUIKSHANKS BROS. CO.,No. 11,"BUTLEB, District Judge. The case is here on ap..."
3064,8766202,THE EAGLE POINT,No. 22,"ACHESON, Circuit Judge. This was a suit in adm..."
