In [220]:
import importlib
import data_loading   # your file, e.g. court_listener_cleaner.py
import helper_functions
importlib.reload(data_loading)

<module 'data_loading' from '/Users/ilyadavidson/Stanford_Internship/judge_project/data_loading.py'>

In [1]:
from data_loading import build_cap_dataset, _load_mapping, court_listener_cleaner, promotion_info_judges
import json
import pandas as pd
from typing import Dict, List, Optional
from api_call import load_case_results
import numpy as np

import pyarrow.dataset as ds
import re, pandas as pd, unicodedata
import pyarrow.compute as pc
import tiktoken
import re
import unicodedata
import statsmodels.api as sm
from typing import Optional
from helper_functions import norm_id
from typing import Dict, List, Optional
import re, unicodedata
from functools import lru_cache

In [2]:
judges = pd.read_csv("data/judge_info.csv")
judges = promotion_info_judges(judges)
cap_data = build_cap_dataset()
cl_data = pd.read_csv('third_circuit_on_appeal.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aj["nomination date"] = pd.to_datetime(aj["nomination date"], errors="coerce")


Working dir: /Users/ilyadavidson/Stanford_Internship/judge_project
Found 28 parquet files for pattern: data/parquet_files/CAP_data_*.parquet


In [6]:
judges.columns

Index(['judge id', 'last name', 'first name', 'gender', 'ethnicity',
       'birth year', 'birth city', 'birth state', 'death year', 'death city',
       'death state', 'court name', 'court type', 'appointment title',
       'appointing president', 'party of appointing president',
       'nomination date', 'termination date', 'termination', 'aba rating',
       'hearing date', 'judiciary committee action', 'committee action date',
       'senate vote type', 'ayes/nays', 'confirmation date', 'commission date',
       'school 1', 'degree 1', 'school 2', 'degree 2', 'professional career',
       'is_promoted', 'promotion_date'],
      dtype='object')

In [27]:
def cap_data_cleaner(
    cap_df:             pd.DataFrame,
    mapping_path:       str = "results/appellate_matches.json",   # appellate_custom_id -> district_unique_id
    api_path:           str = "batch_runs/api_responses.jsonl",
    *,
    id_col:             str = "unique_id",
    judge_name_col:     str = "opinion_author_clean",
    judge_id_col:       str = "opinion_author_id",
    case_name_col:      str = "name",
    docket_col:         str = "docket_number",
    opinion_text_col:   str = "opinion_text",
    decision_date_col:  str = "decision_date",
) -> pd.DataFrame:
    """
    Cleans CAP data and merges in district judge information and API answers.
    """

    app_to_dct             = _load_mapping(mapping_path)
    map_df                 = pd.DataFrame(list(app_to_dct.items()), columns=["custom_id", "district_uid"]).astype(str)

    # Keep only the original appellate cases present in mapping (keep their appellate metadata)
    out                    = cap_df.copy()
    out[id_col]            = out[id_col].astype(str)
    out                    = out[out[id_col].isin(map_df["custom_id"])].copy()

    # Attach appropriate district judge
    district_lookup = (
        cap_df[[id_col, judge_name_col, judge_id_col]]
        .drop_duplicates(subset=[id_col])
        .rename(columns={id_col:            "district_uid",
                         judge_name_col:    "district judge",
                         judge_id_col:      "district judge id"})
    )
    out = (
        out.merge(map_df, left_on=id_col, right_on="custom_id", how="left")
           .merge(district_lookup, on="district_uid", how="left")
    )

    # Get API answers
    api_answers                 = load_case_results(api_path) 
    api_answers                 = api_answers.copy()
    api_keys: List[str] = [c for c in api_answers.columns if c != "custom_id"]

    out                         = out.merge(api_answers, left_on="custom_id", right_on="custom_id", how="left")
    out[id_col]                 = "CAP_" + out["custom_id"].astype(str)

    base_cols                   = [id_col, case_name_col, opinion_text_col, docket_col, "district judge", "district judge id", decision_date_col]

    exclude_cols                = {"lower_judge_first", "lower_judge_last"}
    api_cols                    = [c for c in api_keys if c in out.columns and c not in exclude_cols]

    keep_cols                   = [c for c in base_cols if c in out.columns] + api_cols
    return out[keep_cols].reset_index(drop=True)

In [28]:
cap_data_clean = cap_data_cleaner(cap_data)

In [29]:
cap_data_clean.sample(3)

Unnamed: 0,unique_id,name,opinion_text,docket_number,district judge,district judge id,decision_date,opinion,case_type,error_category,petitioner_type,respondent_type,politicality_score,profile_level
1829,CAP_55866,"Harry VOGELSTEIN, Trading as Baltimore Poster ...",PER CURIAM.\nUpon review of the record we find...,No. 14056,luongo,1440.0,1962-12-20,affirmed,,,,,1.0,low
4445,CAP_73093,"AMERICAN EAGLE OUTFITTERS, Retail Royalty Comp...","NYGAARD, Circuit Judge,\ndissenting in part.\n...",No. 08-4807,,,2009-09-11,reversed and remanded,contract,legal error,company,company,1.0,medium
558,CAP_51053,In re PHILADELPHIA & READING COAL & IRON CO. A...,"MARIS, Circuit Judge.\nIn the reorganization p...",No. 7129,kalodner,1221.0,1939-06-30,reversed and remanded,bankruptcy,legal error,male,company,1.0,medium


In [None]:
def cl_cleaner(cl_data: pd.DataFrame, judges_info: pd.DataFrame) -> pd.DataFrame:
    """Clean CourtListener data, extract district judge last name + robustly map to judge id."""
    # --- helpers ---
    canon           = lambda s: re.sub(r'[^a-z]', '', unicodedata.normalize('NFKD', str(s or '')).lower())
    first_tok       = lambda s: re.sub(r'\.$', '', next((t for t in str(s).split() if t), '')).lower()
    pat_judge       = re.compile(r'(?is)District\s+Judge:\s*([^\r\n]+)')
    strip           = lambda s: re.sub(r'\s+', ' ', re.sub(r'(?is)(the\s+honorable|hon\.?|chief|\(.*?\)|[\*\u2020\u2021])', '', str(s or ''))).strip(' ,;')

    # Extract district judge names 
    out                         = cl_data.copy()
    out["district judge_full"]  = out["combined_preview"].map(lambda t: strip(pat_judge.search(str(t)).group(1)) if pat_judge.search(str(t)) else '')
    out["district judge"]       = out["district judge_full"].str.split().str[-1].str.lower()

    # Canonicalize judges info
    ji                      = judges_info.copy()
    ji["last_key"]          = ji["last name"].map(canon)
    ji["first_key"]         = ji["first name"].map(lambda x: canon(first_tok(x)))
    ji["court_key"]         = ji["court name"].map(canon)
    ji["jid"]               = pd.to_numeric(ji["judge id"], errors="coerce").astype("Int64")

    # First check by last name match
    out["last_key"] = out["district judge"].map(canon)
    merged          = out.merge(ji[["last_key","first_key","court_key","jid"]], on="last_key", how="left", suffixes=("","_j"))

    # If no match, check first name and court
    def resolve_id(row):
        subset = ji[ji["last_key"] == row["last_key"]]
        if len(subset) == 1:
            return subset["jid"].iloc[0]
        first, court = canon(first_tok(row["district judge_full"])), canon(row.get("court_name",""))
        s2 = subset[subset["first_key"] == first]
        if len(s2) == 1: return s2["jid"].iloc[0]
        if len(s2) > 1:
            s3 = s2[s2["court_key"].str.contains(court, na=False) | s2["court_key"].apply(lambda x: court in x)]
            if len(s3): return s3["jid"].iloc[0]
        return pd.NA

    merged["district judge id"] = merged.apply(resolve_id, axis=1).astype("Int64")

    # Finalize
    merged["unique_id"] = "CL_" + merged["cluster_id"].astype(str)
    merged.rename(columns={"case_name":"name","combined_preview":"opinion_text"}, inplace=True)
    cols = ["unique_id","name","opinion_text","docket_number","district judge","district judge id"]
    
    return merged.loc[merged["district judge id"].notna(), cols].reset_index(drop=True)

In [10]:
cl_clean = cl_cleaner(cl_data, judges)

In [11]:
cl_clean

Unnamed: 0,unique_id,name,opinion_text,docket_number,district judge,district judge id
0,CL_10679495,United States v. Natalya Shvets,PRECEDENTIAL\n\n UNITED STATES COURT OF A...,22-2683,robreno,2033
1,CL_10678447,Bobrick Washroom Equipment Inc v. Scranton Pro...,PRECEDENTIAL\n\n UNITED STATES COURT OF...,23-2577,mariani,3397
2,CL_10675432,Robert Sofaly v. Portfolio Recovery Associates...,PRECEDENTIAL\n\n UNITED STATES COURT OF APP...,24-2639,bissoon,3396
3,CL_10674376,United States v. Xavier Josey,PRECEDENTIAL\n\n UNITED STATES COURT O...,24-1891,brann,3455
4,CL_10673697,United States v. Ben McCormack,PRECEDENTIAL\n\n UNITED STATES COURT OF ...,24-2500,brann,3455
...,...,...,...,...,...,...
1516,CL_770720,Michael Malik Allah v. Thomas Seiverling Rober...,229 F.3d 220 (3rd Cir. 2000) MICHAEL MALIK ALL...,97-3627,casey.,2721
1517,CL_769630,Planned Parenthood of Central New Jersey Herbe...,220 F.3d 127 (3rd Cir. 2000) PLANNED PARENTHOO...,99-5042,court.,13761867
1518,CL_769159,"Lucien B. Calhoun Robin L. Calhoun, Individual...",216 F.3d 338 (3rd Cir. 2000) LUCIEN B. CALHOUN...,99-1378,law,1351
1519,CL_766243,"United States of America, Ex Rel. Erdem I. Can...",192 F.3d 402 (3rd Cir. 1999) UNITED STATES OF ...,98-3552,court.,13761867


In [12]:
cl_clean.sample(3)

Unnamed: 0,unique_id,name,opinion_text,docket_number,district judge,district judge id
983,CL_3036273,Lebanon Farms v. Lebanon,Opinions of the United\n2008 Decisions ...,06-3473,kane,2796
1018,CL_3038704,Wilkerson v. New Media Tech,Opinions of the United\n2008 Decisions ...,07-1305,dalzell,556
290,CL_9301926,United States v. Ernest Dyer,PRECEDENTIAL\n\n UNITED STATES COURT OF AP...,21-3087,rambo,1958


In [184]:
cap_data_clean.sample(3)

Unnamed: 0,unique_id,name,opinion_text,docket_number,district judge,district judge id,opinion,case_type,error_category,petitioner_type,respondent_type,politicality_score,profile_level
5417,CAP_95079,"WESTMONT DEVELOPMENT GROUP, Appellant v. TOWNS...","OPINION OF THE COURT\nDAVIS, District Judge.\n...",No. 09-2885,irenas,1150.0,affirmed,contract,insufficient evidence,company,group of individuals,2.0,low
4369,CAP_72204,Ronald C. HELLER; John R. Flinn; Mathew W. Lin...,"WEIS, Circuit Judge.\nThis appeal is from the ...",No. 05-3687,,,reversed and remanded,tort,legal error,male,group of individuals,2.0,low
748,CAP_51881,ETTELSON et al. v. METROPOLITAN LIFE INS. CO.,"GOODRICH, Circuit Judge.\nRichard Ettelson was...",No. 7933,walker,2482.0,reversed and remanded,contract,legal error,group of individuals,company,1.0,high


In [13]:
def merge_cap_and_cl(cap_data_clean: pd.DataFrame, cl_clean: pd.DataFrame, api_path: str) -> pd.DataFrame:
    """Merge CAP + CL datasets efficiently, fill missing judges, and append new CL rows with API fields."""
    from typing import List
    api_answers = load_case_results(api_path).copy()
    api_answers["custom_id"] = api_answers["custom_id"].astype(str)
    api_answers.set_index("custom_id", inplace=True)
    api_keys: List[str] = [c for c in api_answers.columns if c != "custom_id"]

    cap, cl = cap_data_clean.copy(), cl_clean.copy()
    cap_dockets = cap["docket_number"].dropna().astype(str).unique().tolist()

    def find_overlap_docket(docket):
        for cap_d in cap_dockets:
            if str(docket) in cap_d or cap_d in str(docket):
                return cap_d
        return None

    cl["overlap"] = cl["docket_number"].map(find_overlap_docket)

    # --- 1) Fill missing judge info where overlap exists ---
    overl = cl[cl["overlap"].notna()]
    for _, r in overl.iterrows():
        mask = cap["docket_number"].astype(str).eq(r["overlap"])
        if mask.any():
            if cap.loc[mask, "district judge"].isna().any():
                cap.loc[mask, "district judge"] = r["district judge"]
            if cap.loc[mask, "district judge id"].isna().any():
                cap.loc[mask, "district judge id"] = r["district judge id"]

    # --- 2) Append new CL rows (no overlap) ---
    new_rows = cl[cl["overlap"].isna()].copy()
    new_rows = new_rows.reindex(columns=cap.columns)
    new_rows.loc[:, api_keys] = np.nan  # initialize missing cols

    # Map API data to these new CL rows
    new_rows["custom_id"] = new_rows["unique_id"].str.replace("CL_", "", regex=False)
    for k in api_keys:
        new_rows[k] = new_rows["custom_id"].map(api_answers[k]) if k in api_answers else np.nan

    out = pd.concat([cap, new_rows[cap.columns]], ignore_index=True)

    out = out[(out['opinion'].notna()) & (out['district judge id'].notna())] 
    return out.drop(columns=["custom_id"], errors="ignore")

In [14]:
full_data = merge_cap_and_cl(cap_data_clean, cl_clean, "batch_runs/overlap_outputs/overlap_results.jsonl")

In [15]:
len(full_data)

5589

In [None]:
def compute_overturns(judges: pd.DataFrame, full_data: pd.DataFrame) -> pd.DataFrame:
    """Compute overturn stats only for cases decided ≥3 months before promotion_date."""
    df = full_data.copy()
    df["judge id"] = pd.to_numeric(df["district judge id"], errors="coerce").astype("Int64")
    df["opinion"] = df["opinion"].astype(str).str.lower()
    df["decision_date"] = pd.to_datetime(df["decision_date"], errors="coerce")

    res = []
    for _, j in judges.dropna(subset=["promotion_date"]).iterrows():
        cutoff = pd.Timestamp(j.promotion_date) - pd.DateOffset(months=3)
        subset = df[(df["judge id"] == j["judge id"]) & (df["decision_date"] <= cutoff)]
        appealed = len(subset)
        overturned = (subset["opinion"] != "affirmed").sum()
        res.append((appealed, overturned))

    judges[["appealed_cases", "overturned_appealed_cases"]] = pd.DataFrame(res, index=judges.index)
    judges["overturnrate"] = np.where(judges["appealed_cases"] > 0,
                                      judges["overturned_appealed_cases"] / judges["appealed_cases"], np.nan)
    return judges

In [19]:
feature_dataset = compute_overturns(judges, full_data)

KeyError: 'promotion_date'

In [272]:
feature_dataset = feature_dataset[feature_dataset["appealed_cases"] >= 5]

In [273]:
# Regression analysis with gender and ethnicity
y = feature_dataset["is promoted"].astype(float)  
X = pd.DataFrame({
    "overturnrate": pd.to_numeric(feature_dataset["overturnrate"], errors="coerce"),
    "gender": feature_dataset["gender"].fillna("Unknown").astype("category"),
    "ethnicity": feature_dataset["ethnicity"].fillna("Unknown").astype("category"),
    "aba rating": feature_dataset["aba rating"].fillna("Unknown").astype("category"),
    "party of appointing president": feature_dataset["party of appointing president"].fillna("Unknown").astype("category"),
})

X = pd.get_dummies(X, columns=["gender", "ethnicity", "aba rating", "party of appointing president"], drop_first=True, dtype=float)

mask = ~(y.isna() | X.isna().any(axis=1))
y_clean = y.loc[mask]
X_clean = X.loc[mask]

X_clean = sm.add_constant(X_clean, has_constant="add")

logit_model = sm.Logit(y_clean, X_clean).fit(maxiter=100)
print("=== Logistic Regression: is promoted ~ overturnrate + gender + ethnicity ===")
print(logit_model.summary())

ols_model = sm.OLS(y_clean, X_clean).fit(maxiter=100)
print("\n=== OLS Regression: is promoted ~ overturnrate + gender + ethnicity ===")
print(ols_model.summary())

KeyError: 'is promoted'