In [2]:
import os
import re
from pathlib import Path
import pandas as pd

# ---- paths (edit if yours differ) ----
main_dir = Path("/u/home/i/iacir21/myscratch")  # your main directory
judgement_dir = main_dir / "judgements_txt"
xlsx_path = main_dir / "train_test_set" / "metadata_merged_v15.xlsx"   # <-- put the real filename here
xlsx_sheet = 0  # or a sheet name like "Sheet1"

# ---- helper: normalize an ID (strip, lowercase, collapse spaces, drop non-alnum except _ and - if you prefer) ----
def norm_id(s: str) -> str:
    s = str(s).strip()
    # collapse whitespace to single space
    s = re.sub(r"\s+", " ", s)
    # if your IDs should keep dashes/underscores, keep them; otherwise remove non-alnum entirely:
    s = re.sub(r"[^0-9A-Za-z_-]", "", s)
    return s.lower()

# ---- 1) collect file IDs from judgement_txt ----
assert judgement_dir.is_dir(), f"Missing directory: {judgement_dir}"
files = [p for p in judgement_dir.iterdir() if p.is_file()]
# keep only likely text files; relax this if needed
txt_files = [p for p in files if p.suffix.lower() == ".txt" or "." not in p.name]

file_ids = []
for p in txt_files:
    # strip extension, normalize
    base = p.stem  # filename without extension
    file_ids.append(norm_id(base))

file_ids_set = set(file_ids)

# ---- 2) read case_id column from Excel ----
df = pd.read_excel(xlsx_path, sheet_name=xlsx_sheet)
# try to find the case_id column robustly
candidate_cols = [c for c in df.columns if str(c).strip().lower() in {"case_id","caseid","case id","id"}]
if not candidate_cols:
    raise ValueError(f"Could not find a case_id-like column in {xlsx_path}. Columns: {list(df.columns)}")
case_col = candidate_cols[0]

excel_ids = df[case_col].dropna().astype(str).map(norm_id)
excel_ids_set = set(excel_ids)

# ---- 3) compare ----
# a) IDs that are in Excel but not in filenames
missing_files_for_ids = sorted(excel_ids_set - file_ids_set)
# b) IDs that exist as files but are not in Excel
extra_files_not_in_excel = sorted(file_ids_set - excel_ids_set)
# c) intersection / matched
matched_ids = sorted(excel_ids_set & file_ids_set)

print(f"Total txt files considered: {len(txt_files)}")
print(f"Unique file IDs: {len(file_ids_set)}")
print(f"Unique Excel IDs: {len(excel_ids_set)}")
print(f"Matched IDs: {len(matched_ids)}")
print(f"Missing files for Excel IDs: {len(missing_files_for_ids)}")
print(f"Extra files not in Excel: {len(extra_files_not_in_excel)}")

# ---- 4) (optional) write reports to your scratch for inspection ----
out_dir = main_dir / "train_test_set" / "reports_compare_ids"
out_dir.mkdir(parents=True, exist_ok=True)

pd.Series(matched_ids, name="matched_id").to_csv(out_dir / "matched_ids.csv", index=False)
pd.Series(missing_files_for_ids, name="excel_id_no_file").to_csv(out_dir / "excel_ids_missing_files.csv", index=False)
pd.Series(extra_files_not_in_excel, name="file_id_not_in_excel").to_csv(out_dir / "files_not_in_excel.csv", index=False)

print(f"\nWrote reports to: {out_dir}")


Total txt files considered: 131812
Unique file IDs: 131812
Unique Excel IDs: 159645
Matched IDs: 131244
Missing files for Excel IDs: 28401
Extra files not in Excel: 568

Wrote reports to: /u/home/i/iacir21/myscratch/train_test_set/reports_compare_ids


In [3]:

df_2 = pd.read_excel(xlsx_path)



KeyError: 'Num_Judge'

In [4]:
count_missing = ((df_2["Num_Judges"] > 1) & 
                 (df_2["median_slant"].isna() | df_2["median_slant_goodvbad"].isna())
                ).sum()

print("Number of rows with Num_Judge > 1 and missing slant values:", count_missing)

Number of rows with Num_Judge > 1 and missing slant values: 14822


In [5]:
count_missing_2 = (df_2["Num_Judges"] > 1).sum()
print(count_missing_2)

14822


In [6]:
count_missing_3 = (df_2["median_slant"].isna() | df_2["median_slant_goodvbad"].isna()).sum()
print(count_missing_3)

35511


In [8]:
import re
import shutil
from pathlib import Path
import pandas as pd

# ---------- CONFIG ----------
MAIN_DIR   = Path("/u/home/i/iacir21/myscratch")
SRC_DIR    = MAIN_DIR / "judgements_txt"                 # source folder with .txt files
DEST_DIR   = MAIN_DIR / "train_test_set" / "judgements_txt_train"            # destination for training set
XLSX_PATH  = MAIN_DIR / "train_test_set" / "metadata_merged_v15.xlsx"

CASE_COL   = "case_id"      # case id column in the xlsx
NUMJ_COL   = "Num_Judges"    # judge count column
YEAR_COL   = "filing_year"  # filing year column
JUDGE_COL  = "judge(s)"       # judge(s) column

# include median year rows in train (True) or use strictly < median (False)
INCLUDE_MEDIAN = True
# ----------------------------

DEST_DIR.mkdir(parents=True, exist_ok=True)

def norm_id(s: str) -> str:
    s = str(s).strip()
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"[^0-9A-Za-z_-]", "", s)  # keep alnum, _, -
    return s.lower()

# 1) collect available .txt files (id -> path)
assert SRC_DIR.is_dir(), f"Missing source dir: {SRC_DIR}"
txt_files = [p for p in SRC_DIR.iterdir() if p.is_file() and p.suffix.lower()==".txt"]
available = {norm_id(p.stem): p for p in txt_files}

# 2) read Excel & sanity checks
df = pd.read_excel(XLSX_PATH)
for col in [CASE_COL, NUMJ_COL, YEAR_COL, JUDGE_COL]:
    if col not in df.columns:
        raise ValueError(f"Expected column '{col}' not found in {XLSX_PATH}. Got: {list(df.columns)}")

# normalize IDs
df["_cid_norm"] = df[CASE_COL].astype(str).map(norm_id)

# 3) keep rows: id exists as file  AND Num_Judge <= 1  AND filing year & judge present
df = df[df["_cid_norm"].isin(available.keys())].copy()
df = df[df[NUMJ_COL] <= 1].copy()
df[YEAR_COL] = pd.to_numeric(df[YEAR_COL], errors="coerce")
df = df.dropna(subset=[YEAR_COL, JUDGE_COL]).copy()

# 4) per-judge median filing year
med_year = df.groupby(JUDGE_COL)[YEAR_COL].median().rename("median_year")
df = df.merge(med_year, left_on=JUDGE_COL, right_index=True, how="left")

# 5) select train rows based on median rule
if INCLUDE_MEDIAN:
    train_df = df[df[YEAR_COL] <= df["median_year"]].copy()
else:
    train_df = df[df[YEAR_COL] < df["median_year"]].copy()

# 6) copy files to train directory
selected_ids = train_df["_cid_norm"].unique().tolist()
copied, missing = 0, 0
for cid in selected_ids:
    src = available.get(cid)
    if src and src.exists():
        shutil.copy2(str(src), str(DEST_DIR / src.name))
        copied += 1
    else:
        missing += 1

# 7) export lists
# a) train_case_ids.csv
train_ids_path = MAIN_DIR / "train_test_set" / "train_case_ids.csv"
pd.Series(selected_ids, name="case_id_norm").to_csv(train_ids_path, index=False)

# b) median_year_judges.csv with judge, median_year, total_docs_kept(eligible), n_train
judge_counts = df.groupby(JUDGE_COL).size().rename("eligible_docs")
train_counts = train_df.groupby(JUDGE_COL).size().rename("train_docs")
med_table = med_year.to_frame().join([judge_counts, train_counts]).reset_index()
med_table.columns = [JUDGE_COL, "median_year", "eligible_docs", "train_docs"]
median_years_path = MAIN_DIR / "train_test_set" / "median_year_judges.csv"
med_table.to_csv(median_years_path, index=False)

# 8) summary
print("Done.")
print(f"Total .txt in source:           {len(txt_files)}")
print(f"Eligible rows after filters:    {len(df)}")
print(f"Selected for training:          {len(train_df)}")
print(f"Files copied:                   {copied}")
print(f"Selected but file missing:      {missing}")
print(f"Train IDs CSV:                  {train_ids_path}")
print(f"Median-by-judge CSV:            {median_years_path}")
print(f"Train dir:                      {DEST_DIR}")
print(f"Inclusion rule: filing_year {'<= median' if INCLUDE_MEDIAN else '< median'}")


Done.
Total .txt in source:           131812
Eligible rows after filters:    116919
Selected for training:          67983
Files copied:                   67983
Selected but file missing:      0
Train IDs CSV:                  /u/home/i/iacir21/myscratch/train_test_set/train_case_ids.csv
Median-by-judge CSV:            /u/home/i/iacir21/myscratch/train_test_set/median_year_judges.csv
Train dir:                      /u/home/i/iacir21/myscratch/train_test_set/judgements_txt_train
Inclusion rule: filing_year <= median


In [3]:
import zipfile

path_to_zip_file = "/u/home/i/iacir21/myscratch/cleaned_text_files.zip"

with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
    zip_ref.extractall()

In [6]:
import os
folder_path = "/u/home/i/iacir21/myscratch/train_test_set/cleaned_text_files"

# count only files (ignore subdirectories)
file_count = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])

print(f"Number of files in {folder_path}: {file_count}")

Number of files in /u/home/i/iacir21/myscratch/train_test_set/cleaned_text_files: 31318


In [7]:


# paths to your folders
cleaned_folder = "/u/home/i/iacir21/myscratch/train_test_set/cleaned_text_files"
judgements_folder = "/u/home/i/iacir21/myscratch/judgements_txt"

# get filenames (without paths)
cleaned_files = set(os.listdir(cleaned_folder))
judgement_files = set(os.listdir(judgements_folder))

# find duplicates
duplicates = cleaned_files.intersection(judgement_files)

if duplicates:
    print("Duplicate file names found in both folders:")
    for file in duplicates:
        print(file)
else:
    print("No duplicate file names found.")


No duplicate file names found.


In [3]:
import os
folder_path = "/u/home/i/iacir21/myscratch/judgements_txt"

# count only files (ignore subdirectories)
file_count = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])

print(f"Number of files in {folder_path}: {file_count}")

Number of files in /u/home/i/iacir21/myscratch/judgements_txt: 163130


In [3]:
import os
import shutil
import zipfile


# paths
judgements_dir = "/u/home/i/iacir21/myscratch/judgements_txt"
judgements_zip = "/u/home/i/iacir21/myscratch/judgements_txt.zip"
cleaned_dir = "/u/home/i/iacir21/myscratch/train_test_set/cleaned_text_files"

In [None]:



# 1. Delete the old judgements_txt folder if it exists
if os.path.exists(judgements_dir):
    shutil.rmtree(judgements_dir)
    print(f"Deleted: {judgements_dir}")

# 2. Unzip the archive again
with zipfile.ZipFile(judgements_zip, 'r') as zip_ref:
    zip_ref.extractall(os.path.dirname(judgements_dir))  # extracts into the same parent dir
    print(f"Extracted {judgements_zip} to {os.path.dirname(judgements_dir)}")

    


In [4]:
file_count = len([f for f in os.listdir(judgements_dir) if os.path.isfile(os.path.join(judgements_dir, f))])

print(f"Number of files in {judgements_dir}: {file_count}")

file_count = len([f for f in os.listdir(cleaned_dir) if os.path.isfile(os.path.join(cleaned_dir, f))])

print(f"Number of files in {cleaned_dir}: {file_count}")


Number of files in /u/home/i/iacir21/myscratch/judgements_txt: 131812
Number of files in /u/home/i/iacir21/myscratch/train_test_set/cleaned_text_files: 31318


In [5]:


# 3. Copy all cleaned files into the new judgements_txt
for file_name in os.listdir(cleaned_dir):
    src_path = os.path.join(cleaned_dir, file_name)
    dst_path = os.path.join(judgements_dir, file_name)

    if os.path.isfile(src_path):
        shutil.copy2(src_path, dst_path)
        print(f"Copied: {file_name}")
        
        
        
file_count = len([f for f in os.listdir(judgements_dir) if os.path.isfile(os.path.join(judgements_dir, f))])

print(f"Number of files in {judgements_dir}: {file_count}")


Copied: .DS_Store
Copied: 1000_0.txt
Copied: 1000_1.txt
Copied: 1000_2.txt
Copied: 1000_3.txt
Copied: 1000_4.txt
Copied: 1000_5.txt
Copied: 1000_6.txt
Copied: 1000_7.txt
Copied: 1000_8.txt
Copied: 1000_9.txt
Copied: 1001_0.txt
Copied: 1001_1.txt
Copied: 1001_2.txt
Copied: 1001_3.txt
Copied: 1001_4.txt
Copied: 1001_5.txt
Copied: 1001_6.txt
Copied: 1001_7.txt
Copied: 1001_8.txt
Copied: 1001_9.txt
Copied: 1002_0.txt
Copied: 1002_1.txt
Copied: 1002_2.txt
Copied: 1002_3.txt
Copied: 1002_4.txt
Copied: 1002_5.txt
Copied: 1002_6.txt
Copied: 1002_7.txt
Copied: 1002_8.txt
Copied: 1002_9.txt
Copied: 1003_0.txt
Copied: 1003_1.txt
Copied: 1003_2.txt
Copied: 1003_3.txt
Copied: 1003_4.txt
Copied: 1003_5.txt
Copied: 1003_6.txt
Copied: 1003_7.txt
Copied: 1003_8.txt
Copied: 1003_9.txt
Copied: 1004_0.txt
Copied: 1004_1.txt
Copied: 1004_2.txt
Copied: 1004_3.txt
Copied: 1004_4.txt
Copied: 1004_5.txt
Copied: 1004_6.txt
Copied: 1004_7.txt
Copied: 1004_8.txt
Copied: 1004_9.txt
Copied: 1005_0.txt
Copied: 1005_

In [6]:
import os
import re
from pathlib import Path
import pandas as pd

# ---- paths (edit if yours differ) ----
main_dir = Path("/u/home/i/iacir21/myscratch")  # your main directory
judgement_dir = main_dir / "judgements_txt"
xlsx_path = main_dir / "train_test_set" / "metadata_merged_v15.xlsx"   # <-- put the real filename here
xlsx_sheet = 0  # or a sheet name like "Sheet1"

# ---- helper: normalize an ID (strip, lowercase, collapse spaces, drop non-alnum except _ and - if you prefer) ----
def norm_id(s: str) -> str:
    s = str(s).strip()
    # collapse whitespace to single space
    s = re.sub(r"\s+", " ", s)
    # if your IDs should keep dashes/underscores, keep them; otherwise remove non-alnum entirely:
    s = re.sub(r"[^0-9A-Za-z_-]", "", s)
    return s.lower()

# ---- 1) collect file IDs from judgement_txt ----
assert judgement_dir.is_dir(), f"Missing directory: {judgement_dir}"
files = [p for p in judgement_dir.iterdir() if p.is_file()]
# keep only likely text files; relax this if needed
txt_files = [p for p in files if p.suffix.lower() == ".txt" or "." not in p.name]

file_ids = []
for p in txt_files:
    # strip extension, normalize
    base = p.stem  # filename without extension
    file_ids.append(norm_id(base))

file_ids_set = set(file_ids)

# ---- 2) read case_id column from Excel ----
df = pd.read_excel(xlsx_path, sheet_name=xlsx_sheet)
# try to find the case_id column robustly
candidate_cols = [c for c in df.columns if str(c).strip().lower() in {"case_id","caseid","case id","id"}]
if not candidate_cols:
    raise ValueError(f"Could not find a case_id-like column in {xlsx_path}. Columns: {list(df.columns)}")
case_col = candidate_cols[0]

excel_ids = df[case_col].dropna().astype(str).map(norm_id)
excel_ids_set = set(excel_ids)

# ---- 3) compare ----
# a) IDs that are in Excel but not in filenames
missing_files_for_ids = sorted(excel_ids_set - file_ids_set)
# b) IDs that exist as files but are not in Excel
extra_files_not_in_excel = sorted(file_ids_set - excel_ids_set)
# c) intersection / matched
matched_ids = sorted(excel_ids_set & file_ids_set)

print(f"Total txt files considered: {len(txt_files)}")
print(f"Unique file IDs: {len(file_ids_set)}")
print(f"Unique Excel IDs: {len(excel_ids_set)}")
print(f"Matched IDs: {len(matched_ids)}")
print(f"Missing files for Excel IDs: {len(missing_files_for_ids)}")
print(f"Extra files not in Excel: {len(extra_files_not_in_excel)}")

# ---- 4) (optional) write reports to your scratch for inspection ----
out_dir = main_dir / "train_test_set" / "reports_compare_ids"
out_dir.mkdir(parents=True, exist_ok=True)

pd.Series(matched_ids, name="matched_id").to_csv(out_dir / "matched_ids.csv", index=False)
pd.Series(missing_files_for_ids, name="excel_id_no_file").to_csv(out_dir / "excel_ids_missing_files.csv", index=False)
pd.Series(extra_files_not_in_excel, name="file_id_not_in_excel").to_csv(out_dir / "files_not_in_excel.csv", index=False)

print(f"\nWrote reports to: {out_dir}")

Total txt files considered: 163129
Unique file IDs: 163129
Unique Excel IDs: 159645
Matched IDs: 159645
Missing files for Excel IDs: 0
Extra files not in Excel: 3484

Wrote reports to: /u/home/i/iacir21/myscratch/train_test_set/reports_compare_ids


In [7]:
df_2 = pd.read_excel(xlsx_path)

In [8]:
count_missing = ((df_2["Num_Judges"] > 1) & 
                 (df_2["median_slant"].isna() | df_2["median_slant_goodvbad"].isna())
                ).sum()

print("Number of rows with Num_Judge > 1 and missing slant values:", count_missing)

Number of rows with Num_Judge > 1 and missing slant values: 14822


In [9]:
count_missing_2 = (df_2["Num_Judges"] > 1).sum()
print(count_missing_2)

14822


In [10]:
count_missing_3 = (df_2["median_slant"].isna() | df_2["median_slant_goodvbad"].isna()).sum()
print(count_missing_3)

35511


In [5]:
judgements_dir = "/u/home/i/iacir21/myscratch/train_test_set/judgements_txt_train"

if os.path.exists(judgements_dir):
    shutil.rmtree(judgements_dir)
    print(f"Deleted: {judgements_dir}")

Deleted: /u/home/i/iacir21/myscratch/train_test_set/judgements_txt_train


In [6]:
import re
import shutil
from pathlib import Path
import pandas as pd

# ---------- CONFIG ----------
MAIN_DIR   = Path("/u/home/i/iacir21/myscratch")
SRC_DIR    = MAIN_DIR / "judgements_txt"                 # source folder with .txt files
DEST_DIR   = MAIN_DIR / "train_test_set" / "judgements_txt_train"            # destination for training set
XLSX_PATH  = MAIN_DIR / "train_test_set" / "metadata_merged_v15.xlsx"

CASE_COL   = "case_id"      # case id column in the xlsx
NUMJ_COL   = "Num_Judges"    # judge count column
YEAR_COL   = "filing_year"  # filing year column
JUDGE_COL  = "judge(s)"       # judge(s) column

# include median year rows in train (True) or use strictly < median (False)
INCLUDE_MEDIAN = True
# ----------------------------

DEST_DIR.mkdir(parents=True, exist_ok=True)

def norm_id(s: str) -> str:
    s = str(s).strip()
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"[^0-9A-Za-z_-]", "", s)  # keep alnum, _, -
    return s.lower()

# 1) collect available .txt files (id -> path)
assert SRC_DIR.is_dir(), f"Missing source dir: {SRC_DIR}"
txt_files = [p for p in SRC_DIR.iterdir() if p.is_file() and p.suffix.lower()==".txt"]
available = {norm_id(p.stem): p for p in txt_files}

# 2) read Excel & sanity checks
df = pd.read_excel(XLSX_PATH)
for col in [CASE_COL, NUMJ_COL, YEAR_COL, JUDGE_COL]:
    if col not in df.columns:
        raise ValueError(f"Expected column '{col}' not found in {XLSX_PATH}. Got: {list(df.columns)}")

# normalize IDs
df["_cid_norm"] = df[CASE_COL].astype(str).map(norm_id)

# 3) keep rows: id exists as file  AND Num_Judge <= 1  AND filing year & judge present
df = df[df["_cid_norm"].isin(available.keys())].copy()
df = df[df[NUMJ_COL] <= 1].copy()
df[YEAR_COL] = pd.to_numeric(df[YEAR_COL], errors="coerce")
df = df.dropna(subset=[YEAR_COL, JUDGE_COL]).copy()

# 4) per-judge median filing year
med_year = df.groupby(JUDGE_COL)[YEAR_COL].median().rename("median_year")
df = df.merge(med_year, left_on=JUDGE_COL, right_index=True, how="left")

# 5) select train rows based on median rule
if INCLUDE_MEDIAN:
    train_df = df[df[YEAR_COL] <= df["median_year"]].copy()
else:
    train_df = df[df[YEAR_COL] < df["median_year"]].copy()

# 6) copy files to train directory
selected_ids = train_df["_cid_norm"].unique().tolist()
copied, missing = 0, 0
for cid in selected_ids:
    src = available.get(cid)
    if src and src.exists():
        shutil.copy2(str(src), str(DEST_DIR / src.name))
        copied += 1
    else:
        missing += 1

# 7) export lists
# a) train_case_ids.csv
train_ids_path = MAIN_DIR / "train_test_set" / "train_case_ids.csv"
pd.Series(selected_ids, name="case_id_norm").to_csv(train_ids_path, index=False)

# b) median_year_judges.csv with judge, median_year, total_docs_kept(eligible), n_train
judge_counts = df.groupby(JUDGE_COL).size().rename("eligible_docs")
train_counts = train_df.groupby(JUDGE_COL).size().rename("train_docs")
med_table = med_year.to_frame().join([judge_counts, train_counts]).reset_index()
med_table.columns = [JUDGE_COL, "median_year", "eligible_docs", "train_docs"]
median_years_path = MAIN_DIR / "train_test_set" / "median_year_judges.csv"
med_table.to_csv(median_years_path, index=False)

# 8) summary
print("Done.")
print(f"Total .txt in source:           {len(txt_files)}")
print(f"Eligible rows after filters:    {len(df)}")
print(f"Selected for training:          {len(train_df)}")
print(f"Files copied:                   {copied}")
print(f"Selected but file missing:      {missing}")
print(f"Train IDs CSV:                  {train_ids_path}")
print(f"Median-by-judge CSV:            {median_years_path}")
print(f"Train dir:                      {DEST_DIR}")
print(f"Inclusion rule: filing_year {'<= median' if INCLUDE_MEDIAN else '< median'}")



# 9) judge -> case_ids report
judge_cases = (
    train_df.groupby(JUDGE_COL)["_cid_norm"]
    .apply(lambda ids: ";".join(sorted(ids.unique())))
    .reset_index()
    .rename(columns={"_cid_norm": "train_case_ids"})
)

# add train_docs count
judge_cases["train_docs"] = judge_cases["train_case_ids"].apply(lambda x: len(x.split(";")))

# save to CSV
judge_cases_path = MAIN_DIR / "train_test_set" / "judge_train_caseids.csv"
judge_cases.to_csv(judge_cases_path, index=False)

print(f"Judge-level case_id report CSV: {judge_cases_path}")


Done.
Total .txt in source:           163129
Eligible rows after filters:    143881
Selected for training:          82981
Files copied:                   82981
Selected but file missing:      0
Train IDs CSV:                  /u/home/i/iacir21/myscratch/train_test_set/train_case_ids.csv
Median-by-judge CSV:            /u/home/i/iacir21/myscratch/train_test_set/median_year_judges.csv
Train dir:                      /u/home/i/iacir21/myscratch/train_test_set/judgements_txt_train
Inclusion rule: filing_year <= median
Judge-level case_id report CSV: /u/home/i/iacir21/myscratch/train_test_set/judge_train_caseids.csv
