In [1]:
import os
import json
import re
import unicodedata

# Define the folder containing JSON files
folder_path = "opinions"
output_file = "processed_opinions_with_years.jsonl"

# List of justice last names to redact
justice_last_names = [
    "Roberts", "Scalia", "Kennedy", "Thomas", "Ginsburg", "Breyer",
    "Alito", "Sotomayor", "Kagan", "Barrett", "Jackson"
]

# Function to replace justice names and initials with [REDACTED]
def redact_justice_names(text):
    # Common patterns for justice names or initials
    patterns = [
        r"\b[A-Z]\w+,\sJ\.\b",  # E.g., "ALITO, J."
        r"\bJustice\s[A-Z]\w+\b",  # E.g., "Justice Alito"
        r"\b[A-Z]\w+\sJ\.\b",  # E.g., "Alito J."
    ]
    for pattern in patterns:
        text = re.sub(pattern, "[REDACTED]", text)
    
    # Replace just last names of justices
    for last_name in justice_last_names:
        text = re.sub(rf"\b{last_name}\b", "[REDACTED]", text, flags=re.IGNORECASE)
    return text

# Function to extract the year of publication from the JSON data
def extract_year(data):
    """
    Extracts the year of publication from the data.
    Assumes the year is stored under a 'case' key with a 'year' subkey.
    """
    return data.get("case", {}).get("year", "unknown")

# Prepare the JSONL file
with open(output_file, 'w', encoding='utf-8') as jsonl_file:
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as json_file:
                data = json.load(json_file)
                
                # Extract text, author, and year
                text = data.get("text", "")
                author = data.get("author", "unknown")
                year = extract_year(data)
                
                # Normalize unicode and replace poorly parsed symbols
                text = unicodedata.normalize("NFKC", text)
                
                # Redact justice names and last names
                text = redact_justice_names(text)
                
                # Write to JSONL file
                jsonl_data = {"author": author, "year": year, "text": text}
                jsonl_file.write(json.dumps(jsonl_data, ensure_ascii=False) + "\n")

print(f"Processing complete. Extracted data saved in '{output_file}'.")

FileNotFoundError: [Errno 2] No such file or directory: 'opinions'

In [3]:
import re
import unicodedata
import json

def clean_text(text):
    """
    Cleans the input text by removing sentences containing
    'on writ of certiorari', replacing newlines, and normalizing unicode.
    """
    # Normalize unicode symbols
    text = unicodedata.normalize("NFKC", text)
    
    # Replace newlines with spaces
    text = text.replace("\n", " ")
    
    # Remove the sentence containing "on writ of certiorari"
    text = re.sub(r"[^.]*on writ of certiorari[^.]*\.", "", text, flags=re.IGNORECASE)
    
    # Remove multiple spaces
    text = re.sub(r"\s+", " ", text)
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    return text

def process_jsonl(input_file, output_file):
    """
    Processes each entry in a JSONL file by cleaning the text
    and saving the output to a new JSONL file.
    """
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8') as outfile:
        
        for line in infile:
            data = json.loads(line)  # Parse JSON line
            if "text" in data:
                # Clean the text
                data["text"] = clean_text(data["text"])
            
            # Write the cleaned data back to the output file
            outfile.write(json.dumps(data, ensure_ascii=False) + "\n")

# File paths
input_file = "processed.jsonl"
output_file = "cleaned_processed.jsonl"

# Run the processing
process_jsonl(input_file, output_file)

print(f"Processing complete. Cleaned file saved as '{output_file}'.")


Processing complete. Cleaned file saved as 'cleaned_processed.jsonl'.


In [5]:
import re
import unicodedata
import json

def clean_text(text):
    """
    Cleans the input text by removing introductory blocks,
    replacing newlines, and normalizing unicode.
    """
    # Normalize unicode symbols
    text = unicodedata.normalize("NFKC", text)
    
    # Replace newlines with spaces
    text = text.replace("\n", " ")
    
    # Remove everything before "on writ of certiorari"
    match = re.search(r"on writ of certiorari", text, re.IGNORECASE)
    if match:
        text = text[match.end():]  # Keep text after the match
    
    # Remove multiple spaces
    text = re.sub(r"\s+", " ", text)
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    return text

def process_jsonl(input_file, output_file):
    """
    Processes each entry in a JSONL file by cleaning the text
    and saving the output to a new JSONL file.
    """
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8') as outfile:
        
        for line in infile:
            data = json.loads(line)  # Parse JSON line
            if "text" in data:
                # Clean the text
                data["text"] = clean_text(data["text"])
            
            # Write the cleaned data back to the output file
            outfile.write(json.dumps(data, ensure_ascii=False) + "\n")

# File paths
input_file = "processed.jsonl"
output_file = "cleaned_processed.jsonl"

# Run the processing
process_jsonl(input_file, output_file)

print(f"Processing complete. Cleaned file saved as '{output_file}'.")


Processing complete. Cleaned file saved as 'cleaned_processed.jsonl'.


In [6]:
import re
import json
import unicodedata

def clean_text(text):
    """
    Cleans the text by:
    1. Finding and removing all sentences containing 'writ of certiorari'.
    2. Removing everything before those sentences.
    3. Normalizing Unicode symbols.
    4. Replacing newlines with spaces.
    5. Removing extra spaces.
    """
    # Normalize unicode symbols
    text = unicodedata.normalize("NFKC", text)
    
    # Replace newlines with spaces
    text = text.replace("\n", " ")
    
    # Find the position of the first occurrence of "writ of certiorari"
    match = re.search(r"[^.]*writ of certiorari[^.]*\.", text, re.IGNORECASE)
    if match:
        # Remove everything before and including the matched sentence
        text = text[match.end():]
    
    # Remove multiple spaces
    text = re.sub(r"\s+", " ", text)
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    return text

def process_jsonl(input_file, output_file):
    """
    Processes each entry in a JSONL file by cleaning the text
    and saving the output to a new JSONL file.
    """
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8') as outfile:
        
        for line in infile:
            data = json.loads(line)  # Parse JSON line
            if "text" in data:
                # Clean the text
                data["text"] = clean_text(data["text"])
            
            # Write the cleaned data back to the output file
            outfile.write(json.dumps(data, ensure_ascii=False) + "\n")

# File paths
input_file = "processed.jsonl"
output_file = "cleaned_processed.jsonl"

# Run the processing
process_jsonl(input_file, output_file)

print(f"Processing complete. Cleaned file saved as '{output_file}'.")


Processing complete. Cleaned file saved as 'cleaned_processed.jsonl'.


In [None]:
import re
import json
import unicodedata

def clean_text(text):
    """
    Cleans the text by:
    1. Removing sentences containing 'writ of certiorari' and everything before them.
    2. Removing sentences containing '___'.
    3. Normalizing Unicode symbols.
    4. Replacing newlines with spaces.
    5. Removing extra spaces.
    """
    # Normalize unicode symbols
    text = unicodedata.normalize("NFKC", text)
    
    # Replace newlines with spaces
    text = text.replace("\n", " ")
    
    # Remove everything before and including the sentence with "writ of certiorari"
    match = re.search(r"[^.]*writ of certiorari[^.]*\.", text, re.IGNORECASE)
    if match:
        text = text[match.end():]  # Keep text after the matched sentence
    
    # Remove sentences containing '___'
    text = re.sub(r"[^.]*___[^.]*\.", "", text)
    
    # Remove multiple spaces
    text = re.sub(r"\s+", " ", text)
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    return text

def process_jsonl(input_file, output_file):
    """
    Processes each entry in a JSONL file by cleaning the text
    and saving the output to a new JSONL file.
    """
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8') as outfile:
        
        for line in infile:
            data = json.loads(line)  # Parse JSON line
            if "text" in data:
                # Clean the text
                data["text"] = clean_text(data["text"])
            
            # Write the cleaned data back to the output file
            outfile.write(json.dumps(data, ensure_ascii=False) + "\n")

# File paths
input_file = "processed.jsonl"
output_file = "cleaned_processed.jsonl"

# Run the processing
process_jsonl(input_file, output_file)

print(f"Processing complete. Cleaned file saved as '{output_file}'.")

Processing complete. Cleaned file saved as 'cleaned_processed.jsonl'.


In [1]:
import re
import json
import unicodedata

def clean_text(text):
    """
    Cleans the text by:
    1. Removing sentences containing 'writ of certiorari' and everything before them.
    2. Removing sentences containing '____'.
    3. Hiding names (capitalized words).
    4. Removing words with footnotes (e.g., include9).
    5. Removing years (e.g., 1990, 2021).
    6. Normalizing Unicode symbols.
    7. Replacing newlines with spaces.
    8. Removing extra spaces.
    """
    # Normalize unicode symbols
    text = unicodedata.normalize("NFKC", text)
    
    # Replace newlines with spaces
    text = text.replace("\n", " ")
    
    # Remove everything before and including the sentence with "writ of certiorari"
    match = re.search(r"[^.]*writ of certiorari[^.]*\.", text, re.IGNORECASE)
    if match:
        text = text[match.end():]  # Keep text after the matched sentence
    
    # Remove sentences containing '____'
    text = re.sub(r"[^.]*____[^.]*\.", "", text)
    
    # Hide names (capitalized words not at the start of a sentence)
    text = re.sub(r"(?<![.!?]\s)\b[A-Z][a-z]*\b", "[REDACTED]", text)
    
    # Remove words with footnotes (e.g., include9)
    text = re.sub(r"\b\w+\d+\b", "", text)
    
    # Remove years (e.g., 1900-2099)
    text = re.sub(r"\b(19|20)\d{2}\b", "", text)
    
    # Remove multiple spaces
    text = re.sub(r"\s+", " ", text)
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    return text

def process_jsonl(input_file, output_file):
    """
    Processes each entry in a JSONL file by cleaning the text
    and saving the output to a new JSONL file.
    """
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8') as outfile:
        
        for line in infile:
            data = json.loads(line)  # Parse JSON line
            if "text" in data:
                # Clean the text
                data["text"] = clean_text(data["text"])
            
            # Write the cleaned data back to the output file
            outfile.write(json.dumps(data, ensure_ascii=False) + "\n")

# File paths
input_file = "processed.jsonl"
output_file = "cleaned_processed.jsonl"

# Run the processing
process_jsonl(input_file, output_file)

print(f"Processing complete. Cleaned file saved as '{output_file}'.")


Processing complete. Cleaned file saved as 'cleaned_processed.jsonl'.


In [2]:
import re
import json
import unicodedata
from collections import Counter

def clean_text(text, redacted_words):
    """
    Cleans the text by:
    1. Removing sentences containing 'writ of certiorari' and everything before them.
    2. Removing sentences containing '____'.
    3. Hiding names (capitalized words).
    4. Removing words with footnotes (e.g., include9).
    5. Removing years (e.g., 1990, 2021).
    6. Normalizing Unicode symbols.
    7. Replacing newlines with spaces.
    8. Removing extra spaces.
    """
    # Normalize unicode symbols
    text = unicodedata.normalize("NFKC", text)
    
    # Replace newlines with spaces
    text = text.replace("\n", " ")
    
    # Remove everything before and including the sentence with "writ of certiorari"
    match = re.search(r"[^.]*writ of certiorari[^.]*\.", text, re.IGNORECASE)
    if match:
        text = text[match.end():]  # Keep text after the matched sentence
    
    # Remove sentences containing '____'
    text = re.sub(r"[^.]*____[^.]*\.", "", text)
    
    # Hide names (capitalized words not at the start of a sentence)
    matches = re.findall(r"(?<![.!?]\s)\b[A-Z][a-z]*\b", text)
    for word in matches:
        redacted_words[word] += 1  # Count occurrences
    text = re.sub(r"(?<![.!?]\s)\b[A-Z][a-z]*\b", "[REDACTED]", text)
    
    # Remove words with footnotes (e.g., include9)
    text = re.sub(r"\b\w+\d+\b", "", text)
    
    # Remove years (e.g., 1900-2099)
    text = re.sub(r"\b(19|20)\d{2}\b", "", text)
    
    # Remove multiple spaces
    text = re.sub(r"\s+", " ", text)
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    return text

def process_jsonl(input_file, output_file):
    """
    Processes each entry in a JSONL file by cleaning the text,
    collecting `[REDACTED]` words, and saving the output to a new JSONL file.
    """
    redacted_words = Counter()
    
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8') as outfile:
        
        for line in infile:
            data = json.loads(line)  # Parse JSON line
            if "text" in data:
                # Clean the text
                data["text"] = clean_text(data["text"], redacted_words)
            
            # Write the cleaned data back to the output file
            outfile.write(json.dumps(data, ensure_ascii=False) + "\n")
    
    return redacted_words

# File paths
input_file = "processed.jsonl"
output_file = "cleaned_processed.jsonl"

# Run the processing and collect redacted words
redacted_words = process_jsonl(input_file, output_file)

# Sort and print redacted words by frequency
sorted_redacted_words = redacted_words.most_common()
for word, freq in sorted_redacted_words:
    print(f"{word}: {freq}")

# Save redacted words to a file
with open("redacted_words_frequency.json", "w", encoding="utf-8") as freq_file:
    json.dump(sorted_redacted_words, freq_file, ensure_ascii=False, indent=4)

print("Processing complete. Cleaned file saved as 'cleaned_processed.jsonl'.")
print("Redacted words frequency saved as 'redacted_words_frequency.json'.")


U: 36720
Court: 28054
States: 11700
The: 11492
Congress: 10122
United: 9036
Id: 8597
S: 8509
State: 8417
Act: 8066
F: 7002
Government: 5850
I: 5593
Amendment: 5403
J: 5200
A: 4688
District: 4489
Brief: 3955
Ibid: 3731
In: 3515
Footnote: 3489
Ante: 3398
Clause: 3383
Circuit: 3310
Federal: 3253
Appeals: 3207
Constitution: 3206
Inc: 3069
First: 2380
No: 2074
Statesv: 2064
B: 2060
App: 2059
President: 2004
Rule: 1903
Co: 1896
It: 1812
American: 1778
We: 1755
New: 1734
C: 1689
N: 1595
Article: 1590
Law: 1545
Indian: 1537
Board: 1530
Fourth: 1504
This: 1458
Supreme: 1355
California: 1311
T: 1269
Secretary: 1238
D: 1224
Second: 1215
General: 1198
Texas: 1157
Corp: 1109
Fed: 1099
But: 1099
Post: 1073
Commission: 1068
Section: 1034
Department: 1001
City: 999
Pet: 994
As: 991
Code: 991
Sixth: 954
Petitioner: 954
Art: 952
Respondent: 930
Guidelines: 919
P: 908
Ninth: 894
York: 889
W: 879
Service: 872
To: 870
Mr: 867
Cert: 865
E: 848
National: 834
Dictionary: 827
Florida: 822
And: 815
Title: 808
R

In [9]:
import re
import json
import unicodedata
from collections import Counter

def clean_text(text, redacted_words):
    """
    Cleans the text by:
    1. Removing sentences containing 'writ of certiorari' and everything before them.
    2. Removing sentences containing '____'.
    3. Removing words with footnotes (e.g., include9).
    4. Removing years (e.g., 1990, 2021).
    5. Normalizing Unicode symbols.
    6. Replacing newlines with spaces.
    7. Removing extra spaces.
    """
    # Normalize unicode symbols
    text = unicodedata.normalize("NFKC", text)
    
    # Replace newlines with spaces
    text = text.replace("\n", " ")
    
    # Remove everything before and including the sentence with "writ of certiorari"
    match = re.search(r"[^.]*writ of certiorari[^.]*\.", text, re.IGNORECASE)
    if match:
        text = text[match.end():]  # Keep text after the matched sentence
    
    # Remove sentences containing '____'
    text = re.sub(r"[^.]*____[^.]*\.", "", text)
    
    # Remove words with footnotes (e.g., include9)
    text = re.sub(r"\b\w+\d+\b", "", text)
    
    # Remove years (e.g., 1900-2099)
    text = re.sub(r"\b(19|20)\d{2}\b", "", text)
    
    # Remove multiple spaces
    text = re.sub(r"\s+", " ", text)
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    return text

def process_jsonl(input_file, output_file):
    """
    Processes each entry in a JSONL file by cleaning the text
    and saving the output to a new JSONL file.
    """
    redacted_words = Counter()
    
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8') as outfile:
        
        for line in infile:
            data = json.loads(line)  # Parse JSON line
            if "text" in data:
                # Clean the text
                data["text"] = clean_text(data["text"], redacted_words)
            
            # Write the cleaned data back to the output file
            outfile.write(json.dumps(data, ensure_ascii=False) + "\n")

    return redacted_words

# File paths
input_file = "processed.jsonl"
output_file = "cleaned_processed.jsonl"

# Run the processing and collect redacted words
redacted_words = process_jsonl(input_file, output_file)

# Save redacted words to a file
with open("redacted_words_frequency.json", "w", encoding="utf-8") as freq_file:
    json.dump(redacted_words.most_common(), freq_file, ensure_ascii=False, indent=4)

print("Processing complete. Cleaned file saved as 'cleaned_processed.jsonl'.")
print("Redacted words frequency saved as 'redacted_words_frequency.json'.")

Processing complete. Cleaned file saved as 'cleaned_processed.jsonl'.
Redacted words frequency saved as 'redacted_words_frequency.json'.
