In [None]:
import os
import requests
import re
from dotenv import load_dotenv
import pandas as pd
import json

# Specify the path of your CSV file
file_path = 'train.csv'

try:
    # Attempt to read the CSV with UTF-8 encoding first
    df = pd.read_csv(file_path, encoding='utf-8')
except UnicodeDecodeError:
    # If UTF-8 didn't work, try with "ISO-8859-1"
    df = pd.read_csv(file_path, encoding='iso-8859-1')


In [None]:
for index, row in df.head(5).iterrows():  # Only the first 5 rows
        print(f"Data point {index + 1}:")

        # Iterating through each column and printing its content.
        for col_name in df.columns:
            print(f"{col_name}: {row[col_name]}")

        # Print a new line to separate the data points
        print("\n" + "-"*30 + "\n")

In [None]:
# Check if the DataFrame is empty or if the required columns are not present
if df.empty:
    print("The DataFrame is empty. No data to print.")
elif df.shape[1] < 12:  # Checking if there are at least 12 columns, as we want to access up to index 11
    print("The DataFrame does not have enough columns.")
else:
    # Iterate through the first 5 rows
    for index, row in df.head(5).iterrows():
        print(f"Data point {index + 1}:")

        col_0_data = row.iloc[1]  # Data from the first column (typically index starts from 0)
        col_11_data = row.iloc[12]  # Data from the twelfth column

        # Print the data from columns 0 and 11
        print(f"Column 0: {col_0_data}")
        print(f"Column 11: {col_11_data}")

        # Print a new line to separate the data points
        print("\n" + "-"*30 + "\n")

In [None]:
import pandas as pd
import re

num_of_words = 20  # This variable can be changed to capture more or fewer words.

if df.empty:
    print("The DataFrame is empty. No data to print.")
elif df.shape[1] < 13:
    print("The DataFrame does not have enough columns.")
else:
    # Iterate through the first 5 rows
    for index, row in df.head(5).iterrows():
        print(f"Data point {index + 1}:")

        text_content = row.iloc[1]  # Column containing the text

        # Extracting the first 'num_of_words' words from the text content
        first_words = ' '.join(text_content.split()[:num_of_words])

        # Now, for the words preceding '(<HOLDING>)', using an f-string for efficiency.
        regex_pattern = fr'((?:\S+\s+){{0,{num_of_words}}})\(<HOLDING>\)'
        match = re.search(regex_pattern, text_content)

        print(f"First {num_of_words} words:")
        print(first_words)  # Printing the first set of words

        if match:
            preceding_words = match.group(1)  # This is the string of text before your target
            preceding_words_list = preceding_words.split()

            # If there are more than 'num_of_words', we only take the last 'num_of_words' words.
            if len(preceding_words_list) > num_of_words:
                preceding_words_list = preceding_words_list[-num_of_words:]

            print(f"\n{num_of_words} words before (<HOLDING>):")
            print(' '.join(preceding_words_list))  # Printing the words before '<HOLDING>'
        else:
            print("\nNo '<HOLDING>' tag found in this text content.")

        print("\n" + "-"*30 + "\n")  # Separator for clarity between data points


# Code to get reference case documents

In [None]:
import os
import re
import json
import pandas as pd
import requests
import math
import gc
import time

In [None]:
def extract_case_citations(text, num_of_words=18):

    # First, we attempt to extract 'num_of_words' before "(<HOLDING>)"
    match = re.search(fr'((?:\S+\s+){{0,{num_of_words}}})\(<HOLDING>\)', text)
    extracted_text = text  # default to full text if no specific match

    if match:
        # If a match is found, extract the specific portion of the text
        extracted_text = ' '.join(match.group(1).split()[-num_of_words:])

    # This pattern might be more complex depending on the citation formats you're dealing with
    citation_pattern = r'(\d+)\s([A-Za-z0-9.]+)\s(\d+)'
    matches = re.findall(citation_pattern, extracted_text)

    # Build list of case citations from matches
    case_citations = [' '.join(match) for match in matches]

    if not case_citations:
        print("No case citations found in the provided text.")

    return case_citations, extracted_text  

def search_cases(citations):
    """
    Search for cases using the first citation from a list of case citations.
    """
    if not citations:
        print("No citations provided.")
        return None

    # Use the first citation in the list
    first_citation = citations[0]

    # Define the endpoint
    url = 'https://api.case.law/v1/cases/'
    headers = {'Authorization': f'Token {CASELAW_TOKEN}'}
    params = {
        'cite': first_citation,
        'full_case': 'true'  # Requesting the full case details
    }

    response = requests.get(url, headers=headers, params=params)

    if response.status_code == 200:
        case_data = response.json()
        cases_results = case_data.get('results', [])

        if cases_results:
            # Assuming we only need the first case even if multiple cases match the citation
            return cases_results[0]
        else:
            print(f"No cases found for citation {first_citation}.")
            return None
    else:
        print(f'Failed to retrieve cases for citation {first_citation}: {response.status_code}')
        return None

def save_case(case, row_number, subdir='ref_case_jsons'):
    case_id = case['id']
    state = case['jurisdiction']['name_long']

    # Save case to a JSON file
    if not os.path.exists(subdir):
        os.makedirs(subdir)
    
    with open(os.path.join(subdir, f'case_{row_number + 1}.json'), 'w') as json_file:
        json.dump(case, json_file, indent=4)

    return case_id, state

def process_dataframe(df, start_row=0, num_rows_to_process=5, status_file='case_references.csv', subdir='ref_case_jsons'):
    # Initialization
    if os.path.exists(status_file):
        queries_df = pd.read_csv(status_file)
        # Ensuring we are not re-processing already processed rows
        start_row = max(queries_df['row_number'].max() + 1, start_row)
    else:
        # Add new columns for the correct choice index and the corresponding answer value
        queries_df = pd.DataFrame(columns=['row_number', 
                                           'query_status', 
                                           'case_id', 
                                           'state', 'error', 
                                           'correct_choice_index', 
                                           'correct_answer_value', 
                                           'extracted_text'])

    # Main processing loop, which only processes a set number of rows
    for row_number in range(start_row, start_row + num_rows_to_process):
        if row_number >= len(df):
            print("Reached the end of the dataframe.")
            break

        correct_choice_index = df.iloc[row_number].iloc[12]

        # Check if the value in `.iloc[12]` is valid (i.e., within the range 0-4) and not empty
        if pd.isna(correct_choice_index) or correct_choice_index not in range(5):
            print(f"Skipping row {row_number} due to invalid or missing correct choice index.")
            continue  # Skip this row and continue with the next one

        # Adjusting the column reference since .iloc[12] starts at 0 and the choices start at .iloc[2]
        correct_answer_value = df.iloc[row_number].iloc[int(correct_choice_index) + 2]

        text_content = df.iloc[row_number].iloc[1]
        citations, extracted_text = extract_case_citations(text_content) 
        case = search_cases(citations)  # Then, search for the case based on the citation

        query_status = None
        case_id = None
        state = None
        error = False

        if case:
            casebody_status = case.get('casebody', {}).get('status')
            if casebody_status == 'ok':
                try:
                    case_id, state = save_case(case, row_number, subdir=subdir)
                    query_status = 1  # success
                except Exception as e:
                    print(f"Error saving case for row {row_number}: {e}")
                    error = True
                    query_status = 0  # failure
            else:
                print(f"Casebody status is not 'ok' for row {row_number}. Exiting loop.")
                break  # Exit the for loop if casebody status is not "ok"
        else:
            query_status = 0  # failure or no results

        # Update status dataframe with additional information about the correct choice
        new_row = {
            'row_number': row_number, 
            'query_status': query_status, 
            'case_id': case_id, 
            'state': state, 
            'error': error,
            'correct_choice_index': correct_choice_index,
            'correct_answer_value': correct_answer_value,
            'extracted_text': extracted_text
        }
        queries_df = pd.concat([queries_df, pd.DataFrame([new_row])], ignore_index=True)
        print(str(row_number)+":",str(new_row['case_id']))
        
        # Save after each row to preserve data in case of interruption
        # Attempting exponential backoff when saveing to csv
        max_attempts = 5  # Maximum number of retry attempts
        backoff_factor = 2  # Factor by which the wait time increases each attempt

        try:
            queries_df.to_csv(status_file, index=False)
        except Exception as e:
            print("Initial attempt failed, entering backoff loop.")
            for attempt in range(1, max_attempts):
                try:
                    time.sleep(backoff_factor ** attempt)  # Backoff before retrying
                    queries_df.to_csv(status_file, index=False)
                    print(f"File saved successfully on attempt {attempt + 1}.")
                    break  # Exit the loop if the file is saved successfully
                except Exception as e:
                    print(f"Attempt {attempt + 1} failed, retrying...")
            else:
                print("Failed to save the file after several attempts.")

In [None]:
extract_case_citations(df.iloc[0,1])

In [None]:
df.iloc[14,1]

In [None]:
df.iloc[0,2]

## -getting the cases for train.csv

In [None]:
load_dotenv()

CASELAW_TOKEN = os.getenv('CASELAW_TOKEN')
process_dataframe(df, num_rows_to_process=1,subdir='ref_case_jsons')

# for i in range(25):
#     process_dataframe(df, num_rows_to_process=500,subdir='ref_case_jsons')
#     gc.collect()
#     time.sleep(5)

file_path = 'train.csv'
try:
    df = pd.read_csv(file_path, encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv(file_path, encoding='iso-8859-1')
    
for i in range(12):
    process_dataframe(df, num_rows_to_process=500,status_file='case_references_train.csv',subdir='ref_case_jsons_train')
    gc.collect()
    time.sleep(5)

### - getting the cases from val.csv

In [None]:
file_path = 'val.csv'
try:
    df = pd.read_csv(file_path, encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv(file_path, encoding='iso-8859-1')
    
for i in range(12):
    process_dataframe(df, num_rows_to_process=500,status_file='case_references_val.csv',subdir='ref_case_jsons_val')
    gc.collect()
    time.sleep(5)

### - getting the cases from test.csv

In [None]:
file_path = 'test.csv'
try:
    df = pd.read_csv(file_path, encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv(file_path, encoding='iso-8859-1')
    
for i in range(12):
    process_dataframe(df, num_rows_to_process=500,status_file='case_references_test.csv',subdir='ref_case_jsons_test')
    gc.collect()
    time.sleep(5)

# Below is a version to keep on downloading even if the casebody status is not "ok"
## - it'll just mark a 3 in the query_status column

In [None]:
def process_dataframe_without_regard(df, start_row=0, num_rows_to_process=5, status_file='case_references.csv'):
    # Initialization
    if os.path.exists(status_file):
        queries_df = pd.read_csv(status_file)
        # Ensuring we are not re-processing already processed rows
        start_row = max(queries_df['row_number'].max() + 1, start_row)
    else:
        # Add new columns for the correct choice index and the corresponding answer value
        queries_df = pd.DataFrame(columns=['row_number', 
                                           'query_status', 
                                           'case_id', 
                                           'state', 'error', 
                                           'correct_choice_index', 
                                           'correct_answer_value', 
                                           'extracted_text'])

    # Main processing loop, which only processes a set number of rows
    for row_number in range(start_row, start_row + num_rows_to_process):
        if row_number >= len(df):
            print("Reached the end of the dataframe.")
            break

        correct_choice_index = df.iloc[row_number].iloc[12]

        # Check if the value in `.iloc[12]` is valid (i.e., within the range 0-4) and not empty
        if pd.isna(correct_choice_index) or correct_choice_index not in range(5):
            print(f"Skipping row {row_number} due to invalid or missing correct choice index.")
            continue  # Skip this row and continue with the next one

        # Adjusting the column reference since .iloc[12] starts at 0 and the choices start at .iloc[2]
        correct_answer_value = df.iloc[row_number].iloc[int(correct_choice_index) + 2]

        text_content = df.iloc[row_number].iloc[1]
        citations, extracted_text = extract_case_citations(text_content) 
        case = search_cases(citations)  # Then, search for the case based on the citation

        query_status = None
        case_id = None
        state = None
        error = False

        if case:
            casebody_status = case.get('casebody', {}).get('status')
            if casebody_status == 'ok':
                try:
                    case_id, state = save_case(case, row_number)
                    query_status = 1  # success
                except Exception as e:
                    print(f"Error saving case for row {row_number}: {e}")
                    error = True
                    query_status = 0  # failure
            elif casebody_status == 'error_limit_exceeded':
                print(f"Data limit exceeded: null casebody for row: {row_number}")
                case_id, state = save_case(case, row_number)
                query_status = 3
            else:
                print(f"Casebody status is not 'ok' for row {row_number}. Exiting loop.")
                break  # Exit the for loop if casebody status is not "ok"
        else:
            query_status = 0  # failure or no results

        # Update status dataframe with additional information about the correct choice
        new_row = {
            'row_number': row_number, 
            'query_status': query_status, 
            'case_id': case_id, 
            'state': state, 
            'error': error,
            'correct_choice_index': correct_choice_index,
            'correct_answer_value': correct_answer_value,
            'extracted_text': extracted_text
        }
        queries_df = pd.concat([queries_df, pd.DataFrame([new_row])], ignore_index=True)
        print(str(row_number)+":",str(new_row['case_id']))
        # Save after each row to preserve data in case of interruption
        queries_df.to_csv(status_file, index=False)

In [None]:
load_dotenv()
CASELAW_TOKEN = os.getenv('CASELAW_TOKEN2')

for i in range(25):
    process_dataframe_without_regard(df, num_rows_to_process=500)
    gc.collect()
    time.sleep(.1)

# Getting the unique jurisdictions from the saved list
## this was for gathering a dataset for human annotations
## and ensuring we had a sample from every state

In [None]:
# Load the data
try:
    data = pd.read_csv('case_references.csv')
except Exception as e:
    print(f"An error occurred while reading the CSV file: {e}")
    exit()

# Check if 'state' column exists in the dataframe
if 'state' not in data.columns:
    print("'state' column not found in the CSV file")
    exit()

# Create a new dataframe for case_key with the appropriate columns
case_key = pd.DataFrame(columns=['case_number', 'case_id', 'jurisdiction'])

# Create a set to keep track of unique states
unique_states = set()

# Iterate over every row in the original dataframe
for index, row in data.iterrows():
    state = row['state']
    
    # If we find a new state, we proceed to add the information to case_key
    if state not in unique_states:
        unique_states.add(state)
        new_row = pd.DataFrame([{
            'case_number': index + 1,  # row_number + 1
            'case_id': row['case_id'],  # Assuming 'case_id' is the column name in your csv
            'jurisdiction': state
        }])
        
        # Concatenating the new row with the existing DataFrame
        case_key = pd.concat([case_key, new_row], ignore_index=True)

# Save the case_key dataframe
try:
    case_key.to_csv('case_key.csv', index=False)
    print("The case_key file was saved successfully.")
except Exception as e:
    print(f"An error occurred while saving the case_key CSV: {e}")

