In [None]:
import os
import requests
import re
from dotenv import load_dotenv
import pandas as pd
import json

file_path = 'train.csv'

try:
    df = pd.read_csv(file_path, encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv(file_path, encoding='iso-8859-1')

In [None]:
for index, row in df.head(5).iterrows():
        print(f"Data point {index + 1}:")

        for col_name in df.columns:
            print(f"{col_name}: {row[col_name]}")

In [None]:
# Going through the train.csv from CaseHold on hugging face
# This was for going through the case excert and the correct multiple choice holding
# Iterate through the first 5 rows
for index, row in df.head(5).iterrows():
    print(f"Data point {index + 1}:")

    col_0_data = row.iloc[1]
    col_11_data = row.iloc[12]

    # Print the data from columns 0 and 11
    print(f"Column 0: {col_0_data}")
    print(f"Column 11: {col_11_data}")

# Code to get reference case documents

In [None]:
import os
import re
import json
import pandas as pd
import requests
import math
import gc
import time

In [None]:
def extract_case_citations(text, num_of_words=18):

    # First, we attempt to extract 'num_of_words' before "(<HOLDING>)"
    match = re.search(fr'((?:\S+\s+){{0,{num_of_words}}})\(<HOLDING>\)', text)
    # default to full text if no specific match
    extracted_text = text  

    if match:
        # If a match is found, extract the specific portion of the text
        extracted_text = ' '.join(match.group(1).split()[-num_of_words:])
        
    # START: Copied from GPT-4
    citation_pattern = r'(\d+)\s([A-Za-z0-9.]+)\s(\d+)'
    # END: Copied from GPT-4
    
    matches = re.findall(citation_pattern, extracted_text)

    # Build list of case citations from matches
    case_citations = [' '.join(match) for match in matches]

    if not case_citations:
        print("No case citations found in the provided text.")

    return case_citations, extracted_text  

def search_cases(citations):
    if not citations:
        print("No citations provided.")
        return None

    # Use the first citation in the list
    first_citation = citations[0]

    # Define the endpoint
    url = 'https://api.case.law/v1/cases/'
    headers = {'Authorization': f'Token {CASELAW_TOKEN}'}
    params = {
        'cite': first_citation,
        'full_case': 'true'
    }

    response = requests.get(url, headers=headers, params=params)

    if response.status_code == 200:
        case_data = response.json()
        cases_results = case_data.get('results', [])

        if cases_results:
            # Taking the first case even if multiple cases match the citation
            return cases_results[0]
        else:
            print(f"No cases found for citation {first_citation}.")
            return None
    else:
        print(f'Failed to retrieve cases for citation {first_citation}: {response.status_code}')
        return None

def save_case(case, row_number, subdir='ref_case_jsons'):
    case_id = case['id']
    state = case['jurisdiction']['name_long']

    # Save case to a JSON file
    if not os.path.exists(subdir):
        os.makedirs(subdir)
    
    with open(os.path.join(subdir, f'case_{row_number + 1}.json'), 'w') as json_file:
        json.dump(case, json_file, indent=4)

    return case_id, state

def process_dataframe(df, start_row=0, num_rows_to_process=5, status_file='case_references.csv', subdir='ref_case_jsons'):
    if os.path.exists(status_file):
        queries_df = pd.read_csv(status_file)
        # Ensuring we are not re-processing already processed rows
        start_row = max(queries_df['row_number'].max() + 1, start_row)
    else:
        queries_df = pd.DataFrame(columns=['row_number', 
                                           'query_status', 
                                           'case_id', 
                                           'state', 'error', 
                                           'correct_choice_index', 
                                           'correct_answer_value', 
                                           'extracted_text'])

    for row_number in range(start_row, start_row + num_rows_to_process):
        if row_number >= len(df):
            print("Reached the end of the dataframe.")
            break

        correct_choice_index = df.iloc[row_number].iloc[12]

        # Check if the multiple choice values in CaseHold are valid (0-4) since a few were not valid
        if pd.isna(correct_choice_index) or correct_choice_index not in range(5):
            print(f"Skipping row {row_number} due to invalid or missing correct choice index.")
            continue

        # The correct multiple answer (0-4) was actually just 2 columns away
        correct_answer_value = df.iloc[row_number].iloc[int(correct_choice_index) + 2]

        text_content = df.iloc[row_number].iloc[1]
        citations, extracted_text = extract_case_citations(text_content) 
        case = search_cases(citations)

        query_status = None
        case_id = None
        state = None
        error = False

        if case:
            casebody_status = case.get('casebody', {}).get('status')
            if casebody_status == 'ok':
                try:
                    case_id, state = save_case(case, row_number, subdir=subdir)
                    query_status = 1
                except Exception as e:
                    print(f"Error saving case for row {row_number}: {e}")
                    error = True
                    query_status = 0
            else:
                print(f"Casebody status is not 'ok' for row {row_number}. Exiting loop.")
                break
        else:
            query_status = 0  # failure or no results

        # Updateing the status dataframe
        new_row = {
            'row_number': row_number, 
            'query_status': query_status, 
            'case_id': case_id, 
            'state': state, 
            'error': error,
            'correct_choice_index': correct_choice_index,
            'correct_answer_value': correct_answer_value,
            'extracted_text': extracted_text
        }
        queries_df = pd.concat([queries_df, pd.DataFrame([new_row])], ignore_index=True)
        print(str(row_number)+":",str(new_row['case_id']))
        
        # Save after each row to preserve data in case of interruption
        # Attempting exponential backoff when saveing to csv
        max_attempts = 5  # Maximum number of retry attempts
        backoff_factor = 2  # Factor by which the wait time increases each attempt

        try:
            queries_df.to_csv(status_file, index=False)
        except Exception as e:
            print("Initial attempt failed, entering backoff loop.")
            for attempt in range(1, max_attempts):
                try:
                    # Backoff before retrying
                    time.sleep(backoff_factor ** attempt)  
                    queries_df.to_csv(status_file, index=False)
                    print(f"File saved successfully on attempt {attempt + 1}.")
                    break 
                except Exception as e:
                    print(f"Attempt {attempt + 1} failed, retrying...")
            else:
                print("Failed to save the file after several attempts.")

In [None]:
extract_case_citations(df.iloc[0,1])

In [None]:
df.iloc[14,1]

In [None]:
df.iloc[0,2]

## -getting the cases for train.csv

In [None]:
load_dotenv()

CASELAW_TOKEN = os.getenv('CASELAW_TOKEN')
process_dataframe(df, num_rows_to_process=1,subdir='ref_case_jsons')

# for i in range(25):
#     process_dataframe(df, num_rows_to_process=500,subdir='ref_case_jsons')
#     gc.collect()
#     time.sleep(5)

file_path = 'train.csv'
try:
    df = pd.read_csv(file_path, encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv(file_path, encoding='iso-8859-1')
    
for i in range(12):
    process_dataframe(df, num_rows_to_process=500,status_file='case_references_train.csv',subdir='ref_case_jsons_train')
    gc.collect()
    time.sleep(5)

### - getting the cases from val.csv

In [None]:
file_path = 'val.csv'
try:
    df = pd.read_csv(file_path, encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv(file_path, encoding='iso-8859-1')
    
for i in range(12):
    process_dataframe(df, num_rows_to_process=500,status_file='case_references_val.csv',subdir='ref_case_jsons_val')
    gc.collect()
    time.sleep(5)

### - getting the cases from test.csv

In [None]:
file_path = 'test.csv'
try:
    df = pd.read_csv(file_path, encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv(file_path, encoding='iso-8859-1')
    
for i in range(12):
    process_dataframe(df, num_rows_to_process=500,status_file='case_references_test.csv',subdir='ref_case_jsons_test')
    gc.collect()
    time.sleep(5)