# This code was used in formatting the LegalBert-Large summaries with the CaseHold Parentheticals
## -The first cell was used to create a small subset of the dataset for testing
## -The second section was used to formatting the full dataset of train, val, and test

In [None]:
import json
import pandas as pd
import os

csv_file = 'case_references.csv'
case_references = pd.read_csv(csv_file)

data_rows = []

json_dir = 'test set for Jesse json'
txt_dir = 'test set for Jesse txt'

# We go through a bunch of downloaded cases from Caselaw
# Then we match the case id with the case_references csv
# From here, we find the corresponding summary, which is in a txt file
# The result is put in a dataframe
for json_file in os.listdir(json_dir):
    if json_file.endswith('.json'):
        # Extract base case number (e.g., 'case_38' from 'case_38.json')
        base_case_number = os.path.splitext(json_file)[0]
        print(f"Processing {base_case_number}")
        
        json_path = os.path.join(json_dir, json_file)
        
        with open(json_path, 'r') as file:
            data = json.load(file)
            
        json_id = data['id']
        matching_case = case_references[case_references['case_id'] == json_id]

        if not matching_case.empty:
            correct_answer_value = matching_case['correct_answer_value'].iloc[0]
            txt_file_name = f"{base_case_number}_summary.txt"
            txt_path = os.path.join(txt_dir, txt_file_name)

            try:
                with open(txt_path, 'r') as file:
                    txt_content = file.read()

                data_rows.append({
                    'input': txt_content,
                    'output': correct_answer_value,
                    'case number': base_case_number,
                    'case_id': json_id
                })
            except FileNotFoundError:
                print(f"File not found: {txt_path}")

new_df = pd.DataFrame(data_rows)
new_df.to_csv('test_set_for_jesse.csv', index=False)

# Used in formatting the full dataset

In [None]:
import json
import pandas as pd
import os
from tqdm import tqdm

def process_case_files(csv_file, json_dir, txt_dir):

    case_references = pd.read_csv(csv_file)

    data_rows = []

    json_files = [f for f in os.listdir(json_dir) if f.endswith('.json')]
    for json_file in tqdm(json_files, desc="Processing JSON files"):
        base_case_number = os.path.splitext(json_file)[0]
        json_path = os.path.join(json_dir, json_file)

        with open(json_path, 'r') as file:
            data = json.load(file)
        json_id = data['id']

        matching_case = case_references[case_references['case_id'] == json_id]

        if not matching_case.empty:
            correct_answer_value = matching_case['correct_answer_value'].iloc[0]
            txt_file_name = f"{base_case_number}_summary.txt"
            txt_path = os.path.join(txt_dir, txt_file_name)

            try:
                with open(txt_path, 'r') as file:
                    txt_content = file.read()

                # Add to data rows
                data_rows.append({
                    'input': txt_content,
                    'output': correct_answer_value,
                    'case number': base_case_number,
                    'case_id': json_id
                })
            except FileNotFoundError:
                # print(f"File not found: {txt_path}")
                continue

    new_df = pd.DataFrame(data_rows)
    new_df.dropna(inplace=True)

    # Generate output CSV file name based on json_dir
    dataset_type = json_dir.split('_')[-1]
    output_csv_name = f"{dataset_type}_dataset.csv"
    new_df.to_csv(output_csv_name, index=False)
    print(f"Processed data saved to {output_csv_name}")

In [None]:
process_case_files('case_references_train.csv', 'ref_case_jsons_train', 'ref_case_txt_train')

In [None]:
process_case_files('case_references_test.csv', 'ref_case_jsons_test', 'ref_case_txt_test')

In [None]:
process_case_files('case_references_val.csv', 'ref_case_jsons_val', 'ref_case_txt_val')