In [3]:
import json
import pandas as pd
import os

# Load the CSV file
csv_file = 'case_references.csv'
case_references = pd.read_csv(csv_file)

# Initialize a list to store dataframe rows
data_rows = []

# Path to the JSON and TXT directories
json_dir = 'test set for Jesse json'
txt_dir = 'test set for Jesse txt'

# Iterate over JSON files
for json_file in os.listdir(json_dir):
    if json_file.endswith('.json'):
        # Extract base case number (e.g., 'case_38' from 'case_38.json')
        base_case_number = os.path.splitext(json_file)[0]

        # Print the base case number
        print(f"Processing {base_case_number}")

        # Construct full path for JSON file
        json_path = os.path.join(json_dir, json_file)

        # Read JSON file
        with open(json_path, 'r') as file:
            data = json.load(file)

        # Extract ID from JSON
        json_id = data['id']

        # Find matching case in CSV
        matching_case = case_references[case_references['case_id'] == json_id]

        if not matching_case.empty:
            # Get the correct_answer_value
            correct_answer_value = matching_case['correct_answer_value'].iloc[0]

            # Construct corresponding TXT file name and path
            txt_file_name = f"{base_case_number}_summary.txt"
            txt_path = os.path.join(txt_dir, txt_file_name)

            # Read TXT file
            try:
                with open(txt_path, 'r') as file:
                    txt_content = file.read()

                # Add to data rows
                data_rows.append({
                    'input': txt_content,
                    'output': correct_answer_value,
                    'case number': base_case_number,
                    'case_id': json_id
                })
            except FileNotFoundError:
                print(f"File not found: {txt_path}")

# Create a new dataframe from the list of rows
new_df = pd.DataFrame(data_rows)

# Save the new dataframe to CSV
new_df.to_csv('test_set_for_jesse.csv', index=False)


Processing case_1
Processing case_10
Processing case_1000
Processing case_1001
Processing case_1002
Processing case_1005
Processing case_1006
Processing case_1007
Processing case_1008
Processing case_1009
Processing case_101
Processing case_1011
Processing case_1012
Processing case_1013
Processing case_1016
Processing case_1017
Processing case_1018
Processing case_102
Processing case_1020
Processing case_1021
Processing case_1023
Processing case_1025
Processing case_1026
Processing case_1027
Processing case_1028
Processing case_1029
Processing case_103
Processing case_1030
Processing case_1032
Processing case_1037
Processing case_1039
Processing case_104
Processing case_1041
Processing case_1043
Processing case_1044
Processing case_1045
Processing case_1046
Processing case_1047
Processing case_1048
Processing case_1049
Processing case_1050
Processing case_1052
Processing case_1053
Processing case_1054
Processing case_1055
Processing case_1057
Processing case_1059
Processing case_106
Pr

In [10]:
import json
import pandas as pd
import os
from tqdm import tqdm

def process_case_files(csv_file, json_dir, txt_dir):
    # Load the CSV file
    case_references = pd.read_csv(csv_file)

    # Initialize a list to store dataframe rows
    data_rows = []

    # Get the list of JSON files and wrap it with tqdm for a progress bar
    json_files = [f for f in os.listdir(json_dir) if f.endswith('.json')]
    for json_file in tqdm(json_files, desc="Processing JSON files"):
        # Extract base case number
        base_case_number = os.path.splitext(json_file)[0]

        # Construct full path for JSON file
        json_path = os.path.join(json_dir, json_file)

        # Read JSON file
        with open(json_path, 'r') as file:
            data = json.load(file)

        # Extract ID from JSON
        json_id = data['id']

        # Find matching case in CSV
        matching_case = case_references[case_references['case_id'] == json_id]

        if not matching_case.empty:
            # Get the correct_answer_value
            correct_answer_value = matching_case['correct_answer_value'].iloc[0]

            # Construct corresponding TXT file name and path
            txt_file_name = f"{base_case_number}_summary.txt"
            txt_path = os.path.join(txt_dir, txt_file_name)

            # Read TXT file
            try:
                with open(txt_path, 'r') as file:
                    txt_content = file.read()

                # Add to data rows
                data_rows.append({
                    'input': txt_content,
                    'output': correct_answer_value,
                    'case number': base_case_number,
                    'case_id': json_id
                })
            except FileNotFoundError:
                # print(f"File not found: {txt_path}")
                continue

    # Create a new dataframe from the list of rows
    new_df = pd.DataFrame(data_rows)

    # Drop rows with NULL values
    new_df.dropna(inplace=True)

    # Generate output CSV file name based on json_dir
    dataset_type = json_dir.split('_')[-1]  # Extracts 'train', 'test', or 'val' from the filename
    output_csv_name = f"{dataset_type}_dataset.csv"
    new_df.to_csv(output_csv_name, index=False)
    print(f"Processed data saved to {output_csv_name}")

In [None]:
process_case_files('case_references_train.csv', 'ref_case_jsons_train', 'ref_case_txt_train')

In [11]:
process_case_files('case_references_test.csv', 'ref_case_jsons_test', 'ref_case_txt_test')

Processing JSON files: 100%|████████████████████████████████████████████████████████████████████████| 3609/3609 [00:17<00:00, 200.85it/s]


Processed data saved to test_dataset.csv


In [None]:
process_case_files('case_references_val.csv', 'ref_case_jsons_val', 'ref_case_txt_val')

Processing JSON files:  45%|████████████████████████████████▍                                       | 1609/3574 [00:08<00:10, 192.97it/s]