In [1]:
import json
from collections import defaultdict

def reorder_evaluation_results(file_path):
    # Read the JSON file
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    # Create a sorted dictionary to store reorganized results
    organized_results = {
        'baseline': {},
        'CoT': {},
        'manually_written': {}
    }
    
    # Process each file result
    for filename, metrics in data.items():
        # Extract k value from filename
        if '_k' in filename:
            k_value = int(filename.split('_k')[-1].split('.')[0])
            k_key = f'k{k_value}'
            
            # Determine the type and store in organized format
            if 'baseline' in filename:
                organized_results['baseline'][k_key] = metrics
            elif 'CoT' in filename:
                organized_results['CoT'][k_key] = metrics
            elif 'manually_written' in filename:
                organized_results['manually_written'][k_key] = metrics
    
    # Sort k-values within each type
    for response_type in organized_results:
        organized_results[response_type] = dict(sorted(
            organized_results[response_type].items(),
            key=lambda x: int(x[0][1:])  # Sort by k value
        ))
    
    # Write the reorganized results
    output_path = file_path.replace('.json', '_reorganized_by_type.json')
    with open(output_path, 'w') as f:
        json.dump(organized_results, f, indent=4)
    
    return organized_results

In [7]:
# file_path_contractnli = "data/json_output/contractnli/gpt/evaluation_results.json"
# # file_path_cuad = "data/json_output/cuad/llama3/evaluation_results.json"
# file_path_maud = "data/json_output/maud/llama3/evaluation_results.json"
# file_path_privacy_qa = "data/json_output/privacyqa/llama3/evaluation_results.json"


# file_path_contractnli = "data/json_output/contractnli/llama3/evaluation_results.json"
# file_path_cuad = "data/json_output/cuad/llama3/evaluation_results.json"
file_path_privacyqa = "data/json_output/privacyqa/llama3/evaluation_results.json"


reorder_evaluation_results(file_path_privacyqa)
# reorder_evaluation_results(file_path_cuad)
# reorder_evaluation_results(file_path_maud)
# reorder_evaluation_results(file_path_privacy_qa)



{'baseline': {'k1': {'ragas_metrics': "{'faithfulness': 0.6983, 'answer_relevancy': 0.4213}",
   'bert_f1': 0.7957361574025498,
   'rouge_recall': 0.3459876085166887},
  'k3': {'ragas_metrics': "{'faithfulness': 0.8110, 'answer_relevancy': 0.4770}",
   'bert_f1': 0.7811541806176766,
   'rouge_recall': 0.194828315928701},
  'k5': {'ragas_metrics': "{'faithfulness': 0.8080, 'answer_relevancy': 0.4057}",
   'bert_f1': 0.7634375752861967,
   'rouge_recall': 0.13938168725750602},
  'k10': {'ragas_metrics': "{'faithfulness': 0.8161, 'answer_relevancy': 0.3318}",
   'bert_f1': 0.7436846549977961,
   'rouge_recall': 0.10017212409925386}},
 'CoT': {'k1': {'ragas_metrics': "{'faithfulness': 0.7283, 'answer_relevancy': 0.3144}",
   'bert_f1': 0.7860526384152088,
   'rouge_recall': 0.3671654825351862},
  'k3': {'ragas_metrics': "{'faithfulness': 0.8052, 'answer_relevancy': 0.4074}",
   'bert_f1': 0.7779750313955484,
   'rouge_recall': 0.21198786814006546},
  'k5': {'ragas_metrics': "{'faithfulness