In [None]:
import json


def calculate_llm_execution_match_score(file_path):
    # Read the JSON file
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Get the detailed results
    detailed_results = data["detailed_results"]

    # Count total queries and successful matches
    total_queries = len(detailed_results)
    successful_llm_matches = sum(
        1 for result in detailed_results if result["metrics"]["llm_execution_match"]
    )

    successful_llm_result_matches = sum(
        1 for result in detailed_results if result["metrics"]["llm_exact_match"]
    )
    successful_exact_matches = sum(
        1 for result in detailed_results if result["metrics"]["exact_match"]
    )
    successful_execution_matches = sum(
        1 for result in detailed_results if result["metrics"]["execution_match"]
    )
    # Calculate the score
    llm_match_score = successful_llm_matches / total_queries
    llm_result_match_score = successful_llm_result_matches / total_queries
    exact_match_score = successful_exact_matches / total_queries
    execution_match_score = successful_execution_matches / total_queries

    print(f"Total queries: {total_queries}")
    print(f"Successful llm_execution_matches: {successful_llm_matches}")
    print(f"Successful llm_result_matches: {successful_llm_result_matches}")
    print(f"Successful exact_matches: {successful_exact_matches}")
    print(f"Successful execution_matches: {successful_execution_matches}")
    print(f"llm_match_score: {llm_match_score:.4f}")
    print(f"llm_result_match_score: {llm_result_match_score:.4f}")
    print(f"exact_match_score: {exact_match_score:.4f}")
    print(f"execution_match_score: {execution_match_score:.4f}")

In [28]:
import json

def find_missing_keys():
    # Read the JSON file
    with open('../LLM-as-judge-result.full.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Get the detailed results
    detailed_results = data['detailed_results']
    
    # Define the expected keys for each result entry
    expected_keys = {
        'question',
        'generated',
        'ground_truth',
        'ground_truth_result',
        'generated_result',
        'metrics'
    }
    
    # Define the expected keys for metrics
    expected_metric_keys = {
        'exact_match',
        'execution_match',
        'llm_exact_match',
        'llm_execution_match'
    }
    
    # Check each result entry
    for i, result in enumerate(detailed_results):
        # Check main keys
        missing_keys = expected_keys - set(result.keys())
        if missing_keys:
            print(f"\nEntry {i} is missing main keys: {missing_keys}")
        
        # Check metrics keys
        if 'metrics' in result:
            missing_metric_keys = expected_metric_keys - set(result['metrics'].keys())
            if missing_metric_keys:
                print(result)
                print(f"Entry {i} is missing metric keys: {missing_metric_keys}")
        else:
            print(result)
            print(f"Entry {i} is missing the entire metrics object")

if __name__ == "__main__":
    find_missing_keys()

In [2]:
# part01
calculate_llm_execution_match_score()

Total queries: 129
Successful llm_execution_matches: 74
Score: 0.5736


In [6]:
# part02
calculate_llm_execution_match_score('../results-LLM-judge.part02.json')

Total queries: 94
Successful llm_execution_matches: 61
Score: 0.6489


In [29]:
calculate_llm_execution_match_score('../LLM-as-judge-result.full.json')

Total queries: 223
Successful llm_execution_matches: 135
Successful llm_result_matches: 127
Successful exact_matches: 1
Successful execution_matches: 116
llm_match_score: 0.6054
llm_result_match_score: 0.5695
exact_match_score: 0.0045
execution_match_score: 0.5202


In [None]:
import pandas as pd
import json

def merge_csv_with_json():
    # Read the CSV file with proper encoding and quoting
    csv_path = '../data/GSV/generated-data/gen_success_data_results_LLM_judge.csv'
    df = pd.read_csv(csv_path, 
                     encoding='utf-8',
                     quoting=1,  # QUOTE_ALL mode
                     escapechar='\\',
                     on_bad_lines='warn')
    
    # Read the JSON file
    json_path = '../LLM-as-judge-result.full.json'
    with open(json_path, 'r', encoding='utf-8') as f:
        json_data = json.load(f)
    
    # Print first few entries from both sources for comparison
    print("First few entries from CSV:")
    for idx, row in df.head(2).iterrows():
        print(f"\nCSV Question: {repr(row['question'])}")
        print(f"CSV Ground Truth: {repr(row['ground_truth_sql'])}")
    
    print("\nFirst few entries from JSON:")
    for result in json_data['detailed_results'][:2]:
        print(f"\nJSON Question: {repr(result['question'])}")
        print(f"JSON Ground Truth: {repr(result['ground_truth'])}")
    
    # Create a dictionary from JSON data for easy lookup
    json_lookup = {}
    for result in json_data['detailed_results']:
        # Clean and normalize the text for comparison
        question = result['question'].strip()
        ground_truth = result['ground_truth'].strip()
        key = (question, ground_truth)
        json_lookup[key] = {
            'llm_exact_match': result['metrics']['llm_exact_match'],
            'llm_execution_match': result['metrics']['llm_execution_match']
        }
    
    # Add new columns to DataFrame
    df['llm_exact_match'] = None
    df['llm_execution_match'] = None
    
    # Match and fill values
    matched_count = 0
    for idx, row in df.iterrows():
        # Clean and normalize the text for comparison
        question = row['question'].strip()
        ground_truth = row['ground_truth_sql'].strip()
        key = (question, ground_truth)
        
        # Print the first few attempts at matching
        if idx < 2:
            print(f"\nTrying to match:")
            print(f"Question: {repr(question)}")
            print(f"Ground Truth: {repr(ground_truth)}")
            print(f"Available keys in json_lookup: {list(json_lookup.keys())[:2]}")
        
        if key in json_lookup:
            df.at[idx, 'llm_exact_match'] = json_lookup[key]['llm_exact_match']
            df.at[idx, 'llm_execution_match'] = json_lookup[key]['llm_execution_match']
            matched_count += 1
        else:
            print(f"Warning: No match found for question: {repr(question)}")
            print(f"Ground truth: {repr(ground_truth)}")
    
    # Save the updated CSV with proper encoding
    output_path = '../data/GSV/generated-data/gen_success_data_results_LLM_judge_updated.csv'
    df.to_csv(output_path, 
              index=False,
              encoding='utf-8',
              quoting=1,  # QUOTE_ALL mode
              escapechar='\\')
    
    # Print statistics
    print(f"\nProcessing complete:")
    print(f"Total records in CSV: {len(df)}")
    print(f"Successfully matched records: {matched_count}")
    print(f"Total llm_exact_matches: {sum(df[df['llm_exact_match'].notna()]['llm_exact_match'])}")
    print(f"Total llm_execution_matches: {sum(df[df['llm_execution_match'].notna()]['llm_execution_match'])}")

if __name__ == "__main__":
    merge_csv_with_json()

First few entries from CSV:

CSV Question: 'Có bao nhiêu lịch hẹn được tạo trong tháng 3 năm 2025?\n'
CSV Ground Truth: 'SELECT COUNT(*) FROM appointment WHERE YEAR(created_on) = 2025 AND MONTH(created_on) = 3;\n'

CSV Question: 'Liệt kê tất cả các chi nhánh hiện đang hoạt động.\n'
CSV Ground Truth: 'SELECT * FROM branch WHERE is_active = 1;\n'

First few entries from JSON:

JSON Question: 'Có bao nhiêu lịch hẹn được tạo trong tháng 3 năm 2025?\n'
JSON Ground Truth: 'SELECT COUNT(*) FROM appointment WHERE YEAR(created_on) = 2025 AND MONTH(created_on) = 3;\n'

JSON Question: 'Liệt kê tất cả các chi nhánh hiện đang hoạt động.\n'
JSON Ground Truth: 'SELECT * FROM branch WHERE is_active = 1;\n'

Trying to match:
Question: 'Có bao nhiêu lịch hẹn được tạo trong tháng 3 năm 2025?'
Ground Truth: 'SELECT COUNT(*) FROM appointment WHERE YEAR(created_on) = 2025 AND MONTH(created_on) = 3;'
Available keys in json_lookup: [('Có bao nhiêu lịch hẹn được tạo trong tháng 3 năm 2025?', 'SELECT COUNT(*) F

In [54]:
import pandas as pd

def calculate_metrics_by_level():
    # Read the updated CSV file
    csv_path = '../data/GSV/generated-data/gen_success_data_results_LLM_judge_updated.csv'
    df = pd.read_csv(csv_path, 
                     encoding='utf-8',
                     quoting=1,  # QUOTE_ALL mode
                     escapechar='\\',
                     on_bad_lines='warn')
    
    # print(df[df['llm_exact_match'].isna()])
    # return 
    # Filter rows that have both llm_exact_match and llm_execution_match values
    # (not missing, regardless of whether they are True or False)
    validated_df = df[df['llm_exact_match'].notna() & df['llm_execution_match'].notna()]
    
    # Group by level and calculate metrics
    level_metrics = {}
    for level in validated_df['level'].unique():
        level_data = validated_df[validated_df['level'] == level]
        total = len(level_data)
        exact_matches = sum(level_data['llm_exact_match'])
        execution_matches = sum(level_data['llm_execution_match'])
        
        level_metrics[level] = {
            'total_questions': total,
            'exact_matches': exact_matches,
            'execution_matches': execution_matches,
            'exact_match_score': exact_matches / total,
            'execution_match_score': execution_matches / total
        }
    
    # Print results
    print("Metrics by difficulty level (for questions with LLM metrics):")
    print("-" * 80)
    for level, metrics in level_metrics.items():
        print(f"\nLevel: {level}")
        print(f"Total questions: {metrics['total_questions']}")
        print(f"Exact matches: {metrics['exact_matches']} ({metrics['exact_match_score']*100:.2f}%)")
        print(f"Execution matches: {metrics['execution_matches']} ({metrics['execution_match_score']*100:.2f}%)")
    
    # Print overall statistics
    total_validated = len(validated_df)
    total_exact = sum(validated_df['llm_exact_match'])
    total_execution = sum(validated_df['llm_execution_match'])
    
    print("\nOverall statistics:")
    print(f"Total questions with LLM metrics: {total_validated}")
    print(f"Overall exact match score: {(total_exact/total_validated)*100:.2f}%")
    print(f"Overall execution match score: {(total_execution/total_validated)*100:.2f}%")

if __name__ == "__main__":
    calculate_metrics_by_level()

Metrics by difficulty level (for questions with LLM metrics):
--------------------------------------------------------------------------------

Level: easy
Total questions: 77
Exact matches: 59 (76.62%)
Execution matches: 62 (80.52%)

Level: hard
Total questions: 147
Exact matches: 68 (46.26%)
Execution matches: 73 (49.66%)

Overall statistics:
Total questions with LLM metrics: 224
Overall exact match score: 56.70%
Overall execution match score: 60.27%
