### Variance in assigning replaceability scores across models

In [3]:
import os
import json
import numpy as np

def calculate_pooled_std(results):
    sum_of_variances = 0  

    for label, scores in results.items():
        n = len(scores)  
        if n > 1:  
            sum_of_variances += np.std(scores, ddof=1)**2  
    
    return (sum_of_variances / len(results))**0.5

def compute_for_model(model_folder):
    json_file = os.path.join(model_folder, 'output.json')
    
    with open(json_file, 'r') as f:
        results = json.load(f)
    
    pooled_std = calculate_pooled_std(results)
    
    return pooled_std

models = ['gpt', 'claude', 'gemini', 'llama']
base_dir = 'test_error'

for model in models:
    model_folder = os.path.join(base_dir, model)
    pooled_std = compute_for_model(model_folder)
    print(f"Pooled Standard Deviation for {model}: {pooled_std:.4f}")

Pooled Standard Deviation for gpt: 0.3720
Pooled Standard Deviation for claude: 0.2575
Pooled Standard Deviation for gemini: 0.0000
Pooled Standard Deviation for llama: 0.1638
