In [19]:
import numpy as np
from scipy.stats import wilcoxon

# Data for GPT-3.5-Turbo
gpt_data = {
    'GSM8K': {
        'baseline': [0.867, 0.800, 0.900],
        # 'CoT': [0.633, 0.667, 0.600],
        # 'Emotive': [0.767, 0.767, 0.700],
        'gpt-3.5-turbo': {
            'Authoritarian': [0.833, 0.867, 0.367],
            'Market': [0.800, 0.800, 0.600],
            'Hierarchical': [0.733, 0.767, 0.900]
        },
        'claude-sonnet': {
            'Authoritarian': [0.767, 0.667, 0.767],
            'Market': [0.733, 0.800, 0.000],
            'Hierarchical': [0.100, 0.500, 0.767]
        }
    },
    'SST-2': {
        'baseline': [0.967, 0.967, 0.967],
        # 'CoT': [0.633, 0.600, 0.633],
        # 'Emotive': [0.967, 0.967, 0.967],
        'gpt-3.5-turbo': {
            'Authoritarian': [0.933, 0.900, 0.967],
            'Market': [0.900, 0.967, 0.933],
            'Hierarchical': [0.933, 0.967, 0.933]
        },
        'claude-sonnet': {
            'Authoritarian': [0.967, 0.967, 0.967],
            'Market': [0.767, 0.867, 0.800],
            'Hierarchical': [0.967, 0.900, 0.933]
        }
    },
    'HumanEval': {
        'baseline': [0.800, 0.800, 0.800],
        # 'CoT': [0.867],
        # 'Emotive': [0.767],
        'gpt-3.5-turbo': {
            'Authoritarian': [0.800, 0.833, 0.800],
            'Market': [0.767, 0.700, 0.767],
            'Hierarchical': [0.767, 0.633, 0.833]
        },
        'claude-sonnet': {
            'Authoritarian': [0.833, 0.467, 0.867],
            'Market': [0.767, 0.767, 0.800],
            'Hierarchical': [0.833, 0.833, 0.800]
        }
    }
}

# Data for Llama-3.1
llama_data = {
    'GSM8K': {
        'baseline': [0.033, 0.033, 0.033],
        # 'CoT': [0.100, 0.100, 0.100],
        # 'Emotive': [0.067, 0.067, 0.067],
        'gpt-3.5-turbo': {
            'Authoritarian': [0.667, 0.300, 0.033],
            'Market': [0.133, 0.433, 0.033],
            'Hierarchical': [0.033, 0.033, 0.067]
        },
        'claude-sonnet': {
            'Authoritarian': [0.000, 0.267, 0.567],
            'Market': [0.133, 0.167, 0.000],
            'Hierarchical': [0.000, 0.033, 0.267]
        }
    },
    'SST-2': {
        'baseline': [0.800, 0.800, 0.800],
        # 'CoT': [0.800, 0.800, 0.833],
        # 'Emotive': [0.800, 0.800, 0.800],
        'gpt-3.5-turbo': {
            'Authoritarian': [0.800, 0.900, 0.967],
            'Market': [0.800, 0.967, 0.800],
            'Hierarchical': [0.967, 0.967, 0.867]
        },
        'claude-sonnet': {
            'Authoritarian': [0.567, 0.967, 0.867],
            'Market': [0.667, 0.067, 0.233],
            'Hierarchical': [0.800, 0.900, 0.900]
        }
    },
    'HumanEval': {
        'baseline': [0.600, 0.600, 0.600],
        # 'CoT': [0.667],
        # 'Emotive': [0.667],
        'gpt-3.5-turbo': {
            'Authoritarian': [0.167, 0.233, 0.600],
            'Market': [0.666, 0.700, 0.500],
            'Hierarchical': [0.733, 0.433, 0.567]
        },
        'claude-sonnet': {
            'Authoritarian': [0.400, 0.167, 0.467],
            'Market': [0.600, 0.667, 0.567],
            'Hierarchical': [0.600, 0.467, 0.600]
        }
    }
}

# Data for Mistral
mistral_data = {
    'GSM8K': {
        'baseline': [0.200, 0.200, 0.200],
        # 'CoT': [0.267, 0.267, 0.267],
        # 'Emotive': [0.067, 0.067, 0.067],
        'gpt-3.5-turbo': {
            'Authoritarian': [0.400, 0.000, 0.033],
            'Market': [0.533, 0.333, 0.433],
            'Hierarchical': [0.233, 0.467, 0.400]
        },
        'claude-sonnet': {
            'Authoritarian': [0.000, 0.533, 0.300],
            'Market': [0.233, 0.467, 0.000],
            'Hierarchical': [0.000, 0.000, 0.200]
        }
    },
    'SST-2': {
        'baseline': [0.667, 0.667, 0.667],
        # 'CoT': [0.700, 0.700, 0.700],
        # 'Emotive': [0.700, 0.733, 0.700],
        'gpt-3.5-turbo': {
            'Authoritarian': [0.833, 0.867, 0.900],
            'Market': [0.567, 0.833, 0.733],
            'Hierarchical': [0.833, 0.933, 0.667]
        },
        'claude-sonnet': {
            'Authoritarian': [0.133, 0.800, 0.533],
            'Market': [0.667, 0.600, 0.033],
            'Hierarchical': [0.633, 0.667, 0.800]
        }
    },
    'HumanEval': {
        'baseline': [0.467, 0.467, 0.467],
        # 'CoT': [0.433],
        # 'Emotive': [0.467],
        'gpt-3.5-turbo': {
            'Authoritarian': [0.333, 0.533, 0.500],
            'Market': [0.200, 0.467, 0.400],
            'Hierarchical': [0.367, 0.333, 0.467]
        },
        'claude-sonnet': {
            'Authoritarian': [0.400, 0.133, 0.233],
            'Market': [0.400, 0.467, 0.300],
            'Hierarchical': [0.467, 0.333, 0.467]
        }
    }
}

# Perform Wilcoxon Signed-Rank Test
def perform_wilcoxon_test(baseline, framework_scores):
    differences = np.array(baseline) - np.array(framework_scores)
    if np.all(differences == 0):
        return None, None
    stat, p_value = wilcoxon(baseline, framework_scores)
    return stat, p_value

# General function to process data and perform tests
def process_and_test(data):
    results = {}
    for dataset, models in data.items():
        results[dataset] = {}
        for model in models.keys():
            if model == 'baseline':
                continue
            results[dataset][model] = {}
            for framework in models[model].keys():
                stat, p_value = perform_wilcoxon_test(models['baseline'], models[model][framework])
                results[dataset][model][framework] = {'stat': stat, 'p_value': p_value}
    return results

# Process and test all data
results = process_and_test(gpt_data)
llama_results = process_and_test(llama_data)
mistral_results = process_and_test(mistral_data)

# Print results function
def print_results(title, results):
    print(title)
    for dataset, models in results.items():
        print(f"\nDataset: {dataset}")
        for model, frameworks in models.items():
            print(f"Model: {model}")
            for framework, result in frameworks.items():
                if result['stat'] is None:
                    print(f"  Framework: {framework}, Test cannot be performed (all differences are zero)")
                else:
                    print(f"  Framework: {framework}, Test Statistic: {result['stat']}, p-value: {result['p_value']}")

# Print all results
print_results("GPT-3.5-Turbo Results:", results)
print_results("\nLlama-3.1 Results:", llama_results)
print_results("\nMistral Results:", mistral_results)

# Interpretation of results function
def interpret_results(title, results, alpha=0.05):
    print(title)
    for dataset, models in results.items():
        print(f"\nDataset: {dataset}")
        for model, frameworks in models.items():
            print(f"Model: {model}")
            for framework, result in frameworks.items():
                if result['stat'] is None:
                    print(f"  {framework} framework: Test cannot be performed (all differences are zero)")
                elif result['p_value'] < alpha:
                    print(f"  {framework} framework shows significant improvement (p-value: {result['p_value']})")
                else:
                    print(f"  {framework} framework does not show significant improvement (p-value: {result['p_value']})")

# Interpret all results
interpret_results("\nGPT-3.5-Turbo Results Interpretation:", results)
interpret_results("\nLlama-3.1 Results Interpretation:", llama_results)
interpret_results("\nMistral Results Interpretation:", mistral_results)


GPT-3.5-Turbo Results:

Dataset: GSM8K
Model: gpt-3.5-turbo
  Framework: Authoritarian, Test Statistic: 2.0, p-value: 0.75
  Framework: Market, Test Statistic: 0.0, p-value: 0.17971249487899976
  Framework: Hierarchical, Test Statistic: 0.0, p-value: 0.17971249487899976
Model: claude-sonnet
  Framework: Authoritarian, Test Statistic: 0.0, p-value: 0.25
  Framework: Market, Test Statistic: 0.0, p-value: 0.17971249487899976
  Framework: Hierarchical, Test Statistic: 0.0, p-value: 0.25

Dataset: SST-2
Model: gpt-3.5-turbo
  Framework: Authoritarian, Test Statistic: 0.0, p-value: 0.17971249487899976
  Framework: Market, Test Statistic: 0.0, p-value: 0.17971249487899976
  Framework: Hierarchical, Test Statistic: 0.0, p-value: 0.15729920705028505
Model: claude-sonnet
  Framework: Authoritarian, Test cannot be performed (all differences are zero)
  Framework: Market, Test Statistic: 0.0, p-value: 0.25
  Framework: Hierarchical, Test Statistic: 0.0, p-value: 0.17971249487899976

Dataset: Human