In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from itertools import combinations
from scipy.stats import shapiro, probplot, ttest_rel, wilcoxon, kruskal, mannwhitneyu

In [2]:
df = pd.read_csv('../../output/summary-statistics/merged_summary_statistics.csv')
metrics = ['Class_Compliance', 'Property_Compliance', 'Class_Coverage', 'Property_Coverage']
models = df['model'].unique()

In [3]:
models

array(['llava-llama3', 'llama4-scout', 'llama4-maverick', 'gpt-o1',
       'gpt-4.1-nano'], dtype=object)

In [4]:
normality_results = []
grouped = df.groupby(['model', 'method'])

for (model, method), group in grouped:
    cov_diff = group['Class_Coverage'] - group['Property_Coverage']
    com_diff = group['Class_Compliance'] - group['Property_Compliance']
    if len(cov_diff) < 3 or len(com_diff) < 3:
        continue
    cov_stat, cov_p = shapiro(cov_diff)
    com_stat, com_p = shapiro(com_diff)
    normality_results.append({
        'model': model,
        'method': method,
        'n': len(group),
        'Coverage_p': cov_p,
        'Coverage_normal': cov_p > 0.05,
        'Compliance_p': com_p,
        'Compliance_normal': com_p > 0.05
    })
normality_df = pd.DataFrame(normality_results)
#print(normality_df.sort_values(by=['model', 'method']))

  res = hypotest_fun_out(*samples, **kwds)


In [5]:
results = []

for model in models:
    df_model = df[df['model'] == model]
    
    for metric in metrics:
        obs_values = df_model[df_model['graph_type'] == 'observation'][metric].dropna()
        act_values = df_model[df_model['graph_type'] == 'action'][metric].dropna()

        if len(obs_values) > 1 and len(act_values) > 1:
            stat, p = mannwhitneyu(obs_values, act_values, alternative='two-sided')
            results.append({
                'model': model,
                'metric': metric,
                'observation_median': obs_values.median(),
                'action_median': act_values.median(),
                'p_value': p,
                'significant': p < 0.05
            })

modelwise_test_df = pd.DataFrame(results)
print(modelwise_test_df[modelwise_test_df['significant'] == True])

              model               metric  observation_median  action_median  \
10  llama4-maverick       Class_Coverage                0.50       0.833333   
11  llama4-maverick    Property_Coverage                1.00       0.875000   
13           gpt-o1  Property_Compliance                0.80       0.875000   
14           gpt-o1       Class_Coverage                0.75       0.833333   

     p_value  significant  
10  0.000014         True  
11  0.046159         True  
13  0.003401         True  
14  0.013291         True  


In [6]:
# Add average metrics
df['Avg_Compliance'] = (df['Class_Compliance'] + df['Property_Compliance']) / 2
df['Avg_Coverage'] = (df['Class_Coverage'] + df['Property_Coverage']) / 2

# Get unique (model, method) pairs
model_method_pairs = df[['model', 'method']].drop_duplicates().values.tolist()

# Compare each pair
results = []

for (model1, method1), (model2, method2) in combinations(model_method_pairs, 2):
    group1 = df[(df['model'] == model1) & (df['method'] == method1)]
    group2 = df[(df['model'] == model2) & (df['method'] == method2)]
    
    for metric in ['Avg_Compliance', 'Avg_Coverage']:
        values1 = group1[metric].dropna()
        values2 = group2[metric].dropna()
        
        # Ensure there's enough data to compare
        if len(values1) > 1 and len(values2) > 1:
            stat, p = mannwhitneyu(values1, values2, alternative='two-sided')
            results.append({
                'model_method_1': f'{model1} | {method1}',
                'model_method_2': f'{model2} | {method2}',
                'metric': metric,
                'p_value': p,
                'significant': p < 0.05,
                'median_1': values1.median(),
                'median_2': values2.median()
            })

# Create a results DataFrame
diff_results_df = pd.DataFrame(results)

# Show significant differences
significant_results = diff_results_df[diff_results_df['significant'] == True]
print(significant_results.sort_values(by='p_value'))

                 model_method_1           model_method_2          metric  \
54          llava-llama3 | i2kg            gpt-o1 | i2kg  Avg_Compliance   
222       llama4-maverick | dpe            gpt-o1 | i2kg  Avg_Compliance   
138          llama4-scout | dpe            gpt-o1 | i2kg  Avg_Compliance   
57          llava-llama3 | i2kg            gpt-o1 | d2kg    Avg_Coverage   
293               gpt-o1 | d2kg  gpt-4.1-nano | d2kg-rag    Avg_Coverage   
..                          ...                      ...             ...   
87          llava-llama3 | d2kg            gpt-o1 | d2kg    Avg_Coverage   
266  llama4-maverick | d2kg-rag            gpt-o1 | d2kg  Avg_Compliance   
170         llama4-scout | i2kg      gpt-4.1-nano | d2kg  Avg_Compliance   
267  llama4-maverick | d2kg-rag            gpt-o1 | d2kg    Avg_Coverage   
240      llama4-maverick | i2kg            gpt-o1 | d2kg  Avg_Compliance   

          p_value  significant  median_1  median_2  
54   5.276898e-09         True  0.

In [7]:
df['Full_Parse_OK'] = df['Full_Parse_OK'].astype(bool)
aggregation_functions = {
    'Full_Parse_OK': 'sum'
}

float_columns = df.select_dtypes(include=['float64', 'int64']).columns.difference(['run'])
for col in float_columns:
    aggregation_functions[col] = 'mean'

aggregated_df = df.groupby(['model', 'graph_type', 'method']).agg(aggregation_functions).reset_index()
aggregated_df = aggregated_df.rename(columns={'Full_Parse_OK': 'Full_Parse_OK_Count'})
aggregated_df.to_csv('../../output/summary-statistics/aggregated_summary_statistics.csv', index=False)