# Correctness Checker

This is just a tool to see if the correctness values that a LMM came up with are no pure random values but stable from run to run,

## Helper Functions

Calculated a standard deviations for all results to check if they are consistent

In [1]:
import glob
import pandas as pd

def check_correctness_values(glob_pattern):
    df_list = []

    for i, filepath in enumerate(glob.glob(glob_pattern)):
        
        df = pd.read_csv(filepath)
        df['result'] = i + 1
        df['correctness'] = pd.to_numeric(df['correctness']) * 100 # x100 because we don't like floating point numbers
        df_list.append(df)
        
    dfs = pd.concat(df_list).reset_index(names='text_line')
    dfs_pivoted = dfs.pivot_table(index='text_line', columns='result', values='correctness')
    dfs_pivoted['std'] = dfs_pivoted.std(axis=1)
    return dfs_pivoted.sort_values(by=['std'], ascending=False) / 100 # get back to floating point numbers to have results like a pro

## Check Learning Goals

In [2]:
results = check_correctness_values("temp/*lg_results.csv")
print(f"mean std: {results['std'].mean()}")
results.head(20)

mean std: 0.0024345905588127894


result,1,2,3,4,5,6,7,8,9,10,11,std
text_line,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
12,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9,1.0,0.9,1.0,0.040452
23,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.8,0.9,0.030151
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.0
2,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.0
4,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.0
5,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.0
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


## Check translations

In [3]:
results = check_correctness_values("temp/*text_results.csv")
print(f"mean std: {results['std'].mean()}")
results.head(20)

mean std: 0.0014670778907201207


result,1,2,3,4,5,6,7,8,9,10,std
text_line,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
122,1.0,0.9,1.0,0.9,1.0,0.9,1.0,1.0,0.9,1.0,0.05164
268,0.8,0.8,0.9,0.9,0.8,0.9,0.9,0.9,0.9,0.8,0.05164
176,0.9,0.8,0.9,0.9,0.8,0.8,0.9,0.8,0.9,0.9,0.05164
191,0.9,1.0,1.0,0.9,1.0,1.0,1.0,1.0,1.0,0.9,0.048305
89,0.9,1.0,1.0,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.042164
235,0.9,0.9,0.9,0.9,0.9,0.9,1.0,1.0,0.9,0.9,0.042164
243,0.8,0.8,0.8,0.8,0.9,0.8,0.8,0.8,0.8,0.8,0.031623
76,0.9,0.9,0.9,0.8,0.9,0.9,0.9,0.9,0.9,0.9,0.031623
102,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.9,0.8,0.8,0.031623
207,0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.7,0.6,0.031623
