## Evaluation results

### Overall association statistics

The tables in this section show the standard association metrics between human scores and different types of machine scores. These results are computed on the evaluation set. The scores for each model have been truncated to [min-0.4998, max+.4998].When indicated, scaled scores are computed by re-scaling the predicted scores using mean and standard deviation of human scores as observed on the training data and mean and standard deviation of machine scores as predicted for the training set. 


In [None]:
def read_evals(model_list):
    evals = []
    for (model_id, config, csvdir) in model_list:
        eval_short_file = os.path.join(csvdir, '{}_eval_short.csv'.format(model_id))
        if os.path.exists(eval_short_file):
            df_eval = pd.read_csv(eval_short_file, index_col=0)
            df_eval.index = [model_id]
            
            # figure out whether the score was scaled
            df_eval['system score type'] = 'scale' if config['use_scaled_predictions'] == True or config['scale_with'] is not None else 'raw'        
            #rename the columns to remove reference to scale/raw scores
            new_column_names = [col.split('.')[0] if not 'round' in col 
                                else '{} (rounded)'.format(col.split('.')[0])
                                for col in df_eval.columns ]
            df_eval.columns = new_column_names
            # Get human-human scores from the other file
            consistency_csv_file = os.path.join(csvdir, '{}_consistency.csv'.format(model_id))
            if os.path.exists(consistency_csv_file):
                consistency_df = pd.read_csv(consistency_csv_file, index_col=0)
                df_eval['h-h-wtkappa'] = consistency_df.iloc[0]['wtkappa']
                df_eval['h-h-corr'] = consistency_df.iloc[0]['corr']
                df_eval['h-h-exact_agr'] = consistency_df.iloc[0]['exact_agr']
                df_eval['h-h-adj_agr'] = consistency_df.iloc[0]['adj_agr']
                df_eval['h-h-kappa'] = consistency_df.iloc[0]['kappa']
            evals.append(df_eval) 
    if len(evals) > 0:
        df_evals = pd.concat(evals)
    else:
        df_evals = pd.DataFrame()
    return(df_evals)

df_eval = read_evals(model_list)
if not df_eval.empty:
    df_eval.to_csv(join(output_dir, '{}_eval_short.csv'.format(summary_id)))

In [None]:
# Rename columns and add additional comparison columns

# Ensure that all numeric columns are actually numbers
# (They probably are, but just in case)
df_eval.apply(pd.to_numeric, errors='ignore')
# Rename columns to clarify human-machine vs human-human
df_eval.rename(columns = {'corr':'h-m-corr', 'wtkappa (rounded)':'h-m-wtkappa (rounded)',
                          'exact_agr (rounded)': 'h-m-exact_agr (rounded)',
                          'adj_agr (rounded)': 'h-m-adj_agr (rounded)', 
                          'kappa (rounded)': 'h-m-kappa (rounded)'}, inplace = True)
# Add additional columns showing difference between scores
df_eval['h-h-corr h-m-corr diff'] = df_eval['h-h-corr'] - df_eval['h-m-corr']
df_eval['h-h-wtkappa h-m-wtkappa diff'] = df_eval['h-h-wtkappa'] - df_eval['h-m-wtkappa (rounded)']
df_eval['h-h-exact_agr h-m-exact_agr diff'] = df_eval['h-h-exact_agr'] - df_eval['h-m-exact_agr (rounded)']
df_eval['h-h-adj_agr h-m-adj_agr diff'] = df_eval['h-h-adj_agr'] - df_eval['h-m-adj_agr (rounded)']
df_eval['h-h-kappa h-m-kappa diff'] = df_eval['h-h-kappa'] - df_eval['h-m-kappa (rounded)']


#### Descriptive holistic score statistics

The table shows distributional properties of human and system scores. SMD values lower then -0.15 or higher than 0.15 are <span class="highlight_color">highlighted</span>.

In [None]:
pd.options.display.width=10
formatter = partial(color_highlighter, low=-0.15, high=0.15)
if not df_eval.empty:
     display(HTML(df_eval[['N', 'system score type', 'h_mean', 'h_sd',  'sys_mean', 'sys_sd',  'SMD']].to_html(index=True, classes = ['sortable'],
                                                                                                               escape=False,
                                                                                                               formatters={'SMD': formatter},
                                                                                                               float_format=int_or_float_format_func)))
else:
     display(Markdown("No information available for any of the models"))


#### Association statistics

The table shows the standard association metrics between human scores and machine scores. Note that some evaluations are based on rounded (`Trim-round`) scores computed by first truncating and then rounding the predicted score.

In [None]:
if not df_eval.empty:
     display(HTML(df_eval[['N',
                           'system score type', 'h-h-corr',
                           'h-m-corr', 'h-h-corr h-m-corr diff', 'R2', 'RMSE', 'h-h-wtkappa',
                           'h-m-wtkappa (rounded)', 'h-h-wtkappa h-m-wtkappa diff', 
                           'h-h-kappa', 'h-m-kappa (rounded)', 'h-h-kappa h-m-kappa diff',
                           'h-h-exact_agr', 'h-m-exact_agr (rounded)', 'h-h-exact_agr h-m-exact_agr diff',
                           'h-h-adj_agr', 'h-m-adj_agr (rounded)', 
                           'h-h-adj_agr h-m-adj_agr diff']].to_html(index=True, classes = ['sortable'],
                                                                                escape=False,
                                                                                float_format = int_or_float_format_func)))
else:
     display(Markdown("No information available for any of the models"))
