In [None]:
#imports 
import sys
import re
import math
import random
import torch
import ntpath
import itertools
import csv 
import numpy as np
import pandas as pd

import gdown

# Tokenizer
import nltk
nltk.download('punkt')

csv.field_size_limit(sys.maxsize)

from scipy.stats import spearmanr

In [None]:
url = "https://drive.google.com/drive/folders/1MWvEuXcX9GTBPGgvjHcBRZu3ZnbMUC8Q"
gdown.download_folder(url, output='human_evaluation')

In [41]:
def get_headers(filename):
  res =  {}
  with open(filename, 'r') as file:
    reader = csv.reader(file, delimiter=';')
    headers = next(reader)
    for header in headers: 
      res[header] =  headers.index(header)
    #return headers.index(header)
  return res

def Extract(lst, index):
    return [item[index] for item in lst]

def Extract_and_Round(lst, it):
    return [math.ceil(float(item[it])) for item in lst]

def get_average_list(lst):
    return round(sum(lst) / len(lst), 3)

def get_average_lists(lists):
  average_list = []
  for i in range(len(lists[0])):
    sum = count = 0
    for listt in lists:
      listf = [float(row) for row in listt] # convert element to float
      if listf[i] >= 1:  #only valid values are used for the average calculation 
        sum += listf[i]
        count += 1
    if(count > 0):
      average_list.append(sum/count)
    else:
      average_list.append(0)
  return average_list

def readCSVFile(Path, encoding, withHeader = False):
  rows = []
  with open(Path, 'r', encoding=encoding) as file:
    reader = csv.reader(file, delimiter=';')
    if(not withHeader): 
      next(reader)
    for row in reader:
        rows.append(row)
  return rows

#CSV Datei speichern 
def saveCSVFile(filename, array):
  with open(filename, 'w', encoding='utf-8-sig') as file: 
      write = csv.writer(file, delimiter=';') 
      write.writerows(array) 

In [42]:
def get_annotations(annotation_files, dimensions, phase = 0):
  #check valid phase
  valid_phases = {0,1,2}
  if(phase not in valid_phases): 
    raise ValueError("invalid phase id", phase)
  annotations = {}
  for annotation_file in annotation_files: 
    file_name = ntpath.basename(annotation_file).replace(".csv","")

    df =  pd.read_csv(annotation_file, sep=';')
    if(phase > 0):
      df = df[df['Phase'] == "Phase-" + str(phase)]
    rows = df.values.tolist()
    headers = get_headers(annotation_file)
    annotations[file_name] = {}

    for header in headers: 
      if(header.lower() in dimensions):
        annotations[file_name][header.lower()] = Extract(rows, headers[header])  
      #exmpl: coherence 
      #coherence = Extract(rows, 1)
      #annotations[file_name]["coherence"] = coherence
  return annotations


def get_correlations_annotators(annotations, dimensions):
  annotator_pairs = list(itertools.combinations(annotations.keys(), 2))
  correlations = {}
  for pair in annotator_pairs: 
    correlations[pair] = {}
    for dim in dimensions: 
      annotation_p1 = annotations[pair[0]][dim]
      annotation_p2 = annotations[pair[1]][dim]
      valid_annotation = np.array([(x, y) for x, y in zip([float(row) for row in annotation_p1], [float(row) for row in annotation_p2]) if x > 0 and y >0]).T #convert element to float and ignore value <1 or unannotated text (=0)

      if(len(valid_annotation)>1 and len(valid_annotation[0])>1 and len(valid_annotation[1])>1): #more than 2 value needed for the correlation
        correlation, _ = spearmanr(list(valid_annotation[0]), list(valid_annotation[1]))
        if(not math.isnan(correlation)): #NaN if all row have the exact same value (deviation = 0)
          correlations[pair][dim] = round(abs(correlation),3) #absolute value, round 
  return correlations

def get_metric_eval(path, metrics, phase = 0): 
  valid_phases = {0,1,2}
  if(phase not in valid_phases): 
    raise ValueError("invalid phase id", phase)
  metrics_eval = {}
  df =  pd.read_csv(path, sep=';')
  headers = get_headers(path) 
  if(phase > 0):
    df = df[df['Phase'] == "Phase-" + str(phase)]
  rows = df.values.tolist()
  for header in headers:
    if(header.lower() in metrics):
      metrics_eval[header.lower()] = [float(row) for row in Extract(rows,headers[header])] #convert rows to float
  return metrics_eval

def get_correlations_metrics(metrics, average_ranking, dimensions): 
  correlations = {}
  for metric in metrics: 
    correlations[metric] = {}
    for dim in dimensions: 
      correlation, _ = spearmanr(metrics[metric], average_ranking[dim])
      correlations[metric][dim] = round(abs(correlation),3)
  return correlations


def get_average_ranking(annotations, dimensions):
  average_ranking = {}
  for dim in dimensions: 
    lists = []
    for annotator in annotations.keys(): 
      lists.append(annotations[annotator][dim])
    average_ranking[dim]= get_average_lists(lists)
  return average_ranking

def get_average_ranking_per_model(annotation_files, models):  
  headers = get_headers(annotation_files[0]) #since all files have the same structure\fields -> read the headers one time
  average_ranking_per_model= {}
  for model in models:
    tmp = {} #temp dictionary for the ranking per model
    for dim in dimensions: #initialize the temp dic
      tmp[dim] = []

    average_ranking_per_model[model] = {}
    for annotation_file in annotation_files: 
      file_name = ntpath.basename(annotation_file).replace(".csv","")
      df =  pd.read_csv(annotation_file, sep=';')
      df = df[df['Model-Id'] == model] 
      rows = df.values.tolist()
      headers = get_headers(annotation_file)
      for header in headers: 
        if(header.lower() in dimensions):
          tmp[header.lower()].append(Extract(rows, headers[header]))
    for dim in dimensions: 
      average_ranking_per_model[model][dim] = get_average_list(get_average_lists(tmp[dim]))
  return average_ranking_per_model

def get_average_metrics_ranking_per_model(path):
  df =  pd.read_csv(path, sep=';')
  grouped_models_avg = df.groupby('Model-Id').mean()
  return grouped_models_avg.to_dict('index')
   

def get_avg_per_model_annotator(annotations, dimensions): 
  avg_per_model_annotator = {}
  for annotation_file in annotation_files: 
    file_name = ntpath.basename(annotation_file).replace(".csv","")
    avg_per_model_annotator[file_name] = {}
    df = pd.read_csv(annotation_file, sep=';')
    grouped_models = df.groupby('Model-Id')
    for group in  list(grouped_models.groups.keys()): 
      avg_per_model_annotator[file_name][group] = {}
      for dim in dimensions:
        goup_value = grouped_models.get_group(group)
        avg_per_model_annotator[file_name][group][dim] = goup_value[dim][goup_value[dim]>=1].mean()
  return avg_per_model_annotator


def get_avg_per_model(avg_per_model_annotator, dimensions):
  avg_per_model = {}
  for model in models:
    avg_per_model[model] = {}
    for dim in dimensions: 
      tmp = []
      for annotator in avg_per_model_annotator: 
        if(not math.isnan(avg_per_model_annotator[annotator][model][dim])):
          tmp.append(avg_per_model_annotator[annotator][model][dim])
      avg_per_model[model][dim] = get_average_list(tmp)
  return avg_per_model


def get_average_system_correlation(annotation_files,csv_metrics, dimensions, metrics):
  average_system_correlation = {}
  #sort the average list per model - to get the same order for both
  average_heval_ranking_per_model = dict(sorted(get_average_ranking_per_model(annotation_files, models).items()))
  avg_metrics_ranking_per_model = dict(sorted(get_average_metrics_ranking_per_model(csv_metrics).items()))
  #check models for both ranking
  assert(list(average_heval_ranking_per_model.keys()) == list(avg_metrics_ranking_per_model.keys()))
  
  average_heval_ranking = {}
  for dim in dimensions: 
    average_heval_ranking[dim] = [v[dim] for k, v in average_heval_ranking_per_model.items() if dim in v]

  avg_metrics_ranking = {}
  for metric in metrics: 
    avg_metrics_ranking[metric] = [v[metric] for k, v in avg_metrics_ranking_per_model.items() if metric in v]

  for metric in metrics: 
    average_system_correlation[metric] = {}
    for dim in dimensions:
      correlation, _ = spearmanr(avg_metrics_ranking[metric], average_heval_ranking[dim])
      average_system_correlation[metric][dim] = round(abs(correlation),3)
  
  return average_system_correlation


In [43]:
#human evaluation
csv_1_en = "human_evaluation/hDE-EN/en_heval_1.csv"
csv_2_en = "human_evaluation/hDE-EN/en_heval_2.csv"
csv_3_en = "human_evaluation/hDE-EN/en_heval_3.csv"
csv_4_en = "human_evaluation/hDE-EN/en_heval_4.csv"
csv_5_en = "human_evaluation/hDE-EN/en_heval_5.csv"
#metric evaluation 
csv_metrics = "human_evaluation/hDE-EN/en_metrics_res.csv"


In [44]:
annotation_files = [csv_1_en, csv_2_en, csv_3_en, csv_4_en, csv_5_en]
dimensions =  ["coherence", "consistency", "fluency", "relevance"]
metrics = ["rouge1","rougel","bertscore","bartscore","moverscore","menli","supert"]

models = ["1", "2", "3", "6", "7", "101", "102", "B1"]

In [None]:
phase  = 0
print("Phase:", phase)
annotations = get_annotations(annotation_files,dimensions,phase)
#annotation correlations
correlations = get_correlations_annotators(annotations, dimensions)
#metric correlations
average_heval_ranking = get_average_ranking(annotations, dimensions)
metrics_eval = get_metric_eval(csv_metrics,metrics,phase)  
metrics_correlations = get_correlations_metrics(metrics_eval, average_heval_ranking, dimensions)
average_system_correlation = get_average_system_correlation(annotation_files,csv_metrics, dimensions, metrics)
print("-------------------------------------")
print("Correlation between annotators")
print("-------------------------------------")
for dim in dimensions: 
  values = []
  for outer_key, outer_value in correlations.items():
    if dim in outer_value:
      values.append(outer_value[dim])
  print(dim, round(sum(values)/len(values),3))
  #average values 
print("-------------------------------------")
print("Average ranking (gold standard)")
print("-------------------------------------")
for  key in average_heval_ranking: 
  print(key, round(get_average_list(average_heval_ranking[key]),2))

print("-------------------------------------")
print("Average metrics ranking")
print("-------------------------------------")
for  key in metrics_eval: 
  print(key, get_average_list(metrics_eval[key]))
print("-------------------------------------")
print("Segment-level Correlation metrics ranking vs. human eval ranking")
print("-------------------------------------")
for outer_key, outer_value in metrics_correlations.items():
  for dim in dimensions: 
    if dim in outer_value:
        print(outer_key, dim,":", outer_value[dim])
  print("------")
print("-------------------------------------")
print("System-level Correlation metrics ranking vs. human eval ranking")
print("-------------------------------------")
for outer_key, outer_value in average_system_correlation.items():
  for dim in dimensions: 
    if dim in outer_value:
        print(outer_key, dim,":", outer_value[dim])
  print("------")

#assest 
for dim in dimensions: 
  for metric in metrics_eval:
    assert len(average_heval_ranking[dim]) == len(metrics_eval[metric])

annotator_pairs = list(itertools.combinations(annotations.keys(), 2))
for pair in annotator_pairs: 
  for dim in dimensions: 
    assert len(annotations[pair[0]][dim]) ==  len(annotations[pair[1]][dim])


In [None]:
phase = 0 # 0 : all, 1: phase 1, 2: phase 2

annotations = get_annotations(annotation_files, dimensions)
#annotation correlations
correlations = get_correlations_annotators(annotations, dimensions)
#metric correlations
average_ranking = get_average_ranking(annotations, dimensions)
metrics_eval =    get_metric_eval(csv_metrics,metrics,phase)

print(average_ranking)
metrics_correlations = {}
for metric in metrics_eval: 
  metrics_correlations[metric] = {}
  for dim in dimensions: 
    correlation, _ = spearmanr(metrics_eval[metric], average_ranking[dim])
    metrics_correlations[metric][dim] = round(abs(correlation),3)

In [None]:
#def get_average_system_correlation(annotation_files,csv_metrics, dimensions, metrics):
average_system_correlation = {}
#sort the average list per model - to get the same order for both
average_heval_ranking_per_model = dict(sorted(get_average_ranking_per_model(annotation_files, models).items()))
avg_metrics_ranking_per_model = dict(sorted(get_average_metrics_ranking_per_model(csv_metrics).items()))
#check models for both ranking
assert(list(average_heval_ranking_per_model.keys()) == list(avg_metrics_ranking_per_model.keys()))
  
average_heval_ranking = {}
for dim in dimensions: 
  average_heval_ranking[dim] = [v[dim] for k, v in average_heval_ranking_per_model.items() if dim in v]

print(average_heval_ranking)
avg_metrics_ranking = {}
for metric in metrics: 
  avg_metrics_ranking[metric] = [v[metric] for k, v in avg_metrics_ranking_per_model.items() if metric in v]

print(avg_metrics_ranking)
for metric in metrics: 
  average_system_correlation[metric] = {}
  for dim in dimensions:
    correlation, _ = spearmanr(avg_metrics_ranking[metric], average_heval_ranking[dim])
    average_system_correlation[metric][dim] = round(abs(correlation),3)
