# Hanna evaluation notebook

## 1 - Installation and dependencies

In [None]:
!pip install -r ../requirements.txt

In [None]:
import pandas as pd
import numpy as np 
import sklearn
import torch 
import transformers
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from nlg_eval_via_simi_measures.bary_score import BaryScoreMetric
from nlg_eval_via_simi_measures.depth_score import DepthScoreMetric
from nlg_eval_via_simi_measures.infolm import InfoLM
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate import meteor
from nltk import word_tokenize
from nltk import download
from bert_score import score
from rouge import Rouge 

In [None]:
download('punkt')
download('wordnet')

## 2 - score and corr computation functions def

In [None]:
def compute_bary(ref:str,hypothesis:str):
    ref,hypothesis=[ref],[hypothesis]
    metric_call = BaryScoreMetric()
    metric_call.prepare_idfs(ref, hypothesis)
    return metric_call.evaluate_batch(ref, hypothesis)["baryscore_W"][0]

In [None]:
def compute_depthscore(ref:str,hypothesis:str):
    metric_call = DepthScoreMetric()
    metric_call.prepare_idfs(ref, hypothesis)
    return metric_call.evaluate_batch(hypothesis,ref)["depth_score"][0]

In [None]:
def compute_infolmscore(ref:str,hypothesis:str):
    ref,hypothesis=[ref],[hypothesis]
    metric = InfoLM()
    #metric.device="cpu"
    #metric.model.to("cpu")
    metric.prepare_idfs(ref, hypothesis)
    return metric.evaluate_batch(hypothesis, ref)["fisher_rao"][0]

In [None]:
def compute_bleuscore(ref:str,hypothesis:str):
    ref,hypothesis=[word_tokenize(ref)],word_tokenize(hypothesis)
    return sentence_bleu(ref,hypothesis)

In [None]:
def compute_bertscore(ref:str,hypothesis:str):
    ref,hypothesis=[ref],[hypothesis]
    P, R, F1=score(hypothesis, ref, lang="en", verbose=True)
    return P.item(),R.item(),F1.item()

In [None]:
import torch
import torch.nn as nn
import traceback
from transformers import BartTokenizer, BartForConditionalGeneration
from typing import List
import numpy as np


class BARTScorer:
    def __init__(self, device='cuda:0', max_length=1024, checkpoint='facebook/bart-large-cnn'):
        # Set up model
        self.device = device
        self.max_length = max_length
        self.tokenizer = BartTokenizer.from_pretrained(checkpoint)
        self.model = BartForConditionalGeneration.from_pretrained(checkpoint)
        self.model.eval()
        self.model.to(device)

        # Set up loss
        self.loss_fct = nn.NLLLoss(reduction='none', ignore_index=self.model.config.pad_token_id)
        self.lsm = nn.LogSoftmax(dim=1)

    def load(self, path=None):
        """ Load model from paraphrase finetuning """
        if path is None:
            path = 'models/bart.pth'
        self.model.load_state_dict(torch.load(path, map_location=self.device))

    def score(self, srcs, tgts, batch_size=4):
        """ Score a batch of examples """
        score_list = []
        for i in range(0, len(srcs), batch_size):
            src_list = srcs[i: i + batch_size]
            tgt_list = tgts[i: i + batch_size]
            try:
                with torch.no_grad():
                    encoded_src = self.tokenizer(
                        src_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    encoded_tgt = self.tokenizer(
                        tgt_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    src_tokens = encoded_src['input_ids'].to(self.device)
                    src_mask = encoded_src['attention_mask'].to(self.device)

                    tgt_tokens = encoded_tgt['input_ids'].to(self.device)
                    tgt_mask = encoded_tgt['attention_mask']
                    tgt_len = tgt_mask.sum(dim=1).to(self.device)

                    output = self.model(
                        input_ids=src_tokens,
                        attention_mask=src_mask,
                        labels=tgt_tokens
                    )
                    logits = output.logits.view(-1, self.model.config.vocab_size)
                    loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1))
                    loss = loss.view(tgt_tokens.shape[0], -1)
                    loss = loss.sum(dim=1) / tgt_len
                    curr_score_list = [-x.item() for x in loss]
                    score_list += curr_score_list

            except RuntimeError:
                traceback.print_exc()
                print(f'source: {src_list}')
                print(f'target: {tgt_list}')
                exit(0)
        return score_list

    def multi_ref_score(self, srcs, tgts: List[List[str]], agg="mean", batch_size=4):
        # Assert we have the same number of references
        ref_nums = [len(x) for x in tgts]
        if len(set(ref_nums)) > 1:
            raise Exception("You have different number of references per test sample.")

        ref_num = len(tgts[0])
        score_matrix = []
        for i in range(ref_num):
            curr_tgts = [x[i] for x in tgts]
            scores = self.score(srcs, curr_tgts, batch_size)
            score_matrix.append(scores)
        if agg == "mean":
            score_list = np.mean(score_matrix, axis=0)
        elif agg == "max":
            score_list = np.max(score_matrix, axis=0)
        else:
            raise NotImplementedError
        return list(score_list)

In [None]:
def compute_bartscore(ref:str,hypothesis:str):
    ref,hypothesis=[ref],[hypothesis]
    bart_scorer = BARTScorer(device='cuda:0', checkpoint='facebook/bart-large-cnn')
    return bart_scorer.multi_ref_score(hypothesis, ref, agg="max", batch_size=4)[0] # agg means aggregation, can be mean or max

In [None]:
def compute_meteorscore(ref:str,hypothesis:str):
    return round(meteor([word_tokenize(ref)],word_tokenize(hypothesis)),4)

In [None]:
def id(r,p,f):
    return r,p,f

def compute_rougescore(ref:str,hypothesis:str):
    rouge = Rouge() 
    return id(**rouge.get_scores([hypothesis], [ref])[0]['rouge-1'])


In [None]:
def compute_two_corr(dataframe:pd.DataFrame,column_name_1:str, column_name_2:str,correlation_type:str,system:str):
    return dataframe[dataframe.Model==system][[column_name_1,column_name_2]].corr(method=correlation_type,numeric_only=True)

def compute_list_corr(dataframe:pd.DataFrame,list_column_name:list[str],correlation_type:str,system:str=None):
    if system:
        return dataframe[dataframe.Model==system][list_column_name].corr(method=correlation_type)
    else:
        return dataframe[list_column_name].corr(method=correlation_type)

## 3 - DL and datawrangling of Hanna dataset

In [None]:
df = pd.read_csv("https://github.com/dig-team/hanna-benchmark-asg/raw/main/hanna_stories_annotations.csv")
df_unique_human_story=df[df.Model!="Human"][["Story ID","Human","Story","Model"]].drop_duplicates(keep="first")
df_human_index=df.Human.drop_duplicates(keep="first").reset_index(drop=True).reset_index().rename(columns={"index":"human_story_index"})
df_mean_human_metrics=df.groupby("Story ID").mean()

## 4 - Compute scores

In [None]:
# Heavy computation
df_unique_human_story['baryscore']= df_unique_human_story[["Human", "Story"]].apply(lambda x : compute_bary(*x), axis =1)
df_unique_human_story['depthscore']= df_unique_human_story[["Human", "Story"]].apply(lambda x : compute_depthscore(*x), axis =1)
df_unique_human_story['infolmscore']= df_unique_human_story[["Human", "Story"]].apply(lambda x : compute_infolmscore(*x), axis =1)
df_unique_human_story['BLEU']= df_unique_human_story[["Human", "Story"]].apply(lambda x : compute_bleuscore(*x), axis =1)
df_unique_human_story[['ROUGE_r','ROUGE_p','ROUGE_f']]= df_unique_human_story[["Human", "Story"]].apply(lambda x : compute_rougescore(*x), axis =1, result_type="expand")
df_unique_human_story['meteorscore']= df_unique_human_story[["Human", "Story"]].apply(lambda x : compute_meteorscore(*x), axis =1)
df_unique_human_story['bartscore']= df_unique_human_story[["Human", "Story"]].apply(lambda x : compute_bartscore(*x), axis =1)
df_unique_human_story[['bertscore_p','bertscore_r','bertscore_f1']]= df_unique_human_story[["Human", "Story"]].apply(lambda x : compute_bertscore(*x), axis =1, result_type="expand")

In [None]:
df_unique_human_story_all=df_mean_human_metrics.merge(df_unique_human_story,how="left",on="Story ID")
df_unique_human_story_all=df_unique_human_story_all.merge(df_human_index,on="Human")

In [None]:
df_unique_human_story_all.to_parquet("hanna_scores_computed.parquet")

## 5 - Compute correlations

In [None]:
LIST_HUMAN_METRICS=['Relevance','Coherence','Empathy','Surprise','Engagement','Complexity']
LIST_AEM=['baryscore','depthscore','infolmscore','BLEU','ROUGE_r','ROUGE_p','ROUGE_f','meteorscore','bartscore','bertscore_p','bertscore_r','bertscore_f1']
LIST_ALL_METRICS=LIST_HUMAN_METRICS+LIST_AEM
CORR_METHODS=["pearson", "kendall", "spearman"]

In [None]:
# Compute text-level correlation


for corr_method in CORR_METHODS:
    print(corr_method)
    # For each gold, compute AEM + HM (provided)

    # >>> Done in  df_unique_human_story_all
    df_work=df_unique_human_story_all.copy()
    df_work=df_work.drop(columns=["Human","Story","Work time in seconds","Story ID"])
    # For each gold (96), compute correlation 960 => 96 
    list_of_dataframe=[]
    for index in iter(df_work.human_story_index.unique()):
        result=compute_list_corr(df_work[df_work["human_story_index"]==index],LIST_ALL_METRICS,corr_method)
        list_of_dataframe.append(result)

    # Take the mean sample
    #sns.heatmap(pd.DataFrame(np.mean(list(map(lambda x : x.to_numpy(),list_of_dataframe)),axis=0),columns=LIST_ALL_METRICS))
    sns.heatmap(pd.concat(list_of_dataframe).groupby(level=0).mean().reindex(LIST_ALL_METRICS),annot=True)
    plt.show()

In [None]:
# Compute system level correlation
for corr_method in CORR_METHODS:
    print(corr_method)
    # For each gold, compute AEM + HM (provided)

    # >>> Done in  df_unique_human_story_all
    df_work=df_unique_human_story_all.copy()
    df_work=df_work.drop(columns=["Human","Story","Work time in seconds","Story ID"])
    # Take the mean sample for each system 960 => 10

    df_work.groupby("Model").mean()

    # Compute correlation
    sns.heatmap( compute_list_corr(df_work,LIST_ALL_METRICS,corr_method),annot=True)
    plt.show()


# Old

In [None]:
df_mini=df_unique_human_story.head(5)

In [None]:
df_mini['baryscore']= df_mini[["Human", "Story"]].apply(lambda x : compute_bary(*x), axis =1)
df_mini['depthscore']= df_mini[["Human", "Story"]].apply(lambda x : compute_depthscore(*x), axis =1)
df_mini['infolmscore']= df_mini[["Human", "Story"]].apply(lambda x : compute_infolmscore(*x), axis =1)
df_mini['BLEU']= df_mini[["Human", "Story"]].apply(lambda x : compute_bleuscore(*x), axis =1)
df_mini[['ROUGE_r','ROUGE_p','ROUGE_f']]= df_mini[["Human", "Story"]].apply(lambda x : compute_rougescore(*x), axis =1, result_type="expand")
df_mini['meteorscore']= df_mini[["Human", "Story"]].apply(lambda x : compute_meteorscore(*x), axis =1)
df_mini['bartscore']= df_mini[["Human", "Story"]].apply(lambda x : compute_bartscore(*x), axis =1)
df_mini[['bertscore_p','bertscore_r','bertscore_f1']]= df_mini[["Human", "Story"]].apply(lambda x : compute_bertscore(*x), axis =1, result_type="expand")

In [None]:
df_mini

In [None]:
df_unique_human_only=df[df.Model=="Human"][["Human","Story"]].drop_duplicates(keep="first")
df_mini_human=df_unique_human_only.head(5)

In [None]:
df_mini_human['baryscore']= df_mini_human[["Human", "Story"]].apply(lambda x : compute_bary(*x), axis =1)
df_mini_human['depthscore']= df_mini_human[["Human", "Story"]].apply(lambda x : compute_depthscore(*x), axis =1)
df_mini_human['infolmscore']= df_mini_human[["Human", "Story"]].apply(lambda x : compute_infolmscore(*x), axis =1)
df_mini_human['BLEU']= df_mini_human[["Human", "Story"]].apply(lambda x : compute_bleuscore(*x), axis =1)
df_mini_human[['ROUGE_r','ROUGE_p','ROUGE_f']]= df_mini_human[["Human", "Story"]].apply(lambda x : compute_rougescore(*x), axis =1, result_type="expand")
df_mini_human['meteorscore']= df_mini_human[["Human", "Story"]].apply(lambda x : compute_meteorscore(*x), axis =1)
df_mini_human['bartscore']= df_mini_human[["Human", "Story"]].apply(lambda x : compute_bartscore(*x), axis =1)
df_mini_human[['bertscore_p','bertscore_r','bertscore_f1']]= df_mini_human[["Human", "Story"]].apply(lambda x : compute_bertscore(*x), axis =1, result_type="expand")

In [None]:
df_mini_human

In [None]:
#{‘pearson’, ‘kendall’, ‘spearman’}
sns.heatmap(compute_list_corr(df_unique_human_story_all,LIST_ALL_METRICS,'spearman','Human'),annot=True)

In [None]:
compute_two_corr(df_unique_human_story_all,'baryscore','Relevance','spearman','Human')