In [None]:
!pip install -r ../requirements.txt

In [None]:
import pandas as pd
import numpy as np 
import sklearn
import torch 
import transformers
from tqdm import tqdm
from nlg_eval_via_simi_measures.bary_score import BaryScoreMetric
from nlg_eval_via_simi_measures.depth_score import DepthScoreMetric
from nlg_eval_via_simi_measures.infolm import InfoLM
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate import meteor
from nltk import word_tokenize
from nltk import download

In [None]:
download('punkt')
download('wordnet')

In [None]:
df = pd.read_csv("https://github.com/dig-team/hanna-benchmark-asg/raw/main/hanna_stories_annotations.csv")

In [None]:
df_unique_human_story=df[df.Model!="Human"][["Human","Story"]].drop_duplicates(keep="first")
df_unique_human_story

In [None]:
def compute_bary(ref, hypothesis):
    ref,hypothesis=[ref],[hypothesis]
    metric_call = BaryScoreMetric()
    metric_call.prepare_idfs(ref, hypothesis)
    return metric_call.evaluate_batch(ref, hypothesis)["baryscore_W"][0]

In [None]:
def compute_depthscore(ref,hypothesis):
    metric_call = DepthScoreMetric()
    metric_call.prepare_idfs(ref, hypothesis)
    return metric_call.evaluate_batch(hypothesis,ref)["depth_score"][0]

In [None]:
def compute_infolmscore(ref,hypothesis):
    ref,hypothesis=[ref],[hypothesis]
    metric = InfoLM()
    #metric.device="cpu"
    #metric.model.to("cpu")
    metric.prepare_idfs(ref, hypothesis)
    return metric.evaluate_batch(hypothesis, ref)["fisher_rao"][0]

In [None]:
def compute_bleuscore(ref,hypothesis):
    ref,hypothesis=[ref],[hypothesis]
    return corpus_bleu(ref,hypothesis)

In [None]:
def compute_bertscore(ref,hypothesis):
    return score(hypothesis, ref, lang="en", verbose=True)

In [None]:
import torch
import torch.nn as nn
import traceback
from transformers import BartTokenizer, BartForConditionalGeneration
from typing import List
import numpy as np


class BARTScorer:
    def __init__(self, device='cuda:0', max_length=1024, checkpoint='facebook/bart-large-cnn'):
        # Set up model
        self.device = device
        self.max_length = max_length
        self.tokenizer = BartTokenizer.from_pretrained(checkpoint)
        self.model = BartForConditionalGeneration.from_pretrained(checkpoint)
        self.model.eval()
        self.model.to(device)

        # Set up loss
        self.loss_fct = nn.NLLLoss(reduction='none', ignore_index=self.model.config.pad_token_id)
        self.lsm = nn.LogSoftmax(dim=1)

    def load(self, path=None):
        """ Load model from paraphrase finetuning """
        if path is None:
            path = 'models/bart.pth'
        self.model.load_state_dict(torch.load(path, map_location=self.device))

    def score(self, srcs, tgts, batch_size=4):
        """ Score a batch of examples """
        score_list = []
        for i in range(0, len(srcs), batch_size):
            src_list = srcs[i: i + batch_size]
            tgt_list = tgts[i: i + batch_size]
            try:
                with torch.no_grad():
                    encoded_src = self.tokenizer(
                        src_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    encoded_tgt = self.tokenizer(
                        tgt_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    src_tokens = encoded_src['input_ids'].to(self.device)
                    src_mask = encoded_src['attention_mask'].to(self.device)

                    tgt_tokens = encoded_tgt['input_ids'].to(self.device)
                    tgt_mask = encoded_tgt['attention_mask']
                    tgt_len = tgt_mask.sum(dim=1).to(self.device)

                    output = self.model(
                        input_ids=src_tokens,
                        attention_mask=src_mask,
                        labels=tgt_tokens
                    )
                    logits = output.logits.view(-1, self.model.config.vocab_size)
                    loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1))
                    loss = loss.view(tgt_tokens.shape[0], -1)
                    loss = loss.sum(dim=1) / tgt_len
                    curr_score_list = [-x.item() for x in loss]
                    score_list += curr_score_list

            except RuntimeError:
                traceback.print_exc()
                print(f'source: {src_list}')
                print(f'target: {tgt_list}')
                exit(0)
        return score_list

    def multi_ref_score(self, srcs, tgts: List[List[str]], agg="mean", batch_size=4):
        # Assert we have the same number of references
        ref_nums = [len(x) for x in tgts]
        if len(set(ref_nums)) > 1:
            raise Exception("You have different number of references per test sample.")

        ref_num = len(tgts[0])
        score_matrix = []
        for i in range(ref_num):
            curr_tgts = [x[i] for x in tgts]
            scores = self.score(srcs, curr_tgts, batch_size)
            score_matrix.append(scores)
        if agg == "mean":
            score_list = np.mean(score_matrix, axis=0)
        elif agg == "max":
            score_list = np.max(score_matrix, axis=0)
        else:
            raise NotImplementedError
        return list(score_list)

In [None]:
def compute_bartscore(ref,hypothesis):
    bart_scorer = BARTScorer(device='cuda:0', checkpoint='facebook/bart-large-cnn')
    return bart_scorer.multi_ref_score(hypothesis, ref, agg="max", batch_size=4) # agg means aggregation, can be mean or max

In [None]:
def compute_meteorscore(ref,hypothesis):
    return round(meteor(word_tokenize(ref),word_tokenize(hypothesis)),4)

In [None]:
df_mini=df_unique_human_story.head(1)

In [None]:
df_mini['BLEU']= df_mini[["Human", "Story"]].apply(lambda x : compute_bleuscore(*x), axis =1)

In [None]:
# Heavy computation
df_unique_human_story['baryscore']= df_unique_human_story[["Human", "Story"]].apply(lambda x : compute_bary(*x), axis =1)
df_unique_human_story['depthscore']= df_unique_human_story[["Human", "Story"]].apply(lambda x : compute_depthscore(*x), axis =1)
df_unique_human_story['infolmscore']= df_unique_human_story[["Human", "Story"]].apply(lambda x : compute_infolmscore(*x), axis =1)
df_unique_human_story['BLEU']= df_unique_human_story[["Human", "Story"]].apply(lambda x : compute_bleuscore(*x), axis =1)

In [None]:
df_unique_human_story