In [None]:
#google colab connection
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install pytorch_pretrained_bert
!pip install rouge_score
!pip install sentencepiece

In [None]:
#imports 
import os
import sys
import re
import math
import random
import torch
import ntpath
import itertools
import csv 
import numpy as np
import pandas as pd

# Tokenizer
import nltk
nltk.download('punkt')

csv.field_size_limit(sys.maxsize)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


131072

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
def compute_bertscore(summaries, references, language, avg):
  from metrics.bert_score.scorer import BERTScorer
  #languages:'en', 'de'
  bert_scorer = BERTScorer(lang=language, idf=True, nthreads=4)
  bert_scorer.compute_idf(references)
  P, R, F1 = bert_scorer.score(summaries, references, batch_size=4)
  if(avg):
    return round(F1.mean().item(),5)
  else:
    return F1

def compute_moverscore(summaries, references, language, avg): 
  #use XLM-Roberta
  if(language == "en"):
    os.environ['MOVERSCORE_MODEL'] = "roberta-large"
  else: 
    os.environ['MOVERSCORE_MODEL'] = "xlm-roberta-large"
  from metrics.moverscore2 import get_idf_dict, word_mover_score
  idf_dict_hyp = get_idf_dict(summaries)
  idf_dict_ref = get_idf_dict(references)
  scores = word_mover_score(references, summaries, idf_dict_ref, idf_dict_hyp, stop_words=[], n_gram=1, remove_subwords=True)
  if(avg):
    return round(np.mean(scores),5)
  else:
    return scores

def compute_bartscore(summaries, references, language, avg): 
  if(language =='en'):
    from metrics.bartscore import BARTScorer
    model = 'facebook/bart-large-cnn'
    bart_scorer = BARTScorer(device, checkpoint=model, bidirection=False)
  elif(language == 'de'):
    from metrics.mbartscore import MBARTScorer
    model = 'facebook/mbart-large-50-many-to-many-mmt'
    bart_scorer = MBARTScorer(device, checkpoint= model, bidirection=False)
  scores = bart_scorer.score(summaries, references, batch_size=4)
  if(avg):
    return round(np.mean(scores),4)
  else:
    return scores 

def compute_rouge1(summaries, references, avg): 
  from rouge_score import rouge_scorer
  scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
  scores = []
  for (summary, reference) in zip(summaries, references):
    scores.append(scorer.score(summary,
                      reference)["rouge1"].fmeasure)
  if(avg):
    return round(np.mean(scores),4)
  else:
    return scores 

def compute_rougeL(summaries, references, avg): 
  from rouge_score import rouge_scorer
  scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
  scores = []
  for (summary, reference) in zip(summaries, references):
    scores.append(scorer.score(summary,
                      reference)["rougeL"].fmeasure)
  if(avg):
    return round(np.mean(scores),4)
  else:
    return scores 

def compute_supert(summaries, sources, avg): 
  from metrics.supert import SupertMetric
  scorer = SupertMetric()
  scores = scorer.evaluate_batch(hyps, sources, aggregate=False)
  if(avg):
    return round(np.mean([el['supert'] for el in scores]),5)
  else:
    return [np.mean(el['supert'],5) for el in scores] 

def compute_MENLI(summaries, references, language, avg): 
  from metrics.MENLI.MENLI import MENLI
  if(language == 'en'): 
    #used model = ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli
    cross_lingual=False
    MENLI_scorer = MENLI(direction='avg', formula='e', src=False, nli_weight=0.3, combine_with='BERTScore-F', cross_lingual = cross_lingual)
    MENLI_scores = MENLI_scorer.score_all(srcs=[], refs=references, hyps=summaries)
  else: 
    #used model = xlm-roberta-base
    cross_lingual = True
    MENLI_scorer = MENLI(direction='avg', formula='e', src=False, nli_weight=0.3, combine_with='BERTScore-F', cross_lingual = cross_lingual)
    MENLI_scores = MENLI_scorer.score_all(srcs=references, refs=references, hyps=summaries)
  if(avg):
    return round(np.mean(MENLI_scores),5)
  else: 
    return MENLI_scores

In [None]:
def evaluate(texts, metrics, avg):
    """
    evaluate create a list of metric results. 

    :param texts: contains the texts\summaries (used for the calculation) and meta data. 
    :param metrics: list of metrics to be calculated. 
    :param avg: True to return the average values and False to return a list of results.  
    :return: list of metric resuls
    """ 
  res = texts.loc[:, ["Phase","Model-Id", "text-id", "description"]]
  reference_summaries = texts["reference_summary"].tolist()
  generated_summaries = texts["generated_summary"].tolist()
  source_texts = texts["text"].tolist()
  for metric in metrics: 
    if(metric == "rouge1"):
      rouge1 = compute_rouge1(generated_summaries, reference_summaries, avg = avg)
      res["rouge1"] = rouge1
    elif (metric == "rougel"):  
      rougel = compute_rougeL(generated_summaries, reference_summaries, avg = avg)
      res["rougel"] = rougel
    elif (metric == "bertscore"):
      bertscore = compute_bertscore(generated_summaries, reference_summaries, language, avg = avg)
      res["bertscore"] = bertscore
    elif (metric == "moverscore"):
      moverscore = compute_moverscore(generated_summaries, reference_summaries, language, avg = avg)
      res["moverscore"] = moverscore
    elif (metric == "bartscore"):
      bartscore = compute_bartscore(generated_summaries, reference_summaries, language, avg = avg)
      res["bartscore"] = bartscore
    elif (metric == "menli"):
      menli = compute_MENLI(generated_summaries, reference_summaries, language, avg = avg)
      res["menli"] = menli
    elif (metric == "supert"):
      supert = compute_supert(generated_summaries, source_texts, avg = avg)
      res["supert"] = supert
    else: 
      raise Exception('Unsupported metric:', metric)
  return res

In [None]:
# text paths (#example hEN-DE texts)
phase_1_path = "hEN-DE_Phase1_with_id.csv"
phase_2_path = "hEN-DE_Phase2_with_id.csv"
# metric results path
csv_metrics = "de_metrics_res.csv"

In [None]:
#read texts and summaries for each step 
phase1 = pd.read_csv(phase_1_path, sep=';')
phase1["Phase"] = "Phase-1"
phase2 = pd.read_csv(phase_2_path, sep=';')
phase2["Phase"] = "Phase-2"
#Concatenate the texts from the two phases
texts = pd.concat([phase1, phase2])

In [None]:
print(texts.columns)

Index(['text-id', 'model', 'description', 'id', 'text', 'reference_summary',
       'generated_summary', 'Model-Id', 'Phase'],
      dtype='object')


In [None]:
#metrics = ["rouge1","rougel","bertscore","bartscore","moverscore","menli","supert"]
metrics = ["rouge1","rougel","bertscore","menli"] #used metrics for the evaluation 
language = "de" # text & summary language for reference-based metrics 
avg = False #caclulate the average

In [None]:
#create a list of metric results. 
res = evaluate(texts, metrics, avg)
print(res)
#Save the scoring results to a csv file. This will be used to calculate the correlation with the human ranking.
res.to_csv(csv_metrics, index=False, sep=';')