In [None]:
%%capture
#install dependencies (April 14, 2023)
!pip install sacrebleu==2.3.1 
!pip install evaluate==0.3.0 
!pip install rouge_score==0.1.2 
!pip install sacremoses==0.0.53 
!pip install textstat==0.7.3

In [None]:
#20min da
#systems = ['20min-dropout-0_0-0_1-most-similar.txt', '20min-dropout-0_1-0_1-most-similar.txt', '20min-dropout-0_3-0_1-most-similar.txt', '20min-dropout-0_8-0_1-most-similar.txt', '20min-backtranslation-0_0-0_1-most-similar.txt', '20min-simple_noise-0_0-0_1-most-similar.txt', '20min-bart_noise-0_0-0_1-most-similar.txt', '20min-bt_noise-0_0-0_1-most-similar.txt','20min-english-0_0-0_1-most-similar.txt']

#Kurier ft decoder
#systems = ['Kurier-mbart-0_1-0_1-most-similar.txt','Kurier-ft-decoder-mbart-0_1-0_1-most-similar.txt', 'Kurier-ft-decoder-gaussian-noise-mbart-0_1-0_1-most-similar.txt', 'Kurier-ft-decoder-bart-noise-mbart-0_1-0_1-most-similar.txt'] 

#20min retrained cross attentions
systems = ['20min-regular-mbart-0_1-0_1-most-similar.txt','20min-random-mbart-0_1-0_1-most-similar.txt', '20min-retrained-mbart-0_1-0_1-most-similar.txt', '20min-distilled[-1]-mbart-0_1-0_1-most-similar.txt', '20min-distilled[r,r,-1]-mbart-0_1-0_1-most-similar.txt'] 

# Hints

This notebook is primarly intended to be run on Google Colab.  
You need to place your *refs.txt* und *sources.txt* files for the corresponding translations in this notebook's directory.  
Place the translations candidates you want to test in the same directory and reference them in the *systems* array in the cell above.  
The *systems* array's first item is the baseline system. All significance values are calculated pairwise against this system.  

When the above steps have been performed, all cells can be executed consecutively. The results are then presented in the penultimate cell in the form of a python dictionary.

# Set up custom metrics

In [None]:
import math
import logging
from importlib import import_module
from typing import List, Sequence, Optional, Dict, Any

from sacrebleu.utils import my_log, sum_of_lists

from sacrebleu.metrics.base import Score, Signature, Metric
from sacrebleu.metrics.helpers import extract_all_word_ngrams

import textstat

class FRESignature(Signature):
    """A convenience class to represent the reproducibility signature for FRE.

    :param args: key-value dictionary passed from the actual metric instance.
    """
    def __init__(self, args: dict):
        """`FRESignature` initializer."""
        super().__init__(args)


class FREScore(Score):
    """A convenience class to represent FRE scores.

    :param score: The FRE score.
    """
    def __init__(self, score: float):
        """`FREScore` initializer."""
        super().__init__('FRE', score)


class FRE(Metric):
    """Computes the FRE metric given hypotheses and references.
    """

    _SIGNATURE_TYPE = FRESignature

    def __init__(self, references: Optional[Sequence[Sequence[str]]] = None):
        """`FRE` initializer."""
        super().__init__()

        textstat.set_lang("de")

        if references is not None:
            # Pre-compute reference ngrams and lengths
            self._ref_cache = self._cache_references(references)

    def _preprocess_segment(self, sent: str) -> str:
        """Given a sentence, lowercases (optionally) and tokenizes it
        :param sent: The input sentence string.
        :return: The pre-processed output string.
        """
        return sent

    def _compute_score_from_stats(self, stats: List[float]) -> FREScore:
        """Computes the final score from already aggregated statistics.

        :param stats: A list or numpy array of segment-level statistics.
        :return: A `RougeScore` object.
        """
        return FREScore(stats[0] / stats[1])

    def _aggregate_and_compute(self, stats: List[List[int]]) -> FREScore:
        """Computes the final BLEU score given the pre-computed corpus statistics.

        :param stats: A list of segment-level statistics
        :return: A `RougeScore` instance.
        """
        return self._compute_score_from_stats(sum_of_lists(stats))

    def _extract_reference_info(self, refs: Sequence[str]) -> Dict[str, Any]:
        """Given a list of reference segments, extract the n-grams and reference lengths.
        The latter will be useful when comparing hypothesis and reference lengths for BLEU.

        :param refs: A sequence of strings.
        :return: A dictionary that will be passed to `_compute_segment_statistics()`
        through keyword arguments.
        """
        return {'refs': refs}

    def _compute_segment_statistics(self, hypothesis: str,
                                    ref_kwargs: Dict) -> List[int]:
        """Given a (pre-processed) hypothesis sentence and already computed
        reference n-grams & lengths, returns the best match statistics across the
        references.

        :param hypothesis: Hypothesis sentence.
        :param ref_kwargs: A dictionary with `refs_ngrams`and `ref_lens` keys
        that denote the counter containing all n-gram counts and reference lengths,
        respectively.
        :return: A list of integers with match statistics.
        """

        refs = ref_kwargs['refs']
        
        fre_score = max(0,textstat.flesch_reading_ease(hypothesis))
        #fre_score, 1 to count the total lines
        return [fre_score, 1 ]

    def sentence_score(self, hypothesis: str, references: Sequence[str]) -> FREScore:
        """Compute the metric for a single sentence against a single (or multiple) reference(s).

        :param hypothesis: A single hypothesis string.
        :param references: A sequence of reference strings.
        :return: a `FREScore` object.
        """
        fre_score = max(0,textstat.flesch_reading_ease(hypothesis))
        return FREScore(fre_score)


In [None]:
import math
import logging
from importlib import import_module
from typing import List, Sequence, Optional, Dict, Any

from sacrebleu.utils import my_log, sum_of_lists

from sacrebleu.metrics.base import Score, Signature, Metric
from sacrebleu.metrics.helpers import extract_all_word_ngrams

import evaluate

class RougeSignature(Signature):
    """A convenience class to represent the reproducibility signature for Rouge.

    :param args: key-value dictionary passed from the actual metric instance.
    """
    def __init__(self, args: dict):
        """`RougeSignature` initializer."""
        super().__init__(args)


class RougeScore(Score):
    """A convenience class to represent Rouge scores.

    :param score: The Rouge score.
    """
    def __init__(self, score: float):
        """`RougeScore` initializer."""
        super().__init__('Rouge', score)


class ROUGE(Metric):
    """Computes the Rouge metric given hypotheses and references.
    """

    _SIGNATURE_TYPE = RougeSignature

    def __init__(self, references: Optional[Sequence[Sequence[str]]] = None):
        """`Rouge` initializer."""
        super().__init__()

        self.rouge = evaluate.load('rouge')

        if references is not None:
            # Pre-compute reference ngrams and lengths
            self._ref_cache = self._cache_references(references)

    def _preprocess_segment(self, sent: str) -> str:
        """Given a sentence, lowercases (optionally) and tokenizes it
        :param sent: The input sentence string.
        :return: The pre-processed output string.
        """
        return sent

    def _compute_score_from_stats(self, stats: List[float]) -> RougeScore:
        """Computes the final score from already aggregated statistics.

        :param stats: A list or numpy array of segment-level statistics.
        :return: A `RougeScore` object.
        """
        return RougeScore(stats[0] / stats[1])

    def _aggregate_and_compute(self, stats: List[List[int]]) -> RougeScore:
        """Computes the final BLEU score given the pre-computed corpus statistics.

        :param stats: A list of segment-level statistics
        :return: A `RougeScore` instance.
        """
        return self._compute_score_from_stats(sum_of_lists(stats))

    def _extract_reference_info(self, refs: Sequence[str]) -> Dict[str, Any]:
        """Given a list of reference segments, extract the n-grams and reference lengths.
        The latter will be useful when comparing hypothesis and reference lengths for BLEU.

        :param refs: A sequence of strings.
        :return: A dictionary that will be passed to `_compute_segment_statistics()`
        through keyword arguments.
        """
        return {'refs': refs}

    def _compute_segment_statistics(self, hypothesis: str,
                                    ref_kwargs: Dict) -> List[int]:
        """Given a (pre-processed) hypothesis sentence and already computed
        reference n-grams & lengths, returns the best match statistics across the
        references.

        :param hypothesis: Hypothesis sentence.
        :param ref_kwargs: A dictionary with `refs_ngrams`and `ref_lens` keys
        that denote the counter containing all n-gram counts and reference lengths,
        respectively.
        :return: A list of integers with match statistics.
        """

        refs = ref_kwargs['refs']
        
        #use use_aggregator=False to avoid duplicated bootstrap resampling
        rouge_score = self.rouge.compute(predictions=[hypothesis],references=refs, use_aggregator=False)
        #rouge_score, 1 to count the total lines
        return [rouge_score['rougeL'][0], 1 ]

    def sentence_score(self, hypothesis: str, references: Sequence[str]) -> RougeScore:
        """Compute the metric for a single sentence against a single (or multiple) reference(s).

        :param hypothesis: A single hypothesis string.
        :param references: A sequence of reference strings.
        :return: a `RougeScore` object.
        """
        #use use_aggregator=False to avoid duplicated bootstrap resampling
        rouge_score = self.rouge.compute(predictions=[hypothesis], references=references, use_aggregator=False)
        return RougeScore(rouge_score['rougeL'][0])


In [None]:
import math
import logging
from importlib import import_module
from typing import List, Sequence, Optional, Dict, Any

from sacrebleu.utils import my_log, sum_of_lists

from sacrebleu.metrics.base import Score, Signature, Metric
from sacrebleu.metrics.helpers import extract_all_word_ngrams

import evaluate

class SARISignature(Signature):
    """A convenience class to represent the reproducibility signature for SARI.

    :param args: key-value dictionary passed from the actual metric instance.
    """
    def __init__(self, args: dict):
        """`SARISignature` initializer."""
        super().__init__(args)


class SARIScore(Score):
    """A convenience class to represent SARI scores.

    :param score: The SARI score.
    """
    def __init__(self, score: float):
        """`SARIScore` initializer."""
        super().__init__('SARI', score)


class SARI(Metric):
    """Computes the SARI metric given hypotheses and references.
    """

    _SIGNATURE_TYPE = SARISignature

    def __init__(self, sources , references: Optional[Sequence[Sequence[str]]] = None):
        """`SARI` initializer."""
        super().__init__()

        self.sari = evaluate.load('sari')
        self.sources = sources

        if references is not None:
            # Pre-compute reference ngrams and lengths
            self._ref_cache = self._cache_references(references)
            print("hit")

    def _preprocess_segment(self, sent: str) -> str:
        """Given a sentence, lowercases (optionally) and tokenizes it
        :param sent: The input sentence string.
        :return: The pre-processed output string.
        """
        return sent

    def _compute_score_from_stats(self, stats: List[float]) -> SARIScore:
        """Computes the final score from already aggregated statistics.

        :param stats: A list or numpy array of segment-level statistics.
        :return: A `SARIScore` object.
        """
        return SARIScore(stats[0] / stats[1])

    def _aggregate_and_compute(self, stats: List[List[int]]) -> SARIScore:
        """Computes the final BLEU score given the pre-computed corpus statistics.

        :param stats: A list of segment-level statistics
        :return: A `SARIScore` instance.
        """
        return self._compute_score_from_stats(sum_of_lists(stats))

    def _extract_reference_info(self, refs: Sequence[str]) -> Dict[str, Any]:
        """Given a list of reference segments, extract the n-grams and reference lengths.
        The latter will be useful when comparing hypothesis and reference lengths for BLEU.

        :param refs: A sequence of strings.
        :return: A dictionary that will be passed to `_compute_segment_statistics()`
        through keyword arguments.
        """
        return {'refs': refs, 'sources': self.sources}

    def _compute_segment_statistics(self, hypothesis: str,
                                    ref_kwargs: Dict) -> List[int]:
        """Given a (pre-processed) hypothesis sentence and already computed
        reference n-grams & lengths, returns the best match statistics across the
        references.

        :param hypothesis: Hypothesis sentence.
        :param ref_kwargs: A dictionary with `refs_ngrams`and `ref_lens` keys
        that denote the counter containing all n-gram counts and reference lengths,
        respectively.
        :return: A list of integers with match statistics.
        """

        refs = ref_kwargs['refs']
        sources = ref_kwargs['sources'][hash(refs[0])]

        sari_score = self.sari.compute(sources=[sources], predictions=[hypothesis],references=[refs])
        #rouge_score, 1 to count the total lines
        return [sari_score['sari'], 1 ]

    def sentence_score(self, hypothesis: str, references: Sequence[str]) -> SARIScore:
        """Compute the metric for a single sentence against a single (or multiple) reference(s).

        :param hypothesis: A single hypothesis string.
        :param references: A sequence of reference strings.
        :return: a `RougeScore` object.
        """
        sari_score = self.sari.compute(predictions=[hypothesis], references=references)
        return RougeScore(sari_score['sari'])


In [None]:
import math
import logging
from importlib import import_module
from typing import List, Sequence, Optional, Dict, Any

from sacrebleu.utils import my_log, sum_of_lists

from sacrebleu.metrics.base import Score, Signature, Metric
from sacrebleu.metrics.helpers import extract_all_word_ngrams

class REPETITIONSignature(Signature):
    """A convenience class to represent the reproducibility signature for RepetitionScore.

    :param args: key-value dictionary passed from the actual metric instance.
    """
    def __init__(self, args: dict):
        """`RepetitionSignature` initializer."""
        super().__init__(args)


class REPETITIONScore(Score):
    """A convenience class to represent Repetition scores.

    :param score: The Repetition score.
    """
    def __init__(self, score: float):
        """`RepetitionScore` initializer."""
        super().__init__('Repetition', score)


class REPETITION(Metric):
    """Computes the Rouge metric given hypotheses and references.
    """

    _SIGNATURE_TYPE = REPETITIONSignature

    def __init__(self, references: Optional[Sequence[Sequence[str]]] = None):
        """`Repetition` initializer."""
        super().__init__()

    def _process_line(self, line, min_repetitions=2):
        max_intersection = []
        line = line.strip()
        for i in range(min_repetitions, len(line)):
          longest_intersections = 0
          line_copy = line[i:] + line[:i]
          current_row = 0
          for x, y in zip(line, line_copy):
            if(x != y):
              if current_row > longest_intersections:
                longest_intersections = current_row
              current_row = 0
            else:
              current_row += 1
          if longest_intersections >= i:
            max_intersection.append(longest_intersections)

        if len(max_intersection) == 0:
            return 0
        else:
          return max(max_intersection) / len(line)

    def _preprocess_segment(self, sent: str) -> str:
        """Given a sentence, lowercases (optionally) and tokenizes it
        :param sent: The input sentence string.
        :return: The pre-processed output string.
        """
        return sent

    def _compute_score_from_stats(self, stats: List[float]) -> REPETITIONScore:
        """Computes the final score from already aggregated statistics.

        :param stats: A list or numpy array of segment-level statistics.
        :return: A `RepetitionScore` object.
        """
        return REPETITIONScore(stats[0] / stats[1])

    def _aggregate_and_compute(self, stats: List[List[int]]) -> REPETITIONScore:
        """Computes the final Repetition score given the pre-computed corpus statistics.

        :param stats: A list of segment-level statistics
        :return: A `RepetitionScore` instance.
        """
        return self._compute_score_from_stats(sum_of_lists(stats))

    def _extract_reference_info(self, refs: Sequence[str]) -> Dict[str, Any]:
        """Given a list of reference segments, extract the n-grams and reference lengths.
        The latter will be useful when comparing hypothesis and reference lengths for BLEU.

        :param refs: A sequence of strings.
        :return: A dictionary that will be passed to `_compute_segment_statistics()`
        through keyword arguments.
        """
        return {'refs': refs}

    def _compute_segment_statistics(self, hypothesis: str,
                                    ref_kwargs: Dict) -> List[int]:
        """Given a (pre-processed) hypothesis sentence and already computed
        reference n-grams & lengths, returns the best match statistics across the
        references.

        :param hypothesis: Hypothesis sentence.
        :param ref_kwargs: A dictionary with `refs_ngrams`and `ref_lens` keys
        that denote the counter containing all n-gram counts and reference lengths,
        respectively.
        :return: A list of integers with match statistics.
        """

        
        result = self._process_line(hypothesis)
        return [result, 1 ]

    def sentence_score(self, hypothesis: str, references: Sequence[str]) -> REPETITIONScore:
        """Compute the metric for a single sentence against a single (or multiple) reference(s).

        :param hypothesis: A single hypothesis string.
        :param references: A sequence of reference strings.
        :return: a `RougeScore` object.
        """
        result = self._process_line(hypothesis)
        return  REPETITIONScore(result)


# Evaluate Systems

Drag and drop refs.txt, sources.txt and your corresponding translation .txt (one for each system) to the left sidebar of google colab and run all cells

In [None]:
#adapted from sacrebleu

from sacrebleu.significance import PairedTest
from sacrebleu.utils import filter_subset, smart_open
from sacrebleu.metrics.bleu import BLEU

refs_file = "refs.txt"
sources_file = 'sources.txt'

paired_bs = True
paired_bs_n = 1000
paired_ar_n = 10000
short = False
paired_jobs = 1

# Set params
test_type = 'bs' if paired_bs else 'ar'
n_samples = paired_bs_n if paired_bs else paired_ar_n

full_systems, sys_names = [], []

test_set = None
langpair = None
origlang = None
subset = None

concat_ref_files = [[refs_file]]
num_refs = len(concat_ref_files)

references = []
with open(refs_file) as source_file:
  references = source_file.readlines()
sources = {}
with open(sources_file) as source_file:
  for i,line in enumerate(source_file.readlines()):
      sources[hash(references[i].strip())] = line

metrics = {
    'BLEU': BLEU(), 
    'RougeL': ROUGE(),
    'SARI': SARI(sources=sources),
    'FRE': FRE(),
    'REPETITIONS': REPETITION(), 
    }

for fname in systems:
  sys_name = fname

  # Read the system
  lines = []
  for line in smart_open(fname, encoding='utf-8'):
      lines.append(line.rstrip())
  full_systems.append(lines)
  sys_names.append(sys_name)

num_sys = len(sys_names)

full_refs = [[] for x in range(max(len(concat_ref_files[0]), num_refs))]
for ref_files in concat_ref_files:
    for refno, ref_file in enumerate(ref_files):
      for lineno, line in enumerate(smart_open(ref_file, encoding='utf-8'), 1):
          line = line.rstrip()
          if num_refs == 1:
              full_refs[refno].append(line)

# Filter subsets if requested
outputs = filter_subset(
    [*full_systems, *full_refs], test_set, langpair,
    origlang, subset)

# Unpack systems & references back
systems, refs = outputs[:num_sys], outputs[num_sys:]

named_systems = [(sys_names[i], systems[i]) for i in range(num_sys)]

ps = PairedTest(named_systems, metrics, references=refs,
                test_type=test_type, n_samples=n_samples,
                n_jobs=paired_jobs)

# Set back the number of trials
paired_n = ps.n_samples

# Run the test
sigs, scores = ps()

# Get signature strings
sigs = {k: v.format(short) for k, v in sigs.items()}
scores

In [None]:
index = 0
print(scores['System'][index])
print("BLEU: ", scores['BLEU'][index])
print("ROUGE: ", scores['Rouge'][index])
print("SARI: ", scores['SARI'][index])
print("FRE: ", scores['FRE'][index])