In [1]:
pip install evaluate


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from evaluate)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiproce

In [2]:
pip install torchmetrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchmetrics
  Downloading torchmetrics-0.11.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torchmetrics
Successfully installed torchmetrics-0.11.4


In [1]:
import os
import copy

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from scipy.stats import pearsonr
import evaluate
from torchmetrics import BLEUScore

  from .autonotebook import tqdm as notebook_tqdm


Mounted at /content/gdrive


In [2]:
def get_concordant_discordant_filtered(a, b, min_diff=5):
    con = 0
    dis = 0

    # The LENS paper uses only pairs where all three annotators agree with the ranking order
    # and the unnormalised score difference is larger than 5

    # If by that, they mean the score difference is larger than 5 for each annotaor:
    if False:
        for i in range(len(a)):
            for j in range(0, i):

                # Filter out invalid pairs
                filtered = False
                for annotator_idx in range(1, 4):
                    diff = b.iloc[j][f'rating_{annotator_idx}'] - b.iloc[i][f'rating_{annotator_idx}']

                    # Filter out cases where score diff <= 5
                    if abs(diff) <= min_diff:
                        filtered = True

                    # Make sure that all annotators agree with the order
                    if annotator_idx == 1:
                        larger = diff > 0
                    else:
                        if (diff > 0) != larger:
                            filtered = True
                            break
                if filtered:
                    continue

                if larger:
                    larger = 1
                else:
                    larger = -1

                # Count concordanct and discordant pairs
                if (a[j] - a[i]) * larger > 0:
                    con += 1
                else:
                    dis += 1

    # ... If by that, they mean the average score difference is larger than 5:
    else:
        for i in range(len(a)):
            for j in range(0, i):

                # Filter
                filtered = False
                diffs = []
                for annotator_idx in range(1, 4):
                    diff = b.iloc[j][f'rating_{annotator_idx}'] - b.iloc[i][f'rating_{annotator_idx}']
                    diffs.append(diff)

                    # Make sure that all annotators agree with the order
                    if annotator_idx == 1:
                        larger = diff > 0
                    else:
                        if (diff > 0) != larger:
                            filtered = True
                            break

                # Make sure the average score diff is larger than 5
                avg = sum(diffs) / len(diffs)
                if abs(avg) <= min_diff:
                    filtered = True

                if filtered:
                    continue

                if larger:
                    larger = 1
                else:
                    larger = -1

                # Count concordanct and discordant pairs
                if (a[j] - a[i]) * larger > 0:
                    con += 1
                else:
                    dis += 1

    print(f'Concordant: {con}, discordant: {dis}')
    return (con - dis) / (con + dis)

In [33]:
def test_bleu_simpeval_2022():
    # Evaluate BLEU's correlation with human scores
    # This should correspond to the tao_all value on Table 2 of the LENS paper

    # The BLEU implemntation from Huggingface's evaluate package
    #bleu = evaluate.load('bleu')
    bleu = BLEUScore()

    # SimpEval 2022 as provided in the LENS repo
    df = pd.read_excel('../data/stage3/simpeval_2022.xlsx')
    # df = pd.read_csv("../data/stage3/simpeval_2022.csv")
    df_original = copy.deepcopy(df)

    human = "Human 1 Writing"

    # The dataset contains two human simplifications for each source sentence.
    # They use one as the reference and the other as the oracle output.
    # I use Human 2 Writing as the reference and Human 1 Writing as the oracle output here. I've also tried the other way around.
    df = df[df['system'] != human]

    scores = []
    for i in range(len(df)):
        pred = df.iloc[i]['generation']

        # Resolve the reference
        original_id = df.iloc[i]['original_id']
        human_generated = df_original['system'].isin([human])
        same_id = df_original['original_id'] == original_id
        refs = df_original[human_generated & same_id]['generation'].tolist()

        # Compute BLEU under the default settings
        #score = bleu.compute(predictions = [pred], references = [refs])['bleu']
        #scores.append(score)

        bleu.update([pred], [refs])
        score = bleu.compute()
        score = bleu([pred], [refs])
        
        if i > 3:
            pred = 'Hi I am a sentence'
            refs = ['Hi I am a sentence']

            bleu.update([pred], [refs])
            score = bleu.compute() # This returns a score of 0.1131 probably because the BLEUScore object's states are not properly updated.
            print(score)

            score = bleu([pred], [refs]) # This returns a score of 1. It is also how the BLEUScore object is used in the documentation: https://torchmetrics.readthedocs.io/en/stable/text/bleu_score.html.
            print(score)
            
            print(pred)
            print(refs)
            break
        scores.append(score.item())
        #print(score)

    #print(scores)
    # Kendall tau-like with pairs where all annotators agree with the ranking order and unormalized score differences > 5
    kendall = get_concordant_discordant_filtered(scores, df)
    print(f'Kendall Tau-like (filtered pairs): {kendall}')

In [34]:
test_bleu_simpeval_2022()

tensor(0.1131)
tensor(1.)
Hi I am a sentence
['Hi I am a sentence']
Concordant: 0, discordant: 1
Kendall Tau-like (filtered pairs): -1.0


  warn(


In [28]:
pred = ['They imposed a fine of $400, or $100 per count of fraud, and three years of supervised release following the prison sentence.']
ref = [['The sentence included a three-year supervised release and a fine of $400.']]


from torchmetrics import BLEUScore
bleu = BLEUScore()

my_results = bleu(pred, ref)
print(my_results)

bleu.update(pred, ref)
your_results = bleu.compute()
print(your_results)


tensor(0.)
tensor(0.)
