In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive/Colab Notebooks/master_thesis
!pip install git+https://github.com/feralvam/easse.git
#!pip install tupa
#nltk.download('stopwords')
#!python -m spacy download en_core_web_md

In [None]:
#!pip install pingouin
import numpy as np
import pandas as pd
#import pingouin as pg
from scipy.stats import zscore, spearmanr, pearsonr
from tqdm import tqdm
import nltk

In [None]:
import en_core_web_md
en_core_web_md.load()
import spacy
#spacy.cli.download('en_core_web_md')
spacy.load('en_core_web_md')

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


<spacy.lang.en.English at 0x7ff032bd1e10>

# import

In [None]:
# From https://github.com/feralvam/metaeval-simplification/blob/main/notebooks/correlation.py
from itertools import combinations
import numpy as np
import pandas as pd
from scipy.stats import zscore, pearsonr, t
from scipy.stats.mstats import mquantiles
from sklearn.metrics import cohen_kappa_score


def _standardise_ratings(df, rater_id_cols, aspect_col):
    return df.groupby(rater_id_cols)[aspect_col].transform(lambda x: zscore(x))


def _simulate_two_annotators(ratings, num_ratings_annotatorA=1):
    ratings_shuffled = np.random.permutation(ratings)
    ratingA = np.mean(ratings_shuffled[:num_ratings_annotatorA])
    ratingB = np.mean(ratings_shuffled[num_ratings_annotatorA:])
    return [ratingA, ratingB]


def compute_inter_annotator_agreement(df_ratings, segment_id_cols, rater_id_cols, aspects,
                                      n_bins=5, use_quantiles=True, n_simulations=1000):
    iaa_per_aspect = {}
    for aspect in aspects:
        if f"{aspect}_zscore" not in df_ratings.columns:
            df_ratings[f"{aspect}_zscore"] = _standardise_ratings(df_ratings, rater_id_cols, aspect)
        df_scores = df_ratings[segment_id_cols + [f'{aspect}_zscore']]
        # Bin the data in n_bins
        if use_quantiles:  # equally-distributed
            _, bins_ranges = pd.qcut(df_scores[f'{aspect}_zscore'], q=n_bins, retbins=True)
        else:  # equally-spaced
            _, bins_ranges = pd.cut(df_scores[f'{aspect}_zscore'], bins=n_bins, retbins=True)
        kappa_values = []
        for _ in tqdm(range(n_simulations)):
            ratings_simulation = df_scores.groupby(segment_id_cols)[f'{aspect}_zscore'].apply(_simulate_two_annotators).to_list()
            raterA, raterB = zip(*ratings_simulation)
            kappa_values.append(cohen_kappa_score(np.digitize(raterA, bins_ranges), np.digitize(raterB, bins_ranges), weights='quadratic'))
        iaa_per_aspect[aspect] = (np.mean(kappa_values), np.std(kappa_values))
    return iaa_per_aspect


def compute_segment_scores(df_ratings, segment_id_cols, rater_id_cols, aspects):
    scores_cols = []
    for aspect in aspects:
        df_ratings[f"{aspect}_zscore"] = _standardise_ratings(df_ratings, rater_id_cols, aspect)
        scores_cols += [aspect, f"{aspect}_zscore"]
    df_segment_scores = df_ratings.groupby(segment_id_cols)[scores_cols].agg([np.mean])
    df_segment_scores.columns = [a for a, _ in df_segment_scores.columns]

    return df_segment_scores


def _select_pairs_in_group(group, min_score_difference=25):
    data = []
    for (system_a, score_a, zscore_a), (system_b, score_b, zscore_b) in combinations(group.values, 2):
        # select the pair if its absolute difference in DA scores is greater than 25
        if abs(score_a - score_b) > min_score_difference:
            data.append([system_a, score_a, zscore_a, system_b, score_b, zscore_b])
    df_selected_pairs = pd.DataFrame(data,
                                     columns=['system_a', "score_a", "zscore_a", "system_b", "score_b", "zscore_b"])
    return df_selected_pairs


def select_segment_pairs(df_human_scores, aspect, sentence_id_cols, system_id_cols):
    df_scores = df_human_scores.reset_index()
    cols_of_interest = system_id_cols + [aspect, f"{aspect}_zscore"]
    selected_pairs = (df_scores.groupby(sentence_id_cols)[cols_of_interest].apply(_select_pairs_in_group)
                                .reset_index(level=1, drop=True)
                                .reset_index())
    return selected_pairs


def compute_relative_ranking_correlations(df_human_scores, df_metrics_scores, aspect,
                                          segment_id_cols, sentence_id_cols, system_id_cols,
                                          use_absolute_values=True, bootstrap_samples=1000):
    df_segment_pairs = select_segment_pairs(df_human_scores, aspect, sentence_id_cols, system_id_cols)

    df_all_scores = pd.merge(left=df_segment_pairs,
                             left_on=sentence_id_cols+['system_a'],
                             right=df_metrics_scores,
                             right_on=segment_id_cols)
    df_all_scores = pd.merge(left=df_all_scores,
                             left_on=sentence_id_cols+['system_b'],
                             right=df_metrics_scores,
                             right_on=segment_id_cols)

    metrics_names = [col for col in df_metrics_scores.columns if col not in segment_id_cols]

    # Compute the correlations
    print("Computing correlations...")
    correlations_data = []
    for metric in metrics_names:
        corr = kendall_tau_wmt(df_all_scores[['zscore_a', 'zscore_b', f"{metric}_x", f"{metric}_y"]])
        if use_absolute_values:
            corr = abs(corr)
        correlations_data.append([metric, corr])
    df_correlations = pd.DataFrame(correlations_data, columns=['metric', 'corr'])

    # Bootstrap sampling
    print("Bootstrap sampling...")
    correlations_bootstrap_data = []
    for _ in range(bootstrap_samples):
        df_scores_sample = df_all_scores.sample(n=len(df_all_scores), replace=True)
        for metric in metrics_names:
            corr_sample = kendall_tau_wmt(df_scores_sample[['zscore_a', 'zscore_b', f"{metric}_x", f"{metric}_y"]])
            if use_absolute_values:
                corr_sample = abs(corr_sample)
            correlations_bootstrap_data.append([metric, corr_sample])
    df_bootstrap_correlations = pd.DataFrame(correlations_bootstrap_data, columns=['metric', 'corr'])

    # Compute 95% confidence intervals for each metric
    print("Computing 95% confidence intervals for each metric...")
    confidence_intervals = []
    for metric in metrics_names:
        metric_corr = df_bootstrap_correlations[df_bootstrap_correlations['metric'] == metric]['corr']
        # Equivalent to using the R function quantile with default type 7
        lower, upper = mquantiles(metric_corr, prob=[0.05, 0.95], alphap=1, betap=1)
        confidence_intervals.append(pd.Interval(left=lower, right=upper, closed='both'))
    df_correlations['conf_interval'] = confidence_intervals
    df_correlations.sort_values(by=['corr'], ascending=False, inplace=True, ignore_index=True)

    # Determine if the difference in performance is significant
    print("Determining if the difference in performance is significant...")
    metrics_names = df_correlations['metric'].to_list()
    significance_matrix = []
    winner_status = []
    for _, row_metric_a in df_correlations.iterrows():
        metric_a = row_metric_a['metric']
        ci_metric_a = row_metric_a['conf_interval']
        is_winner = True
        significance_row = []
        for _, row_metric_b in df_correlations.iterrows():
            metric_b = row_metric_b['metric']
            ci_metric_b = row_metric_b['conf_interval']
            # It's significant if confidence intervals do not overlap
            is_diff_stats_significant = ci_metric_a.left > ci_metric_b.right
            significance_row.append(is_diff_stats_significant)
            # Update winner status (not significantly outperformed by any other metric)
            if metric_b != metric_a:
                is_winner = is_winner and is_diff_stats_significant
        significance_matrix.append(significance_row)
        winner_status.append(is_winner)
    df_correlations['is_winner'] = winner_status
    df_significance = pd.DataFrame(np.array(significance_matrix), columns=metrics_names, index=metrics_names)

    return df_correlations, df_significance


def kendall_tau_wmt(df_scores):
    concordant = 0
    discordant = 0
    for _, (score_a, score_b, metric_a, metric_b) in df_scores.iterrows():
        if score_a < score_b:
            if metric_a < metric_b:
                concordant += 1
            else:
                discordant += 1
        elif score_a > score_b:
            if metric_a <= metric_b:
                discordant += 1
            else:
                concordant += 1

    return (abs(concordant) - abs(discordant)) / (abs(concordant) + abs(discordant))


def compute_direct_assessment_correlations(df_human_scores, df_metrics_scores, aspect, segment_id_cols,
                                           use_absolute_values=True):
    df_da_scores = df_human_scores.reset_index()
    cols_of_interest = segment_id_cols + [aspect, f"{aspect}_zscore"]
    df_da_scores = df_da_scores[cols_of_interest]
    df_all_scores = pd.merge(left=df_metrics_scores, right=df_da_scores, on=segment_id_cols)

    # Compute correlations metrics vs human scores
    print("Computing correlations...")
    metrics_names = [col for col in df_metrics_scores.columns if col not in segment_id_cols]
    correlations_data = []
    for metric in metrics_names:
        corr, p_value = pearsonr(df_all_scores[metric], df_all_scores[f'{aspect}_zscore'])
        if use_absolute_values:
            corr = abs(corr)
        correlations_data.append([metric, corr, p_value])
    df_correlations_metrics_human = pd.DataFrame(correlations_data, columns=['metric', 'corr', 'p_value'])
    df_correlations_metrics_human.sort_values(by=['corr'], ascending=False, inplace=True, ignore_index=True)

    # Compute correlations metrics vs metrics
    metrics_names = df_correlations_metrics_human['metric'].to_list()
    correlations_data = []
    for _, (metric_a, corr_metric_a, _) in df_correlations_metrics_human.iterrows():
        for _, (metric_b, corr_metric_b, _) in df_correlations_metrics_human.iterrows():
            corr_a_b, pvalue_a_b = pearsonr(df_all_scores[metric_a], df_all_scores[metric_b])
            if use_absolute_values:
                corr_a_b = abs(corr_a_b)
            correlations_data.append([metric_a, corr_metric_a,
                                      metric_b, corr_metric_b,
                                      corr_a_b, pvalue_a_b])
    df_correlations_metric_metric = pd.DataFrame(correlations_data,
                                                 columns=['metric_a', 'corr_metric_a',
                                                          'metric_b', 'corr_metric_b',
                                                          'corr_a_b', 'pvalue_a_b'])

    # Determine if the difference in performance is significant
    print("Determining if the difference in performance is significant...")
    significance_matrix = []
    winner_status = []
    for metric_a in metrics_names:
        df_correlations = df_correlations_metric_metric[df_correlations_metric_metric['metric_a'] == metric_a]
        is_winner = True
        significance_row = []
        for _, (_, corr_metric_a, metric_b, corr_metric_b, corr_a_b, _) in df_correlations.iterrows():
            p = np.nan
            if (metric_a != metric_b) and (corr_metric_a > corr_metric_b):
                _, p = williams_test(corr_metric_a, corr_metric_b, corr_a_b, len(df_human_scores))
            is_diff_stats_significant = p < 0.05
            if not is_diff_stats_significant:
                # we do not care about the exact values in cases where it's not significant
                p = np.nan
            significance_row.append(p)
            # Update winner status (not significantly outperformed by any other metric)
            if metric_a != metric_b:
                is_winner = is_winner and is_diff_stats_significant
        significance_matrix.append(significance_row)
        winner_status.append(is_winner)
    df_correlations_metrics_human['is_winner'] = winner_status
    df_significance = pd.DataFrame(np.array(significance_matrix), columns=metrics_names, index=metrics_names)

    return df_correlations_metrics_human, df_significance


# From https://github.com/inmoonlight/nlp-williams/blob/master/williams.py
def williams_test(r12, r13, r23, n):
    """The Williams test (Evan J. Williams. 1959. Regression Analysis, volume 14. Wiley, New York, USA)
    A test of whether the population correlation r12 equals the population correlation r13.
    Significant: p < 0.05
    Arguments:
        r12 (float): correlation between x1, x2
        r13 (float): correlation between x1, x3
        r23 (float): correlation between x2, x3
        n (int): size of the population
    Returns:
        t (float): Williams test result
        p (float): p-value of t-dist
    """
    assert (r12 >= r13), "r12 should be larger than r13"
    assert (n > 3), "n should be larger than 3"

    K = 1 - r12 ** 2 - r13 ** 2 - r23 ** 2 + 2 * r12 * r13 * r23
    denominator = np.sqrt(
        2 * K * (n - 1) / (n - 3) + (((r12 + r13) ** 2) / 4) * ((1 - r23) ** 3)
    )
    numerator = (r12 - r13) * np.sqrt((n - 1) * (1 + r23))
    p = 1 - t.cdf(numerator / denominator, df=n - 3)  # changed to n-3 on 30/11/14
    return t, p

In [None]:
from easse.utils.constants import TEST_SETS_PATHS
from easse.utils.helpers import read_lines
import math
import scipy

def read_test_set(test_set, as_lists=False):
    orig_sents_path = TEST_SETS_PATHS[(test_set, "orig")]
    refs_sents_paths = TEST_SETS_PATHS[(test_set, "refs")]
    num_refs = len(refs_sents_paths)

    orig_sents = read_lines(orig_sents_path)
    refs_sents = [read_lines(ref_sents_path) for ref_sents_path in refs_sents_paths]

    if as_lists:
        return orig_sents, refs_sents

    fhs = [orig_sents] + refs_sents
    all_sent_id = []
    all_orig_sent = []
    all_ref_sents = []
    for sent_id, (orig_sent, *ref_sents) in enumerate(zip(*fhs), start=1):
        all_sent_id += [sent_id] * num_refs
        all_orig_sent += [orig_sent] * num_refs
        all_ref_sents += ref_sents
    return pd.DataFrame(
        list(zip(all_sent_id, all_orig_sent, all_ref_sents)),
        columns=["sent_id", "orig_sent", "ref_sent"],
    )


def collect_references(sent_ids, test_set_orig_sents, test_set_refs_sents, num_refs):
    orig_sents = []
    refs_sents = [[] for i in range(num_refs)]
    for sent_id in sent_ids:
        orig_sents.append(test_set_orig_sents[sent_id-1])
        for i, ref in enumerate(test_set_refs_sents):
            refs_sents[i].append(ref[sent_id-1])

    return orig_sents, refs_sents


def sigmoid(x):
    return 1 / (1 + math.exp(-x))

In [None]:
from easse.bleu import corpus_bleu
from easse.fkgl import corpus_fkgl
#from easse.samsa import get_samsa_sentence_scores
from easse.sari import corpus_sari
from easse.bertscore import corpus_bertscore
from bert_score import BERTScorer

In [None]:
asset_orig, asset_refs = read_test_set("asset_test", as_lists=True)
turk_orig, turk_refs = read_test_set("turkcorpus_test", as_lists=True)
hsplit_orig, hsplit_refs = read_test_set("hsplit_test", as_lists=True)

# We create a dataset composed of all references together
all_orig = asset_orig
all_refs = asset_refs + turk_refs + hsplit_refs

EVAL_DATASETS = {
    "asset": (asset_orig, asset_refs, 10),  # (original, references, number of references)
    #"turk": (turk_orig, turk_refs, 8),
    #"hsplit": (hsplit_orig, hsplit_refs, 4),
    #"all": (all_orig, all_refs, 22)
}

In [None]:
df_simplicityDA = pd.read_csv("./src/metaeval-simplification/simplicity_DA.csv")

# get df_metrics_segment

In [None]:
df_simplicityDA

In [None]:
df_simplicityDA.groupby("sys_name").count()  # 100 sentences per system

Unnamed: 0_level_0,sent_id,orig_sent,simp_sent,sys_type,fluency,fluency_zscore,meaning,meaning_zscore,simplicity,simplicity_zscore
sys_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ACCESS,100,100,100,100,100,100,100,100,100,100
DMASS-DCSS,100,100,100,100,100,100,100,100,100,100
Dress-Ls,100,100,100,100,100,100,100,100,100,100
Hybrid,100,100,100,100,100,100,100,100,100,100
PBMT-R,100,100,100,100,100,100,100,100,100,100
SBMT-SARI,100,100,100,100,100,100,100,100,100,100


In [None]:
lowercase = False  # case-insensitive
tokenizer = "moses"
bertscore_rescale = BERTScorer(lang="en", rescale_with_baseline=True)

Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
df_simplicityDA

In [None]:
for _, row in tqdm(df_simplicityDA.iterrows()):
    for test_set, (test_set_orig, test_set_refs, num_refs) in EVAL_DATASETS.items():
        orig_sents, ref_sents = collect_references(
            [row["sent_id"]], test_set_orig, test_set_refs, num_refs
        )
        print(orig_sents)
        print(ref_sents)
        print(row["simp_sent"])
        print()

In [None]:
metrics = []
for _, row in tqdm(df_simplicityDA.iterrows()):
    for test_set, (test_set_orig, test_set_refs, num_refs) in EVAL_DATASETS.items():
        orig_sents, ref_sents = collect_references(
            [row["sent_id"]], test_set_orig, test_set_refs, num_refs
        )
        #print(orig_sents)
        #print(ref_sents)
        # BLEU
        bleu_sys_refs = corpus_bleu(
            [row["simp_sent"]],
            ref_sents,
            smooth_method="floor",
            tokenizer=tokenizer,
            lowercase=lowercase,
            effective_order=True,
        )

        # SARI
        sari_score = corpus_sari(
            orig_sents,
            [row["simp_sent"]],
            ref_sents,
            tokenizer=tokenizer,
            lowercase=lowercase,
            use_f1_for_deletion=False,
        )

        # iBLEU (alpha = 0.9)
        bleu_sys_orig = corpus_bleu(
            [row["simp_sent"]],
            [orig_sents],
            force=True,
            tokenizer=tokenizer,
            lowercase=lowercase,
        )
        ibleu_score = 0.9 * bleu_sys_refs - (1 - 0.9) * bleu_sys_orig

        # Avg. of BLEU and SARI
        amean_bleu_sari = np.mean([bleu_sys_refs, sari_score])
        gmean_bleu_sari = scipy.stats.gmean([bleu_sys_refs, sari_score])

        # Flesch
        fkgl_sys = corpus_fkgl([row["simp_sent"]], tokenizer=tokenizer)

        # FKBLEU
        fkgl_orig = corpus_fkgl(orig_sents, tokenizer=tokenizer)
        fk_diff = sigmoid(fkgl_sys - fkgl_orig)
        fkbleu_score = ibleu_score * fk_diff

        # BERTScore
        ref_sents = [ref for [ref] in ref_sents]
        bertscore_rescale_scores = bertscore_rescale.score([row["simp_sent"]], [ref_sents])
        #bertscores = corpus_bertscore([
        #    row["simp_sent"]],
        #    ref_sents,
        #    tokenizer=tokenizer,
        #    lowercase=lowercase
        #)

        metrics.append(
            {
                "sent_id": row["sent_id"],
                "sys_name": row["sys_name"],
                "test_set": test_set,
                "bleu": bleu_sys_refs,
                "sari": sari_score,
                "ibleu": ibleu_score,
                "amean_bleu_sari": amean_bleu_sari,
                "gmean_bleu_sari": gmean_bleu_sari,
                "fkgl": fkgl_sys,
                "fkbleu": fkbleu_score,
                "bertscore_P": bertscore_rescale_scores[0].cpu().item(),
                "bertscore_R": bertscore_rescale_scores[1].cpu().item(),
                "bertscore_F1": bertscore_rescale_scores[2].cpu().item(),
                #"bertscore_P": bertscores[0],
                #"bertscore_R": bertscores[1],
                #"bertscore_F1": bertscores[2],
            }
        )

df_metrics_segment = pd.DataFrame(metrics)

0it [00:00, ?it/s][nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  log_a = np.log(np.array(a, dtype=dtype))
600it [29:10,  2.92s/it]


In [None]:
samsa_scores = get_samsa_sentence_scores(
    df_simplicityDA["orig_sent"],
    df_simplicityDA["simp_sent"],
    tokenizer=tokenizer,
    lowercase=lowercase,
)

# Since SAMSA is reference-less, this reformating is only done so that it can appear in thae same dataframe as the other metrics
df_metrics_segment["samsa"] = [
    s for s in samsa_scores for _ in range(len(EVAL_DATASETS))
]

600 passages [05:16,  1.89 passages/s, en ucca=1_0]


INFO:stanza:Writing properties to tmp file: corenlp_server-8895520c7a63492d.props
INFO:stanza:Starting server with command: java -Xmx5G -cp /usr/local/lib/python3.7/dist-packages/easse/resources/tools/stanford-corenlp-4.4.0/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9001 -timeout 60000 -threads 40 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-8895520c7a63492d.props -annotators tokenize,ssplit,pos,lemma,ner,depparse -preload -outputFormat json
INFO:stanza:Writing properties to tmp file: corenlp_server-a04b27a7960b41fc.props
INFO:stanza:Starting server with command: java -Xmx5G -cp /usr/local/lib/python3.7/dist-packages/easse/resources/tools/stanford-corenlp-4.4.0/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9001 -timeout 60000 -threads 40 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-a04b27a7960b41fc.props -annotators tokenize,ssplit,pos,lemma,ner,depparse -preload -outputFormat json


In [None]:
df_metrics_segment['amean_bleu_samsa'] = np.mean(df_metrics_segment[['bleu', 'samsa']], axis=1)
df_metrics_segment['amean_sari_samsa'] = np.mean(df_metrics_segment[['sari', 'samsa']], axis=1)
df_metrics_segment['gmean_bleu_samsa'] = scipy.stats.gmean(df_metrics_segment[['bleu', 'samsa']], axis=1)
df_metrics_segment['gmean_sari_samsa'] = scipy.stats.gmean(df_metrics_segment[['sari', 'samsa']], axis=1)

  log_a = np.log(np.array(a, dtype=dtype))


In [None]:
import pickle
with open('./src/metaeval-simplification/df_metrics_segment_simplicityDA.pickle', 'wb') as f:
    pickle.dump(df_metrics_segment, f)

# compute correlations

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive/Colab Notebooks/master_thesis

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[Errno 2] No such file or directory: 'drive/MyDrive/Colab Notebooks/master_thesis'
/content/drive/MyDrive/Colab Notebooks/master_thesis


In [None]:
df_simplicityDA = pd.read_csv("./src/metaeval-simplification/simplicity_DA.csv")

In [None]:
df_simplicityDA[['simplicity', 'simplicity_zscore']]

Unnamed: 0,simplicity,simplicity_zscore
0,71.333333,0.611060
1,90.933333,1.257177
2,53.800000,0.158894
3,71.600000,0.664848
4,27.733333,-0.577387
...,...,...
595,63.400000,0.336182
596,81.800000,0.890318
597,73.800000,0.606212
598,74.133333,0.614088


In [None]:
df_simplicityDA

Unnamed: 0,sent_id,sys_name,orig_sent,simp_sent,sys_type,fluency,fluency_zscore,meaning,meaning_zscore,simplicity,simplicity_zscore
0,268,ACCESS,Prunk is a member of Institute of European His...,Prunk is a member of Institute of European His...,NeuralSeq2Seq,77.400000,0.152400,77.333333,0.530362,71.333333,0.611060
1,67,SBMT-SARI,"In return, Rollo swore fealty to Charles, conv...","In return, Rollo swore fealty to Charles, conv...",SBMT,87.866667,0.512595,96.466667,1.142803,90.933333,1.257177
2,341,SBMT-SARI,From 1900 to 1920 many new facilities were con...,From 1900 to 1920 many new plants were built o...,SBMT,79.133333,0.119438,68.533333,0.284701,53.800000,0.158894
3,278,PBMT-R,Mercury is similar in appearance to the Moon: ...,Mercury is similar in appearance to the Moon: ...,PBMT,90.333333,0.598145,90.666667,0.965649,71.600000,0.664848
4,107,Hybrid,"He settled in London, devoting himself chiefly...","He settled in London, devoting himself.",Semantics+PBMT,65.466667,-0.425792,32.066667,-0.820035,27.733333,-0.577387
...,...,...,...,...,...,...,...,...,...,...,...
595,210,Hybrid,Orchestration Stravinsky first conceived of wr...,Orchestration Stravinsky conceived of writing ...,Semantics+PBMT,68.333333,-0.092967,89.933333,0.846412,63.400000,0.336182
596,202,Dress-Ls,"The album, however, was banned from many recor...",The album was banned from many record stores n...,NeuralSeq2Seq,99.600000,0.970276,95.333333,1.027604,81.800000,0.890318
597,194,PBMT-R,ISBN 1-876429-14-3 is an historic township loc...,Gunnedah is a historic town near Cowra in the ...,PBMT,73.733333,0.040598,58.466667,-0.104838,73.800000,0.606212
598,148,Dress-Ls,"Later, Esperanto speakers began to see the lan...",Esperanto speakers began to see the language a...,NeuralSeq2Seq,71.533333,-0.001330,71.000000,0.274918,74.133333,0.614088


In [None]:
import pickle
with open('./src/metaeval-simplification/df_metrics_segment_simplicityDA.pickle', 'rb') as f:
    df_metrics_segment = pickle.load(f)

In [None]:
df_metrics_segment

Unnamed: 0,sent_id,sys_name,test_set,bleu,sari,ibleu,amean_bleu_sari,gmean_bleu_sari,fkgl,fkbleu,bertscore_P,bertscore_R,bertscore_F1,samsa,amean_bleu_samsa,amean_sari_samsa,gmean_bleu_samsa,gmean_sari_samsa
0,268,ACCESS,asset,82.802644,47.579917,67.429755,65.191280,62.767371,7.633846,0.268007,0.906928,0.867856,0.887509,0.0000,41.401322,23.789958,0.000000,0.000000
1,67,SBMT-SARI,asset,66.066846,43.911624,53.154247,54.989235,53.861883,11.056667,3.183869,0.919909,0.917334,0.918485,25.0000,45.533423,34.455812,40.640757,33.132923
2,341,SBMT-SARI,asset,60.599133,44.499949,49.010280,52.549541,51.929359,17.142500,1.209228,0.725551,0.795212,0.749482,50.0000,55.299566,47.249975,55.045042,47.169879
3,278,PBMT-R,asset,56.543974,35.619970,43.240624,46.081972,44.878666,11.547407,2.605818,0.669763,0.764498,0.708511,23.4375,39.990737,29.528735,36.403975,28.893651
4,107,Hybrid,asset,86.944174,44.101676,72.976326,65.522925,61.922402,2.280000,0.793896,0.885912,0.726350,0.802543,37.5000,62.222087,40.800838,57.099970,40.667098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,210,Hybrid,asset,70.710678,48.431040,56.123109,59.570859,58.520011,8.370000,33.071130,0.909308,0.914433,0.901515,50.0000,60.355339,49.215520,59.460356,49.209267
596,202,Dress-Ls,asset,100.000000,51.796302,83.846009,75.898151,71.969648,7.190000,51.475261,1.000000,1.000000,1.000000,100.0000,100.000000,75.898151,100.000000,71.969648
597,194,PBMT-R,asset,47.947318,35.863357,39.357168,41.905338,41.467479,10.310435,24.618682,0.223006,0.082152,0.152979,6.2500,27.098659,21.056679,17.311001,14.971506
598,148,Dress-Ls,asset,100.000000,30.564593,85.726507,65.282297,55.285254,8.895238,0.003160,0.903357,0.582856,0.739002,20.3125,60.156250,25.438547,45.069391,24.916727


## direct assessment

In [None]:
def compute_direct_correlations(df_benchmark, df_segment_metrics, aspects, test_sets, segment_id_cols):
    results = {}
    for aspect in aspects:
        smed = df_benchmark['simplicity_zscore'].median()
        shind = df_benchmark['simplicity_zscore'] > smed
        slind = df_benchmark['simplicity_zscore'] <= smed
        fmed = df_benchmark['fluency_zscore'].median()
        fhind = df_benchmark['fluency_zscore'] > fmed
        flind = df_benchmark['fluency_zscore'] <= fmed
        mmed = df_benchmark['meaning_zscore'].median()
        mhind = df_benchmark['meaning_zscore'] > mmed
        mlind = df_benchmark['meaning_zscore'] <= mmed
        fmhind = (df_benchmark['fluency_zscore'] > fmed) & (df_benchmark['meaning_zscore'] > mmed)
        fmlind = (df_benchmark['fluency_zscore'] <= fmed) & (df_benchmark['meaning_zscore'] <= mmed)
        smfhind = (df_benchmark['simplicity_zscore'] > smed) & (df_benchmark['fluency_zscore'] > fmed) & (df_benchmark['meaning_zscore'] > mmed)
        smflind = (df_benchmark['simplicity_zscore'] <= smed) & (df_benchmark['fluency_zscore'] <= fmed) & (df_benchmark['meaning_zscore'] <= mmed)
        df_scores_slow = df_benchmark[slind]
        df_scores_shigh = df_benchmark[shind]
        df_scores_flow = df_benchmark[flind]
        df_scores_fhigh = df_benchmark[fhind]
        df_scores_mlow = df_benchmark[mlind]
        df_scores_mhigh = df_benchmark[mhind]
        df_scores_fmlow = df_benchmark[fmlind]
        df_scores_fmhigh = df_benchmark[fmhind]
        df_scores_sfmlow = df_benchmark[smflind]
        df_scores_sfmhigh = df_benchmark[smfhind]

        print(f"{aspect}: simp_high ({len(df_scores_shigh)}) - simp_low ({len(df_scores_slow)}) - flu_low ({len(df_scores_flow)}) - flu_high ({len(df_scores_fhigh)}) - mean_low ({len(df_scores_mlow)}) - mean_high ({len(df_scores_mhigh)}) - fm_low ({len(df_scores_fmlow)}) - fm_high ({len(df_scores_fmhigh)}) - sfm_low ({len(df_scores_sfmlow)}) - sfm_high ({len(df_scores_sfmhigh)}) - All ({len(df_benchmark)})")
        for quality, df_scores in {'simp_low': df_scores_slow, 'simp_high': df_scores_shigh, 'flu_low': df_scores_flow, 'flu_high': df_scores_fhigh, 'mean_low': df_scores_mlow, 'mean_high': df_scores_mhigh, 'fm_low': df_scores_fmlow, 'fm_high': df_scores_fmhigh, 'sfm_low': df_scores_sfmlow, 'sfm_high': df_scores_sfmhigh, 'all': df_benchmark}.items():
            for test_set in test_sets:
                print(f"Computing for {quality} scores - {test_set} references")
                df_metrics = df_segment_metrics[df_segment_metrics.test_set==test_set].drop(columns=['test_set'])
                results[(quality, aspect, test_set)] = compute_direct_assessment_correlations(
                    df_scores,
                    df_metrics,
                    aspect,
                    segment_id_cols=segment_id_cols,
                    use_absolute_values=False
                )
                print()
    return results

In [None]:
METRICS_TO_ANALYSE = ['bleu', 'sari', 'fkgl', 'fkbleu', 'ibleu',
                      'amean_bleu_sari',
                      'gmean_bleu_sari',
                      'bertscore_P', 'bertscore_R', 'bertscore_F1']

In [None]:
EVAL_DATASETS = ['asset']

In [None]:
sn = 'SBMT-SARI'

In [None]:
results_simplicity = compute_direct_correlations(
                        df_simplicityDA[df_simplicityDA['sys_name'] == sn],
                        df_metrics_segment[df_metrics_segment['sys_name'] == sn],
                        aspects=['simplicity'],
                        test_sets=EVAL_DATASETS,
                        segment_id_cols=['sent_id','sys_name']
                    )

In [None]:
results_simplicity[('all', 'simplicity', 'asset')][0]

Unnamed: 0,metric,corr,p_value,is_winner
0,bertscore_R,0.565444,8.820981e-10,False
1,bertscore_F1,0.557813,1.650117e-09,False
2,bertscore_P,0.556447,1.842891e-09,False
3,amean_bleu_sari,0.316309,0.001345668,False
4,gmean_bleu_sari,0.308449,0.001795011,False
5,ibleu,0.308259,0.001807363,False
6,amean_bleu_samsa,0.299986,0.002426955,False
7,bleu,0.296936,0.002699805,False
8,gmean_bleu_samsa,0.289125,0.003528686,False
9,gmean_sari_samsa,0.250123,0.01207968,False


In [None]:
results_simplicity = compute_direct_correlations(
                        df_simplicityDA,
                        df_metrics_segment,
                        aspects=['simplicity'],
                        test_sets=EVAL_DATASETS,
                        segment_id_cols=['sent_id','sys_name']
                    )

simplicity: simp_high (300) - simp_low (300) - flu_low (300) - flu_high (300) - mean_low (300) - mean_high (300) - fm_low (222) - fm_high (222) - sfm_low (208) - sfm_high (196) - All (600)
Computing for simp_low scores - asset references
Computing correlations...
Determining if the difference in performance is significant...

Computing for simp_high scores - asset references
Computing correlations...
Determining if the difference in performance is significant...

Computing for flu_low scores - asset references
Computing correlations...
Determining if the difference in performance is significant...

Computing for flu_high scores - asset references
Computing correlations...
Determining if the difference in performance is significant...

Computing for mean_low scores - asset references
Computing correlations...
Determining if the difference in performance is significant...

Computing for mean_high scores - asset references
Computing correlations...
Determining if the difference in perform

In [None]:
results_simplicity[('flu_high', 'simplicity', 'asset')][0]

Unnamed: 0,metric,corr,p_value,is_winner
0,bertscore_P,0.38889,2.85878e-12,False
1,gmean_bleu_sari,0.34248,1.112407e-09,False
2,ibleu,0.342083,1.165882e-09,False
3,amean_bleu_sari,0.341902,1.1911e-09,False
4,bleu,0.312661,3.163412e-08,False
5,bertscore_F1,0.312312,3.28262e-08,False
6,sari,0.273586,1.501996e-06,False
7,bertscore_R,0.236514,3.498846e-05,False
8,amean_bleu_samsa,0.171116,0.002945164,False
9,gmean_bleu_samsa,0.10594,0.06688683,False


In [None]:
results_simplicity[('flu_low', 'simplicity', 'asset')][0]

Unnamed: 0,metric,corr,p_value,is_winner
0,bertscore_P,0.539148,5.066457e-24,False
1,bertscore_F1,0.529617,4.3148940000000004e-23,False
2,gmean_bleu_sari,0.4897,1.689533e-19,False
3,amean_bleu_sari,0.486063,3.406962e-19,False
4,bleu,0.475907,2.310685e-18,False
5,ibleu,0.474877,2.79595e-18,False
6,bertscore_R,0.460495,3.748397e-17,False
7,sari,0.38566,4.468054e-12,False
8,amean_bleu_samsa,0.326259,7.187346e-09,False
9,fkgl,0.277417,1.055095e-06,False


## relative ranking

In [None]:
def compute_relative_ranking_correlations(df_human_scores, df_metrics_scores, aspect, segment_id_cols,
                                           use_absolute_values=True):
    df_da_scores = df_human_scores.reset_index()
    cols_of_interest = segment_id_cols + [aspect, f"{aspect}_zscore"]
    df_da_scores = df_da_scores[cols_of_interest]
    df_all_scores = pd.merge(left=df_metrics_scores, right=df_da_scores, on=segment_id_cols)

    # Compute correlations metrics vs human scores
    print("Computing correlations...")
    metrics_names = [col for col in df_metrics_scores.columns if col not in segment_id_cols]
    correlations_data = []
    for metric in metrics_names:
        corr, p_value = spearmanr(df_all_scores[metric], df_all_scores[f'{aspect}_zscore'])
        if use_absolute_values:
            corr = abs(corr)
        correlations_data.append([metric, corr, p_value])
    df_correlations_metrics_human = pd.DataFrame(correlations_data, columns=['metric', 'corr', 'p_value'])
    df_correlations_metrics_human.sort_values(by=['corr'], ascending=False, inplace=True, ignore_index=True)

    # Compute correlations metrics vs metrics
    metrics_names = df_correlations_metrics_human['metric'].to_list()
    correlations_data = []
    for _, (metric_a, corr_metric_a, _) in df_correlations_metrics_human.iterrows():
        for _, (metric_b, corr_metric_b, _) in df_correlations_metrics_human.iterrows():
            corr_a_b, pvalue_a_b = pearsonr(df_all_scores[metric_a], df_all_scores[metric_b])
            if use_absolute_values:
                corr_a_b = abs(corr_a_b)
            correlations_data.append([metric_a, corr_metric_a,
                                      metric_b, corr_metric_b,
                                      corr_a_b, pvalue_a_b])
    df_correlations_metric_metric = pd.DataFrame(correlations_data,
                                                 columns=['metric_a', 'corr_metric_a',
                                                          'metric_b', 'corr_metric_b',
                                                          'corr_a_b', 'pvalue_a_b'])

    # Determine if the difference in performance is significant
    print("Determining if the difference in performance is significant...")
    significance_matrix = []
    winner_status = []
    for metric_a in metrics_names:
        df_correlations = df_correlations_metric_metric[df_correlations_metric_metric['metric_a'] == metric_a]
        is_winner = True
        significance_row = []
        for _, (_, corr_metric_a, metric_b, corr_metric_b, corr_a_b, _) in df_correlations.iterrows():
            p = np.nan
            if (metric_a != metric_b) and (corr_metric_a > corr_metric_b):
                _, p = williams_test(corr_metric_a, corr_metric_b, corr_a_b, len(df_human_scores))
            is_diff_stats_significant = p < 0.05
            if not is_diff_stats_significant:
                # we do not care about the exact values in cases where it's not significant
                p = np.nan
            significance_row.append(p)
            # Update winner status (not significantly outperformed by any other metric)
            if metric_a != metric_b:
                is_winner = is_winner and is_diff_stats_significant
        significance_matrix.append(significance_row)
        winner_status.append(is_winner)
    df_correlations_metrics_human['is_winner'] = winner_status
    df_significance = pd.DataFrame(np.array(significance_matrix), columns=metrics_names, index=metrics_names)

    return df_correlations_metrics_human, df_significance

In [None]:
def compute_ranking_correlations(df_benchmark, df_segment_metrics, aspects, test_sets, segment_id_cols):
    results = {}
    for aspect in aspects:
        smed = df_benchmark['simplicity_zscore'].median()
        shind = df_benchmark['simplicity_zscore'] > smed
        slind = df_benchmark['simplicity_zscore'] <= smed
        fmed = df_benchmark['fluency_zscore'].median()
        fhind = df_benchmark['fluency_zscore'] > fmed
        flind = df_benchmark['fluency_zscore'] <= fmed
        mmed = df_benchmark['meaning_zscore'].median()
        mhind = df_benchmark['meaning_zscore'] > mmed
        mlind = df_benchmark['meaning_zscore'] <= mmed
        fmhind = (df_benchmark['fluency_zscore'] > fmed) & (df_benchmark['meaning_zscore'] > mmed)
        fmlind = (df_benchmark['fluency_zscore'] <= fmed) & (df_benchmark['meaning_zscore'] <= mmed)
        smfhind = (df_benchmark['simplicity_zscore'] > smed) & (df_benchmark['fluency_zscore'] > fmed) & (df_benchmark['meaning_zscore'] > mmed)
        smflind = (df_benchmark['simplicity_zscore'] <= smed) & (df_benchmark['fluency_zscore'] <= fmed) & (df_benchmark['meaning_zscore'] <= mmed)
        df_scores_slow = df_benchmark[slind]
        df_scores_shigh = df_benchmark[shind]
        df_scores_flow = df_benchmark[flind]
        df_scores_fhigh = df_benchmark[fhind]
        df_scores_mlow = df_benchmark[mlind]
        df_scores_mhigh = df_benchmark[mhind]
        df_scores_fmlow = df_benchmark[fmlind]
        df_scores_fmhigh = df_benchmark[fmhind]
        df_scores_sfmlow = df_benchmark[smflind]
        df_scores_sfmhigh = df_benchmark[smfhind]

        print(f"{aspect}: simp_high ({len(df_scores_shigh)}) - simp_low ({len(df_scores_slow)}) - flu_low ({len(df_scores_flow)}) - flu_high ({len(df_scores_fhigh)}) - mean_low ({len(df_scores_mlow)}) - mean_high ({len(df_scores_mhigh)}) - fm_low ({len(df_scores_fmlow)}) - fm_high ({len(df_scores_fmhigh)}) - sfm_low ({len(df_scores_sfmlow)}) - sfm_high ({len(df_scores_sfmhigh)}) - All ({len(df_benchmark)})")
        for quality, df_scores in {'simp_low': df_scores_slow, 'simp_high': df_scores_shigh, 'flu_low': df_scores_flow, 'flu_high': df_scores_fhigh, 'mean_low': df_scores_mlow, 'mean_high': df_scores_mhigh, 'fm_low': df_scores_fmlow, 'fm_high': df_scores_fmhigh, 'sfm_low': df_scores_sfmlow, 'sfm_high': df_scores_sfmhigh, 'all': df_benchmark}.items():
            for test_set in test_sets:
                print(f"Computing for {quality} scores - {test_set} references")
                df_metrics = df_segment_metrics[df_segment_metrics.test_set==test_set].drop(columns=['test_set'])
                results[(quality, aspect, test_set)] = compute_relative_ranking_correlations(
                    df_human_scores=df_scores,
                    df_metrics_scores=df_metrics,
                    aspect=aspect,
                    segment_id_cols=segment_id_cols,
                    #sentence_id_cols=['sent_id'],
                    #system_id_cols=['sys_name'],
                    use_absolute_values=False,
                )
                print()
    return results

In [None]:
METRICS_TO_ANALYSE = ['bleu', 'sari', 'fkgl', 'fkbleu', 'ibleu',
                      'amean_bleu_sari',
                      'gmean_bleu_sari',
                      'bertscore_P', 'bertscore_R', 'bertscore_F1']
EVAL_DATASETS = ['asset', 'turk', 'hsplit', 'all']

In [None]:
results_simplicity = compute_ranking_correlations(
                        df_simplicityDA[df_simplicityDA['sys_name'] == sn],
                        df_metrics_segment[df_metrics_segment['sys_name'] == sn],
                        aspects=['simplicity'],
                        test_sets=['asset'],
                        segment_id_cols=['sent_id','sys_name']
                    )

In [None]:
results_simplicity[('all', 'simplicity', 'asset')][0]

Unnamed: 0,metric,corr,p_value,is_winner
0,bertscore_P,0.56126,1.245959e-09,False
1,bertscore_F1,0.545095,4.528401e-09,False
2,bertscore_R,0.536754,8.584025e-09,False
3,amean_bleu_samsa,0.287897,0.003678017,False
4,gmean_bleu_samsa,0.276103,0.005426123,False
5,amean_sari_samsa,0.25931,0.00918149,False
6,gmean_bleu_sari,0.252157,0.01137678,False
7,amean_bleu_sari,0.250861,0.01182028,False
8,gmean_sari_samsa,0.241047,0.0156963,False
9,ibleu,0.231035,0.02073988,False


In [None]:
results_simplicity = compute_ranking_correlations(
                        df_simplicityDA,
                        df_metrics_segment,
                        aspects=['simplicity'],
                        test_sets=['asset'],
                        segment_id_cols=['sent_id','sys_name']
                    )

simplicity: simp_high (300) - simp_low (300) - flu_low (300) - flu_high (300) - mean_low (300) - mean_high (300) - fm_low (222) - fm_high (222) - sfm_low (208) - sfm_high (196) - All (600)
Computing for simp_low scores - asset references
Computing correlations...
Determining if the difference in performance is significant...

Computing for simp_high scores - asset references
Computing correlations...
Determining if the difference in performance is significant...

Computing for flu_low scores - asset references
Computing correlations...
Determining if the difference in performance is significant...

Computing for flu_high scores - asset references
Computing correlations...
Determining if the difference in performance is significant...

Computing for mean_low scores - asset references
Computing correlations...
Determining if the difference in performance is significant...

Computing for mean_high scores - asset references
Computing correlations...
Determining if the difference in perform

In [None]:
results_simplicity[('fm_high', 'simplicity', 'asset')][0]

Unnamed: 0,metric,corr,p_value,is_winner
0,ibleu,0.2702,4.5e-05,False
1,amean_bleu_sari,0.266852,5.7e-05,False
2,bertscore_P,0.264547,6.6e-05,False
3,gmean_bleu_sari,0.262605,7.5e-05,False
4,bleu,0.231512,0.000506,False
5,sari,0.212859,0.001421,False
6,bertscore_F1,0.17485,0.009037,False
7,bertscore_R,0.15865,0.018006,False
8,amean_bleu_samsa,0.040296,0.550343,False
9,amean_sari_samsa,-0.008852,0.895653,False


In [None]:
results_simplicity[('fm_low', 'simplicity', 'asset')][0]

Unnamed: 0,metric,corr,p_value,is_winner
0,bertscore_P,0.391705,1.478522e-09,False
1,bertscore_F1,0.351774,7.251806e-08,False
2,ibleu,0.347073,1.108309e-07,False
3,bleu,0.341897,1.754126e-07,False
4,gmean_bleu_sari,0.337102,2.664318e-07,False
5,amean_bleu_sari,0.335962,2.939694e-07,False
6,amean_bleu_samsa,0.294174,8.296821e-06,False
7,bertscore_R,0.291071,1.042355e-05,False
8,gmean_bleu_samsa,0.27509,3.237507e-05,False
9,sari,0.252358,0.0001443959,False
