# Run Imports

In [2]:
import json
import datasets
import numpy as np
import pandas as pd
import sqlite3
import pickle

from scipy.stats import kendalltau, pearsonr, spearmanr
from datasets import load_dataset, Dataset
from evaluate import load
from tqdm.notebook import tqdm
from transformers import logging
from tabulate import tabulate

logging.set_verbosity_error()

# Local
from nbtools.utils import files

# Load Datasets

In [3]:
cnndm_test = load_dataset("Salesforce/rose", "cnndm_test")["data"]
cnndm_val = load_dataset("Salesforce/rose", "cnndm_validation")["data"]
xsum = load_dataset("Salesforce/rose", "xsum")["data"]
samsum = load_dataset("Salesforce/rose", "samsum")["data"]
cnndm_protocol = load_dataset("Salesforce/rose", "cnndm_protocol")["data"]
cnndm_protocol_gpt3 = load_dataset("Salesforce/rose", "cnndm_protocol_gpt3")["data"]

rose = {
    'cnndm_test': cnndm_test,
    'cnndm_validation': cnndm_val,
    'xsum': xsum,
    'samsum': samsum,
    'cnndm_protocol': cnndm_protocol,
    'cnndm_protocol_gpt3': cnndm_protocol_gpt3,
}

fname = '/data/john/datasets/model_annotations.aligned.paired.jsonl'
summeval = Dataset.from_pandas(pd.read_json(fname, lines=True))

print(rose)
print(summeval)


{'cnndm_test': Dataset({
    features: ['source', 'reference', 'reference_acus', 'count_id', 'example_id', 'annotations', 'system_outputs'],
    num_rows: 500
}), 'cnndm_validation': Dataset({
    features: ['source', 'reference', 'reference_acus', 'count_id', 'example_id', 'annotations', 'system_outputs'],
    num_rows: 1000
}), 'xsum': Dataset({
    features: ['source', 'reference', 'reference_acus', 'count_id', 'example_id', 'annotations', 'system_outputs'],
    num_rows: 500
}), 'samsum': Dataset({
    features: ['source', 'reference', 'reference_acus', 'count_id', 'example_id', 'annotations', 'system_outputs'],
    num_rows: 500
}), 'cnndm_protocol': Dataset({
    features: ['source', 'reference', 'count_id', 'example_id', 'annotations', 'system_outputs'],
    num_rows: 100
}), 'cnndm_protocol_gpt3': Dataset({
    features: ['source', 'reference', 'reference_acus', 'count_id', 'example_id', 'annotations', 'system_outputs'],
    num_rows: 100
})}
Dataset({
    features: ['id', 'dec

# Inspect Datasets Here

In [3]:
sample = rose['cnndm_test'][0]
print(sample['reference'])
print(json.dumps(sample['system_outputs'], indent=4))

Juan Arango escaped punishment from the referee for biting Jesus Zavela .
He could face a retrospective punishment for the incident .
Arango had earlier scored a free kick in his team's 4-3 defeat .
{
    "bart": "Juan Arango bites Jesus Zavela in a moment of madness in Club Tijuana's 4-3 defeat by Monterrey in the Mexican league. The Venezuelan icon sank his teeth into the shoulder of the opponent as his temper flared in the defeat. He was not booked by the referee but could face a heavy retrospective ban. Arango had earlier curled in a magnificent free kick for his team to bring them level after falling 2-0 down.",
    "gold": "Juan Arango bit Jesus Zavala's shoulder in Club Tijuana's 4-3 defeat by Monterrey. The Venezuelan icon sank his teeth into the shoulder of Jesus Zavela. Arango was not booked by the referee but could face a heavy retrospective ban.",
    "pegasus": "Club Tijuana lost 4-3 to Monterrey in the Mexican league. Juan Arango was not booked but could face a heavy retr

# RoSE Scores

### Create Array of Reference Scores

In [3]:
rose_scores = {}
semncg = {}
semf1 = {}

sf1_met = load("nbansal/semf1")
sncg_met = load("nbansal/semncg")
for ds_name, ds in tqdm(list(rose.items()), position=0):
    sncg_scores = []
    sf1_scores = []
    acu_scores = []
    for sample in tqdm(ds, position=1, leave=False):
        src, ref = sample['source'], sample['reference']
        preds = []
        acu_sc = []
        for sys in sample['annotations'].keys():
            if sys == 'reference':
                preds.append(sample['reference'])
            else:
                preds.append(sample['system_outputs'][sys])
            acu_sc.append(
                sample['annotations'][sys]['normalized_acu']
            )
        acu_scores.append(acu_sc)
            
        N = len(preds)
        sncg_scores.append(sncg_met.compute(
            predictions=preds, references=[ref]*N, documents=[src]*N,
            verbose=False)[-1]
        )
        sf1_scores.append([
            sc.f1 for sc in sf1_met.compute(
                predictions=preds,
                references=[ref]*N,
            )
        ])
    rose_scores[ds_name] = acu_scores
    semf1[ds_name] = sf1_scores
    semncg[ds_name] = sncg_scores




  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

KeyboardInterrupt: 

# Compute Correlation Scores

In [None]:
base_pth = f'{files.project_root()}/data/metrics/rose'

with open(f'{base_pth}/rose.pkl', 'rb') as f:
    rose_scores = pickle.load(f)
with open(f'{base_pth}/semncg.pkl', 'rb') as f:
    semncg = pickle.load(f)
with open(f'{base_pth}/semf1.pkl', 'rb') as f:
    semf1 = pickle.load(f)


keys = list(rose_scores.keys())
a = np.array(rose_scores[keys[0]])
b = np.array(semf1[keys[0]])
C = {'semncg': dict(), 'semf1': dict()}
results = []
for key in keys:
    row = {'dataset': key}
    rs = np.array(rose_scores[key])
    sncg = np.array(semncg[key])
    sf1 = np.array(semf1[key])

    # pearsonr
    row['semf1-sum-r'] = np.mean(np.nan_to_num(
        pearsonr(rs, sf1, axis=1).statistic
    ))
    row['semncg-sum-r'] = np.mean(np.nan_to_num(
        pearsonr(rs, sncg, axis=1).statistic
    ))
    row['semf1-sys-r'] = pearsonr(
        np.mean(rs, axis=0), 
        np.mean(sf1, axis=0)).statistic
    row['semncg-sys-r'] = pearsonr(
        np.mean(rs, axis=0), 
        np.mean(sncg, axis=0)).statistic

    # spearmanr
    row['semf1-sum-p'] = np.mean(np.nan_to_num(
        spearmanr(rs, sf1, axis=1).statistic
    ))
    row['semncg-sum-p'] = np.mean(np.nan_to_num(
        spearmanr(rs, sncg, axis=1).statistic
    ))
    row['semf1-sys-p'] = spearmanr(
        np.mean(rs, axis=0), 
        np.mean(sf1, axis=0)).statistic
    row['semncg-sys-p'] = spearmanr(
        np.mean(rs, axis=0), 
        np.mean(sncg, axis=0)).statistic

    # kendalltau
    row['semf1-sum-t'] = np.mean(np.nan_to_num(np.array([
        kendalltau(a, b).statistic for a, b in zip(rs, sf1)
    ])))
    row['semncg-sum-t'] = np.mean(np.nan_to_num(np.array([
        kendalltau(a, b).statistic for a, b in zip(rs, sf1)
    ])))
    row['semf1-sys-t'] = kendalltau(
        np.mean(rs, axis=0), 
        np.mean(sf1, axis=0)).statistic
    row['semncg-sys-t'] = kendalltau(
        np.mean(rs, axis=0), 
        np.mean(sncg, axis=0)).statistic

    results.append(row)

print(tabulate(results, headers='keys'))

    

  pearsonr(rs, sf1, axis=1).statistic
  pearsonr(rs, sncg, axis=1).statistic
  c /= stddev[:, None]
  c /= stddev[None, :]


  pearsonr(rs, sncg, axis=1).statistic


dataset                semf1-sum-r    semncg-sum-r    semf1-sys-r    semncg-sys-r    semf1-sum-p    semncg-sum-p    semf1-sys-p    semncg-sys-p    semf1-sum-t    semncg-sum-t    semf1-sys-t    semncg-sys-t
-------------------  -------------  --------------  -------------  --------------  -------------  --------------  -------------  --------------  -------------  --------------  -------------  --------------
cnndm_test                0.385972        0.238795       0.432037        0.888579      0.0456889      0.019255         0.741259        0.832168       0.282966        0.282966       0.484848        0.69697
cnndm_validation          0.319511        0.199148       0.524792        0.843004      0.0535337      0.023556         0.738095        0.761905       0.237901        0.237901       0.571429        0.571429
xsum                      0.300467        0.126828       0.691984        0.676914      0.0165924      0.00715199       0.52381         0.714286       0.267867        0.267867   

### Show why nan_to_num() needs to be used in above cell

In [12]:
a = np.array(rose_scores['cnndm_test'])
b = np.array(semf1['cnndm_test'])
res = pearsonr(a, b, axis=1).statistic
indices, = np.where(np.isnan(res))
print(f'problem samples: {indices}')
index = 189
print(f'a: {a[index]}\nb: {b[index]}')

problem samples: [144 189 481]
a: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
b: [0.51438807 0.38887445 0.53921602 0.44882605 0.45183832 0.4134736
 0.42873829 0.5178094  0.44135811 0.44464295 0.36657725 0.3910114 ]


  res = pearsonr(a, b, axis=1).statistic
