In [6]:
from torchtext.data.metrics import bleu_score
import glob
import torch

In [None]:
def turn_txt_to_corpus(path):
    corpus = []
    with open(path) as f:
        lines = f.readlines()
        for l in lines:
            corpus.append(l.split('\n')[0].split(' '))
        f.close()
    return corpus

In [None]:
files_root = './experiment-data'
model_types = ['base', 'as_is']
N_lists = ['6']
trials = 5
scores = {}
paths = glob.glob(f'{files_root}/*')

for mt in model_types:
    scores[mt] = {}
    for N in N_lists:
        if mt != 'base' and N == '1':
            continue
        total_bleu = 0
        for t in range(trials):
            perds_path = None
            refs_path = None
            keywords = [mt, f'N={N}', f'trial={t}']
            for p in paths:
                is_match = True
                for kw in keywords:
                    is_match = is_match and (kw in p)
                if is_match:
                    if '_preds' in p:
                        perds_path = p
                    elif '_refs' in p:
                        refs_path = p
            candidate_corpus = turn_txt_to_corpus(perds_path)
            print(candidate_corpus[:2])
            references_corpus = turn_txt_to_corpus(refs_path)
            total_bleu = total_bleu + bleu_score(candidate_corpus, references_corpus)
            print(mt, N, t, 'done')
        scores[mt][N] = total_bleu / trials


In [None]:
scores

In [5]:
try:
    import torchmetrics
except:
    !pip install torchmetrics

try:
    import evaluate
except:
    !pip install evaluate

In [7]:
def readfile(fn, data):
    with open(fn) as file:
        while line := file.readline():
            data.append(line.rstrip())
            # break
    return data

In [8]:
from torchmetrics import BLEUScore, SacreBLEUScore
import evaluate
import numpy as np
import pandas as pd

model_names = ['base', 'as_is']
# language_pairings = [('de', 'en'), ('en', 'de')]
language_pairings = [('de', 'en')]
num_trials = 5
N_lists = ['6']
path_root = './experiment-data'
paths = glob.glob(f'{path_root}/*')

df_dict = {}
for mn in model_names:
    for lp in language_pairings:
        for N in N_lists:
            if mn != 'base' and N =='1':
                continue
            lp_str = f'{lp[0]}->{lp[1]}'
            df_dict[f'{mn}_{lp_str}_{N}'] = {
                'BLEU' : (0,0), # format key as tuple (mean, std)
                'SacreBLEU' : (0,0),
                'METEOR' : (0,0),
                'COMET' : (0,0)
            }
            metrics_dict = {
                'bleu' : [],
                'sacrebleu' : [],
                'meteor' : [],
                'comet' : []
            }
            for i in range(num_trials):
                
                p_ct = 0
                for p in paths:
                    if mn in p and f'trial={i}' in p and f'N={N}' in p:
                        if 'preds' in p:
                            preds_path = p
                        elif 'refs' in p:
                            refs_path = p
                        elif 'src' in p:
                            src_path = p
                        p_ct += 1
                        print(p)
                assert p_ct == 3

                # read data into lists
                preds = []
                refs = []
                src = []
                readfile(preds_path, preds)
                readfile(refs_path, refs)
                readfile(src_path, src)

                # bleu
                if len(refs[0]) > 1:
                    print('Adding surrounding list for each target')
                    refs_bleu = [[x] for x in refs]
                else:
                    refs_bleu = refs

                bleu = BLEUScore()
                torch_bleu = bleu(preds, refs_bleu)
                metrics_dict['bleu'].append(torch_bleu)

                # sacrebleu
                if len(refs[0]) > 1:
                    print('Adding surrounding list for each target')
                    refs_sacrebleu = [[x] for x in refs]
                else:
                    refs_sacrebleu = refs

                sacre_bleu = SacreBLEUScore(tokenize='none')
                torch_sacrebleu = sacre_bleu(preds, refs_sacrebleu)
                metrics_dict['sacrebleu'].append(torch_sacrebleu)

                # meteor
                meteor = evaluate.load('meteor')
                results = meteor.compute(predictions=preds, references=refs)
                huggingface_meteor = (round(results['meteor'], 2))
                metrics_dict['meteor'].append(huggingface_meteor)

            # compute mean and std over trials and put in dict
            mean_bleu = np.mean(np.array(metrics_dict['bleu']))
            std_bleu = np.std(np.array(metrics_dict['bleu']))

            mean_sacrebleu = np.mean(np.array(metrics_dict['sacrebleu']))
            std_sacrebleu = np.std(np.array(metrics_dict['sacrebleu']))

            mean_meteor = np.mean(np.array(metrics_dict['meteor']))
            std_meteor = np.std(np.array(metrics_dict['meteor']))

            # mean_comet = np.mean(np.array(metrics_dict['comet']))
            # std_comet = np.std(np.array(metrics_dict['comet']))


            df_dict[f'{mn}_{lp_str}_{N}'] = {
                'BLEU' : (mean_bleu,std_bleu), # format key as tuple (mean, std)
                'SacreBLEU' : (mean_sacrebleu,std_sacrebleu),
                'METEOR' : (mean_meteor,std_meteor),
                'COMET' : (0,0) # disabled for now
            }

results_df = pd.DataFrame(df_dict)
results_df


./experiment-data/model=base_N=6_lang_pair=de->en_trial=0_preds.txt
./experiment-data/model=base_N=6_lang_pair=de->en_trial=0_src.txt
./experiment-data/model=base_N=6_lang_pair=de->en_trial=0_refs.txt
Adding surrounding list for each target
Adding surrounding list for each target


[nltk_data] Downloading package wordnet to /home/hao/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


./experiment-data/model=base_N=6_lang_pair=de->en_trial=1_refs.txt
./experiment-data/model=base_N=6_lang_pair=de->en_trial=1_preds.txt
./experiment-data/model=base_N=6_lang_pair=de->en_trial=1_src.txt
Adding surrounding list for each target
Adding surrounding list for each target


[nltk_data] Downloading package wordnet to /home/hao/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


./experiment-data/model=base_N=6_lang_pair=de->en_trial=2_src.txt
./experiment-data/model=base_N=6_lang_pair=de->en_trial=2_refs.txt
./experiment-data/model=base_N=6_lang_pair=de->en_trial=2_preds.txt
Adding surrounding list for each target
Adding surrounding list for each target


[nltk_data] Downloading package wordnet to /home/hao/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


./experiment-data/model=base_N=6_lang_pair=de->en_trial=3_src.txt
./experiment-data/model=base_N=6_lang_pair=de->en_trial=3_refs.txt
./experiment-data/model=base_N=6_lang_pair=de->en_trial=3_preds.txt
Adding surrounding list for each target
Adding surrounding list for each target


[nltk_data] Downloading package wordnet to /home/hao/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


./experiment-data/model=base_N=6_lang_pair=de->en_trial=4_refs.txt
./experiment-data/model=base_N=6_lang_pair=de->en_trial=4_src.txt
./experiment-data/model=base_N=6_lang_pair=de->en_trial=4_preds.txt
Adding surrounding list for each target
Adding surrounding list for each target


[nltk_data] Downloading package wordnet to /home/hao/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


./experiment-data/model=as_is_N=6_lang_pair=de->en_trial=0_preds.txt
./experiment-data/model=as_is_N=6_lang_pair=de->en_trial=0_src.txt
./experiment-data/model=as_is_N=6_lang_pair=de->en_trial=0_refs.txt
Adding surrounding list for each target
Adding surrounding list for each target


[nltk_data] Downloading package wordnet to /home/hao/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


./experiment-data/model=as_is_N=6_lang_pair=de->en_trial=1_preds.txt
./experiment-data/model=as_is_N=6_lang_pair=de->en_trial=1_refs.txt
./experiment-data/model=as_is_N=6_lang_pair=de->en_trial=1_src.txt
Adding surrounding list for each target
Adding surrounding list for each target


[nltk_data] Downloading package wordnet to /home/hao/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


./experiment-data/model=as_is_N=6_lang_pair=de->en_trial=2_preds.txt
./experiment-data/model=as_is_N=6_lang_pair=de->en_trial=2_src.txt
./experiment-data/model=as_is_N=6_lang_pair=de->en_trial=2_refs.txt
Adding surrounding list for each target
Adding surrounding list for each target


[nltk_data] Downloading package wordnet to /home/hao/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


./experiment-data/model=as_is_N=6_lang_pair=de->en_trial=3_refs.txt
./experiment-data/model=as_is_N=6_lang_pair=de->en_trial=3_preds.txt
./experiment-data/model=as_is_N=6_lang_pair=de->en_trial=3_src.txt
Adding surrounding list for each target
Adding surrounding list for each target


[nltk_data] Downloading package wordnet to /home/hao/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


./experiment-data/model=as_is_N=6_lang_pair=de->en_trial=4_src.txt
./experiment-data/model=as_is_N=6_lang_pair=de->en_trial=4_preds.txt
./experiment-data/model=as_is_N=6_lang_pair=de->en_trial=4_refs.txt
Adding surrounding list for each target
Adding surrounding list for each target


[nltk_data] Downloading package wordnet to /home/hao/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,base_de->en_6,as_is_de->en_6
BLEU,"(0.29382062, 0.0027804063)","(0.2925221, 0.0026590193)"
SacreBLEU,"(0.29382062, 0.0027804063)","(0.2925221, 0.0026590193)"
METEOR,"(0.57, 0.0)","(0.5679999999999998, 0.0039999999999999584)"
COMET,"(0, 0)","(0, 0)"
