In [1]:
import os
import codecs
import sacrebleu

import numpy as np
import pandas as pd



In [2]:
proc_dir = './data/processed/'
languages = ['af', 'nr', 'zu', 'xh', 'nso', 'st', 'tn', 'ss', 've', 'ts']
translators = ['translator1', 'translator2', 'translator3', 'translator4']
len(languages)

10

In [3]:
def load_all_translations(lang):
    """Load all autshumato evaluation translations into a dictionary.
    
    Params
    ------
    
    lang (str):
        The ISO code language to load.
        
    Returns
    -------
    
    out (dict):
        A dictionary containing all translated lines from the Autshumato evaluation set
        for the given language. The key corresponds to a translator, and the value is a list
        containing the translation.
    """
    out = {}
    for translator in translators:
        fp = f"{translator}.{lang}.txt"
        fp = os.path.join(proc_dir, fp)
        with codecs.open(fp, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            # strip the translation of any escape chars or whitespace
            out[translator] = list(map(lambda x: x.strip(), lines))
    return out 
    

In [4]:
def calculate_bleu(translator, trans_dict):
    """Calculate bleu score using a human translator as the hypothesis, compared to
    other human translators for a reference.
    
    Params
    ------
    translator (str):
        The translator to use as a hypothesis
        
    trans_dict (dict):
        Dictionary containing all autshumato translations. The key is a translator.
        
    Returns
    -------
    score (float):
        The bleu score for the given translator.
    """
    ref_translators = list(set(translators).difference({translator})) # remove the translator we are checking from the reference list
    refs = list(trans_dict[i] for i in ref_translators) # get translations from all reference translators
    sys = trans_dict[translator] # hypothesis from translator we are checking
    
    return sacrebleu.corpus_bleu(sys, refs).score

In [5]:
# now calculate the bleu score for each alnguage and translator and place it into a dataframe

bleu = pd.DataFrame()

for lang in languages:
    print(lang + ':')
    lang_dict = load_all_translations(lang)
    for translator in translators:
        score = calculate_bleu(translator, lang_dict)
        bleu.loc[lang, translator] = score
        
        print(f'\t{translator}')
        print('\t' + '-'*len(translator))
        print(f'\tScore: {score}\n')
    print('----------------------')



af:




	translator1
	-----------
	Score: 76.77506934319096





	translator2
	-----------
	Score: 90.14414121766356





	translator3
	-----------
	Score: 81.49895264739418





	translator4
	-----------
	Score: 65.15124334660122

----------------------
nr:




	translator1
	-----------
	Score: 22.132337352981722





	translator2
	-----------
	Score: 19.165345163806087





	translator3
	-----------
	Score: 21.89220833444859





	translator4
	-----------
	Score: 22.34960471180981

----------------------
zu:




	translator1
	-----------
	Score: 22.975736442373996





	translator2
	-----------
	Score: 24.0141718951024





	translator3
	-----------
	Score: 22.24239371760884





	translator4
	-----------
	Score: 18.55738252578397

----------------------
xh:




	translator1
	-----------
	Score: 16.73549395845432





	translator2
	-----------
	Score: 18.12999090971908





	translator3
	-----------
	Score: 23.094076289463167





	translator4
	-----------
	Score: 22.183587884348402

----------------------
nso:




	translator1
	-----------
	Score: 50.58829985372169





	translator2
	-----------
	Score: 51.76857003632066





	translator3
	-----------
	Score: 45.543203926949715





	translator4
	-----------
	Score: 48.26835907582604

----------------------
st:




	translator1
	-----------
	Score: 46.40756690033645





	translator2
	-----------
	Score: 48.26523700989228





	translator3
	-----------
	Score: 46.35809245971638





	translator4
	-----------
	Score: 43.97840762030583

----------------------
tn:




	translator1
	-----------
	Score: 40.46101105933139





	translator2
	-----------
	Score: 38.38014166281088





	translator3
	-----------
	Score: 32.825560945749274





	translator4
	-----------
	Score: 34.01799572218709

----------------------
ss:




	translator1
	-----------
	Score: 18.676983906626074





	translator2
	-----------
	Score: 19.372828334855424





	translator3
	-----------
	Score: 21.472465220869374





	translator4
	-----------
	Score: 20.404115201817127

----------------------
ve:




	translator1
	-----------
	Score: 55.0115184637609





	translator2
	-----------
	Score: 52.35363909156426





	translator3
	-----------
	Score: 54.558916108736376





	translator4
	-----------
	Score: 52.64282490838124

----------------------
ts:




	translator1
	-----------
	Score: 44.13491061203231





	translator2
	-----------
	Score: 41.48279262871372





	translator3
	-----------
	Score: 30.72812120409177

	translator4
	-----------
	Score: 44.993747778554834

----------------------


In [6]:
bleu

Unnamed: 0,translator1,translator2,translator3,translator4
af,76.775069,90.144141,81.498953,65.151243
nr,22.132337,19.165345,21.892208,22.349605
zu,22.975736,24.014172,22.242394,18.557383
xh,16.735494,18.129991,23.094076,22.183588
nso,50.5883,51.76857,45.543204,48.268359
st,46.407567,48.265237,46.358092,43.978408
tn,40.461011,38.380142,32.825561,34.017996
ss,18.676984,19.372828,21.472465,20.404115
ve,55.011518,52.353639,54.558916,52.642825
ts,44.134911,41.482793,30.728121,44.993748


In [7]:
bleu.mean(axis=1)

af     78.392352
nr     21.384874
zu     21.947421
xh     20.035787
nso    49.042108
st     46.252326
tn     36.421177
ss     19.981598
ve     53.641725
ts     40.334893
dtype: float64

In [8]:
bleu.std(axis=1)

af     10.419502
nr      1.491431
zu      2.374044
xh      3.081676
nso     2.748656
st      1.756683
tn      3.599145
ss      1.221178
ve      1.338474
ts      6.576544
dtype: float64

In [9]:
bleu.to_csv('result.csv')