In [3]:
import sacrebleu
import pandas as pd

### Calculate BLEU Score Across Test Corpus

In [4]:
def agg_bleu(file):
    with open(file) as f:
            lines = f.readlines()

    stripped_lines = list(set([i for i in lines if len(i.strip().split('\t')) ==3]))
    translations = [i.split('\t')[1] for i in lines]
    targets = [i.split('\t')[2] for i in lines]

    sets = [i for i in list(zip(translations, targets)) if i[0] != ""]
    candidates = [i[0] for i in sets]
    references = [i[1] for i in sets]

    cand_list = [candidates[:]]
    ref_list = references[:]

    sac = sacrebleu.corpus_bleu(ref_list, list(cand_list))
    
    print(f'CORPUS BLEU SCORE: {sac.score}')
    return

In [5]:
agg_bleu('translations/es_to_en_finetune_translations.txt')

CORPUS BLEU SCORE: 56.42632952716528


In [6]:
agg_bleu('translations/es_to_en_baseline_translations.txt')

CORPUS BLEU SCORE: 52.39416604877844


### Calculate BLEU Score for Each Sentence

all_translations.txt lines organized into:

`target\tbaseline\tfinetuned\google\interpreter`

In [7]:
with open('translations/all_translations.txt','r') as f:
    translations = f.readlines()

In [28]:
sep = [i.strip().split('\t') for i in translations]

In [29]:
ex = sep[5]

In [30]:
ex

['Talk to your doctor about the possible risks of using this medication for your condition.',
 'Ask your doctor about the possible risks of using this medicine to treat your condition.',
 'Ask your doctor about the possible risks of using this medication for your condition.',
 'Ask your doctor about the possible risks of using this medication to treat your condition.',
 'Ask your doctor about possible risks associated with using this medication to treat your condition.']

In [31]:
sentence_numbers = list(range(1,21))
baseline_scores = []
finetuned_scores = []
google_translate_scores = []
interpreter_scores = []

for i in range(20):
    ex = sep[i]
    baseline_scores.append(round(sacrebleu.sentence_bleu(ex[0], [ex[1]]).score, 2))
    finetuned_scores.append(round(sacrebleu.sentence_bleu(ex[0], [ex[2]]).score, 2))
    google_translate_scores.append(round(sacrebleu.sentence_bleu(ex[0], [ex[3]]).score, 2))
    interpreter_scores.append(round(sacrebleu.sentence_bleu(ex[0], [ex[4]]).score, 2))
#     for j in range(1,5):
        
#         print(sacrebleu.sentence_bleu(ex[j], [ex[j]]).score)

In [32]:
bleu = pd.DataFrame({
                'sentence id': sentence_numbers,
                'baseline': baseline_scores,
                'finetuned':finetuned_scores,
                'google': google_translate_scores,
                'interpreter':interpreter_scores
             
             })

bleu

Unnamed: 0,sentence id,baseline,finetuned,google,interpreter
0,1,39.33,53.52,41.88,29.94
1,2,53.73,53.73,53.73,53.73
2,3,100.0,100.0,100.0,64.35
3,4,38.3,51.56,61.43,52.26
4,5,40.13,38.16,48.3,33.97
5,6,61.48,86.12,68.65,23.18
6,7,48.89,48.89,42.38,37.68
7,8,79.11,79.11,79.11,56.59
8,9,68.9,68.9,75.42,72.98
9,10,32.9,32.9,32.9,32.9


In [33]:
bleu.mean()

sentence id    10.5000
baseline       53.4720
finetuned      59.4815
google         55.3760
interpreter    37.9850
dtype: float64