In [None]:
import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install sacremoses
!pip install sacrebleu

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Multilingual_NLP

/content/drive/MyDrive/Multilingual_NLP


## Downloading the WMT’15 test sets

In [None]:
!curl -O https://statmt.org/wmt15/test.tgz
!tar -xf test.tgz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2025k  100 2025k    0     0  2096k      0 --:--:-- --:--:-- --:--:-- 2094k


In [None]:
import re

def sgm_to_txt(sgm_file, txt_file):
    """
    Turns a .sgm into a plain .txt file with only the example sentences
    """
    with open(sgm_file, 'r') as f_in:
        with open(txt_file, 'w') as f_out:
            for line in f_in:
                if line.startswith("<seg id"):
                    f_out.write(re.sub('(<seg id="\d*">)|(<\/seg>)', "", line))

sgm_to_txt("/content/test/newsdiscusstest2015-enfr-src.en.sgm", "newsdiscusstest2015-enfr-src.en")
sgm_to_txt("/content/test/newsdiscusstest2015-enfr-ref.fr.sgm", "newsdiscusstest2015-enfr-ref.fr")

## Translate

In [None]:
def generate_batches(src_sentences, tokenizer, batch_size, device):
    bstart = 0
    N = len(src_sentences)
    while bstart < N:      
        bend = min(bstart+batch_size, N)
        b_src_sentences = src_sentences[bstart:bend]
        b_encoded_src = tokenizer(b_src_sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)
        yield b_encoded_src.to(device)
        bstart += batch_size

In [None]:
from tqdm.notebook import tqdm # for progress bars in notebooks

def translate(src_file, out_file, tokenizer, model, batch_size, device):
    with open(src_file, 'r') as f:
        src_sentences = f.readlines()

    nb_batches = (len(src_sentences)/batch_size)
    for b_encoded_src in tqdm(generate_batches(src_sentences, tokenizer, batch_size, device), total=nb_batches):

        print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
        
        try:  # MBart
            generated_tokens = model.generate(**b_encoded_src,  max_new_tokens=512, forced_bos_token_id=tokenizer.lang_code_to_id["fr_XX"])
        except AttributeError:  # MarianMT
            generated_tokens = model.generate(**b_encoded_src,  max_new_tokens=512)
              
        b_decoded_trg = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

        with open(out_file, 'a') as f:
            for line in b_decoded_trg:
                f.write(f"{line}\n")

MarianMT

In [None]:
from transformers import MarianTokenizer, MarianMTModel

model_name = "Helsinki-NLP/opus-mt-en-fr"
marian_model = MarianMTModel.from_pretrained(model_name).to(DEVICE)
marian_tokenizer = MarianTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/301M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/778k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/802k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

In [None]:
translate('newsdiscusstest2015-enfr-src.en', 
          'marian-newsdiscusstest2015-enfr-pred.fr', 
          marian_tokenizer, 
          marian_model, 
          batch_size=128,
          device=DEVICE)

MBart

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

mbart_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt").to(DEVICE)
mbart_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
mbart_tokenizer.src_lang = "en_XX"

Downloading:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/529 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/649 [00:00<?, ?B/s]

In [None]:
translate('newsdiscusstest2015-enfr-src.en', 
          'mbart-newsdiscusstest2015-enfr-pred.fr', 
          mbart_tokenizer, 
          mbart_model, 
          batch_size=16,
          device=DEVICE)

## Bleu scores

In [None]:
from math import exp, log
from collections import Counter
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

def brevity_penalty(r, h):
    if h > r:
        return 1.
    else:
        return exp( 1 - (r/h) )


def ngrams(sequence, n):
    if len(sequence) < n: # no ngrams
      return

    sequence = iter(sequence)
    history = []
    while n > 1:
        history.append(next(sequence))
        n -= 1
    for item in sequence:
        history.append(item)
        yield tuple(history)
        del history[0]


def ngram_matches(ref, hyp, n):
    bag_ref = Counter(ngrams(ref, n))
    bag_hyp = Counter(ngrams(hyp, n))

    # number of common ngrams (if repetition the minimum between number of occurences in hyp and ref)
    common = sum((min(bag_hyp[ngram], bag_ref[ngram]) for ngram in bag_hyp))
    # number of ngrams in hyp (with repetitions)
    total = sum(bag_hyp.values())
    return common, total


def my_BLEU(references, hypotheses, max_n):
    """
    My implementation of BLEU score
    references and hypotheses are aligned lists (of the same length) of strings of space-separated tokens
    max_n is the maximum size of n-grams to consider
    """
    r = 0 # number of tokens in the reference corpus
    h = 0 # number of tokens in the hypotheses corpus
    matched_ngrams = [0] * max_n  # number of matched ngrams for each n
    total_ngrams = [0] * max_n  # number of ngrams in hypotheses corpus for each n

    for ref, hyp in zip(references, hypotheses):
        hyp = word_tokenize(hyp)
        ref = word_tokenize(ref)

        r += len(ref)
        h += len(hyp)

        for n in range(max_n):
            common, total = ngram_matches(ref, hyp, n+1)
            matched_ngrams[n] += common
            total_ngrams[n] += total

    precisions = []
    for n in range(max_n):
        precisions.append(matched_ngrams[n]/total_ngrams[n])
    log_precisions = [log(pn) for pn in precisions]

    avg_precision = sum(log_precisions)/len(log_precisions)
    bp = brevity_penalty(r, h)
    bleu = bp * exp(avg_precision)
    return {'bleu': round(bleu,4), 
            'precisions': [round(pn, 3) for pn in precisions], 
            'bp': round(bp,3), 
            'ratio': round(r/h,3), 
            'hyp_len': h, 
            'ref_len': r}


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
with open('newsdiscusstest2015-enfr-ref.fr', 'r') as f:
    references = f.readlines()

with open('marian-newsdiscusstest2015-enfr-pred.fr', 'r') as f:
    marian_hypotheses = f.readlines()

with open('mbart-newsdiscusstest2015-enfr-pred.fr', 'r') as f:
    mbart_hypotheses = f.readlines()

print('MarianMT Bleu:')
print(my_BLEU(references, marian_hypotheses, 4))

print('\nMBart Bleu:')
print(my_BLEU(references, mbart_hypotheses, 4))

MarianMT Bleu:
{'bleu': 0.3747, 'precisions': [0.655, 0.44, 0.316, 0.234], 'bp': 0.981, 'ratio': 1.019, 'hyp_len': 28047, 'ref_len': 28589}

MBart Bleu:
{'bleu': 0.3231, 'precisions': [0.606, 0.381, 0.261, 0.182], 'bp': 0.999, 'ratio': 1.001, 'hyp_len': 28550, 'ref_len': 28589}


In [None]:
from sacrebleu.metrics import BLEU
bleu = BLEU()

print('MarianMT sacrebleu:')
print(bleu.corpus_score(marian_hypotheses, [references]))

print('\nMBart sacrebleu:')
print(bleu.corpus_score(mbart_hypotheses, [references]))

MarianMT sacrebleu:
BLEU = 0.24 2.8/0.3/0.1/0.0 (BP = 1.000 ratio = 4.477 hyp_len = 125250 ref_len = 27975)

MBart sacrebleu:
BLEU = 0.24 2.8/0.3/0.1/0.0 (BP = 1.000 ratio = 4.576 hyp_len = 128008 ref_len = 27975)


## Number of permutations

In [None]:
from math import factorial 

def possible_permutations(references, hypotheses):

    permutations = 0
    for ref, hyp in zip(references, hypotheses):
        hyp = word_tokenize(hyp)
        ref = word_tokenize(ref)

        n = len(hyp)
        b, _ = ngram_matches(ref, hyp, 2)
        permutations += factorial(n-b)
                
    return f"{permutations:.2e}"

possible_permutations(references, mbart_hypotheses)

'3.01e+213'

## Tokenize predictions

Raw text

In [None]:
print('MarianMT Bleu:')
print(my_BLEU(references, marian_hypotheses, 4))

print('\nMBart Bleu:')
print(my_BLEU(references, mbart_hypotheses, 4))

print('\nMarianMT sacrebleu:')
print(bleu.corpus_score(marian_hypotheses, [references]))

print('\nMBart sacrebleu:')
print(bleu.corpus_score(mbart_hypotheses, [references]))

MarianMT Bleu:
{'bleu': 0.3747, 'precisions': [0.655, 0.44, 0.316, 0.234], 'bp': 0.981, 'ratio': 1.019, 'hyp_len': 28047, 'ref_len': 28589}

MBart Bleu:
{'bleu': 0.3231, 'precisions': [0.606, 0.381, 0.261, 0.182], 'bp': 0.999, 'ratio': 1.001, 'hyp_len': 28550, 'ref_len': 28589}

MarianMT sacrebleu:
BLEU = 38.36 65.6/44.2/31.8/23.5 (BP = 1.000 ratio = 1.005 hyp_len = 28115 ref_len = 27975)

MBart sacrebleu:
BLEU = 32.58 60.7/38.4/26.3/18.4 (BP = 1.000 ratio = 1.021 hyp_len = 28575 ref_len = 27975)


Tokenize into subwords

In [None]:
def subword_tokenization(tokenizer, in_file, out_file):
    with open(in_file, 'r') as f_in:
        with open(out_file, 'w') as f_out:
            for line in f_in:
                f_out.write(' '.join(tokenizer.tokenize(line.strip())) + '\n')

subword_tokenization(mbart_tokenizer, 'marian-newsdiscusstest2015-enfr-pred.fr', 'subtok-marian-newsdiscusstest2015-enfr-pred.fr')
subword_tokenization(mbart_tokenizer, 'mbart-newsdiscusstest2015-enfr-pred.fr', 'subtok-mbart-newsdiscusstest2015-enfr-pred.fr')

In [None]:
with open('subtok-marian-newsdiscusstest2015-enfr-pred.fr', 'r') as f:
    marian_hypotheses = f.readlines()

with open('subtok-mbart-newsdiscusstest2015-enfr-pred.fr', 'r') as f:
    mbart_hypotheses = f.readlines()

print('MarianMT subword Bleu:')
print(my_BLEU(references, marian_hypotheses, 4))

print('\nMBart subword Bleu:')
print(my_BLEU(references, mbart_hypotheses, 4))

print('\nMarianMT subword sacrebleu:')
print(bleu.corpus_score(marian_hypotheses, [references]))

print('\nMBart subword sacrebleu:')
print(bleu.corpus_score(mbart_hypotheses, [references]))

MarianMT subword Bleu:
{'bleu': 0.0029, 'precisions': [0.089, 0.006, 0.001, 0.0], 'bp': 1.0, 'ratio': 0.727, 'hyp_len': 39315, 'ref_len': 28589}

MBart subword Bleu:
{'bleu': 0.003, 'precisions': [0.087, 0.006, 0.001, 0.0], 'bp': 1.0, 'ratio': 0.712, 'hyp_len': 40143, 'ref_len': 28589}

MarianMT subword sacrebleu:
BLEU = 0.73 8.6/0.8/0.3/0.1 (BP = 1.000 ratio = 1.410 hyp_len = 39447 ref_len = 27975)

MBart subword sacrebleu:
BLEU = 0.74 8.3/0.8/0.3/0.1 (BP = 1.000 ratio = 1.437 hyp_len = 40212 ref_len = 27975)


Tokenize into characters

In [None]:
def char_tokenization(in_file, out_file):
    with open(in_file, 'r') as f_in:
        with open(out_file, 'w') as f_out:
            for line in f_in:
                f_out.write(' '.join(line))

char_tokenization('marian-newsdiscusstest2015-enfr-pred.fr', 'chartok-marian-newsdiscusstest2015-enfr-pred.fr')
char_tokenization('mbart-newsdiscusstest2015-enfr-pred.fr', 'chartok-mbart-newsdiscusstest2015-enfr-pred.fr')

In [None]:
with open('chartok-marian-newsdiscusstest2015-enfr-pred.fr', 'r') as f:
    marian_hypotheses = f.readlines()

with open('chartok-mbart-newsdiscusstest2015-enfr-pred.fr', 'r') as f:
    mbart_hypotheses = f.readlines()

print('MarianMT character Bleu:')
print(my_BLEU(references, marian_hypotheses, 4))

print('\nMBart character Bleu:')
print(my_BLEU(references, mbart_hypotheses, 4))

print('\nMarianMT character sacrebleu:')
print(bleu.corpus_score(marian_hypotheses, [references]))

print('\nMBart character sacrebleu:')
print(bleu.corpus_score(mbart_hypotheses, [references]))

MarianMT character Bleu:
{'bleu': 0.0016, 'precisions': [0.03, 0.002, 0.001, 0.0], 'bp': 1.0, 'ratio': 0.228, 'hyp_len': 125250, 'ref_len': 28589}

MBart character Bleu:
{'bleu': 0.0016, 'precisions': [0.03, 0.002, 0.001, 0.0], 'bp': 1.0, 'ratio': 0.223, 'hyp_len': 128008, 'ref_len': 28589}

MarianMT character sacrebleu:
BLEU = 0.24 2.8/0.3/0.1/0.0 (BP = 1.000 ratio = 4.477 hyp_len = 125250 ref_len = 27975)

MBart character sacrebleu:
BLEU = 0.24 2.8/0.3/0.1/0.0 (BP = 1.000 ratio = 4.576 hyp_len = 128008 ref_len = 27975)
