# In this notebook: 
  - Choose 2 models for translation
  - Calculating their metrics
  - Choose the best model to use it in my project

# Importing libraries

In [92]:
from transformers import MarianMTModel, MarianTokenizer, AutoTokenizer, AutoModelForSeq2SeqLM
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from rouge import Rouge
from datasets import load_dataset
from nltk.tokenize import word_tokenize

In [122]:
import torch
import nltk
import numpy as np
from nltk.translate import meteor_score

# Save model names in variables

In [104]:
model_1_name = "Helsinki-NLP/opus-mt-en-ru" 
model_2_name = "Gopal1853/marian-finetuned-kde4-en-to-ru"

# Load my GPU

In [94]:
device = torch.device("cuda")

In [95]:
device

device(type='cuda')

# Load all the models

In [105]:
tokenizer_1 = MarianTokenizer.from_pretrained(model_1_name)
model_1 = MarianMTModel.from_pretrained(model_1_name).to(device)

tokenizer_2 = AutoTokenizer.from_pretrained(model_2_name)
model_2 = AutoModelForSeq2SeqLM.from_pretrained(model_2_name).to(device)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [24]:
test_data = load_dataset("Helsinki-NLP/opus-100", "en-ru", split="test")

In [29]:
test_data

Dataset({
    features: ['translation'],
    num_rows: 2000
})

In [118]:
test_sample = test_data.select(range(100)) # Let's take first 10 lines

In [119]:
references = [ex["translation"]["ru"] for ex in test_sample]

# Base function to check model prediction

In [130]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...


True

In [78]:
def translate_text(model, tokenizer, text):
    inputs = tokenizer.prepare_seq2seq_batch([text], truncation=True, padding="longest", return_tensors="pt").to(device)
    translated_ids = model.generate(inputs.input_ids)
    translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    return translated_text

# BLEU Formula

 $$BLEU = BP \cdot e^{\sum{w_n \cdot \log(p_n)}}$$

# ROUGE Formula

$$ROUGE-N = \frac{{\text{{Count of matching n-grams}}}}{{\text{{Count of n-grams in the reference summary}}}}$$

# METEOR-metric's formula

$$\text{METEOR} = \frac{{\text{Precision} \times \text{Recall}}}{{(1 - \alpha) \times \text{Precision} + \alpha \times \text{Recall}}} \times (1 - \beta \times \text{Fragmentation})$$

In [131]:
def get_metrics(model, tokenizer):
    BLEUFirst = []
    RougeF1First = []
    MeteorScore = []
    rouge = Rouge()
    
    for i, ex in enumerate(test_sample):
        source = ex["translation"]["en"]
        translated = translate_text(model, tokenizer, source)
        prediction_tokens = word_tokenize(translated)
        reference_tokens = word_tokenize(references[i])
        bleu_cur = nltk.translate.bleu_score.sentence_bleu([reference_tokens], prediction_tokens)
        BLEUFirst.append(bleu_cur)
        rouge_cur = rouge.get_scores(references[i], translated)
        rouge_l_score = rouge_cur[0]['rouge-l']['f']
        RougeF1First.append(rouge_l_score)
        meteor = meteor_score.meteor_score([reference_tokens], prediction_tokens)
        MeteorScore.append(meteor)
        
    return BLEUFirst, RougeF1First, MeteorScore

# Let's look at metrics, and make a decision

In [132]:
bleu_1, rouge_1, meteor_1 = get_metrics(model_1, tokenizer_1)
bleu_2, rouge_2, meteor_2 = get_metrics(model_2, tokenizer_2)

In [108]:
print(bleu_1)

[1.1640469867513693e-231, 8.129855005981316e-155, 4.101791032784236e-78, 0.3934995962231127, 0.1531682455208201, 4.188639545551841e-78, 0, 1.2882297539194154e-231, 0.43989172475842214, 1.2882297539194154e-231]


In [133]:
print(f"{model_1_name}'s results: ")
print(f"Average BLEU: {np.mean(bleu_1):.4f}")
print(f"Average Rouge F1: {np.mean(rouge_1):.4f}")
print(f"Average Meteor: {np.mean(meteor_1):.4f}")
print("-----------------------------------------")
print(f"{model_2_name}'s results: ")
print(f"Agerage BLEU: {np.mean(bleu_2)}")
print(f"Average Rouge F1: {np.mean(rouge_2)}")
print(f"Average Meteor: {np.mean(meteor_2)}")


Helsinki-NLP/opus-mt-en-ru's results: 
Average BLEU: 0.0987
Average Rouge F1: 0.2938
Average Meteor: 0.3821
-----------------------------------------
Gopal1853/marian-finetuned-kde4-en-to-ru's results: 
Agerage BLEU: 0.09786781163050493
Average Rouge F1: 0.2937923569154086
Average Meteor: 0.3721905498150912


# In conclusion: as we can see, in terms of metrics these models are approximately equal, we can take both. But I preferred to take "Helsinki-nlp/opus-mt-en-ru"