<a href="https://colab.research.google.com/github/leonrafael29/W266_Final_Project/blob/main/mBART/MBart_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install requirements

In [None]:
!pip install datasets -q
!pip install sentencepiece -q
!pip install transformers -q
!pip install git+https://github.com/google-research/bleurt.git -q

# !wget -N https://storage.googleapis.com/bleurt-oss-21/BLEURT-20.zip . -q
# !unzip -q -n BLEURT-20.zip


[K     |████████████████████████████████| 451 kB 4.2 MB/s 
[K     |████████████████████████████████| 212 kB 69.0 MB/s 
[K     |████████████████████████████████| 115 kB 94.4 MB/s 
[K     |████████████████████████████████| 182 kB 84.5 MB/s 
[K     |████████████████████████████████| 127 kB 89.2 MB/s 
[K     |████████████████████████████████| 1.3 MB 4.1 MB/s 
[K     |████████████████████████████████| 5.5 MB 4.2 MB/s 
[K     |████████████████████████████████| 7.6 MB 76.7 MB/s 
[K     |████████████████████████████████| 352 kB 4.2 MB/s 
[?25h  Building wheel for BLEURT (setup.py) ... [?25l[?25hdone


Imports

In [None]:
import csv
import numpy as np
import pandas as pd
import torch
from bleurt import score
from datasets import load_dataset
from transformers import MBartForConditionalGeneration, \
    MBart50TokenizerFast, MBartConfig,\
    TrainingArguments, Trainer
    

Mount google drive to use for file saving and loading

In [None]:
from google.colab import files, drive
drive.mount('/content/gdrive/', force_remount=True)
%cd gdrive/MyDrive

Mounted at /content/gdrive/
/content/gdrive/MyDrive


Global variables

In [None]:
ORIGINAL_MODEL_CHECKPOINT = 'facebook/mbart-large-50-many-to-many-mmt'
MODEL_CHECKPOINT = 'Mbart/Model/Tiny/epoch-1'
PAIRS = [
    'en-zh',
    'zh-en',
    'en-es',
    'es-zh',
    'es-en',
    'zh-es',
    ]
MBART_DATA = {
    'en-zh': {
        'size': 69020,
        'train': 48444,
        'val': 10381,
        'src': 'en',
        'tgt': 'zh',
        'src_tkn': 'en_XX',
        'tgt_tkn':'zh_CN',
        'tkn': 'zh_CN',
        'reverse': False,
        'train_path':f'Mbart/Data/en-zh-train_pairs.csv',
        'val_path':f'Mbart/Data/en-zh-val_pairs.csv',
        'test_path':f'Mbart/Data/en-zh-test_pairs.csv',
        },
    'zh-en': {
        'size': 69020,
        'train': 48444,
        'val': 10381,
        'src': 'zh',
        'tgt': 'en',
        'src_tkn': 'zh_CN',
        'tgt_tkn':'en_ZZ',
        'tkn': 'en_XX',
        'reverse': True,
        'train_path':f'Mbart/Data/en-zh-train_pairs.csv',
        'val_path':f'Mbart/Data/en-zh-val_pairs.csv',
        'test_path':f'Mbart/Data/en-zh-test_pairs.csv',
        },
    'en-es': {
        'size': 238511,
        'train': 167210,
        'val': 35831,
        'src': 'en',
        'tgt': 'es',
        'src_tkn': 'en_XX',
        'tgt_tkn':'es_XX',
        'tkn': 'es_XX',
        'reverse': False,
        'train_path':f'Mbart/Data/en-es-train_pairs.csv',
        'val_path':f'Mbart/Data/en-es-val_pairs.csv',
        'test_path':f'Mbart/Data/en-es-test_pairs.csv',
        },
    'es-zh': {
        'size': 65408,
        'train': 45796,
        'val': 9814,
        'src': 'es',
        'tgt': 'zh',
        'src_tkn': 'es_XX',
        'tgt_tkn':'zh_CN',
        'tkn': 'zh_CN',
        'reverse': False,
        'train_path':f'Mbart/Data/es-zh-train_pairs.csv',
        'val_path':f'Mbart/Data/es-zh-val_pairs.csv',
        'test_path':f'Mbart/Data/es-zh-test_pairs.csv',
        },
    'es-en': {
        'size': 238511,
        'train': 167210,
        'val': 35831,
        'src': 'es',
        'tgt': 'en',
        'src_tkn': 'es_XX',
        'tgt_tkn':'en_XX',
        'tkn': 'en_XX',
        'reverse': True,
        'train_path':f'Mbart/Data/en-es-train_pairs.csv',
        'val_path':f'Mbart/Data/en-es-val_pairs.csv',
        'test_path':f'Mbart/Data/en-es-test_pairs.csv',
        },
    'zh-es': {
        'size': 65408,
        'train': 45796,
        'val': 9814,
        'src': 'zh',
        'tgt': 'es',
        'src_tkn': 'zh_CN',
        'tgt_tkn':'es_XX',
        'tkn': 'es_XX',
        'reverse': True,
        'train_path':f'Mbart/Data/es-zh-train_pairs.csv',
        'val_path':f'Mbart/Data/es-zh-val_pairs.csv',
        'test_path':f'Mbart/Data/es-zh-test_pairs.csv',
        },
    }

DATASET = 'news_commentary'
MAX_LENGTH = 50
MAX_NEW_TOKENS = 50
TRUNCATION = True
PADDING = True
RETURN_TENSORS = 'pt'
BLEURT_CHECKPOINT = './BLEURT-20-D3'
N_EXAMPLES = 100

%env PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:256

env: PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:256


Load Model, Metrics and Tokenizer

In [None]:
# Load the metrics model and tokenizer for use in the next cell

bleurt_metric = score.LengthBatchingBleurtScorer(BLEURT_CHECKPOINT)
#model = MBartForConditionalGeneration.from_pretrained(ORIGINAL_MODEL_CHECKPOINT)#, config=config)
tokenizer = MBart50TokenizerFast.from_pretrained(ORIGINAL_MODEL_CHECKPOINT)
model = torch.load('Mbart/Model/Tiny/epoch-1')

Downloading:   0%|          | 0.00/529 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/649 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

In [None]:
def eval_token(src_data, tokenizer, src_tkn):
  tokenizer.src_lang = src_tkn
 
  inputs = tokenizer(
        text=src_data,
        max_length=MAX_LENGTH,
        padding='max_length',
        truncation=TRUNCATION,
        return_tensors=RETURN_TENSORS,
        )

  return {'input_ids':inputs['input_ids'].cuda(),
          'attention_mask':inputs['attention_mask'].cuda()}

In [None]:
model.to('cuda')

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): Embedding(250054, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): Embedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0): MBartEncoderLayer(
          (self_attn): MBartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((1024,

Scoring function definition

In [None]:
# Function to generate dataset stream, select first n samples, generate predictions and return score and n samples used


def score_dataset(
    pair_index=0,
    starting_iter=0,
    end_iter=9999,
    n_examples=N_EXAMPLES,
    dataset=DATASET,
    model=model,
    tokenizer=tokenizer,
    bleurt_metric=bleurt_metric,
):
    """
    Evaluate the model that is passed into the function's performance on BLEU and
    BLEURT Translation performance. All results are output into csvs to store history
    external from the notebook.

    Inputs:
      Language Pair,
      Iteration Progress
      Ending Iteration,
      Number of examples
      Dataset
      Tokenizer
      Model,
      Bleurt Scorer

    Returns:
      Nothing

    """

    # Pair and properties for gatekeeping
    pair = PAIRS[pair_index]
    src = MBART_DATA[PAIRS[pair_index]]["src"]
    tgt = MBART_DATA[PAIRS[pair_index]]["tgt"]
    src_tkn = MBART_DATA[PAIRS[pair_index]]["src_tkn"]
    tgt_tkn = MBART_DATA[PAIRS[pair_index]]["tgt_tkn"]
    tkn = MBART_DATA[PAIRS[pair_index]]["tkn"]
    size = MBART_DATA[PAIRS[pair_index]]["size"]
    reverse = MBART_DATA[PAIRS[pair_index]]["reverse"]
    val = MBART_DATA[PAIRS[pair_index]]["val"]

    # Set Tokenizer up
    tokenizer.src_lang = src_tkn
    tokenizer.tgt_lang = tgt_tkn

    # Specify the scoring portion of the dataset
    eval = size * 0.15

    if eval > 10000:
      eval = 10000
    else:
      eval = 5000

    loops = int(eval // n_examples) - starting_iter + 1
    prev_iter = 0

    # Check where to start
    if starting_iter > 0:
        prev_iter = starting_iter

    # Check where to stop
    if end_iter == 9999:
        end_iter = loops
    else:
        loops = end_iter - starting_iter

    # Check if pair is reversed or not in the dataset loading
    if reverse:
        data_stream = load_dataset(DATASET, f"{tgt}-{src}", streaming=True)
    else:
        data_stream = load_dataset(DATASET, f"{src}-{tgt}", streaming=True)

    # Create/start your entries
    with open(f"Mbart/Tiny_e1_bleurt.csv", "a", newline="") as bluert_file:
        wr = csv.writer(bluert_file, quoting=csv.QUOTE_ALL)
        wr.writerow([f"starting tiny epoch_1_iter {prev_iter} ending at {end_iter}"])

    # Loop of scoring
    for run in range(loops + 1):
        start = prev_iter * n_examples

        # Normal loops of n_examples
        score_set = data_stream["train"].skip(start).take(n_examples)

        # Vectorize sentences
        src_data = [p["translation"][src] for p in score_set]
        tgt_data = [p["translation"][tgt] for p in score_set]

        # Progress text
        print(f"Now scoring iteration {run} of {loops}.")

        # Prepare inputs
        inputs = eval_token(src_data,tokenizer,src_tkn)

        # Pass inputs through the model
        generated_tokens = model.generate(
            **inputs,
            forced_bos_token_id=tokenizer.lang_code_to_id[tkn],
            max_new_tokens=MAX_NEW_TOKENS,
        )
        gen = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

        bleurt_score = bleurt_metric.score(references=tgt_data, candidates=gen)

        with open(f"Mbart/Tiny_e1_bleurt.csv","a", newline="") as bleurt_file:
            wr = csv.writer(bleurt_file, quoting=csv.QUOTE_ALL)
            wr.writerow([pair, run, np.mean(bleurt_score)])
        print(prev_iter+1, np.mean(bleurt_score))

        prev_iter += 1

    return None


Score the dataset

In [None]:
# Load individual dataset, score BLEU and BLEURT scores for dataset
for p in range(0, len(PAIRS)):
    print(f"Evaluating translation {PAIRS[p]}")
    score_dataset(
        pair_index=p,
        starting_iter=0,
        end_iter=9999,
        n_examples=N_EXAMPLES,
        dataset=DATASET,
        model=model,
        tokenizer=tokenizer,
        bleurt_metric=bleurt_metric,
    )


Evaluating translation en-zh


Downloading builder script:   0%|          | 0.00/5.36k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/116k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Now scoring iteration 0 of 101.
1 0.2399071104824543
Now scoring iteration 1 of 101.
2 0.2192081568390131
Now scoring iteration 2 of 101.
3 0.22437858782708645
Now scoring iteration 3 of 101.
4 0.23636277705430986
Now scoring iteration 4 of 101.
5 0.26452936448156833
Now scoring iteration 5 of 101.
6 0.20898724481463432
Now scoring iteration 6 of 101.
7 0.2312250419706106
Now scoring iteration 7 of 101.
8 0.23323311381042003
Now scoring iteration 8 of 101.
9 0.20888338290154934
Now scoring iteration 9 of 101.
10 0.230626600459218
Now scoring iteration 10 of 101.
11 0.22267891220748426
Now scoring iteration 11 of 101.
12 0.24702425062656402
Now scoring iteration 12 of 101.
13 0.2527615723013878
Now scoring iteration 13 of 101.
14 0.23696478344500066
Now scoring iteration 14 of 101.
15 0.19910964407026768
Now scoring iteration 15 of 101.
16 0.1965870313346386
Now scoring iteration 16 of 101.
17 0.2110183569788933
Now scoring iteration 17 of 101.
18 0.23797212786972521
Now scoring iterati

In [None]:
!ls