In [None]:
!pip install transformers --quiet
!pip install sentencepiece --quiet
!pip install datasets --quiet
!pip install evaluate --quiet

!pip install git+https://github.com/google-research/bleurt.git -q
#!wget -N https://storage.googleapis.com/bleurt-oss-21/BLEURT-20.zip . -q
#!unzip -q -n BLEURT-20.zip
!wget https://storage.googleapis.com/bleurt-oss-21/BLEURT-20-D12.zip . -q
!unzip -q -n BLEURT-20-D12.zip

[K     |████████████████████████████████| 5.5 MB 7.6 MB/s 
[K     |████████████████████████████████| 7.6 MB 45.5 MB/s 
[K     |████████████████████████████████| 182 kB 73.5 MB/s 
[K     |████████████████████████████████| 1.3 MB 6.6 MB/s 
[K     |████████████████████████████████| 451 kB 7.5 MB/s 
[K     |████████████████████████████████| 115 kB 62.7 MB/s 
[K     |████████████████████████████████| 212 kB 63.3 MB/s 
[K     |████████████████████████████████| 127 kB 67.5 MB/s 
[K     |████████████████████████████████| 72 kB 1.3 MB/s 
[K     |████████████████████████████████| 352 kB 6.7 MB/s 
[?25h  Building wheel for BLEURT (setup.py) ... [?25l[?25hdone


### 2 Import libraries

In [None]:
from datasets import load_dataset, load_metric
from transformers import BertTokenizer, TFBertModel, BertGenerationEncoder, BertGenerationDecoder, EncoderDecoderModel
import evaluate
import numpy as np
import tensorflow as tf
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import pandas as pd
from csv import writer
import math

from bleurt import score

In [None]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load BLEURT
bleurt_checkpoint = "BLEURT-20-D12"

bleurt_metric = score.BleurtScorer(bleurt_checkpoint)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

### 3 Data Acquisition

In [None]:
orig = "zh"
target = "es"
ngram_size = 3
min_length = 50

In [None]:
# Data paths
train_file = f'drive/MyDrive/MIDS/W266/Final_Project/bert2bert-finetuned/{orig}_{target}/train_pairs.csv'
val_file = f'drive/MyDrive/MIDS/W266/Final_Project/bert2bert-finetuned/{orig}_{target}/val_pairs.csv'
test_file = f'drive/MyDrive/MIDS/W266/Final_Project/bert2bert-finetuned/{orig}_{target}/test_pairs.csv'

In [None]:
# Dictionary to store data sizes
data_size = {'en-zh': 
                {'train': 48444, 'val': 10381},
             'en-es':
                {'train': 167210, 'val': 35831},
             'es-en':
                {'train': 167210, 'val': 35831},
             'es-zh':
                {'train': 45796, 'val': 9814},
             'zh-es':
                {'train': 45796, 'val': 9814},
             'zh-en':
                {'train': 48444, 'val': 10381}}

### 4 Model instantiation

In [None]:
# define tokenizer and encoder/decoder
model_checkpoint = "bert-base-multilingual-uncased"
tokenizer = BertTokenizer.from_pretrained(model_checkpoint)


# define sequence to sequence model
bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained(model_checkpoint, model_checkpoint)

Downloading:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertLMHeadModel: ['cls.seq_relationship

### 5 Data Processing

In [None]:
def preprocess_data(text_pair, tokenizer, max_length=100):
    orig_text, target_text = text_pair
    orig_encoded = tokenizer.batch_encode_plus(
        [orig_text],
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    orig_input_ids = orig_encoded['input_ids'][0]
    orig_attention_mask = orig_encoded['attention_mask'][0]
    
    target_encoded = tokenizer.batch_encode_plus(
        [target_text],
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    target_attention_mask = target_encoded['attention_mask'][0]

    label_ids = target_encoded['input_ids'][0]
    target_input_ids = label_ids
    # We have to make sure that the PAD token is ignored
    pad_token_indices = label_ids == tokenizer.pad_token_id
    label_ids[pad_token_indices] = -100

    target_input_ids = label_ids

    
    return {'input_ids': orig_input_ids,
            'attention_mask': orig_attention_mask,
            'labels': label_ids}

In [None]:
class TranslationDataIterator:
    
    def __init__(self,
                 tokenizer,
                 n_examples,
                 max_load_at_once,
                 data_filename,
                 max_length=100,
                 shuffle=True):
        
        self.tokenizer = tokenizer
        self.n_examples = n_examples
        self.max_load_at_once = max_load_at_once
        self.data_filename = data_filename
        self.max_length = max_length
        self.shuffle = shuffle
        
        # Initialize row order, call on_epoch_end to shuffle row indices
        self.row_order = np.arange(1, self.n_examples+1)
        self.on_epoch_end()

        # Load first chunk of max_load_at_once examples
        self.df_curr_loaded = self._load_next_chunk(0)
        self.curr_idx_in_load = 0
    
    def _load_next_chunk(self, idx):
        load_start = idx
        load_end = idx + self.max_load_at_once

        # Indices to skip are the ones in the shuffled row_order before and
        # after the chunk we'll use for this chunk
        load_idx_skip = self.row_order[:load_start] + self.row_order[load_end:]
        self.df_curr_loaded = pd.read_csv(self.data_filename, skiprows=load_idx_skip)
    
    def __len__(self):
        return self.n_examples
    
    def __getitem__(self, idx):
        if self.df_curr_loaded is None or self.curr_idx_in_load >= len(self.df_curr_loaded):
            self._load_next_chunk(idx)
            self.curr_idx_in_load = 0
        
        text_pair = self.df_curr_loaded[[f'{orig}', f'{target}']].values.astype(str)[self.curr_idx_in_load]
        self.curr_idx_in_load += 1
        
        item_data = preprocess_data(
            text_pair,
            self.tokenizer,
            self.max_length
        )

        return item_data
    
    def __call__(self):
        for i in range(self.__len__()):
            yield self.__getitem__(i)
            
            if i == self.__len__()-1:
                self.on_epoch_end()
    
    def on_epoch_end(self):
        if self.shuffle:
            self.row_order = list(np.random.permutation(self.row_order))

In [None]:
# Create the data generators for train and validation data, tensorflow version

max_length = 100 
max_load_at_once = 1000 

train_data_generator = TranslationDataIterator(
    tokenizer=tokenizer,
    n_examples=data_size[f'{orig}-{target}']['train'],
    max_load_at_once=max_load_at_once,
    data_filename=train_file,
    max_length=max_length
)

valid_data_generator = TranslationDataIterator(
    tokenizer=tokenizer,
    n_examples=data_size[f'{orig}-{target}']['val'],
    max_load_at_once=max_load_at_once,
    data_filename=val_file,
    max_length=max_length
)

### 6 Model Training

In [None]:
# Specify batch size and other training arguments

batch_size = 16 

# Modify this filepath to where you want to save the model after fine-tuning
if min_length:
  dir_path = f'drive/MyDrive/MIDS/W266/Final_Project/bert2bert-finetuned/{orig}_{target}/max_length_100/min_length_{min_length}/ngram_{ngram_size}'
  file_path = dir_path
else:
  dir_path = f'drive/MyDrive/MIDS/W266/Final_Project/bert2bert-finetuned/{orig}_{target}/max_length_100/ngram_{ngram_size}'
  file_path = dir_path

args = Seq2SeqTrainingArguments(
    file_path,
    predict_with_generate=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,

)

In [None]:
file_path

'drive/MyDrive/MIDS/W266/Final_Project/bert2bert-finetuned/zh_es/max_length_100/min_length_50/ngram_3'

In [None]:
# Define metrics
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True, max_length = 100)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True, max_length = 100)
    bleurt_score = bleurt_metric.score(references=label_str, candidates=pred_str)

    return {
        "bleurt" : round(bleurt_score[0],4)
    }
  

In [None]:
print(bert2bert.config.no_repeat_ngram_size)

0


In [None]:
# Setting up the special tokens
bert2bert.config.decoder_start_token_id = tokenizer.cls_token_id
bert2bert.config.eos_token_id = tokenizer.sep_token_id
bert2bert.config.pad_token_id = tokenizer.pad_token_id
bert2bert.config.vocab_size = bert2bert.config.encoder.vocab_size

# Setting up parameters
bert2bert.config.max_length = 100

# ngram
bert2bert.config.no_repeat_ngram_size = ngram_size

# min_length
if min_length:
  bert2bert.config.min_length = min_length
else:
  pass

# Define the trainer, passing in the model, training args, and data generators

trainer = Seq2SeqTrainer(
    model = bert2bert,
    compute_metrics=compute_metrics,
    args = args,
    train_dataset=train_data_generator,
    eval_dataset=valid_data_generator
)

In [None]:
print(bert2bert.config.no_repeat_ngram_size)

3


In [None]:
print(bert2bert.config.min_length)

50


In [None]:
trainer.train()