In [None]:
!pip install transformers --quiet
!pip install sentencepiece --quiet
!pip install datasets --quiet
!pip install evaluate --quiet

!pip install git+https://github.com/google-research/bleurt.git -q
#!wget -N https://storage.googleapis.com/bleurt-oss-21/BLEURT-20.zip . -q
#!unzip -q -n BLEURT-20.zip
!wget https://storage.googleapis.com/bleurt-oss-21/BLEURT-20-D12.zip . -q
!unzip -q -n BLEURT-20-D12.zip

### 2 Import libraries

In [None]:
from datasets import load_dataset, load_metric
from transformers import BertTokenizer, TFBertModel, BertGenerationEncoder, BertGenerationDecoder, EncoderDecoderModel, GPT2Tokenizer, GPT2Model
import evaluate
import numpy as np
import tensorflow as tf
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import pandas as pd
from csv import writer
import math

from bleurt import score

In [None]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load BLEURT
bleurt_checkpoint = "BLEURT-20-D12"

bleurt_metric = score.BleurtScorer(bleurt_checkpoint)

### 3 Data Acquisition

In [None]:
# select pair languages and min_length
orig = "en"
target = "zh"
min_length = 50

In [None]:
# Data paths
train_file = f'drive/MyDrive/MIDS/W266/Final_Project/bert2bert-finetuned/{orig}_{target}/train_pairs.csv'
val_file = f'drive/MyDrive/MIDS/W266/Final_Project/bert2bert-finetuned/{orig}_{target}/val_pairs.csv'
test_file = f'drive/MyDrive/MIDS/W266/Final_Project/bert2bert-finetuned/{orig}_{target}/test_pairs.csv'

In [None]:
# Dictionary to store data sizes
data_size = {'en-zh': 
                {'train': 48444, 'val': 10381},
             'en-es':
                {'train': 167210, 'val': 35831},
             'es-en':
                {'train': 167210, 'val': 35831},
             'es-zh':
                {'train': 45796, 'val': 9814},
             'zh-es':
                {'train': 45796, 'val': 9814},
             'zh-en':
                {'train': 48444, 'val': 10381}}

### 4 Model instantiation

In [None]:
# define tokenizer and encoder/decoder
model_checkpoint = "bert-base-multilingual-uncased"
tokenizer = BertTokenizer.from_pretrained(model_checkpoint)


# define sequence to sequence model
bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained(model_checkpoint, model_checkpoint)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertLMHeadModel: ['cls.seq_relationship

### 5 Data Processing

In [None]:
def preprocess_data(text_pair, tokenizer, max_length=100):
    orig_text, target_text = text_pair
    orig_encoded = tokenizer.batch_encode_plus(
        [orig_text],
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    orig_input_ids = orig_encoded['input_ids'][0]
    orig_attention_mask = orig_encoded['attention_mask'][0]
    
    target_encoded = tokenizer.batch_encode_plus(
        [target_text],
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    target_attention_mask = target_encoded['attention_mask'][0]

    label_ids = target_encoded['input_ids'][0]
    target_input_ids = label_ids
    # We have to make sure that the PAD token is ignored
    pad_token_indices = label_ids == tokenizer.pad_token_id
    label_ids[pad_token_indices] = -100

    target_input_ids = label_ids

    
    return {'input_ids': orig_input_ids,
            'attention_mask': orig_attention_mask,
            'labels': label_ids}

In [None]:
class TranslationDataIterator:
    
    def __init__(self,
                 tokenizer,
                 n_examples,
                 max_load_at_once,
                 data_filename,
                 max_length=100,
                 shuffle=True):
        
        self.tokenizer = tokenizer
        self.n_examples = n_examples
        self.max_load_at_once = max_load_at_once
        self.data_filename = data_filename
        self.max_length = max_length
        self.shuffle = shuffle
        
        # Initialize row order, call on_epoch_end to shuffle row indices
        self.row_order = np.arange(1, self.n_examples+1)
        self.on_epoch_end()

        # Load first chunk of max_load_at_once examples
        self.df_curr_loaded = self._load_next_chunk(0)
        self.curr_idx_in_load = 0
    
    def _load_next_chunk(self, idx):
        load_start = idx
        load_end = idx + self.max_load_at_once

        # Indices to skip are the ones in the shuffled row_order before and
        # after the chunk we'll use for this chunk
        load_idx_skip = self.row_order[:load_start] + self.row_order[load_end:]
        self.df_curr_loaded = pd.read_csv(self.data_filename, skiprows=load_idx_skip)
    
    def __len__(self):
        return self.n_examples
    
    def __getitem__(self, idx):
        if self.df_curr_loaded is None or self.curr_idx_in_load >= len(self.df_curr_loaded):
            self._load_next_chunk(idx)
            self.curr_idx_in_load = 0
        
        text_pair = self.df_curr_loaded[[f'{orig}', f'{target}']].values.astype(str)[self.curr_idx_in_load]
        self.curr_idx_in_load += 1
        
        item_data = preprocess_data(
            text_pair,
            self.tokenizer,
            self.max_length
        )

        return item_data
    
    def __call__(self):
        for i in range(self.__len__()):
            yield self.__getitem__(i)
            
            if i == self.__len__()-1:
                self.on_epoch_end()
    
    def on_epoch_end(self):
        if self.shuffle:
            self.row_order = list(np.random.permutation(self.row_order))

In [None]:
# Create the data generators for train and validation data, tensorflow version

max_length = 100 
max_load_at_once = 1000 

train_data_generator = TranslationDataIterator(
    tokenizer=tokenizer,
    n_examples=data_size[f'{orig}-{target}']['train'],
    max_load_at_once=max_load_at_once,
    data_filename=train_file,
    max_length=max_length
)

valid_data_generator = TranslationDataIterator(
    tokenizer=tokenizer,
    n_examples=data_size[f'{orig}-{target}']['val'],
    max_load_at_once=max_load_at_once,
    data_filename=val_file,
    max_length=max_length
)

### 6 Model Training

In [None]:
# Specify batch size and other training arguments

batch_size = 16 

# Modify this filepath to where you want to save the model after fine-tuning
if min_length:
  dir_path = f'drive/MyDrive/MIDS/W266/Final_Project/bert2bert-finetuned/{orig}_{target}/max_length_100/min_length_{min_length}'
  file_path = dir_path
else:
  dir_path = f'drive/MyDrive/MIDS/W266/Final_Project/bert2bert-finetuned/{orig}_{target}/max_length_100'
  file_path = dir_path

args = Seq2SeqTrainingArguments(
    file_path,
    predict_with_generate=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,

)

In [None]:
# Define metrics
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True, max_length = 100)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True, max_length = 100)
    bleurt_score = bleurt_metric.score(references=label_str, candidates=pred_str)

    return {
        "bleurt" : round(bleurt_score[0],4)
    }
  

In [None]:
# Setting up the special tokens
bert2bert.config.decoder_start_token_id = tokenizer.cls_token_id
bert2bert.config.eos_token_id = tokenizer.sep_token_id
bert2bert.config.pad_token_id = tokenizer.pad_token_id
bert2bert.config.vocab_size = bert2bert.config.encoder.vocab_size

# Setting up parameters
bert2bert.config.max_length = 100

if min_length:
  bert2bert.config.min_length = min_length
else:
  pass

print(bert2bert.config.min_length)

# Define the trainer, passing in the model, training args, and data generators

trainer = Seq2SeqTrainer(
    model = bert2bert,
    compute_metrics=compute_metrics,
    args = args,
    train_dataset=train_data_generator,
    eval_dataset=valid_data_generator
)

50


In [None]:
trainer.train()

***** Running training *****
  Num examples = 48444
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 9084
  Number of trainable parameters = 363187095


Epoch,Training Loss,Validation Loss,Bleurt
1,2.912,2.829103,0.2576
2,2.3711,2.462857,0.4469
3,2.3113,2.344123,0.4262


Saving model checkpoint to drive/MyDrive/MIDS/W266/Final_Project/bert2bert-finetuned/en_zh/max_length_100/min_length_50/checkpoint-500
Configuration saved in drive/MyDrive/MIDS/W266/Final_Project/bert2bert-finetuned/en_zh/max_length_100/min_length_50/checkpoint-500/config.json
Model weights saved in drive/MyDrive/MIDS/W266/Final_Project/bert2bert-finetuned/en_zh/max_length_100/min_length_50/checkpoint-500/pytorch_model.bin
Saving model checkpoint to drive/MyDrive/MIDS/W266/Final_Project/bert2bert-finetuned/en_zh/max_length_100/min_length_50/checkpoint-1000
Configuration saved in drive/MyDrive/MIDS/W266/Final_Project/bert2bert-finetuned/en_zh/max_length_100/min_length_50/checkpoint-1000/config.json
Model weights saved in drive/MyDrive/MIDS/W266/Final_Project/bert2bert-finetuned/en_zh/max_length_100/min_length_50/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to drive/MyDrive/MIDS/W266/Final_Project/bert2bert-finetuned/en_zh/max_length_100/min_length_50/checkpoint-1500
Configu

TrainOutput(global_step=9084, training_loss=2.745795566990532, metrics={'train_runtime': 17661.538, 'train_samples_per_second': 8.229, 'train_steps_per_second': 0.514, 'total_flos': 1.74195615644424e+16, 'train_loss': 2.745795566990532, 'epoch': 3.0})

## 5. Evaluation

In [None]:
test_df = pd.read_csv(test_file)[[f'{orig}', f'{target}']]
test_orig= test_df[f'{orig}'].values.astype(str)
test_labels = test_df[f'{target}'].values.astype(str)

In [None]:
test_labels

array(['股市过热恰逢人口趋势不利于退休基金。比如，在德国，20%的人口年龄超过65岁，工作年龄成年人将从今天的5,000万左右下降到2060年的3,400万。而在你新兴市场，预期寿命的快速增长和生育率的下降可能让中国60岁以上人口比例在2050年翻一番——这意味着五亿不事生产的退休者需要支持。',
       '然而这么一个世界意味着以色列无法再利用对纳粹大屠杀的愧疚感来影响那些主要势力。那是一个一神论宗教之间的敌意逐渐被多元信仰的汪洋大海所稀释的世界，而在那些仅仅以自身国家利益作为评判标准的，多疑且现实的大国眼中，以色列只能依靠其比较竞争优势来赢得青睐。',
       '对美国冠军费舍尔来说，这场对局是他从以神童身份出道以来二十年追求头衔生涯的高潮。对一个超级巨星来说（他甚至经常出现在主要刊物的封面上），费舍尔的生活可谓穷困潦倒，如今，他终于坐在了价值250,000美元的对局边上。当然，这与1971年阿里和弗雷泽（Frazier）拳王争霸战双方都能保证获得的250万美元来说只是九牛一毛。但费舍尔知道，在美国文化中，一切不产生大钱的运动项目都会被边缘化，因此他将这六位数的现金奖金视为国际象棋运动取得进展的终极标志。',
       ...,
       '在新年来临之际，瑞典通过小步快跑创造了历史，在刚刚结束半年轮换的的外交安全欧盟主席之后，我们将这个职位交给了欧盟新的固定体系 —— 里斯本条约的框架下在布鲁塞尔建立了的体系。',
       '像以色列这样缺少强大的人口基础和有利地缘政治条件的小国不可能永久维持占领区，塔尔蒙说。因此，以色列的危险在于徒劳地试图征服巴勒斯坦人。“领导人瞎了眼，看不见前方等待我们的是种族战争。”他写道。',
       '例如，联合国难民署帮助解决难民问题；世界粮食计划署为营养不良的儿童提供帮助；而世界卫生组织则支持公共卫生信息系统。这些系统对应对来自于禽流感等流行性疾病的威胁至关重要。联合国没有资源解决如艾滋病或全球气候变化等新问题，但它在敦促各国政府采取行动方面可以起到重要的召集者的作用。'],
      dtype='<U471')

In [None]:
# Upload saved fine-tuned model
dir_path = f'drive/MyDrive/MIDS/W266/Final_Project/bert2bert-finetuned/{orig}_{target}'
file_path = f'{dir_path}'
bert2bert_saved = EncoderDecoderModel.from_pretrained(file_path + '/checkpoint-9000')

In [None]:
num_examples = 100
start_index = 0
end_index = num_examples
test_size = len(test_orig)
num_batches = math.ceil(test_size/num_examples)
test_bleurt_scores_file = f'drive/MyDrive/MIDS/W266/Final_Project/bert2bert-finetuned/{orig}_{target}/test_bleurt_scores.csv'


In [None]:
with open(test_bleurt_scores_file, 'a') as f_object:
  # and get a writer object
  writer_object = writer(f_object)
  
  for _ in range(num_batches): 
      # Get predictions
      test_input_ids = tokenizer.batch_encode_plus(test_orig[start_index: end_index], return_tensors="pt", padding=True, truncation=True, max_length=100)
      test_output_token_ids = bert2bert_saved.generate(test_input_ids.input_ids)
      test_decoded = tokenizer.batch_decode(test_output_token_ids, skip_special_tokens=True, 
                                  clean_up_tokenization_spaces=False, max_length = 100)

      # Compute Bleurt scores
      bleurt_scores = bleurt_metric.score(references = test_labels[start_index: end_index], candidates = test_decoded)

      # pass the list as an argument into writerow()
      writer_object.writerow(bleurt_scores)

      # update indices
      start_index = end_index

      if end_index + num_examples > test_size:
        end_index = test_size
      else:
        end_index += num_examples




In [None]:
test_input_ids = tokenizer.batch_encode_plus(test_orig[:5], return_tensors="pt", padding=True, truncation=True, max_length=50)
test_output_token_ids = bert2bert_saved.generate(test_input_ids.input_ids)
test_decoded = tokenizer.batch_decode(test_output_token_ids)


In [None]:
[tokenizer.decode(out_ids, skip_special_tokens=True, 
                               clean_up_tokenization_spaces=False, max_length = 50) for out_ids in test_output_token_ids]

In [None]:
test_output_token_ids

In [None]:
test_decoded