<a href="https://colab.research.google.com/github/leonrafael29/W266_Final_Project/blob/main/mBART/Teensy_Tuner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Mount google drive to use for file saving and loading

In [None]:
from google.colab import files, drive
drive.mount('/content/gdrive/', force_remount=True)
%cd gdrive/MyDrive

Mounted at /content/gdrive/
/content/gdrive/MyDrive


Install requirements

In [None]:
!pip install sentencepiece -q
!pip install transformers -q
!pip install datasets -q
!pip install git+https://github.com/google-research/bleurt.git -q

# !wget -N https://storage.googleapis.com/bleurt-oss-21/BLEURT-20.zip . -q
# !unzip -q -n BLEURT-20.zip

#Downloads the 3-layer distilled model, which is much smaller.
# !wget https://storage.googleapis.com/bleurt-oss-21/BLEURT-20-D3.zip .
# !unzip BLEURT-20-D3.zip

[K     |████████████████████████████████| 1.3 MB 32.5 MB/s 
[K     |████████████████████████████████| 5.5 MB 29.1 MB/s 
[K     |████████████████████████████████| 7.6 MB 86.0 MB/s 
[K     |████████████████████████████████| 182 kB 77.3 MB/s 
[K     |████████████████████████████████| 451 kB 32.2 MB/s 
[K     |████████████████████████████████| 212 kB 101.3 MB/s 
[K     |████████████████████████████████| 115 kB 93.6 MB/s 
[K     |████████████████████████████████| 127 kB 90.7 MB/s 
[K     |████████████████████████████████| 352 kB 32.7 MB/s 
[?25h  Building wheel for BLEURT (setup.py) ... [?25l[?25hdone


Imports

In [None]:
import csv
import numpy as np
import pandas as pd
import torch
from bleurt import score
from datasets import load_dataset
from transformers import MBartForConditionalGeneration, \
    MBart50TokenizerFast, MBartConfig,\
    TrainingArguments, Trainer
    

Global variables

In [None]:
ORIGINAL_MODEL_CHECKPOINT = 'facebook/mbart-large-50-many-to-many-mmt'
MODEL_CHECKPOINT = 'Mbart/Model/Tiny/checkpoint-1000'

PAIRS = [
    'en-zh',
    'zh-en',
    'en-es',
    'es-zh',
    'es-en',
    'zh-es',
    ]
MBART_DATA = {
    'en-zh': {
        'size': 69020,
        'train': 48444,
        'val': 10381,
        'src': 'en',
        'tgt': 'zh',
        'src_tkn': 'en_XX',
        'tgt_tkn':'zh_CN',
        'tkn': 'zh_CN',
        'reverse': False,
        'train_path':f'Mbart/Data/en-zh-train_pairs.csv',
        'val_path':f'Mbart/Data/en-zh-val_pairs.csv',
        'test_path':f'Mbart/Data/en-zh-test_pairs.csv',
        },
    'zh-en': {
        'size': 69020,
        'train': 48444,
        'val': 10381,
        'src': 'zh',
        'tgt': 'en',
        'src_tkn': 'zh_CN',
        'tgt_tkn':'en_XX',
        'tkn': 'en_XX',
        'reverse': True,
        'train_path':f'Mbart/Data/en-zh-train_pairs.csv',
        'val_path':f'Mbart/Data/en-zh-val_pairs.csv',
        'test_path':f'Mbart/Data/en-zh-test_pairs.csv',
        },
    'en-es': {
        'size': 238511,
        'train': 167210,
        'val': 35831,
        'src': 'en',
        'tgt': 'es',
        'src_tkn': 'en_XX',
        'tgt_tkn':'es_XX',
        'tkn': 'es_XX',
        'reverse': False,
        'train_path':f'Mbart/Data/en-es-train_pairs.csv',
        'val_path':f'Mbart/Data/en-es-val_pairs.csv',
        'test_path':f'Mbart/Data/en-es-test_pairs.csv',
        },
    'es-zh': {
        'size': 65408,
        'train': 45796,
        'val': 9814,
        'src': 'es',
        'tgt': 'zh',
        'src_tkn': 'es_XX',
        'tgt_tkn':'zh_CN',
        'tkn': 'zh_CN',
        'reverse': False,
        'train_path':f'Mbart/Data/es-zh-train_pairs.csv',
        'val_path':f'Mbart/Data/es-zh-val_pairs.csv',
        'test_path':f'Mbart/Data/es-zh-test_pairs.csv',
        },
    'es-en': {
        'size': 238511,
        'train': 167210,
        'val': 35831,
        'src': 'es',
        'tgt': 'en',
        'src_tkn': 'es_XX',
        'tgt_tkn':'en_XX',
        'tkn': 'en_XX',
        'reverse': True,
        'train_path':f'Mbart/Data/en-es-train_pairs.csv',
        'val_path':f'Mbart/Data/en-es-val_pairs.csv',
        'test_path':f'Mbart/Data/en-es-test_pairs.csv',
        },
    'zh-es': {
        'size': 65408,
        'train': 45796,
        'val': 9814,
        'src': 'zh',
        'tgt': 'es',
        'src_tkn': 'zh_CN',
        'tgt_tkn':'es_XX',
        'tkn': 'es_XX',
        'reverse': True,
        'train_path':f'Mbart/Data/es-zh-train_pairs.csv',
        'val_path':f'Mbart/Data/es-zh-val_pairs.csv',
        'test_path':f'Mbart/Data/es-zh-test_pairs.csv',
        },
    }

DATASET = 'news_commentary'
MAX_LENGTH = 50
MAX_NEW_TOKENS = 50
TRUNCATION = True
PADDING = True
RETURN_TENSORS = 'pt'
BLEURT_CHECKPOINT = './BLEURT-20-D3'
N_EXAMPLES = 100

%env PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512

env: PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512


Load Model, Metrics and Tokenizer

In [None]:
config = MBartConfig.from_pretrained(ORIGINAL_MODEL_CHECKPOINT)

config.encoder_layers = 1
config.decoder_layers = 1
config.num_hidden_layers = 1
config.decoder_ffn_dim = 2048
config.encoder_ffn_dim = 2048
config.encoder_attention_heads = 8
config.decoder_attention_heads = 8

config

MBartConfig {
  "_name_or_path": "/home/suraj/projects/mbart-50/hf_models/mbart-50-large-many-to-many/",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "relu",
  "add_bias_logits": false,
  "add_final_layer_norm": true,
  "architectures": [
    "MBartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 1,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 1,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LA

In [None]:
# Load the metrics model and tokenizer for use in the next cell

bleurt_metric = score.LengthBatchingBleurtScorer(BLEURT_CHECKPOINT)
model = MBartForConditionalGeneration.from_pretrained(ORIGINAL_MODEL_CHECKPOINT, config=config, ignore_mismatched_sizes=True)
tokenizer = MBart50TokenizerFast.from_pretrained(ORIGINAL_MODEL_CHECKPOINT)
#model = torch.load('Mbart/Model/epoch-1')

Some weights of the model checkpoint at facebook/mbart-large-50-many-to-many-mmt were not used when initializing MBartForConditionalGeneration: ['model.encoder.layers.5.fc2.weight', 'model.encoder.layers.9.self_attn.k_proj.bias', 'model.decoder.layers.10.self_attn_layer_norm.bias', 'model.decoder.layers.10.self_attn.out_proj.bias', 'model.encoder.layers.8.fc2.weight', 'model.encoder.layers.2.final_layer_norm.bias', 'model.decoder.layers.6.self_attn.q_proj.weight', 'model.decoder.layers.9.fc2.weight', 'model.decoder.layers.7.encoder_attn.q_proj.weight', 'model.decoder.layers.3.self_attn.q_proj.weight', 'model.decoder.layers.9.encoder_attn.q_proj.weight', 'model.encoder.layers.11.fc1.weight', 'model.decoder.layers.5.encoder_attn.v_proj.bias', 'model.encoder.layers.10.fc2.bias', 'model.decoder.layers.11.self_attn.v_proj.weight', 'model.encoder.layers.4.final_layer_norm.weight', 'model.decoder.layers.10.encoder_attn.k_proj.bias', 'model.decoder.layers.4.self_attn.out_proj.bias', 'model.enc

In [None]:
def supervised_preprocessor(src_data, tgt_data, tokenizer, src_tkn, tgt_tkn):
  tokenizer.src_lang = src_tkn
  tokenizer.tgt_lang = tgt_tkn
 
  inputs = tokenizer(
        text=[np.array2string(src_data)],
        text_target=[np.array2string(tgt_data)],
        max_length=MAX_LENGTH,
        padding='max_length',
        truncation=TRUNCATION,
        return_tensors=RETURN_TENSORS,
        )

  return {'input_ids':inputs.input_ids[0],
           'attention_mask':inputs.attention_mask[0],
           'labels':inputs.labels[0],
          }

def eval_preprocessor(src_data, tgt_data, tokenizer, src_tkn, tgt_tkn):
  tokenizer.src_lang = src_tkn
  tokenizer.tgt_lang = tgt_tkn
 
  src_tkns = tokenizer(
        text=[np.array2string(src_data)],
        max_length=MAX_LENGTH,
        padding='max_length',
        truncation=TRUNCATION,
        return_tensors=RETURN_TENSORS,
        )
  
  tgt_tkns = tokenizer(
        text_target=[np.array2string(tgt_data)],
        max_length=MAX_LENGTH,
        padding='max_length',
        truncation=TRUNCATION,
        return_tensors=RETURN_TENSORS,
        return_attention_mask=False,
        )

  tgt_tkn_id = torch.tensor(tokenizer.lang_code_to_id[tgt_tkn])
  return {'inputs' : {'input_ids':src_tkns.input_ids.cuda(),
                      'attention_mask':src_tkns.attention_mask.cuda(),
                      'decoder_input_ids':tgt_tkn_id},
          'labels' : tgt_tkns.input_ids.cuda(),}

In [None]:
 class TranslationDataIterator:
    
    def __init__(self,
                 tokenizer,
                 n_examples,
                 max_load_at_once,
                 data_filename,
                 src,
                 tgt,
                 src_tkn,
                 tgt_tkn,
                 max_length=MAX_LENGTH,
                 shuffle=True):
        
        self.tokenizer = tokenizer
        self.n_examples = n_examples
        self.max_load_at_once = max_load_at_once
        self.data_filename = data_filename
        self.src = src
        self.tgt = tgt
        self.src_tkn = src_tkn
        self.tgt_tkn = tgt_tkn
        self.max_length = max_length
        self.shuffle = shuffle
        
        # Initialize row order, call on_epoch_end to shuffle row indices
        self.row_order = np.arange(1, self.n_examples+1)
        self.on_epoch_end()

        # Load first chunk of max_load_at_once examples
        self.df_curr_loaded = self._load_next_chunk(0)
        self.curr_idx_in_load = 0
    
    def _load_next_chunk(self, idx):
        load_start = idx
        load_end = idx + self.max_load_at_once

        # Indices to skip are the ones in the shuffled row_order before and
        # after the chunk we'll use for this chunk
        load_idx_skip = self.row_order[:load_start] + self.row_order[load_end:]
        self.df_curr_loaded = pd.read_csv(self.data_filename, skiprows=load_idx_skip)
    
    def __len__(self):
        return self.n_examples
    
    def __getitem__(self, idx):
        if self.df_curr_loaded is None or self.curr_idx_in_load >= len(self.df_curr_loaded):
            self._load_next_chunk(idx)
            self.curr_idx_in_load = 0
        
        src_data = self.df_curr_loaded[[src]].values.astype(str)[self.curr_idx_in_load]
        tgt_data = self.df_curr_loaded[[tgt]].values.astype(str)[self.curr_idx_in_load]
        self.curr_idx_in_load += 1

        item_data = supervised_preprocessor(
            src_data,
            tgt_data,
            self.tokenizer,
            self.src_tkn,
            self.tgt_tkn,
        )
        
        return item_data
    
    def __call__(self):
        for i in range(self.__len__()):
            yield self.__getitem__(i)
            
            if i == self.__len__()-1:
                self.on_epoch_end()
    
    def on_epoch_end(self):
        if self.shuffle:
            self.row_order = list(np.random.permutation(self.row_order))

In [None]:
# class CustomTrainer(Trainer):
#   def compute_loss(self,model,inputs, return_outputs=False):
#     labels=inputs.get("labels")
#     outputs = model(**inputs, decoder_start_token_id=output_tkn)
#     logits = outputs.get("logits")
#     loss_fct = nn.CrossEntropyLoss()
#     loss = loss_fct(logits.view(-1,self.model.config.vocab_size), labels.view(-1))
#     return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_prediction):
  label_ids = eval_prediction.label_ids
  generated_ids = eval_prediction.predictions
  generated = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
  label = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
  bleurt_score = score(references=label, candidates=generated)

  return {
      "bleurt" : bleurt_score
  }


In [None]:
 class PredictIterator(TranslationDataIterator):
    
    def __init__(self,
                 tokenizer,
                 n_examples,
                 max_load_at_once,
                 data_filename,
                 src,
                 tgt,
                 src_tkn,
                 tgt_tkn,
                 max_length=MAX_LENGTH,
                 shuffle=True):
        
        self.tokenizer = tokenizer
        self.n_examples = n_examples
        self.max_load_at_once = max_load_at_once
        self.data_filename = data_filename
        self.src = src
        self.tgt = tgt
        self.src_tkn = src_tkn
        self.tgt_tkn = tgt_tkn
        self.max_length = max_length
        self.shuffle = shuffle
        
        # Initialize row order, call on_epoch_end to shuffle row indices
        self.row_order = np.arange(1, self.n_examples+1)
        self.on_epoch_end()

        # Load first chunk of max_load_at_once examples
        self.df_curr_loaded = self._load_next_chunk(0)
        self.curr_idx_in_load = 0
    
    def _load_next_chunk(self, idx):
        load_start = idx
        load_end = idx + self.max_load_at_once

        # Indices to skip are the ones in the shuffled row_order before and
        # after the chunk we'll use for this chunk
        load_idx_skip = self.row_order[:load_start] + self.row_order[load_end:]
        self.df_curr_loaded = pd.read_csv(self.data_filename, skiprows=load_idx_skip)
    
    def __len__(self):
        return self.n_examples    
    def __getitem__(self, idx):
        if self.df_curr_loaded is None or self.curr_idx_in_load >= len(self.df_curr_loaded):
            self._load_next_chunk(idx)
            self.curr_idx_in_load = 0
        
        src_data = self.df_curr_loaded[[src]].values.astype(str)[self.curr_idx_in_load]
        tgt_data = self.df_curr_loaded[[tgt]].values.astype(str)[self.curr_idx_in_load]
        self.curr_idx_in_load += 1

        item_data = eval_preprocessor(
            src_data,
            tgt_data,
            self.tokenizer,
            self.src_tkn,
            self.tgt_tkn,
        )
        
        return item_data
    
    def __call__(self):
        for i in range(self.__len__()):
            yield self.__getitem__(i)
            
            if i == self.__len__()-1:
                self.on_epoch_end()
    
    def on_epoch_end(self):
        if self.shuffle:
            self.row_order = list(np.random.permutation(self.row_order))

In [None]:
MODEL_PATH=f"Mbart/Model/Teensy"
BATCH_SIZE=32

TRAINER_PARAMS = TrainingArguments(
    MODEL_PATH,
    evaluation_strategy='no',
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=1,
    dataloader_drop_last=True,
    resume_from_checkpoint=True,

)

In [None]:
torch.save(model,'Mbart/Model/Teensy/base2')

In [None]:
# Create the data generators for train and validation data, tensorflow version

max_length = MAX_LENGTH
max_load_at_once = 200 

# Load individual dataset, score BLEU and BLEURT scores for dataset
for p in range(0,len(PAIRS)):
    train_len = MBART_DATA[PAIRS[p]]["train"]
    val_len = MBART_DATA[PAIRS[p]]["val"]
    src = MBART_DATA[PAIRS[p]]["src"]
    tgt = MBART_DATA[PAIRS[p]]["tgt"]
    src_tkn = MBART_DATA[PAIRS[p]]["src_tkn"]
    tgt_tkn = MBART_DATA[PAIRS[p]]["tgt_tkn"]
    train_file = MBART_DATA[PAIRS[p]]["train_path"]
    val_file = MBART_DATA[PAIRS[p]]["val_path"]

    print(f"Training translation {PAIRS[p]} mini model")

    train_data_generator = TranslationDataIterator(
        tokenizer=tokenizer,
        n_examples=train_len,
        max_load_at_once=max_load_at_once,
        data_filename=train_file,
        max_length=max_length,
        src=src,
        tgt=tgt,
        src_tkn=src_tkn,
        tgt_tkn=tgt_tkn,
    )

    valid_data_generator = TranslationDataIterator(
        tokenizer=tokenizer,
        n_examples=val_len,
        max_load_at_once=max_load_at_once,
        data_filename=val_file,
        max_length=max_length,
        src=src,
        tgt=tgt,
        src_tkn=src_tkn,
        tgt_tkn=tgt_tkn,
      )
    troubleshooting_data_generator = PredictIterator(
        tokenizer=tokenizer,
        n_examples=val_len,
        max_load_at_once=max_load_at_once,
        data_filename=val_file,
        max_length=max_length,
        src=src,
        tgt=tgt,
        src_tkn=src_tkn,
        tgt_tkn=tgt_tkn,
      )

    mbart_trainer = Trainer(
        model=model,
        args=TRAINER_PARAMS,
        train_dataset=train_data_generator,
        eval_dataset=None,
        compute_metrics=None,
      )
    mbart_trainer.train()
    

Training translation en-zh mini model


***** Running training *****
  Num examples = 48444
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1513
  Number of trainable parameters = 279164928


Step,Training Loss
500,6.7714
1000,5.5268


Saving model checkpoint to Mbart/Model/Teensy/checkpoint-500
Configuration saved in Mbart/Model/Teensy/checkpoint-500/config.json
Model weights saved in Mbart/Model/Teensy/checkpoint-500/pytorch_model.bin
Saving model checkpoint to Mbart/Model/Teensy/checkpoint-1000
Configuration saved in Mbart/Model/Teensy/checkpoint-1000/config.json
Model weights saved in Mbart/Model/Teensy/checkpoint-1000/pytorch_model.bin


IndexError: ignored

In [None]:
torch.save(model, 'Mbart/Model/Single/epoch-1')

In [None]:
# torch.save(model.state_dict(), 'Mbart/Model/Tiny/epoch-2-state')

In [None]:
!ls

sample_data


AttributeError: ignored

In [None]:
troubleshooting_data_generator[0]

{'inputs': {'input_ids': tensor([[250025,    378,     25,   1189,  37415,   5714,      4,  26871,  55899,
            17989,  60353, 226934,    264,  48616, 123843,   1988,     30, 124920,
             1535,  37928,    213,   4617,   2920,   3300,  13447,     37,   5250,
             2100,    213, 156455,   1420,  12590,  23470,   7447,   1493,  95539,
                4,   1288,  27839,   1635,   1288,  48616,   4491,  27043,   5525,
            11608,  53989,  32870,  72926,      2]], device='cuda:0'),
  'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
           1, 1]], device='cuda:0'),
  'decoder_input_ids': tensor(250005)},
 'labels': tensor([[250005,    378,     25,    284,  26897,      8,    576, 110245,      7,
               4,    576,  28214,  26850,      7,   6805,   1005, 190037,      7,
             121,   1011,    377,  40578,     10,     21

In [None]:
tokenizer.decode(troubleshooting_data_generator[0]['inputs']['input_ids'])

TypeError: ignored

In [None]:
tokenizer.decode(troubleshooting_data_generator[0]['labels'])

In [None]:
troubleshooting_data_generator[0]

In [None]:
original = tokenizer.decode(troubleshooting_data_generator[0]['inputs']['input_ids'][0])
original

In [None]:
generated_none = model.generate(input_ids=troubleshooting_data_generator[0]['inputs']['input_ids'], max_length=50)
generated_none

In [None]:
tokenizer.decode(generated_none[0])

In [None]:
generated_zh = model.generate(input_ids=troubleshooting_data_generator[0]['inputs']['input_ids'], forced_bos_token_id=tokenizer.lang_code_to_id['zh_CN'], max_length=50)
generated_zh

In [None]:
tokenizer.decode(generated_zh[0])

In [None]:
generated_es = model.generate(input_ids=troubleshooting_data_generator[0]['inputs']['input_ids'], forced_bos_token_id=tokenizer.lang_code_to_id['es_XX'], max_length=50)
generated_es

In [None]:
tokenizer.decode(generated_es[0])

In [None]:
model.config

In [None]:
src_tkn

In [None]:
    mbart_trainer.train()

In [None]:
print(torch.cuda.memory_summary())

In [None]:
model.train()

In [None]:
print(torch.cuda.memory_summary())

In [None]:
 torch.cuda.synchronize()
 torch.cuda.empty_cache()