## FineTuning Multi-task Supervised Pre-training for Natural Language Generation (MVP) for generating BBE text from KJV text

### Import packages

In [None]:
import os
# os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'

In [None]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import torch
from tqdm import tqdm
from transformers import MvpTokenizer, MvpForConditionalGeneration, AdamW, MvpConfig

In [None]:
pd.set_option('display.max_colwidth', None)

### Import Transformer Model & Tokenizer and set device

In [None]:
model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp")
tokenizer = MvpTokenizer.from_pretrained("RUCAIBox/mvp")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

MvpForConditionalGeneration(
  (model): MvpModel(
    (shared): Embedding(50267, 1024, padding_idx=1)
    (encoder): MvpEncoder(
      (embed_tokens): Embedding(50267, 1024, padding_idx=1)
      (embed_positions): MvpLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MvpEncoderLayer(
          (self_attn): MvpAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((1

### Read in cleaned data & make training dataset (300 samples) & testing dataset (100 samples)

**Note: Not using all 1000+ samples due to computational constraints**

In [None]:
df = pd.read_csv('bible_cleaned_and_short_data.csv')
df = df.sample(400)
train_df, test_df = train_test_split(df, train_size=300, random_state=42)

train_df.to_csv('train_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)

### Data Preprocessing: Tokenizing and Preparing Training and Testing Datasets

In [None]:
def preprocess_data(df):
    inputs = tokenizer(df['KJV'].tolist(), padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    labels = tokenizer(df['BBE'].tolist(), padding="max_length", truncation=True, max_length=512, return_tensors="pt")['input_ids']
    # Replace padding token id's in the labels with -100 so they are not considered in the loss computation, forces model to learn meaningful parts of input data as opposed to the padding
    labels[labels == tokenizer.pad_token_id] = -100
    return inputs, labels

train_inputs, train_labels = preprocess_data(train_df)
test_inputs, test_labels = preprocess_data(test_df)

### Training and Evaluation Loop for Finetuning MVP

In [None]:
# Setup the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

# Training settings
model.train()
epochs = 3
batch_size = 3

# Training and evaluation loop
for epoch in range(epochs):
    total_train_loss = 0
    progress_bar = tqdm(range(0, len(train_inputs['input_ids']), batch_size), desc=f'Epoch {epoch + 1}', leave=False)
    for i in progress_bar:
        optimizer.zero_grad()

        # Load batches into GPU
        batch_input_ids = train_inputs['input_ids'][i:i+batch_size].to(device)
        batch_attention_mask = train_inputs['attention_mask'][i:i+batch_size].to(device)
        batch_labels = train_labels[i:i+batch_size].to(device)

        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask, labels=batch_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        del batch_input_ids, batch_attention_mask, batch_labels, outputs
        torch.cuda.empty_cache()

        total_train_loss += loss.item()
        progress_bar.set_description(f"Epoch {epoch + 1} [Train Loss: {total_train_loss / (i + 1):.4f}]")
        progress_bar.refresh()

    avg_train_loss = total_train_loss / len(progress_bar)
    print(f"Epoch {epoch + 1}, Average Train Loss: {avg_train_loss:.4f}")

    # Evaluate on the test set
    model.eval()
    total_test_loss = 0
    with torch.no_grad():
        for i in range(0, len(test_inputs['input_ids']), batch_size):
            batch_input_ids = test_inputs['input_ids'][i:i+batch_size].to(device)
            batch_attention_mask = test_inputs['attention_mask'][i:i+batch_size].to(device)
            batch_labels = test_labels[i:i+batch_size].to(device)

            outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask, labels=batch_labels)
            loss = outputs.loss

            del batch_input_ids, batch_attention_mask, batch_labels, outputs
            torch.cuda.empty_cache()

            total_test_loss += loss.item()

    avg_test_loss = total_test_loss / (len(test_inputs['input_ids']) // batch_size)
    print(f"Epoch {epoch + 1}, Average Test Loss: {avg_test_loss:.4f}")

    # Prepare for the next epoch
    model.train()

# Save the model and tokenizer
model.save_pretrained('./mvp_finetuned_bible_translation')
tokenizer.save_pretrained('./mvp_finetuned_bible_translation')



Epoch 1, Average Train Loss: 1.7209
Epoch 1, Average Test Loss: 1.5445




Epoch 2, Average Train Loss: 1.0882
Epoch 2, Average Test Loss: 1.5835




Epoch 3, Average Train Loss: 0.7477


Non-default generation parameters: {'early_stopping': True, 'num_beams': 5, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


Epoch 3, Average Test Loss: 1.7019


('./mvp_finetuned_bible_translation/tokenizer_config.json',
 './mvp_finetuned_bible_translation/special_tokens_map.json',
 './mvp_finetuned_bible_translation/vocab.json',
 './mvp_finetuned_bible_translation/merges.txt',
 './mvp_finetuned_bible_translation/added_tokens.json')

### Setup for Text Generation Using Fine-Tuned Model on Test Data


In [11]:
model = MvpForConditionalGeneration.from_pretrained('./mvp_finetuned_bible_translation')
tokenizer = MvpTokenizer.from_pretrained('./mvp_finetuned_bible_translation')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

mvp_test_df = test_df.copy()
mvp_test_df['mvp_bbe'] = ''

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Generate and Store Model Predictions for BBE text on Test Dataset


In [12]:
progress = tqdm(test_df.iterrows(), total=len(test_df), desc="Generating text")
for index, row in progress:
    # Tokenize the text from 'KJV'
    inputs = tokenizer(row['KJV'], return_tensors="pt", max_length=512, truncation=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Generate text
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_length=350)

    # Decode the generated IDs to text and store in the DataFrame
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    mvp_test_df.at[index, 'mvp_bbe'] = generated_text

mvp_test_df.to_csv('mvp_test_data.csv', index=False)

Generating text: 100%|██████████| 100/100 [02:46<00:00,  1.67s/it]
