<a href="https://colab.research.google.com/github/jeanlucjackson/w266_final_project/blob/main/code/BB/bb_t5_memory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

import json

# Make longer output readable without scrolling
from pprint import pprint

# Stop warning messages from showing
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
!pip install -q sentencepiece

[K     |████████████████████████████████| 1.3 MB 4.5 MB/s 
[?25h

In [3]:
!pip install -q transformers

[K     |████████████████████████████████| 5.3 MB 5.3 MB/s 
[K     |████████████████████████████████| 163 kB 37.8 MB/s 
[K     |████████████████████████████████| 7.6 MB 43.8 MB/s 
[?25h

In [4]:
!pip install -q datasets

[K     |████████████████████████████████| 441 kB 5.4 MB/s 
[K     |████████████████████████████████| 115 kB 53.3 MB/s 
[K     |████████████████████████████████| 212 kB 50.3 MB/s 
[K     |████████████████████████████████| 127 kB 43.7 MB/s 
[K     |████████████████████████████████| 115 kB 45.7 MB/s 
[?25h

In [5]:
!pip install -q evaluate
import evaluate

[?25l[K     |████▌                           | 10 kB 15.4 MB/s eta 0:00:01[K     |█████████                       | 20 kB 17.9 MB/s eta 0:00:01[K     |█████████████▌                  | 30 kB 22.5 MB/s eta 0:00:01[K     |██████████████████              | 40 kB 7.2 MB/s eta 0:00:01[K     |██████████████████████▌         | 51 kB 6.6 MB/s eta 0:00:01[K     |███████████████████████████     | 61 kB 7.7 MB/s eta 0:00:01[K     |███████████████████████████████▌| 71 kB 6.3 MB/s eta 0:00:01[K     |████████████████████████████████| 72 kB 831 kB/s 
[?25h

In [36]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
from datasets import list_datasets, load_dataset_builder, get_dataset_config_names, load_dataset, load_from_disk


In [8]:
def summarize_dataset (dataset, config=None):
    builder = load_dataset_builder(dataset, config)
    print(f"Description:\n {builder.info.description}")
    print(f"Features:")
    pprint(builder.info.features)
    return

### Load SQuAD dataset from Gdrive

In [9]:
data_squad = load_from_disk("/content/drive/MyDrive/w266 NLP Final Project/Data/squad.hf")

In [10]:
data_squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [11]:
# Look at first example
pprint(data_squad['train'][0])

{'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']},
 'context': 'Architecturally, the school has a Catholic character. Atop the '
            "Main Building's gold dome is a golden statue of the Virgin Mary. "
            'Immediately in front of the Main Building and facing it, is a '
            'copper statue of Christ with arms upraised with the legend '
            '"Venite Ad Me Omnes". Next to the Main Building is the Basilica '
            'of the Sacred Heart. Immediately behind the basilica is the '
            'Grotto, a Marian place of prayer and reflection. It is a replica '
            'of the grotto at Lourdes, France where the Virgin Mary reputedly '
            'appeared to Saint Bernadette Soubirous in 1858. At the end of the '
            'main drive (and in a direct line that connects through 3 statues '
            'and the Gold Dome), is a simple, modern stone statue of Mary.',
 'id': '5733be284776f41900661182',
 'question': 'To whom did t

### T5v1.1 Model

Sources: https://huggingface.co/docs/transformers/model_doc/t5
https://huggingface.co/docs/transformers/model_doc/t5v1.1

In [17]:
from transformers import T5Tokenizer, TFT5ForConditionalGeneration

t5_model = TFT5ForConditionalGeneration.from_pretrained("google/t5-v1_1-base")
t5_tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-base")

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at google/t5-v1_1-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [31]:
t5_model.summary()

Model: "tft5_for_conditional_generation_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 shared (TFSharedEmbeddings)  multiple                 24674304  
                                                                 
 encoder (TFT5MainLayer)     multiple                  84954240  
                                                                 
 decoder (TFT5MainLayer)     multiple                  113275008 
                                                                 
 lm_head (Dense)             multiple                  24674304  
                                                                 
Total params: 247,577,856
Trainable params: 247,577,856
Non-trainable params: 0
_________________________________________________________________


### Train model with limited RAM

Prep data

In [24]:
# Create dataframe for train data 
# shuffle so random
data_shuffle=data_squad['train'].shuffle(seed=1962)
df=pd.DataFrame()
df['answer'] = [answer['text'][0] for answer in data_shuffle['answers']]
df['context'] = data_shuffle['context']
df['question'] = data_shuffle['question']


df.head()



Unnamed: 0,answer,context,question
0,biotech companies,"Prior to moving its headquarters to Chicago, a...",What type of businesses did Nickles want to at...
1,Tytus Woyciechowski,Four boarders at his parents' apartments becam...,To whom did Chopin reveal in letters which par...
2,the Endangered Species Committee,The question to be answered is whether a liste...,"If a species may be harmed, who holds final sa..."
3,China,"In Asian countries such as China, Korea, and J...",What country has the dog as part of its 12 ani...
4,45 years,Saint Athanasius of Alexandria (/ˌæθəˈneɪʃəs/;...,How long did his episcopate last?


In [25]:
# Save train data to csv
df.to_csv("/content/drive/MyDrive/w266 NLP Final Project/Data/data_squad_train.csv", index=False)

In [26]:
# Read in csv of train data
train_filename = "/content/drive/MyDrive/w266 NLP Final Project/Data/data_squad_train.csv"

df_train = pd.read_csv(train_filename)

df_train.shape

(87599, 3)

Preprocess data

In [27]:
input_txt = [f"gq answer: {answer} context: {context}" for answer, context in zip (df.answer, df.context)]

output_txt = df.question.to_list()

input_txt[1]

'gq answer: Tytus Woyciechowski context: Four boarders at his parents\' apartments became Chopin\'s intimates: Tytus Woyciechowski, Jan Nepomucen Białobłocki, Jan Matuszyński and Julian Fontana; the latter two would become part of his Paris milieu. He was friendly with members of Warsaw\'s young artistic and intellectual world, including Fontana, Józef Bohdan Zaleski and Stefan Witwicki. He was also attracted to the singing student Konstancja Gładkowska. In letters to Woyciechowski, he indicated which of his works, and even which of their passages, were influenced by his fascination with her; his letter of 15 May 1830 revealed that the slow movement (Larghetto) of his Piano Concerto No. 1 (in E minor) was secretly dedicated to her – "It should be like dreaming in beautiful springtime – by moonlight." His final Conservatory report (July 1829) read: "Chopin F., third-year student, exceptional talent, musical genius."'

In [None]:
# # Training

# # Encode inputs
# inputs = tokenizer(input_txt, max_length=1024, padding=True, truncation=True, return_tensors="tf")
# #.input_ids
# # Identify input ids and attention mask
# input_ids, attention_mask = inputs.input_ids, inputs.attention_mask

# # Encode outputs/targets
# labels = tokenizer(output_txt, max_length=1024, padding=True, truncation=True, return_tensors="tf").input_ids

# outputs = model(input_ids=input_ids, 
#                 attention_mask=attention_mask, 
#                 labels=labels)
# loss = outputs.loss
# logits = outputs.logits

In [28]:
# Create function to preprocess data

def preprocess_data(input_txt, output_txt, tokenizer, max_length=1024):
    # Encode inputs
    encoded = tokenizer(input_txt, 
                        max_length=max_length, 
                        padding=True, 
                        truncation=True, 
                        return_tensors="tf"
                        )

    # Extract encoded features and labels, add to corresponding lists
    input_ids = encoded.input_ids
    attention_masks = encoded.attention_mask

    # Encode outputs
    labels = tokenizer(output_txt, 
                       max_length=max_length, 
                       padding=True, 
                       truncation=True, 
                       return_tensors="tf").input_ids
    
    return [input_ids, attention_masks], labels

In [15]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [29]:
class SNLIDataGeneratorFromFile(tf.keras.utils.Sequence):
    
    def __init__(self,
                 tokenizer,
                 n_examples,
                 data_filename,
                 max_length=128,
                 batch_size=32,
                 shuffle=True):
        
        self.tokenizer = tokenizer
        self.n_examples = n_examples
        self.data_filename = data_filename
        self.max_length = max_length
        self.batch_size = batch_size
        self.shuffle = shuffle
        
        # Initialize row order, call on_epoch_end to shuffle row indices
        self.row_order = np.arange(1, self.n_examples+1)
        self.on_epoch_end()
    
    def __len__(self):
        return self.n_examples // self.batch_size
    
    def __getitem__(self, idx):
        batch_start = idx * self.batch_size
        batch_end = (idx + 1) * self.batch_size

        # Indices to skip are the ones in the shuffled row_order before and
        # after the chunk we'll use for this batch
        batch_idx_skip = self.row_order[:batch_start] + self.row_order[batch_end:]
        df = pd.read_csv(self.data_filename, skiprows=batch_idx_skip)
        
        input_txt = [f"gq answer: {answer} context: {context}" for answer, context in zip (df.answer, df.context)]
        output_txt = df.question.to_list()

        batch_data = preprocess_data(
            input_txt,
            output_txt,
            self.tokenizer,
            self.max_length
        )

        return batch_data
    
    def on_epoch_end(self):
        if self.shuffle:
            self.row_order = list(np.random.permutation(self.row_order))

In [32]:
train_data_generator = SNLIDataGeneratorFromFile(
    tokenizer=t5_tokenizer,
    n_examples=5000,
    data_filename="/content/drive/MyDrive/w266 NLP Final Project/Data/data_squad_train.csv"
)

In [37]:
# Path in drive where we want to save checkpoints
!ls "/content/drive/MyDrive/w266 NLP Final Project/Checkpoints/"

In [38]:
# CHANGE checkpoint_dir TO THE PATH IN YOUR OWN DRIVE WHERE YOU WANT TO SAVE CHECKPOINTS

checkpoint_dir = "/content/drive/MyDrive/w266 NLP Final Project/Checkpoints/"
checkpoint_filepath = checkpoint_dir + 'weights.{epoch:02d}-{val_accuracy:.2f}.hdf5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True)

In [40]:
# Now train model
t5_model.generate(train_data_generator,
                  # num_beams=2,
                  # no_repeat_ngram_size=1,
                  max_length=40
               )

ValueError: ignored