In [2]:
from transformers import T5Tokenizer, BertTokenizer

In [43]:
text="This is an introduction to: tokenizers"
t5_tokenizer=T5Tokenizer.from_pretrained("t5-base")
bert_tokenizer=BertTokenizer.from_pretrained("bert-base-uncased")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [44]:
t5_tokens=t5_tokenizer.tokenize(text)
bert_tokens=bert_tokenizer.tokenize(text)

In [45]:
print("\tT5\n --------------\n")
print(t5_tokens)
print("\n")
print("\tBert\n --------------\n")
print(bert_tokens)

	T5
 --------------

['▁This', '▁is', '▁an', '▁introduction', '▁to', ':', '▁token', 'izer', 's']


	Bert
 --------------

['this', 'is', 'an', 'introduction', 'to', ':', 'token', '##izer', '##s']


In [46]:
t5_ids=t5_tokenizer.convert_tokens_to_ids(t5_tokens)
bert_ids=bert_tokenizer.convert_tokens_to_ids(bert_tokens)

In [47]:
print("\tT5\n --------------\n")
print(t5_ids)
print("\n")
print("\tBert\n --------------\n")
print(bert_ids)

	T5
 --------------

[100, 19, 46, 5302, 12, 10, 14145, 8585, 7]


	Bert
 --------------

[2023, 2003, 2019, 4955, 2000, 1024, 19204, 17629, 2015]


In [48]:
print(bert_tokenizer.convert_ids_to_tokens(bert_ids))
t5_tokenizer.convert_ids_to_tokens(t5_ids)

['this', 'is', 'an', 'introduction', 'to', ':', 'token', '##izer', '##s']


['▁This', '▁is', '▁an', '▁introduction', '▁to', ':', '▁token', 'izer', 's']

In [49]:
t5_tokenizer.decode(t5_ids)
bert_tokenizer.decode(bert_ids)

'this is an introduction to : tokenizers'

In [50]:
t5_tokenizer.encode(text)

[100, 19, 46, 5302, 12, 10, 14145, 8585, 7, 1]

In [51]:
t5_tokenizer.decode(t5_tokenizer.encode(text))

'This is an introduction to: tokenizers</s>'

In [52]:
source = t5_tokenizer.batch_encode_plus([text], max_length= 16, pad_to_max_length=False,return_tensors='pt')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [53]:
source

{'input_ids': tensor([[  100,    19,    46,  5302,    12,    10, 14145,  8585,     7,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [54]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

# WandB – Import the wandb library
#import wandb

In [55]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.text
        self.ctext = self.data.ctext

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [56]:
TRAIN_BATCH_SIZE = 2    # input batch size for training (default: 64)
VALID_BATCH_SIZE = 2    # input batch size for testing (default: 1000)
TRAIN_EPOCHS = 2        # number of epochs to train (default: 10)
VAL_EPOCHS = 1 
LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
SEED = 42               # random seed (default: 42)
MAX_LEN = 512
SUMMARY_LEN = 150 
# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(SEED) # pytorch random seed
np.random.seed(SEED) # numpy random seed
torch.backends.cudnn.deterministic = True
# tokenzier for encoding the text
tokenizer = T5Tokenizer.from_pretrained("t5-base")

# Importing and Pre-Processing the domain data
# Selecting the needed columns only. 
# Adding the summarzie text in front of the text. This is to format the dataset similar to how T5 model was trained for summarization task. 
df = pd.read_csv('~/Downloads/news_summary.csv',encoding='latin-1')
df = df[['text','ctext']]
df.ctext = 'summarize: ' + df.ctext
print(df.head())


# Creation of Dataset and Dataloader
# Defining the train size. So 80% of the data will be used for training and the rest will be used for validation. 
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state = SEED)
val_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(val_dataset.shape))


  # Creating the Training and Validation dataset for further creation of Dataloader
training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)
val_set = CustomDataset(val_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)

                                                text  \
0  The Administration of Union Territory Daman an...   
1  Malaika Arora slammed an Instagram user who tr...   
2  The Indira Gandhi Institute of Medical Science...   
3  Lashkar-e-Taiba's Kashmir commander Abu Dujana...   
4  Hotels in Maharashtra will train their staff t...   

                                               ctext  
0  summarize: The Daman and Diu administration on...  
1  summarize: From her special numbers to TV?appe...  
2  summarize: The Indira Gandhi Institute of Medi...  
3  summarize: Lashkar-e-Taiba's Kashmir commander...  
4  summarize: Hotels in Mumbai and other Indian c...  
FULL Dataset: (4514, 2)
TRAIN Dataset: (3611, 2)
TEST Dataset: (903, 2)


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [68]:
train_params = {
        'batch_size': TRAIN_BATCH_SIZE,
        'shuffle': True,
        'num_workers': 0
        }

val_params = {
        'batch_size': VALID_BATCH_SIZE,
        'shuffle': False,
        'num_workers': 0
        }

    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)


In [69]:
itr=iter(training_loader)

In [70]:
x=next(itr)



In [73]:
y=x['target_ids']

In [74]:
y_ids = y[:, :-1].contiguous()

In [76]:
y_ids.shape

torch.Size([2, 149])

In [77]:
y.shape

torch.Size([2, 150])

In [80]:
type(y)

torch.Tensor