### OM NAMO NARAYANA

### Preparation of dataset
**Output:** A Pytorch Dataset class returning sequence of root words in English <br/>
**Input:** Preprocessed English sentences <br/>
- Dataset: Wikipedia dataset <br/>
- Lemmatizer: Spacy <br/>
- Model: T5 <br/>
- Pretrained on: C4 Dataset <br/>
- Loss: ?



In [1]:
!pip install datasets
!pip install transformers
!pip install sentencepiece
!pip install wandb



In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [1]:
import wandb
wandb.init(project='root2seq')

[34m[1mwandb[0m: Currently logged in as: [33mteam-knitts[0m (use `wandb login --relogin` to force relogin)


In [2]:
# root_dir = '/content/drive/My Drive/'
root_dir = '/home/ubuntu/Context-Comprehension-Enhancement-Tamil/'
checkpoint_dir = root_dir + 'checkpoints/'

In [9]:
import torch
from torch.utils.data import Dataset
import spacy
from datasets import load_dataset
from torch.utils.data import DataLoader 

class T5Dataset(Dataset):
    """T5 root2seq dataset."""

    def __init__(self, dataset, tokenizer, transform=lambda k:k):
        """
        Args:
            dataset (dataset): Dataloader from datasets.
            tokenizer (Tokenizer): To tokenizer input and output sentence.
            transform (function): Any transformation function
        """
        self.dataset=dataset
        self.nlp = spacy.load('en_core_web_sm')
        self.tokenizer = tokenizer
        self.transform = transform


    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sentence = self.dataset[idx]["sentence"]
        doc = self.nlp(sentence)
        
        li=[]
        for token in doc:
            li.append(token.lemma_)

        input_sentence = ''
        for input_word in li: input_sentence += input_word + ' ' 


        # input_tokens = tokenizer(self.transform(input_sentence), max_length=1024, return_tensors="pt")
        # output_tokens = tokenizer(self.transform(sentence), max_length=1024, return_tensors="pt")
        input_tokens = tokenizer(self.transform(input_sentence), max_length=256, padding='max_length', return_tensors="pt")
        output_tokens = tokenizer(self.transform(sentence), max_length=256, padding='max_length', return_tensors="pt")

        # print('input_tokens.shape: ', input_tokens.input_ids.shape, 'output_tokens.shape: ', output_tokens.input_ids.shape)

        return {"input_tokens": input_tokens, "output_tokens": output_tokens}

In [None]:
import torch
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import get_scheduler
import os
from datetime import datetime
import re


wandb.config.update({
    'epochs': 10,
    'tokenizer': 't5-small',
    'optimizer': 'nn.NLLLoss',
})

load_model = True
config = wandb.config
config.epochs = 10
config.TOKENIZER = AutoTokenizer.from_pretrained("t5-small") 

def train(model, tokenizer, model_optimizer, criterion, dataloader, epochs = 10, debug=False, checkpoint_dir = "/", model_name = "unknown", **kwargs):


  num_training_steps = epochs * len(dataloader)

  lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)


  device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
  model.to(device)

  print_freq = 200
  save_freq = 200
  model_optimizer.zero_grad()

  for epoch in range(epochs):
    epoch_loss = 0
    t = 0
    if(epoch % 2 == 1): 
      if(not os.path.isdir(checkpoint_dir + model_name + '/')):
        os.makedirs(checkpoint_dir + model_name + '/')

      torch.save(model.state_dict(), checkpoint_dir + model_name + '/' +datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+'.pt')
      print('checkpoint saved')
    for data in tqdm(dataloader):
      t = t + 1
      input_ids = data['input_tokens']['input_ids'].squeeze(1)
      attention_mask = data['input_tokens']['attention_mask'].squeeze(1)
      output = data['output_tokens']['input_ids'].squeeze(1)
      input_ids= input_ids.to(device)
      output = output.to(device)
      attention_mask = attention_mask.to(device)
      if(debug): print('input_ids.shape: ', input_ids.shape)
      if(debug): print('device:', device, ' input_ids.device: ', input_ids.device, ' output.device: ', output.device, ' attention_mask.device: ', attention_mask.device)

      if(debug):predicted_tokens = model(input_ids = input_ids, attention_mask = attention_mask,labels = output)
      else: loss = model(input_ids = input_ids, labels = output).loss
      if(debug): print('predicted_tokens: ', predicted_tokens, 'logits.shape: ', predicted_tokens['logits'].shape) # logits.shape [1, x, 32128]

      if(not debug): wandb.log({'loss': loss.item()})

      if(t % print_freq == 1):
        print('t:', t, 'loss: ', loss.item(), '\ntarget_sentence:', tokenizer.decode(output[0]))
        predicted_sentence = model.generate(torch.unsqueeze(input_ids[0], 0), num_beams=4, max_length=10,)
        print('input_sentence: ', tokenizer.decode(input_ids[0], skip_special_tokens=True), '\npredicted_sentence: ', tokenizer.decode(predicted_sentence[0], skip_special_tokens=True))

      # epoch_loss += predicted_tokens['loss']

      loss.backward()

      model_optimizer.step()
      lr_scheduler.step()
      optimizer.zero_grad()
      
# model_name = "csebuetnlp/mT5_multilingual_XLSum"      
model_name = "t5-small"      


model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer_transform = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
tokenizer_transform = lambda k: k
optimizer = optim.SGD(model.parameters(), lr=0.01)
criterion = nn.NLLLoss()
dataset = load_dataset('glue', 'cola', split='train')
# t5dataset = T5Dataset(dataset, tokenizer, transform = tokenizer_transform)
dataloader = torch.utils.data.DataLoader(t5dataset, batch_size = 4, shuffle=True)
if(load_model):
  try:
    ls = os.listdir(checkpoint_dir + model_name + '/')
    ls.sort()
    latest_file = checkpoint_dir + model_name + '/' + ls[-1]
    model.load_state_dict(torch.load(latest_file))
    print("loaded latest checkpoint")
  except:
    print("can't load model")
train(model, tokenizer, optimizer, criterion, t5dataset, 10, checkpoint_dir = checkpoint_dir, model_name=model_name)

Reusing dataset glue (/home/ubuntu/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


loaded latest checkpoint


  0%|                                                      | 0/8551 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


t: 1 loss:  1.247257947921753 
target_sentence: Our friends won't buy this analysis, let alone the next one we propose.</s>


  0%|                                              | 1/8551 [00:00<34:25,  4.14it/s]

input_sentence:  our friend will not buy this analysis, let alone the next one we propose. 
predicted_sentence:  Unser Freund wird diese Analyse nicht


  2%|▊                                           | 152/8551 [00:11<10:25, 13.42it/s]


### Archeive

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("t5-small")

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

In [None]:
ARTICLE_TO_SUMMARIZE = "summarize: My friends are cool but they eat too many carbs."
inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="pt")

# Generate Summary
summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=6,)
print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))

In [None]:
# # training
# input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
# labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
# outputs = model(input_ids=input_ids, labels=labels)
# loss = outputs.loss
# logits = outputs.logits

# inference
input_ids = tokenizer(
    "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
).input_ids  # Batch size 1
outputs = model.generate(input_ids, num_beams=4, max_length=5,)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
# studies have shown that owning a dog is good for you.

In [None]:
# teacher_forcing_ratio = 0.5

# def T5seq(model, input_tensor, target_tensor, model_optimizer, criterion):
#    model_optimizer.zero_grad()

#    input_length = input_tensor.size(0)
#    loss = 0
#    epoch_loss = 0
#    # print(input_tensor.shape)

#    output = model(input_tensor, target_tensor)

#    num_iter = output.size(0)
#    print(num_iter)

# #calculate the loss from a predicted sentence with the expected result
#    for ot in range(num_iter):
#        loss += criterion(output[ot], target_tensor[ot])

#    loss.backward()
#    model_optimizer.step()
#    epoch_loss = loss.item() / num_iter

#    return epoch_loss

# def trainModel(model, source, target, pairs, num_iteration=20000):
#    model.train()

#    optimizer = optim.SGD(model.parameters(), lr=0.01)
#    criterion = nn.NLLLoss()
#    total_loss_iterations = 0

#    training_pairs = [tensorsFromPair(source, target, random.choice(pairs))
#                      for i in range(num_iteration)]
  
#    for iter in range(1, num_iteration+1):
#        training_pair = training_pairs[iter - 1]
#        input_tensor = training_pair[0]
#        target_tensor = training_pair[1]

#        loss = clacModel(model, input_tensor, target_tensor, optimizer, criterion)

#        total_loss_iterations += loss

#        if iter % 5000 == 0:
#            avarage_loss= total_loss_iterations / 5000
#            total_loss_iterations = 0
#            print('%d %.4f' % (iter, avarage_loss))
          
#    torch.save(model.state_dict(), 'mytraining.pt')
#    return model

In [None]:
 
# import pandas as pd
# from google.colab import drive
# drive.mount('/content/drive/')

# df = pd.read_csv("")

dataset = load_dataset('glue', 'cola', split='train')
print(dataset[0])
# dataset[0].pop("label")
# dataset[0].pop("idx")
text = [dataset[0]["sentence"]]
print(text)

nlp = spacy.load('en')
# text=("Our friends won't buy this analysis, let alone the next one we propose.")
print(text)
pass
doc = [nlp(t) for t in text]
print(doc)

total_li=[]
li=[]
for lines in doc:
  li=[]
  for token in lines:
    li.append(token.lemma_)
  total_li.append(li)
print(total_li)

In [None]:
input_list = ['-PRON-', 'friend', 'will', 'not', 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', '-PRON-', 'propose', '.']
input_sentence = ''
for input_word in input_list: input_sentence += input_word + ' '
input_sentence = input_sentence.strip()
tokenizer(input_sentence , return_tensors="pt")

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

dataset = load_dataset('glue', 'cola', split='train')
tokenizer = AutoTokenizer.from_pretrained("t5-small")
dataset = T5Dataset(dataset, tokenizer)
print('dataset[0]: ', dataset[0])
# dataloader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True)
# print(next(iter(dataloader)))

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import os
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
ls = os.listdir(checkpoint_dir)
ls.sort()
print('ls: ', ls)
latest_file = checkpoint_dir + ls[-1]
model.load_state_dict(torch.load(latest_file))
print("loaded latest file")
print("can't load model")

In [None]:
from datetime import datetime

x = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
print(x)

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer

tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
example_english_phrase = "UN Chief Says There Is No Military Solution in Syria"

model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
inputs = tokenizer(example_english_phrase, return_tensors="pt")

summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=5)
x = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)[0]
print(x)

In [None]:
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))

article_text = """Videos that say approved vaccines are dangerous and cause autism, cancer or infertility are among those that will be taken down, the company said.  The policy includes the termination of accounts of anti-vaccine influencers.  Tech giants have been criticised for not doing more to counter false health information on their sites.  In July, US President Joe Biden said social media platforms were largely responsible for people's scepticism in getting vaccinated by spreading misinformation, and appealed for them to address the issue.  YouTube, which is owned by Google, said 130,000 videos were removed from its platform since last year, when it implemented a ban on content spreading misinformation about Covid vaccines.  In a blog post, the company said it had seen false claims about Covid jabs "spill over into misinformation about vaccines in general". The new policy covers long-approved vaccines, such as those against measles or hepatitis B.  "We're expanding our medical misinformation policies on YouTube with new guidelines on currently administered vaccines that are approved and confirmed to be safe and effective by local health authorities and the WHO," the post said, referring to the World Health Organization."""

model_name = "csebuetnlp/mT5_multilingual_XLSum"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

input_ids = tokenizer(
    [WHITESPACE_HANDLER(article_text)],
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=512
)["input_ids"]

output_ids = model.generate(
    input_ids=input_ids,
    max_length=84,
    no_repeat_ngram_size=2,
    num_beams=4
)[0]

summary = tokenizer.decode(
    output_ids,
    skip_special_tokens=True,
    clean_up_tokenization_spaces=False
)

print(summary)