### Week 4: Reccurent Neural Networks (LSTM, GRU, ATTENTION, Transformer, BERT)

```
- Advanced Machine Learning, Innopolis University 
- Professor: Muhammad Fahim 
- Teaching Assistant: Gcinizwe Dlamini
```
<hr>


```
Lab Plan
1. Simple and staked LSTM
2. Transformer based models
3. Homework 1 presentation
4. Lab Task
```

<hr>

In [2]:
import torch
from torch import nn
import torch.optim as optim
import pandas as pd
import numpy as np

# Preliminaries for processing the text
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
import torchtext
from torchtext.legacy.data import Field, TabularDataset, BucketIterator, Iterator

from torch.utils.data import DataLoader, TensorDataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Prepare the data

In [3]:
!pip install wget 
import wget 

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp37-none-any.whl size=9681 sha256=bab85b500916e363920800eb124b7927d973326a41323839c6177f51d0abd451
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [4]:
#Download and unzip dataset
wget.download("http://alt.qcri.org/semeval2016/task6/data/uploads/stancedataset.zip")

!unzip stancedataset.zip

Archive:  stancedataset.zip
   creating: StanceDataset/
  inflating: StanceDataset/test.csv  
   creating: __MACOSX/
   creating: __MACOSX/StanceDataset/
  inflating: __MACOSX/StanceDataset/._test.csv  
  inflating: StanceDataset/train.csv  
  inflating: __MACOSX/StanceDataset/._train.csv  


In [5]:
#Read dataset to dataframe

train_data = pd.read_csv("./StanceDataset/train.csv", header=0, engine='python' ,encoding = "latin-1", usecols=["Tweet","Target"])
test_data = pd.read_csv("./StanceDataset/test.csv", header=0, engine='python' ,encoding = "latin-1", usecols=["Tweet","Target"])

test_data.query("Target != 'Donald Trump'",inplace=True)

labels_keys = {value: i for i, (value, count) in enumerate(train_data.Target.value_counts().items())}

train_data['Target'] = train_data['Target'].apply(lambda x: labels_keys.get(x))
test_data['Target'] = test_data['Target'].apply(lambda x: labels_keys.get(x))

In [6]:
#TODO: preprocess each an every sentence (tweet text)
def clean_data(text):
  import pandas as pd
  import numpy as np
  import re
  import nltk
  from nltk.corpus import stopwords

  from keras.preprocessing.text import one_hot
  from keras.preprocessing.sequence import pad_sequences
  from keras.models import Sequential
  from keras.layers.core import Activation, Dropout, Dense
  from keras.layers import Flatten, Conv1D, LSTM
  from keras.layers import GlobalMaxPooling1D
  from keras.layers.embeddings import Embedding
  from sklearn.model_selection import train_test_split
  from keras.preprocessing.text import Tokenizer

  TAG_RE = re.compile(r'<[^>]+>')

  def remove_tags(text):
    return TAG_RE.sub('', text)

  # Removing tags
  sentence = remove_tags(text)

  # Remove punctuations and numbers
  sentence = re.sub('[^a-zA-Z]', ' ', sentence)

  # Single character removal
  sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

  # Removing multiple spaces
  sentence = re.sub(r'\s+', ' ', sentence)

  return sentence



train_data['Tweet'] = train_data['Tweet'].apply(clean_data)
test_data['Tweet'] = test_data['Tweet'].apply(clean_data)

In [7]:
#tokenize and create Vocab
tokenizer = get_tokenizer('basic_english')
counter = Counter()

for _, row in train_data.iterrows():
  counter.update(tokenizer(row["Tweet"]))

vocab = Vocab(counter,specials=("<pad>","<unk>"), min_freq=1)

In [8]:
# Do padding
def data_process(raw_text_iter,max_len=64):
  batch = []
  for item in raw_text_iter:
    res = [vocab[token] for token in tokenizer(item)]
    if len(res) > max_len : 
      res = res[:max_len]
    if len(res) < max_len :
      res += ([vocab["<pad>"]] * (max_len-len(res)))
    batch.append(res)
  pad_data = torch.tensor(batch, dtype=torch.long)
  return pad_data

In [9]:
max_len = 64
embedding_size = 10
n_classes = len(np.unique(train_data.Target.values))

#Create Dataloader
train_tensor = data_process(train_data.Tweet.values)
tgts_tensor = torch.nn.functional.one_hot(torch.from_numpy(train_data.Target.values), n_classes) #torch.from_numpy(train_data.Target.values)

dataset = TensorDataset(train_tensor, tgts_tensor)

loader = DataLoader(dataset, batch_size=2, shuffle=True, pin_memory=True)

## Simple LSTM

In [10]:
class SimpleLstm(nn.Module):
  def __init__(self, embedding_dim ,vocab_size , hidden_dim=10, output_dim=1, n_layers=1):
    super().__init__()
    self.hidden_dim = hidden_dim
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.lstm_layer = nn.LSTM(embedding_dim, hidden_dim, num_layers = n_layers,batch_first = True, bidirectional=True)
    
    self.output_layer = nn.Linear(hidden_dim, output_dim)
      
  def forward(self, x):
    batch_size = x.size(0)
    embedded = self.embedding(x)
    outputs, (hidden, cell) = self.lstm_layer(embedded)
    
    pred = self.output_layer(hidden[-1])
    return pred

vocab_size = len(vocab)
embedding_size = 64
output_dim = len(np.unique(train_data['Target']))
model = SimpleLstm(embedding_dim=embedding_size, vocab_size=vocab_size, hidden_dim=10,output_dim=output_dim).to(device).float()

## Model summary



In [11]:
print(model)

SimpleLstm(
  (embedding): Embedding(8993, 64)
  (lstm_layer): LSTM(64, 10, batch_first=True, bidirectional=True)
  (output_layer): Linear(in_features=10, out_features=5, bias=True)
)


## Train Model

In [25]:
#TODO: Implement Model train function which will return epoch loss and accuracy
def train(model, data_loader, optimizer, criterion, device):
  epoch_loss = 0
  epoch_accuracy = 0

  def accuracy_calculator(preds, y):
    spreds = []
    for pred in preds.to('cpu'):
      spreds.append(np.argmax(torch.softmax(pred)))
    sy = []  
    for tr in y:
      sy.append(np.argmax(tr))
    accuracy = (spreds==sy).float()
    return np.mean(accuracy)

  model.train()
  
  for batch in data_loader:
    x, y = batch
    x = x.to(device)
    y = y.to(device).float()
    optimizer.zero_grad()
    predictions = model(x).squeeze(1)
    loss = criterion(predictions, y)

    acc = accuracy_calculator(predictions, y)
    
    loss.backward()
    optimizer.step()
    
    epoch_loss += loss.item()
    epoch_accuracy += acc.item()

  return epoch_loss / len(data_loader) , epoch_accuracy / len(data_loader) 

In [26]:
# Train loop 
criterion = torch.nn.BCEWithLogitsLoss() 
optimizer = optim.SGD(model.parameters(), lr=1e-3)

criterion = criterion.to(device)
n_epochs = 2

for epoch in range(n_epochs):
  train_loss, train_acc = train(model, loader, optimizer, criterion, device=device)
  print(f"Loss : {train_loss}")

TypeError: ignored

## Transformers & Bert

Sentiment analysis task. We are going to use the [transformers library](https://github.com/huggingface/transformers) to get pre-trained transformers and use them as embedding layers. Its possible to implement from scratch. Bert is one of the popular transformer based models, <br>
* **Name other transformer based state-of-the-art models** <br>

The transformers library can be easily installed with pip <br>`pip install transformers`

In [31]:
!pip install transformers
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/81/91/61d69d58a1af1bd81d9ca9d62c90a6de3ab80d77f27c5df65d9a2c1f5626/transformers-4.5.0-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.2MB 5.6MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/08/cd/342e584ee544d044fb573ae697404ce22ede086c9e87ce5960772084cad0/sacremoses-0.0.44.tar.gz (862kB)
[K     |████████████████████████████████| 870kB 26.0MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 32.8MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.44-cp37-none-any.whl size=886084 sha256=7411b

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…


[CLS] [SEP] [PAD] [UNK]


In [33]:
def tokenize_and_trim(sentence):
  tokens = tokenizer.tokenize(sentence) 
  tokens = tokens[:max_input_length-2]
  return tokens

In [34]:
import torch.nn as nn

class BertBasedSentiment(nn.Module):
  def __init__(self, transform_based_model, hidden_dim, output_dim, n_layers, bidirectional, dropout):
    super().__init__()
    self.transform_based_model = transform_based_model

    self.embedding_dim = transform_based_model.config.to_dict()['hidden_size']
    self.gru = nn.GRU(self.embedding_dim, hidden_dim, num_layers = n_layers,bidirectional = bidirectional,batch_first = True,dropout = 0 if n_layers < 2 else dropout)
    self.output_layer = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
    self.dropout = nn.Dropout(dropout)
        
  def forward(self, text):
    # First pass the input text through bert. The output of bert is like embedding Remember: Bert is set to not trainable mode
    with torch.no_grad():
      embed = self.transform_based_model(text)[0]
      
    _, hidden = self.gru(embed)

    if self.gru.bidirectional:
      hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
    else:
      hidden = self.dropout(hidden[-1,:,:])
      
    return self.output_layer(hidden)


In [35]:
from transformers import BertTokenizer, BertModel

bert_model = BertModel.from_pretrained('bert-base-uncased')

hidden_dim = 256
out_dim = len(np.unique(train_data['Target']))
bi_directional = True
dropout_rate = 0.25
n_layers = 2

model = BertBasedSentiment(bert_model, hidden_dim, out_dim, n_layers, bi_directional, dropout_rate).to(device).float()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [36]:
#Set all the bert weights non-trainable
for name, param in model.named_parameters():
  if name.startswith('bert'):
    param.requires_grad = False

## Train Bert model

First, we will specify the algorithm to update the model we parameters in the training process - optimizer. The most common is stochastic gradient descent (SGD). Secondly, we will specify the loss calculation function which is selected based on the learning objective (regression, classification, ..). Lastly, all the specified algorithms must be placed in the same training device where the model is

In [37]:
import torch.optim as optim

# TODO: Select the optimizer and loss function/criterion
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [39]:
epochs = 5

for epoch in range(epochs):
  train_loss, train_acc = train(model, loader, optimizer, criterion, device)
  print(f"Epoch: {epoch}, Loss :{train_loss}, Accuracy: {train_acc}")

Epoch: 0, Loss :0.5324328982858684, Accuracy: 0.7908716649839791
Epoch: 1, Loss :0.526932493803349, Accuracy: 0.7909402991290279
Epoch: 2, Loss :0.5205061641677079, Accuracy: 0.7969801075751436
Epoch: 3, Loss :0.5242551810318424, Accuracy: 0.79396020314754
Epoch: 4, Loss :0.5225017451429989, Accuracy: 0.7948524475752203


## Lab Task 

```
1. Write a predict function that takes in a trained net, a plain tweet text and prints out a tweet topic (label).
2. Add make bi-directional LSTM for the classification of tweet topic. (Modify the simple LSTM model example)
3. Create a validation set from the training data and log the models loss (training and validation) on tensorboard
4. Visialize simple LSTM and transformer based model (bert) perfomence using confussion matrix 

```

<center>Don't forget to commit</center>

## References
1. [Illustrated Guide to LSTM’s and GRU’s: A step by step explanation](http://www.kurious.pub/blog/Illustrated-Guide-to-LSTMs-and-GRUs-A-step-by-step-explanation-6)

2. [BERT Explained: What You Need to Know About Google’s New Algorithm](https://www.searchenginejournal.com/bert-explained-what-you-need-to-know-about-googles-new-algorithm/337247/#close)

3. [Understanding searches better than ever before](https://www.blog.google/products/search/search-language-understanding-bert/)

4. [BERT Explained: State of the art language model for NLP](https://towardsdatascience.com/bert-explained-state-of-the-art-language-model-for-nlp-f8b21a9b6270)

5. [How do Transformers Work in NLP? A Guide to the Latest State-of-the-Art Models](https://www.analyticsvidhya.com/blog/2019/06/understanding-transformers-nlp-state-of-the-art-models/)

6. [A deep dive into BERT: How BERT launched a rocket into natural language understanding](https://searchengineland.com/a-deep-dive-into-bert-how-bert-launched-a-rocket-into-natural-language-understanding-324522)

7. [The Illustrated BERT, ELMo, and co. (How NLP Cracked Transfer Learning)](http://jalammar.github.io/illustrated-bert/)

8. [The Illustrated Transformer](https://jalammar.github.io/illustrated-transformer/)

9. [BERT Word Embeddings Tutorial](https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/)

10. [Universal Language Model Fine-tuning for Text Classification](https://arxiv.org/pdf/1801.06146.pdf)

11. [Efficient multi-lingual language model fine-tuning](https://nlp.fast.ai/)

12. [BERT Text Classification in 3 Lines of Code Using Keras](https://towardsdatascience.com/bert-text-classification-in-3-lines-of-code-using-keras-264db7e7a358)

13. [QUASI-RECURRENT NEURAL NETWORKS](https://arxiv.org/pdf/1611.01576.pdf)

