In [None]:
!pip install -U torch==1.10.0 torchtext==0.11.0

# Reload environment
exit()

Collecting torch==1.8.0
  Downloading torch-1.8.0-cp37-cp37m-manylinux1_x86_64.whl (735.5 MB)
[K     |████████████████████████████████| 735.5 MB 14 kB/s 
[?25hCollecting torchtext==0.9.0
  Downloading torchtext-0.9.0-cp37-cp37m-manylinux1_x86_64.whl (7.1 MB)
[K     |████████████████████████████████| 7.1 MB 33.5 MB/s 
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.11.0+cu113
    Uninstalling torch-1.11.0+cu113:
      Successfully uninstalled torch-1.11.0+cu113
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.12.0
    Uninstalling torchtext-0.12.0:
      Successfully uninstalled torchtext-0.12.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.12.0+cu113 requires torch==1.11.0, but you have torch 1.8.0 which is incompatible.
torchaudio 0.11.0

In [None]:
import random
import re
import pandas as pd
import spacy
import torch
import torch.optim as optim
import torch.nn as nn
from torchtext.legacy import data

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
g_path = "/content/drive/My Drive/pytorch/"

Mounted at /content/drive


In [None]:
data_fl = 'data/IMDB_review_sentiment_small.csv'

In [None]:
#reproducing the same result
SEED = 2021
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fdc52ed3a30>

In [None]:
spacy_en = spacy.load('en')
def clean_data(texts):
    cleaned_text = []
    for text in texts:
        # remove break
        text = text.replace('br', '')
        # remove punctuation
        text = re.sub('[^a-zA-Z0-9]', ' ', text)
        # remove multiple spaces
        text = re.sub(r' +', ' ', text)
        # remove newline
        text = re.sub(r'\n', ' ', text)
        # strip the text
        text = text.strip()
        # lower the text
        text = text.lower()

        if text != '':
          cleaned_text.append(text)
    return cleaned_text

def tokenizer(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

TEXT = data.Field(preprocessing=clean_data,tokenize=tokenizer,batch_first=True,include_lengths=True)
LABEL = data.LabelField(dtype = torch.float,batch_first=True)
fields = [('text',TEXT),('label', LABEL)]

In [None]:
#loading the entire data
def load_data():
  imdb_data = data.TabularDataset(path = g_path+data_fl,format = 'csv', fields = fields, skip_header = True)
  return imdb_data

imdb_data = load_data() 
print(vars(imdb_data.examples[0]))
print(imdb_data.examples[0].text, imdb_data.examples[0].label)

{'text': ['one', 'of', 'the', 'other', 'reviewers', 'has', 'mentioned', 'that', 'after', 'watching', 'just', '1', 'oz', 'episode', 'you', 'll', 'be', 'hooked', 'they', 'are', 'right', 'as', 'this', 'is', 'exactly', 'what', 'happened', 'with', 'me', 'the', 'first', 'thing', 'that', 'struck', 'me', 'about', 'oz', 'was', 'its', 'utality', 'and', 'unflinching', 'scenes', 'of', 'violence', 'which', 'set', 'in', 'right', 'from', 'the', 'word', 'go', 'trust', 'me', 'this', 'is', 'not', 'a', 'show', 'for', 'the', 'faint', 'hearted', 'or', 'timid', 'this', 'show', 'pulls', 'no', 'punches', 'with', 'regards', 'to', 'drugs', 'sex', 'or', 'violence', 'its', 'is', 'hardcore', 'in', 'the', 'classic', 'use', 'of', 'the', 'word', 'it', 'is', 'called', 'oz', 'as', 'that', 'is', 'the', 'nickname', 'given', 'to', 'the', 'oswald', 'maximum', 'security', 'state', 'penitentary', 'it', 'focuses', 'mainly', 'on', 'emerald', 'city', 'an', 'experimental', 'section', 'of', 'the', 'prison', 'where', 'all', 'the',

In [None]:
#splitting the data into training and validation dataset
def split_data(imdb_data):
  train_data, valid_data = imdb_data.split(split_ratio=0.7, random_state = random.seed(SEED))
  return train_data, valid_data

train_data, valid_data = split_data(imdb_data)

In [None]:
#generate vocabulary
TEXT.build_vocab(train_data,min_freq=3,vectors = "glove.6B.100d")  
LABEL.build_vocab(train_data)

#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))
#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

.vector_cache/glove.6B.zip: 862MB [02:40, 5.36MB/s]                           
100%|█████████▉| 399999/400000 [00:16<00:00, 24099.28it/s]


Size of TEXT vocabulary: 466
Size of LABEL vocabulary: 2


In [None]:
#preparing batches for training the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

#set batch size
BATCH_SIZE = 5

#Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

In [None]:
class Attention(nn.Module):

  def __init__(self, feature_dim, batch_first, **kwargs):
    #Constructor
    super(Attention, self).__init__(**kwargs)

    #variables
    self.batch_first = batch_first

    #attention parameters (will be learned via back propagation)
    self.W = nn.Parameter(torch.FloatTensor(feature_dim, feature_dim), requires_grad=True)
    nn.init.kaiming_uniform_(self.W.data)

    self.u = nn.Parameter(torch.FloatTensor(feature_dim,1), requires_grad=True)
    nn.init.kaiming_uniform_(self.u.data)


  def forward(self, x, x_len, mask=None):
    
    '''
      x is the hidden states (output) from lstm layer
      x_len contains information for leangth of each row (sentence) in the batch
    '''

    '''
    get the dimension information based on lstm batch_first logic
    '''
    if self.batch_first:
      batch_size, max_len = x.size()[:2]
    else:
      max_len, batch_size = x.size()[:2]


    '''
    x ~ [batch_size, seq_len, feature_dim] ~ feature_dim == (2 * hidden_dim)
    x_len ~ [batch_size]
    '''

    #[batch_size, seq_len, 1]
    e_ij = torch.matmul(
                torch.tanh( 
                      torch.matmul(x, #[batch_size, seq_len, feature_dim]
                                   self.W #[feature_dim, feature_dim]
                                  ) #[batch_size, seq_len, feature_dim]
                ), #[batch_size, seq_len, feature_dim]
                self.u #[feature_dim, 1]
            ) #[batch_size, seq_len, 1]

   
    #[batch_size, seq_len, 1]
    a_ij = torch.softmax(e_ij, dim=1)

    # multiply each hidden state with the attention weights
    #[batch_size, seq_len, feature_dim]
    weighted_ip = x * a_ij

    return weighted_ip, a_ij
    

class Classifier(nn.Module):

  def __init__(self, vocab_size, **kwargs):
    #Constructor
    super(Classifier, self).__init__(**kwargs)

    # variables
    self.embedding_dim = 100
    self.hidden_dim = 32
    self.num_layers = 1
    self.bidirectional = True
    self.batch_first = True
    self.output_dim = 1

    #embedding layer
    self.embedding = nn.Embedding(num_embeddings=vocab_size, 
                                  embedding_dim=self.embedding_dim)

    #lstm layer
    self.lstm = nn.LSTM(input_size=self.embedding_dim,
                        hidden_size=self.hidden_dim, 
                        num_layers=self.num_layers, 
                        bidirectional=self.bidirectional,
                        batch_first=self.batch_first)
    
    self.attn = Attention(self.hidden_dim * 2, batch_first=True) # 2 is bidrectional

    #dense layer / linear layer
    self.fc = nn.Linear(self.hidden_dim * 2, self.output_dim)

    #activation function
    self.act = nn.Sigmoid()

  def forward(self, txt, txt_len):
    '''
    # txt [batch_size, seq_len] 
    ~ seq_len is max sequence length among all the rows in batch
    ~ it means the rows length with less than seq_len will be padded with zeros
    ~ but the padding will be batchwise
    # txt_len [batch_size]
    ~ contains sequence length for each row in batch
    '''
    
    '''
    Step 1: pass through the embedding layer to convert text into vectors
    '''
    # embed_txt ~ [batch_size, seq_len, embedding_dim] 
    embed_txt = self.embedding(txt)

    '''
    Step 2: passing the embeddings through LSTM layer
    '''

    '''
    Step 2.1: first packing the embeddings to tackle variable length input
    '''
    # packed the embedding (only the vocab words without padding)
    embed_txt_packed_pad = nn.utils.rnn.pack_padded_sequence(embed_txt, txt_len, batch_first=True)


    '''
    Step 2.2: passing the packed input to LSTM layer
    '''
    # LSTM block 
    lstm_out, (h_n, c_n) = self.lstm(embed_txt_packed_pad)


    '''
    Step 2.3: retrieving back the lstm output with zero padding
    '''
    # packed the embedding (with padding)
    embed_txt_pad_packed, lengths = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)


    # embed_txt_pad_packed ~ [batch_size, seq_len, embedding_dim] 


    '''
    Step 3: passing the lstm output to Attention layer to get weighted output sequence
    '''
    # attn_out ~ [batch_size, (2 * hidden_dim)]
    attn_out, _ = self.attn(embed_txt_pad_packed, lengths, mask=None)

    '''
    sum all the weighted hidden states (modified by attention)
    '''
    # [batch_size, feature_dim]
    weighted_sum_ip = attn_out.sum(dim=1)

    '''
    feeding the weighted value to a linear layer
    '''
    # fc_out ~ [batch_size, output_dim]
    fc_out = self.fc(weighted_sum_ip)

    '''
    feeding the linear output to activation function
    '''
    # out ~ [batch_size, output_dim]
    out = self.act(fc_out)

    return out

In [None]:
#define hyperparameters
vocab_size = len(TEXT.vocab)

#instantiate the model
train_model = Classifier(vocab_size)
train_model = train_model.to(device)

In [None]:
#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def valid_model(valid_iterator, train_model, criterion):
  epoch_loss = 0
  epoch_acc = 0

  with torch.no_grad():
    for valid_batch in valid_iterator:
      
      #retrieve text and no. of words
      text, text_lengths = valid_batch.text
          
      #get prediction
      predictions = train_model(text, text_lengths)
      preds = predictions.squeeze(-1) #convert to 1D tensor

      #compute the loss
      loss = criterion(preds, valid_batch.label)

      #compute the binary accuracy
      acc = binary_accuracy(preds, valid_batch.label)

      # compute loss and accuracy
      epoch_loss += loss.item()
      epoch_acc += acc.item()

  valid_epoc_loss = epoch_loss / len(valid_iterator)
  valid_epoch_acc = epoch_acc / len(valid_iterator)

  return valid_epoc_loss, valid_epoch_acc

In [None]:
#training the model

#define the optimizer
optimizer = optim.Adam(train_model.parameters())

#define the loss
criterion = nn.BCELoss()
criterion = criterion.to(device)

#set the model in training phase
train_model.train()

N_EPOCHS = 6
VALIDATION_EPOCH = 2

for epoch in range(N_EPOCHS+1):

  #initialize every epoch 
  epoch_loss = 0
  epoch_acc = 0

  for batch in train_iterator:
    #resets the gradients after every batch
    optimizer.zero_grad() 

    #retrieve text and no. of words
    text, text_lengths = batch.text

    #get prediction
    predictions = train_model(text, text_lengths)
    preds = predictions.squeeze(-1) #convert to 1D tensor

    #compute the loss
    loss = criterion(preds, batch.label)

    #compute the binary accuracy
    acc = binary_accuracy(preds, batch.label)   

    #backpropage the loss and compute the gradients
    loss.backward()

    #update the weights
    optimizer.step() 

    # compute loss and accuracy
    epoch_loss += loss.item()
    epoch_acc += acc.item()

  if epoch%VALIDATION_EPOCH == 0:
    train_model.eval() # set the model in eval phase
    valid_epoc_loss, valid_epoch_acc = valid_model(valid_iterator, train_model, criterion)
    train_model.train() # return back to training phase

    print("epoch:- ",epoch)
    print("training===> ","loss:- ", epoch_loss / len(train_iterator), "  accuracy:- ", epoch_acc / len(train_iterator))
    print("validation===> ","loss:- ", valid_epoc_loss, "  accuracy:- ", valid_epoch_acc)

  if epoch == N_EPOCHS-1:
    torch.save(train_model.state_dict(), g_path+"model/classification_model.pt")

epoch:-  0
training===>  loss:-  0.6995312401226589   accuracy:-  0.4857142950807299
validation===>  loss:-  0.6951013008753458   accuracy:-  0.4000000059604645
epoch:-  2
training===>  loss:-  0.639090929712568   accuracy:-  0.8571428656578064
validation===>  loss:-  0.7025701999664307   accuracy:-  0.46666667858759564
epoch:-  4
training===>  loss:-  0.5566349625587463   accuracy:-  0.9428571462631226
validation===>  loss:-  0.703237255414327   accuracy:-  0.5333333412806193
epoch:-  6
training===>  loss:-  0.4354144334793091   accuracy:-  0.9714285731315613
validation===>  loss:-  0.7122084498405457   accuracy:-  0.5333333412806193


In [None]:
###  Inference  ###

#define hyperparameters
vocab_size = len(TEXT.vocab)

#instantiate the model
test_model = Classifier(vocab_size)
test_model = test_model.to(device)

#loading the model
model_path = g_path+"model/classification_model.pt"
test_model.load_state_dict(torch.load(model_path))

test_model.eval() # set the model in eval phase


test_sentence = "Are there any sports that you don't like?"
test_sentence = "I love the movie"
test_sentence = "I dislike the movie"
test_sentence = "I don't like the movie"

test_data = " ".join(clean_data(test_sentence.split(" "))) # clean the data
tokenized_test_data = tokenizer(test_data)  #tokenize the sentence

indexed_test_data = [TEXT.vocab.stoi[t] for t in tokenized_test_data]  #convert to integer sequence
txt_tensor = torch.LongTensor(indexed_test_data).to(device) #convert to tensor
txt_tensor_ip = txt_tensor.unsqueeze(1).T #reshape in form of batch,no. of words

length = [len(indexed_test_data)]  #compute no. of words
length_tensor_ip = torch.LongTensor(length) #convert to tensor 

prediction = test_model(txt_tensor_ip, length_tensor_ip) #prediction

print(prediction.item())

0.45333701372146606


**Example>>>>**

---



In [None]:
#[batch_size, (seq_len ~ variable), embed_dim]
x = [
        [
         [0.16, 0.57, 0.12, 0.84],
         [0.64, 0.28, 0.42, 0.86]
        ],

        [
         [0.20, 0.91, 0.26, 0.16],
         [0.75, 0.32, 0.25, 0.75],
         [0.15, 0.16, 0.70, 0.48]
        ],

        [
         [0.91, 0.10, 0.74, 0.22],
         [0.25, 0.42, 0.29, 0.26],
         [0.51, 0.70, 0.12, 0.26]
        ],

        [
         [0.17, 0.91, 0.77, 0.88]
        ],

        [
         [0.35, 0.90, 0.18, 0.46],
         [0.44, 0.33, 0.16, 0.43],
         [0.10, 0.97, 0.10, 0.70]
        ]
      ]

In [None]:
'''
first define the input to the attention
#[batch_size, seq_len, embed_dim]
batch_size : the number of input sentences at a time
seq_len : the number (max) of words among the input sentences
embed_dim : the vector dimension for each word (depends on RNN/LSTM hidden_dim)
'''

batch_size = 5
seq_len = 3
embed_dim = 4

#[batch_size, seq_len, embed_dim]
x_padded = torch.Tensor([
        [[0.16, 0.57, 0.12, 0.84],
         [0.64, 0.28, 0.42, 0.86],
         [0.41,  0.4, 0.12, 0.82]],

        [[ 0.2, 0.91, 0.26, 0.16],
         [0.75, 0.32, 0.25, 0.75],
         [0.15, 0.16,  0.7, 0.48]],

        [[0.91, 0.10, 0.74, 0.22],
         [0.25, 0.42, 0.29, 0.26],
         [0.51, 0.70, 0.12, 0.26]],

        [[0.17, 0.91, 0.77, 0.88],
         [0.10, 0.14, 0.60, 0.74],
         [0.94, 0.43, 0.77, 0.95]],

        [[0.35,  0.9, 0.18, 0.46],
         [0.44, 0.33, 0.16, 0.43],
         [ 0.1, 0.97,  0.1,  0.7]]
      ])

In [None]:
'''
let's define the weights to be learned 
these weights will help us to learn attention weights for earch word in each sentence
--> these should be randomly initialized at begining
--> these should be defined as nn.Parameter so it can be learned by backward propagation 
'''
#[embed_dim, embed_dim]
W = torch.Tensor([
        [-0.88, 0.09, 0.04,  0.00],
        [0.37,  0.00, 0.37,  0.83],
        [0.94,  0.37, 0.40,  0.93],
        [0.72,  0.28, 0.09,  0.00]])

#[embed_dim, 1]
u = torch.Tensor([
        [0.72],
        [0.3],
        [0.64],
        [0.00]])

In [None]:
#[batch_size, seq_len, embed_dim] 
tmp_1 = torch.matmul(x_padded,W)
print("first matmul shape:- \n",tmp_1.shape)
print("first matmul:- \n",tmp_1)

#[batch_size, seq_len, embed_dim] 
tmp_2 = torch.tanh(tmp_1)
print("tanh shape:- \n",tmp_2.shape)
print("tanh:- \n",tmp_2)

#[batch_size, seq_len, 1] 
tmp_3 = torch.matmul(tmp_2, u)
print("second matmul shape:- \n",tmp_3.shape)
print("second matmul:- \n",tmp_3)


#[batch_size, seq_len, 1]
attn = torch.matmul(
    torch.tanh( torch.matmul(x_padded, #[batch_size, seq_len, embed_dim] 
                                W #[embed_dim, embed_dim]
                              ) #[batch_size, seq_len]
    ),
    u #[embed_dim, 1]
)

first matmul shape:- 
 torch.Size([5, 3, 4])
first matmul:- 
 tensor([[[0.7877, 0.2940, 0.3409, 0.5847],
         [0.5544, 0.4538, 0.3746, 0.6230],
         [0.4904, 0.3109, 0.2862, 0.4436]],

        [[0.5203, 0.1590, 0.4631, 0.9971],
         [0.2334, 0.3700, 0.3159, 0.4981],
         [0.9308, 0.4069, 0.3884, 0.7838]],

        [[0.0902, 0.4173, 0.3892, 0.7712],
         [0.3952, 0.2026, 0.3048, 0.6183],
         [0.1102, 0.1631, 0.3508, 0.6926]],

        [[1.5445, 0.5466, 0.7307, 1.4714],
         [1.0606, 0.4382, 0.3624, 0.6742],
         [0.7397, 0.6355, 0.5902, 1.0730]],

        [[0.5254, 0.2269, 0.4604, 0.9144],
         [0.1949, 0.2192, 0.2424, 0.4227],
         [0.8689, 0.2420, 0.4659, 0.8981]]])
tanh shape:- 
 torch.Size([5, 3, 4])
tanh:- 
 tensor([[[0.6571, 0.2858, 0.3283, 0.5261],
         [0.5038, 0.4250, 0.3580, 0.5532],
         [0.4545, 0.3013, 0.2786, 0.4166]],

        [[0.4779, 0.1577, 0.4326, 0.7604],
         [0.2293, 0.3540, 0.3058, 0.4606],
         [0.7310, 0.

In [None]:
#[batch_size, seq_len, 1]
attn_score = torch.softmax(attn, dim=1)
print("attn_score shape:- \n", attn_score.shape)
print("attn_score:- \n", attn_score)

attn_score shape:- 
 torch.Size([5, 3, 1])
attn_score:- 
 tensor([[[0.3581],
         [0.3407],
         [0.3012]],

        [[0.3277],
         [0.2679],
         [0.4044]],

        [[0.3300],
         [0.3645],
         [0.3055]],

        [[0.3956],
         [0.2948],
         [0.3096]],

        [[0.3454],
         [0.2474],
         [0.4073]]])


In [None]:
#[batch_size, seq_len, embed_dim] 
weighted_x = x_padded * attn_score
print("weighted_x shape:- \n", weighted_x.shape)
print("weighted_x:- \n", weighted_x)

weighted_x shape:- 
 torch.Size([5, 3, 4])
weighted_x:- 
 tensor([[[0.0573, 0.2041, 0.0430, 0.3008],
         [0.2181, 0.0954, 0.1431, 0.2930],
         [0.1235, 0.1205, 0.0361, 0.2470]],

        [[0.0655, 0.2982, 0.0852, 0.0524],
         [0.2009, 0.0857, 0.0670, 0.2009],
         [0.0607, 0.0647, 0.2831, 0.1941]],

        [[0.3003, 0.0330, 0.2442, 0.0726],
         [0.0911, 0.1531, 0.1057, 0.0948],
         [0.1558, 0.2139, 0.0367, 0.0794]],

        [[0.0672, 0.3600, 0.3046, 0.3481],
         [0.0295, 0.0413, 0.1769, 0.2182],
         [0.2910, 0.1331, 0.2384, 0.2941]],

        [[0.1209, 0.3108, 0.0622, 0.1589],
         [0.1088, 0.0816, 0.0396, 0.1064],
         [0.0407, 0.3950, 0.0407, 0.2851]]])


In [None]:
#[batch_size, embed_dim] 
weighted_sum_x = torch.sum(weighted_x, dim=1)
print("weighted_sum_x shape:- \n", weighted_sum_x.shape)
print("weighted_sum_x:- \n", weighted_sum_x)

weighted_sum_x shape:- 
 torch.Size([5, 4])
weighted_sum_x:- 
 tensor([[0.3989, 0.4200, 0.2222, 0.8408],
        [0.3271, 0.4486, 0.4353, 0.4475],
        [0.5472, 0.3999, 0.3866, 0.2468],
        [0.3877, 0.5344, 0.7199, 0.8604],
        [0.2704, 0.7875, 0.1425, 0.5503]])


**Resources**

---
**Link** <br>
1) https://www.kaggle.com/code/dannykliu/lstm-with-attention-clr-in-pytorch/notebook <br>
2) https://discuss.pytorch.org/t/self-attention-on-words-and-masking/5671/9 <br>
3) https://mlwhiz.com/blog/2019/03/09/deeplearning_architectures_text_classification/ <br>
4) https://www.analyticsvidhya.com/blog/2019/11/comprehensive-guide-attention-mechanism-deep-learning/ <br>
5) https://www.kaggle.com/code/robertke94/pytorch-bi-lstm-attention/notebook <br>
6) https://towardsdatascience.com/sequence-2-sequence-model-with-attention-mechanism-9e9ca2a613a <br>
7) https://towardsdatascience.com/attention-and-its-different-forms-7fc3674d14dc <br>
8) https://www.kaggle.com/code/pavelvod/transformer-cnn-lstm-attention-heads <br>
9) https://www.analyticsvidhya.com/blog/2019/11/comprehensive-guide-attention-mechanism-deep-learning/ <br>
10) https://analyticsindiamag.com/hands-on-guide-to-bi-lstm-with-attention/ <br>
11) https://colab.research.google.com/drive/1HmegzNQR6g5_Xt37BMgV0kX7wQklT7dD?usp=sharing#scrollTo=Vh9bXvzHkmfi <br>
12) https://github.com/prakashpandey9/Text-Classification-Pytorch <br>
13) https://www.programmerall.com/article/51852224642/ <br>
14) https://richliao.github.io/supervised/classification/2016/12/26/textclassifier-RNN/ <br>
15) https://richliao.github.io/supervised/classification/2016/12/26/textclassifier-HATN/ <br>
16) https://lilianweng.github.io/posts/2018-06-24-attention/ <br>
17) https://github.com/WHLYA/text-classification/blob/master/text%20classification/LSTM%2BAttention.ipynb <br>

**Video** <br>
1) https://www.youtube.com/watch?v=Bp-_DatyUCY <br>
2) https://www.youtube.com/watch?v=oaV_Fv5DwUM <br>
3) https://www.youtube.com/watch?v=KmAISyVvE1Y <br>
4) https://www.youtube.com/watch?v=oUhGZMCTHtI <br>
5) https://www.youtube.com/watch?v=MN__lSncZBs <br>
6) https://www.coursera.org/lecture/nlp-sequence-models/attention-model-intuition-RDXpX

**Paper** <br>
1) https://arxiv.org/pdf/1904.02874.pdf <br>
2) https://arxiv.org/pdf/1409.0473.pdf <br>
3) https://arxiv.org/pdf/1804.06659.pdf <br>
4) https://arxiv.org/pdf/1703.03130.pdf <br>
5) https://arxiv.org/ftp/arxiv/papers/1902/1902.02181.pdf <br>
6) https://mdpi-res.com/d_attachment/applsci/applsci-11-03883/article_deploy/applsci-11-03883.pdf?version=1619361335 <br>
7) https://www.cs.cmu.edu/~./hovy/papers/16HLT-hierarchical-attention-networks.pdf <br>
8) https://colinraffel.com/publications/iclr2016feed.pdf <br>
9) http://univagora.ro/jour/index.php/ijccc/article/download/3142/1185/ 