In [2]:
'''
import statements
'''
import torch
from torch.nn.functional import softmax

# Transformer

# **Transformer Architecture**

---



The most populer attention architectures are as below <br>
1) ELMo (from Allen Institute for AI) <br>
2) GPT/GPT-2/GPT-3  (from OpenAI) <br>
3) BERT (from Google)  <br> 
4) Transformer-XL (from Google/CMU)  <br>
5) XLNet (from Google/CMU)  <br>
6) XLM (from Facebook)  <br>
7) RoBERTa (from Facebook) <br>
8) DistilBERT (from Hugging Face)

# **Transformer Components**

---

1) input Embedding and positional embedding <br>
2) self attention <br>
3) feed forward

# **Transformer Equation**

---

#  **Self Attention**

---

What is self-attention?

A self-attention module takes in n inputs, and returns n outputs. What happens in this module?
In layman’s terms, the self-attention mechanism allows the inputs to interact with each other (“self”) and find out who they should pay more attention to (“attention”). The outputs are aggregates of these interactions and attention scores.


**--- INPUT ---**


--> let's say we have a single sentence with three words => <br>
['I', 'love', 'you']

--> Assume, we get 4 (embed_dim) for each word. <br>
so input is [seq_len, embed_dim] ~ [3, 4]

In [21]:
x = [
  [1, 0, 1, 0], # word 1 ~ I
  [0, 2, 2, 2], # word 2 ~ love
  [1, 1, 1, 1]  # word 3 ~ you
 ]
x = torch.tensor(x, dtype=torch.float32)
x

tensor([[1., 0., 1., 0.],
        [0., 2., 2., 2.],
        [1., 1., 1., 1.]])

--> Next, we have to create **Query (Q)**, **Key (K)** and **Value (V)** <br>
--> Below is linear transformation on the input (x) <br>
$ Q = W_Q X + b_Q$ <br>
$ K = W_K X + b_K$ <br>
$ V = W_V X + b_V$ <br>

--> What should be the dimension od $W_Q, W_K, W_V$ <br>
[embed_dim, model_op_dim] ~ [4, 3] <br>

-> Note: In a neural network setting, these weights are usually small numbers, initialised randomly using an appropriate random distribution like Gaussian, Xavier and Kaiming distributions.

**--- Weights (W_Key, W_Query, W_Value) ---**


In [27]:
w_query = [
  [0, 0, 1],
  [1, 1, 0],
  [0, 1, 0],
  [1, 1, 0]
]
w_key = [
  [1, 0, 1],
  [1, 0, 0],
  [0, 1, 0],
  [1, 0, 1]
]
w_value = [
  [1, 0, 1],
  [1, 1, 0],
  [0, 1, 1],
  [0, 0, 1]
]

w_query = torch.tensor(w_query, dtype=torch.float32)
w_key = torch.tensor(w_key, dtype=torch.float32)
w_value = torch.tensor(w_value, dtype=torch.float32)


print("Weights for query: \n", w_query)
print("Weights for key: \n", w_key)
print("Weights for value: \n", w_value)

Weights for query: 
 tensor([[0., 0., 1.],
        [1., 1., 0.],
        [0., 1., 0.],
        [1., 1., 0.]])
Weights for key: 
 tensor([[1., 0., 1.],
        [1., 0., 0.],
        [0., 1., 0.],
        [1., 0., 1.]])
Weights for value: 
 tensor([[1., 0., 1.],
        [1., 1., 0.],
        [0., 1., 1.],
        [0., 0., 1.]])


--- Derive **Query (Q)**, **Key (K)** and **Value (V)** ---

In [28]:
# dot product = @
Q = x @ w_query
K = x @ w_key
V = x @ w_value

print("Querys: \n", Q)
# tensor([[0., 1., 1.],
#         [4., 6., 0.],
#         [2., 3., 1.]])

print("Keys: \n", K)
# tensor([[0., 1., 1.],
#         [4., 4., 0.],
#         [2., 3., 1.]])


print("Values: \n", V)
# tensor([[1., 2., 3.],
#         [2., 8., 0.],
#         [2., 6., 3.]])

Querys: 
 tensor([[0., 1., 1.],
        [4., 6., 0.],
        [2., 3., 1.]])
Keys: 
 tensor([[1., 1., 1.],
        [4., 2., 2.],
        [3., 1., 2.]])
Values: 
 tensor([[1., 1., 2.],
        [2., 4., 4.],
        [2., 2., 3.]])


**--- Calculate attention scores ---** <br>
$Attention(Q, K, V) = softmax(\frac{QK^T}{\sqrt{d_k}})V$ <br>
$softmax(\frac{QK^T}{\sqrt{d_k}})$ is attention score for each value ($V$) <br>
weighted value is multiplication of attention scores and $V$

In [30]:
# calculate attention score by dot product of queries and keys
attn_scores = Q @ K.T
print(attn_scores)

# tensor([[ 2.,  4.,  3.],  # attention scores from Query 1 ~ [[Q₁ x K₁], [Q₁ x K₂], [Q₁ x K₃]]
#         [ 10., 28., 18.], # attention scores from Query 2 ~ [[Q₂ x K₁], [Q₂ x K₂], [Q₂ x K₃]]
#         [ 6., 16., 11.]]) # attention scores from Query 3 ~ [[Q₃ x K₁], [Q₃ x K₂], [Q₃ x K₃]]

tensor([[ 2.,  4.,  3.],
        [10., 28., 18.],
        [ 6., 16., 11.]])


In [32]:
# dimension of the key vectors is d_k ~ 3, so sqrt(d_k) ~ 1 
attn_scores_scaled = attn_scores / 1
print(attn_scores_scaled)

# tensor([[ 2.,  4.,  4.],  # attention scores from Query 1
#         [ 4., 16., 12.],  # attention scores from Query 2
#         [ 4., 12., 10.]]) # attention scores from Query 3

tensor([[ 2.,  4.,  3.],
        [10., 28., 18.],
        [ 6., 16., 11.]])


In [33]:
# apply softmax to normalize
attn_scores_softmax = softmax(attn_scores_scaled, dim=-1)
print(attn_scores_softmax)
# tensor([[6.3379e-02, 4.6831e-01, 4.6831e-01],
#         [6.0337e-06, 9.8201e-01, 1.7986e-02],
#         [2.9539e-04, 8.8054e-01, 1.1917e-01]])

# For readability, approximate the above as follows
attn_scores_softmax = torch.round(attn_scores_softmax, decimals=1)
print(attn_scores_softmax)
# tensor([[0.1000, 0.7000, 0.2000],
#         [0.0000, 1.0000, 0.0000],
#         [0.0000, 1.0000, 0.0000]])

tensor([[9.0031e-02, 6.6524e-01, 2.4473e-01],
        [1.5229e-08, 9.9995e-01, 4.5398e-05],
        [4.5094e-05, 9.9326e-01, 6.6925e-03]])
tensor([[0.1000, 0.7000, 0.2000],
        [0.0000, 1.0000, 0.0000],
        [0.0000, 1.0000, 0.0000]])


**--- Calculate Weighted Values ---**

In [35]:
'''
[0.1000, 0.7000, 0.2000] ~ [0.1, 0.7, 0.2]
V
0.1 * [1, 1, 2] = [0.1, 0.1, 0.2] # Value for input 1
0.7 * [2, 4, 4] = [1.4, 2.8, 2.8] # Value for input 2
0.2 * [2, 2, 3] = [0.4, 0.4, 0.6] # Value for input 3
                  ---------------
                  [1.9, 3.3, 3.6]

[0.0000, 1.0000, 0.0000] ~ [0.0, 1.0, 0.0]
V
0.0 * [1, 1, 2] = [0., 0., 0.]  # Value for input 1
1.0 * [2, 4, 4] = [2., 4., 4.]  # Value for input 2
0.0 * [2, 2, 3] = [0., 0., 0.]  # Value for input 3
                  -------------
                  [2., 4., 4.]

[0.0000, 1.0000, 0.0000] ~ [0.0, 1.0, 0.0]
V
0.0 * [1, 1, 2] = [0., 0., 0.]  # Value for input 1
1.0 * [2, 4, 4] = [2., 4., 4.]  # Value for input 2
0.0 * [2, 2, 3] = [0., 0., 0.]  # Value for input 3
                  -------------
                  [2., 4., 4.]

'''
output = torch.matmul(attn_scores_softmax, V)
output

tensor([[1.9000, 3.3000, 3.6000],
        [2.0000, 4.0000, 4.0000],
        [2.0000, 4.0000, 4.0000]])

# PyTorch Implementation

In [117]:
import math
import torch
from torch import nn, Tensor

In [229]:
class SelfAttention(nn.Module):

    def __init__(self, conf):
        super(SelfAttention, self).__init__()

        self.embed_dim = conf["embed_dim"]
        self.heads = conf["heads"] #number of heads
        self.head_dim = conf["embed_dim"] // conf["heads"]

        assert (
            self.head_dim * self.heads == conf["embed_dim"]
        ), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)

        self.fc_out = nn.Linear(self.heads * self.head_dim,  self.embed_dim)

    def forward(self, values, keys, query, mask):
        
        '''
        values ~ # [batch, seq_len, embed_dim]
        keys ~ # [batch, seq_len, embed_dim]
        query ~ # [batch, seq_len, embed_dim]
        mask ~ # [batch, 1, 1, seq_len]
        '''

        # Get number of training examples
        batch = query.shape[0]

        '''
        value_len ~ the max seq_len in values 
        key_len ~ the max seq_len in keys 
        query_len ~ the max seq_len in query 
        here all of them are same
        '''
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        # Split the embedding into self.heads different pieces
        # [batch, seq_len, embed_dim] --converted--> [batch, seq_len, heads, head_dim]
        # [batch, value_len, heads, head_dim]
        values = values.reshape(batch, value_len, self.heads, self.head_dim)

        # [batch, seq_len, embed_dim] --converted--> [batch, seq_len, heads, head_dim]
        # [batch, key_len, heads, head_dim]
        keys = keys.reshape(batch, key_len, self.heads, self.head_dim)

        # [batch, seq_len, embed_dim] --converted--> [batch, seq_len, heads, head_dim]
        # [batch, query_len, heads, head_dim]
        query = query.reshape(batch, query_len, self.heads, self.head_dim)



        # [batch, value_len, heads, head_dim]
        values = self.values(values  # [batch, value_len, heads, head_dim]
                             )
        # [batch, key_len, heads, head_dim]
        keys = self.keys(keys  # [batch, key_len, heads, head_dim]
                         )
        # [batch, query_len, heads, head_dim]
        queries = self.queries(query  # [batch, query_len, heads, head_dim]
                               )

        
        # Einsum does matrix mult. for query*keys for each training example
        # with every other training example, don't be confused by einsum
        # it's just how I like doing matrix multiplication & bmm
        # [batch, head, query_len, key_len]
        energy = torch.einsum("nqhd,nkhd->nhqk", 
                              [queries, # [batch, query_len, heads, head_dim]
                               keys # [batch, key_len, heads, head_dim]
                               ])

        # Mask padded indices so their weights become 0
        if mask is not None:
            # energy = energy.masked_fill(mask == 0, float("-1e20"))
            energy = energy.masked_fill(mask == 0, float("0"))

        # Normalize energy values similarly to seq2seq + attention
        # so that they sum to 1. Also divide by scaling factor for
        # better stability ~ sqrt(d_k)
        # attention shape: [batch, heads, query_len, key_len]
        attention = torch.softmax(energy / (self.embed_dim ** (1 / 2)), dim=3)

        '''
        here key_len and value_len are same ~ denoted by l 
        '''
        # [batch, query_len, heads, head_dim]
        out = torch.einsum("nhql,nlhd->nqhd", [attention,  # [batch, heads, query_len, key_len]
                                               values  # [batch, value_len, heads, head_dim]
                                               ])
        # [batch, query_len, heads, head_dim] --converted--> [batch, query_len, embed_dim]
        out = out.reshape(batch, query_len, self.heads * self.head_dim)


        # Linear layer doesn't modify the shape, final shape will be
        # [batch, query_len, embed_dim]
        out = self.fc_out(out # [batch, query_len, embed_dim]
                          )

        return out

In [230]:
class TransformerBlock(nn.Module):
    def __init__(self, conf):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(conf)
        self.norm1 = nn.LayerNorm(conf["embed_dim"])
        self.norm2 = nn.LayerNorm(conf["embed_dim"])

        self.feed_forward = nn.Sequential(
            nn.Linear(conf["embed_dim"], 
                      conf["forward_expansion"] * conf["embed_dim"]),
            nn.ReLU(),
            nn.Linear(conf["forward_expansion"] * conf["embed_dim"], 
                      conf["embed_dim"]),
        )

        self.dropout = nn.Dropout(conf["dropout"])

    def forward(self, value, key, query, mask):

      '''
      value ~ # [batch, seq_len, embed_dim]
      key ~ # [batch, seq_len, embed_dim]
      query ~ # [batch, seq_len, embed_dim]
      mask ~ # [batch, 1, 1, seq_len]
      '''

      '''
      Step 1: passing the value, key, and query to self attention layer
      '''
      # [batch, seq_len, embed_dim] 
      attention = self.attention(value,  # [batch, seq_len, embed_dim]
                                 key,  # [batch, seq_len, embed_dim]
                                 query,  # [batch, seq_len, embed_dim]
                                 mask  # [batch, 1, 1, seq_len]
                                 )

      '''
      Step 2: normalizing the output
      '''
      # [batch, seq_len, embed_dim]   
      # Add skip connection, run through normalization and finally dropout
      x = self.dropout(self.norm1(attention + query))
      forward = self.feed_forward(x)
      out = self.dropout(self.norm2(forward + x))

      return out

In [231]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, max_len: int = 5000, dropout: float = 0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def pos_mask(self, src, conf):
        src_mask = (src != conf["src_pad_idx"])
        # [batch, seq_len, embed_dim]
        # the values are True or False
        return src_mask

    def forward(self, x: Tensor, conf: dict) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        pos_mask_val = self.pos_mask(x, conf)

        x = x + self.pe[:x.size(0)]
        x = x.masked_fill(pos_mask_val == 0, float("0"))

        return self.dropout(x)

In [232]:
class Transformer(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        conf
    ):

        super(Transformer, self).__init__()
        
        self.word_embedding = nn.Embedding(num_embeddings = src_vocab_size, 
                                           embedding_dim = conf["embed_dim"],
                                           padding_idx=0)
        
        #should be same, unleass you do not want to consider all the vocabularies
        max_length = src_vocab_size 
        self.position_embedding = PositionalEncoding(conf["embed_dim"],
                                                      max_length)

        self.layers = nn.ModuleList(
            [
                TransformerBlock(conf)
                for _ in range(conf["num_Tlayers"])
            ]
        )
        self.dropout = nn.Dropout(conf["dropout"])

    def forward(self, x, mask):

        '''
        x ~ # [batch, seq_len]
        mask ~ # [batch, 1, 1, seq_len]
        '''

        '''
        Step 1: pass through the embedding layer to convert text into vectors
        '''
        # x_embed ~ [batch, seq_len, embed_dim] 
        x_embed = self.word_embedding(x # [batch, seq_len]
                                      )  
        
        '''
        Step 2: position_embedding incorporates the position information
        '''
        if conf["ps_embed_enc_ind"]:
          x_embed_with_position = self.position_embedding(x_embed, conf)
          x_embed = x_embed_with_position

        out = self.dropout(x_embed)

        '''
        Step 3: passing the embeddings to the transformer block
        '''
        # In the Encoder the query, key, value are all the same
        for t_layer in self.layers:
          # [batch, seq_len, embed_dim]
          out = t_layer(out,  # [batch, seq_len, embed_dim]
                        out,  # [batch, seq_len, embed_dim]
                        out,  # [batch, seq_len, embed_dim]
                        mask  # [batch, 1, 1, seq_len]
                      )

        return out

In [233]:
class TransformerClassifier(nn.Module):
    def __init__(
        self, src_vocab_size, conf
    ):

        super(TransformerClassifier, self).__init__()

        self.src_pad_idx = conf["src_pad_idx"]
        self.device = conf["device"]

        self.encoder = Transformer(
            src_vocab_size,
            conf
        )

        self.output_dim = 1
        #dense layer / linear layer
        self.fc = nn.Linear(conf["embed_dim"], self.output_dim)

        #activation function
        self.act = nn.Sigmoid()

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        # [batch, 1, 1, seq_len]
        # the values are True or False
        return src_mask.to(self.device)

    def forward(self, src):
      
      '''
      src ~ # [batch, seq_len]
      '''

      '''
      Step 1: create mask for the attention layer
      '''
      # [batch, 1, 1, seq_len]
      # the values are True or False
      src_mask = self.make_src_mask(src # [batch, seq_len]
                                    )

      '''
      Step 2: passing the input to transformer
      '''
      # [batch, seq_len, embed_dim]
      enc_out = self.encoder(src,  # [batch, seq_len]
                             src_mask  # [batch, 1, 1, seq_len]
                             )
      '''
      Step 3: sum all the embed dim of each word in a sentence
      '''
      # [batch_size, embed_dim]
      weighted_out = enc_out.sum(dim=1)

      '''
      Step 4: feeding the weighted value to a linear layer
      '''
      # fc_out ~ [batch_size, output_dim]
      fc_out = self.fc(weighted_out  # [batch_size, embed_dim]
                       )

      '''
      Step 5: feeding the linear output to activation function
      '''
      # out ~ [batch_size, output_dim]
      out = self.act(fc_out)

      return out

In [234]:
conf = {
    "device" : torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "src_pad_idx" : 0,
    "embed_dim" : 6,
    "ps_embed_enc_ind": True,
    "num_Tlayers" : 2,
    "heads" : 2,
    "dropout" : 0.1,
    "forward_expansion" : 4
}
'''
In this example the highest value of word index is 10
'''
src_vocab_size = 10 
'''
<start> ~ 1
<end> ~ 2
<pad> ~ 0
other words ~ [3 <--> 9]
'''
# [batch, seq_len] ~ torch.Size([4, 9])
x_batch = torch.tensor([
                  [
                    [1, 5, 6, 4, 3, 9, 5, 6, 2], 
                    [1, 8, 7, 3, 4, 5, 8, 2, 0],
                    [1, 8, 7, 6, 4, 7, 3, 2, 0],
                    [1, 5, 6, 4, 3, 2, 0, 0, 0]
                  ],
                  [
                    [1, 5, 6, 4, 2, 0, 0, 0, 0], 
                    [1, 8, 7, 2, 0, 0, 0, 0, 0],
                    [1, 8, 7, 2, 0, 0, 0, 0, 0],
                    [1, 2, 0, 0, 0, 0, 0, 0, 0]
                  ]
                ])

classifier = TransformerClassifier(src_vocab_size, conf)
for x in x_batch:
  # [batch, 1] ~ torch.Size([4, 1])
  out = classifier(x)
  print(out)

tensor([[0.9806],
        [0.8411],
        [0.8522],
        [0.9928]], grad_fn=<SigmoidBackward0>)
tensor([[0.9987],
        [0.9020],
        [0.7238],
        [0.9970]], grad_fn=<SigmoidBackward0>)


# Hugging Face Implementation

In [111]:
!pip install transformers 

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 13.8 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 66.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.4 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 71.9 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 48.6 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml

**GPT-2**

---



In [112]:
from transformers import pipeline, set_seed
'''
Text generation
'''
generator = pipeline('text-generation', model='gpt2')
generator("Hello, I like to play cricket,", max_length=60, num_return_sequences=7)

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Hello, I like to play cricket, just for fun.\n\n\nI would also like to thank my dad for making him a hero for that time. He has led the fight for the survival of the tribe. That was my greatest strength.\n\n\nAlso, we must mention one big thanks to'},
 {'generated_text': 'Hello, I like to play cricket, I like swimming. If that doesn\'t make the difference, just to relax and think about the future for the players and their future, I\'m sure that people will say I am a bit of a fan," he said.\n\nThat might be the way'},
 {'generated_text': "Hello, I like to play cricket, my friends call me 'the hula'); that's a bit too much for the average girl to handle. It reminds me a bit of my early days in India, when I learned to play cricket. But now I just sit there and watch.\n\n"},
 {'generated_text': 'Hello, I like to play cricket, but I never felt as good as I felt at any given moment. At the time, everyone expected Pakistan to win there. In fact, I had just lost because 

In [113]:
from transformers import pipeline, set_seed
'''
Sentiment analysis
'''
# Allocate a pipeline for sentiment-analysis
classifier = pipeline('sentiment-analysis')
classifier('The secret of getting ahead is getting started.')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

[{'label': 'POSITIVE', 'score': 0.9970657229423523}]

In [116]:
from transformers import pipeline, set_seed
'''
Question Answering
'''
# Allocate a pipeline for question-answering
question_answerer = pipeline('question-answering')
question_answerer({
    'question': 'What is Newton\'s third law of motion?',
    'context': 'Newton\'s third law of motion states that, "For every action there is equal and opposite reaction"'})

No model was supplied, defaulted to distilbert-base-cased-distilled-squad (https://huggingface.co/distilbert-base-cased-distilled-squad)


Downloading:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/249M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

{'answer': '"For every action there is equal and opposite reaction"',
 'end': 97,
 'score': 0.6134567856788635,
 'start': 42}

**BERT**

---



In [None]:
from transformers import pipeline, set_seed
'''
Text prediction
'''
unmasker = pipeline('fill-mask', model='bert-base-cased')
unmasker("Hello, My name is [MASK].")

In [None]:
from transformers import pipeline, set_seed
'''
Text Summarization
'''
#Summarization is currently supported by Bart and T5.
summarizer = pipeline("summarization")

ARTICLE = """The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972.
First conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space,
Apollo was later dedicated to President John F. Kennedy's national goal of "landing a man on the Moon and returning him safely to the Earth" by the end of the 1960s, which he proposed in a May 25, 1961, address to Congress.
Project Mercury was followed by the two-man Project Gemini (1962-66).
The first manned flight of Apollo was in 1968.
Apollo ran from 1961 to 1972, and was supported by the two-man Gemini program which ran concurrently with it from 1962 to 1966.
Gemini missions developed some of the space travel techniques that were necessary for the success of the Apollo missions.
Apollo used Saturn family rockets as launch vehicles.
Apollo/Saturn vehicles were also used for an Apollo Applications Program, which consisted of Skylab, a space station that supported three manned missions in 1973-74, and the Apollo-Soyuz Test Project, a joint Earth orbit mission with the Soviet Union in 1975.
"""

summary=summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False)[0]

print(summary['summary_text'])

In [None]:
from transformers import pipeline, set_seed
'''
English to German translation
'''
# English to German
translator_ger = pipeline("translation_en_to_de")
print("German: ",translator_ger("Joe Biden became the 46th president of U.S.A.", max_length=40)[0]['translation_text'])

In [None]:
from transformers import pipeline, set_seed
'''
Conversation/ Chatbot
'''
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")

# Let's chat for 5 lines
for step in range(5):
   # encode the new user input, add the eos_token and return a tensor in Pytorch
   new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')

   # append the new user input tokens to the chat history
   bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

   # generated a response while limiting the total chat history to 1000 tokens,
   chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)

   # pretty print last output tokens from bot
   print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

In [None]:
from transformers import pipeline, set_seed
'''
Named Entity Recognition
'''
nlp_token_class = pipeline('ner')
nlp_token_class('Ronaldo was born in 1985, he plays for Juventus and Portugal. ')

In [None]:
from transformers import pipeline, set_seed
'''
Zero-shot Learning
'''
classifier_zsl = pipeline("zero-shot-classification")

sequence_to_classify = "Bill gates founded a company called Microsoft in the year 1975"
candidate_labels = ["Europe", "Sports",'Leadership','business', "politics","startup"]
classifier_zsl(sequence_to_classify, candidate_labels)

In [None]:
from transformers import pipeline, set_seed
'''
Features Extraction
'''
import numpy as np
nlp_features = pipeline('feature-extraction')
output = nlp_features(output = nlp_features("Deep learning is a branch of Machine learning"))
np.array(output).shape # (Samples, Tokens, Vector Size)

In [None]:
# Code
# https://www.kaggle.com/code/nageshsingh/huggingface-transformer-basic-usage/notebook

**Resources**

---
**Blog** <br>
1) http://peterbloem.nl/blog/transformers <br>
2) https://www.programmerall.com/article/51852224642/ <br>
3) https://towardsdatascience.com/illustrated-self-attention-2d627e33b20a <br>
4) https://jalammar.github.io/illustrated-transformer/ <br>
5) https://n8henrie.com/2021/08/writing-a-transformer-classifier-in-pytorch/ <br>
6) https://pytorch.org/tutorials/beginner/transformer_tutorial.html <br>
7) https://github.com/ShivamRajSharma/Transformer-Architectures-From-Scratch/blob/master/TRANSFORMERS.py <br>
8) https://www.kdnuggets.com/2021/02/hugging-face-transformer-basics.html <br>
9) https://towardsdatascience.com/a-deep-dive-into-the-transformer-architecture-the-development-of-transformer-models-acbdf7ca34e0 <br>
10) https://www.analyticsvidhya.com/blog/2019/06/understanding-transformers-nlp-state-of-the-art-models/ <br>
11) https://www.analyticsvidhya.com/blog/2021/09/a-deep-dive-into-transformers-library/ <br>
12) https://thevatsalsaglani.medium.com/question-classification-using-self-attention-transformer-part-2-910b89c7116a <br>
13) https://github.com/huggingface/transformers <br>
14) https://github.com/ThilinaRajapakse/pytorch-transformers-classification <br>
15) https://neptune.ai/blog/how-to-code-bert-using-pytorch-tutorial <br>
16) https://colab.research.google.com/github/gmihaila/ml_things/blob/master/notebooks/pytorch/gpt2_finetune_classification.ipynb#scrollTo=7Sifp6ocoSng <br>
17) https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb#scrollTo=_NCzbHCHNAox <br>
18) https://jamesmccaffrey.wordpress.com/2021/03/02/pytorch-transformer-model-for-classification-input-output/ <br>
19) https://hyugen-ai.medium.com/transformers-in-pytorch-from-scratch-for-nlp-beginners-ff3b3d922ef7 <br>
20) https://medium.com/@gauravghati/comparison-between-bert-gpt-2-and-elmo-9ad140cd1cda <br>
21) https://allenai.org/allennlp/software/elmo <br>

**Paper** <br>
[Attention Is All You Need] <br>
1) https://arxiv.org/pdf/1706.03762.pdf <br>
[Open AI] <br>
2) https://openai.com/api/ <br>
[BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding] <br>
3) https://arxiv.org/pdf/1810.04805.pdf <br>

**Video** <br>
1) https://www.youtube.com/watch?v=U0s0f995w14 <br>
2) https://www.youtube.com/c/MakeesyAI/videos <br>
3) https://www.youtube.com/watch?v=jVPd7lEvjtg <br>
4) https://www.youtube.com/watch?v=Ck9-0YkJD_Q 