## Downgrade pytorch (please restart runtime after run this cell)


In [4]:
!pip install torch==1.9.0
!pip install torchtext==0.10.0
!pip install einops

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting einops
  Downloading einops-0.6.0-py3-none-any.whl (41 kB)
[K     |████████████████████████████████| 41 kB 499 kB/s 
[?25hInstalling collected packages: einops
Successfully installed einops-0.6.0


## Model Definition

In [1]:
import math
import torch
import torch.nn as nn
from einops import rearrange

class MultiHeadAttention(nn.Module):
  def __init__(self, d_model, num_heads, d_k):
    super().__init__()
    self.d_k = d_k
    self.num_heads = num_heads
    self.d_model = d_model
    self.softargmax = nn.Softmax(-1)
    self.split_head = nn.Linear(d_model, self.d_k*num_heads)
    self.W_o = nn.Linear(self.num_heads*d_k,d_model)

  def split(self, x):
    x = self.split_head(x)
    x = rearrange(x, "seq_length batch_size (heads d_k) -> seq_length batch_size heads d_k", heads = self.num_heads)
    return x  
  
  def forward(self, query, key, value):
    query, key, value = self.split(query), self.split(key), self.split(value)
    score = torch.einsum(
        "q b h d, k b h d -> q k b h",query,key) 
    score /= math.sqrt(self.d_k)
    attn = self.softargmax(score)
    out = torch.einsum("q k b h, k b h d-> q b h d", attn, value)
    out = rearrange(out, "seq_length batch_size heads d_k -> seq_length batch_size (heads d_k)", heads = self.num_heads)
    out = self.W_o(out)
    return out

In [2]:
import torch
import torch.nn as nn

class FeedForward(nn.Module):
    def __init__(self, d_model, d_hidden, dropout_prob):
        super(FeedForward, self).__init__()
        self.layer1 = nn.Linear(d_model, d_hidden)
        self.layer2 = nn.Linear(d_hidden, d_model)
        self.dropout = nn.Dropout(p=dropout_prob)
        self.relu = nn.ReLU()
      
    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.layer2(x)
        return x

In [3]:
import torch
import torch.nn as nn

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len, dropout_prob):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout_prob)
        position = torch.arange(0, max_len)
        position = position.float().unsqueeze(dim=1)
        
        pe = torch.zeros(max_len, d_model)
        div_term = torch.exp(torch.arange(0, d_model, step=2) * (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position*div_term)
        pe[:, 1::2] = torch.cos(position*div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        batch_size, seq_len, d_model = x.size()
        return self.dropout(self.pe[:seq_len, :].unsqueeze(0)+x)

In [4]:
import torch
class EncoderLayer(torch.nn.Module):
    def __init__(self,d_model,head_size,mlp_hidden_dim,dropout_prob = 0.1):
        super().__init__()
       
        self.attention = MultiHeadAttention(d_model, head_size,d_model)
        self.layer_norm1 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
        self.layer_norm2 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
        self.mlp = FeedForward(d_model,mlp_hidden_dim, dropout_prob)

    def forward(self, x):
        # 1. compute self attention
        _x = x
        x = self.attention(x,x,x)
        # 2. add and norm
        x = self.layer_norm1(x + _x)

        
        # 3. positionwise feed forward network
        _x = x
        x = self.mlp(x)
      
        # 4. add and norm
        x = self.layer_norm2(x + _x)
        return x

In [5]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab_size, max_position_embeddings, p):
        super().__init__()
        self.word_embeddings = nn.Embedding(vocab_size, d_model, padding_idx=1)
        self.positional_encoding = PositionalEncoding( d_model,max_position_embeddings,p)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-12)

    def forward(self, input_ids):
        seq_length = input_ids.size(1)
        
        # Get word embeddings for each input id
        word_embeddings = self.word_embeddings(input_ids)                   # (bs, max_seq_length, dim)
        
        
        embeddings = self.positional_encoding(word_embeddings)
        # Layer norm 
        embeddings = self.layer_norm(embeddings)             # (bs, max_seq_length, dim)
        return embeddings

In [6]:
class Encoder(nn.Module):
    def __init__(self, num_layers, d_model, head_size, mlp_hidden_dim, input_vocab_size,
               maximum_position_encoding, p=0.1):
        super().__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = Embeddings(d_model, input_vocab_size,maximum_position_encoding, p)

        self.enc_layers = nn.ModuleList()
        for _ in range(num_layers):
            self.enc_layers.append(EncoderLayer(d_model, head_size, mlp_hidden_dim, p))
        
    def forward(self, x):
        x = self.embedding(x) # Transform to (batch_size, input_seq_length, d_model)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x)

        return x  # (batch_size, input_seq_len, d_model)

## Dataset Processing

In [7]:
from torchtext import legacy
from torchtext.legacy import data
import torchtext.datasets as datasets

In [8]:
max_len = 200
text = legacy.data.Field(sequential=True, fix_length=max_len, batch_first=True, lower=True, dtype=torch.long)
label = legacy.data.LabelField(sequential=False, dtype=torch.long)
ds_train, ds_test = legacy.datasets.IMDB.splits(text, label, root='./')
print('train : ', len(ds_train))
print('test : ', len(ds_test))
print('train.fields :', ds_train.fields)

train :  25000
test :  25000
train.fields : {'text': <torchtext.legacy.data.field.Field object at 0x7f009e8fc250>, 'label': <torchtext.legacy.data.field.LabelField object at 0x7f009e8fc2b0>}


In [9]:
ds_train, ds_valid = ds_train.split(0.9)
print('train : ', len(ds_train))
print('valid : ', len(ds_valid))
print('test : ', len(ds_test))

train :  22500
valid :  2500
test :  25000


In [10]:
num_words = 50_000
text.build_vocab(ds_train, max_size=num_words)
label.build_vocab(ds_train)
vocab = text.vocab

In [11]:
batch_size = 164
train_loader, valid_loader, test_loader = data.BucketIterator.splits(
    (ds_train, ds_valid, ds_test), batch_size=batch_size, sort_key=lambda x: len(x.text), repeat=False)

In [12]:
class TransformerClassifier(nn.Module):
    def __init__(self, num_layers, d_model, head_size, conv_hidden_dim, input_vocab_size, num_answers):
        super().__init__()
        
        self.encoder = Encoder(num_layers, d_model, head_size, conv_hidden_dim, input_vocab_size,
                         maximum_position_encoding=10000)
        self.dense = nn.Linear(d_model, num_answers)

    def forward(self, x):
        x = self.encoder(x)
        
        x, _ = torch.max(x, dim=1)
        x = self.dense(x)
        return x

In [13]:
model = TransformerClassifier(num_layers=1, d_model=16, head_size=2, 
                         conv_hidden_dim=128, input_vocab_size=50002, num_answers=2)
model.cuda()

TransformerClassifier(
  (encoder): Encoder(
    (embedding): Embeddings(
      (word_embeddings): Embedding(50002, 16, padding_idx=1)
      (positional_encoding): PositionalEncoding(
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (layer_norm): LayerNorm((16,), eps=1e-12, elementwise_affine=True)
    )
    (enc_layers): ModuleList(
      (0): EncoderLayer(
        (attention): MultiHeadAttention(
          (softargmax): Softmax(dim=-1)
          (split_head): Linear(in_features=16, out_features=32, bias=True)
          (W_o): Linear(in_features=32, out_features=16, bias=True)
        )
        (layer_norm1): LayerNorm((16,), eps=1e-06, elementwise_affine=True)
        (layer_norm2): LayerNorm((16,), eps=1e-06, elementwise_affine=True)
        (mlp): FeedForward(
          (layer1): Linear(in_features=16, out_features=128, bias=True)
          (layer2): Linear(in_features=128, out_features=16, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (relu)

## Training Loop

In [14]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
epochs = 50
t_total = len(train_loader) * epochs

In [15]:
import torch.nn.functional as F
def train(train_loader, valid_loader):
    
    for epoch in range(epochs):
        train_iterator, valid_iterator = iter(train_loader), iter(valid_loader)
        nb_batches_train = len(train_loader)
        train_acc = 0
        model.train()
        losses = 0.0

        for batch in train_iterator:
            x = batch.text.cuda()
            y = batch.label.cuda()
            
            out = model(x)  # ①

            loss = F.cross_entropy(out, y)  # ②
            
            model.zero_grad()  # ③

            loss.backward()  # ④
            losses += loss.item()

            optimizer.step()  # ⑤
                        
            train_acc += (out.argmax(1) == y).cpu().numpy().mean()
        
        print(f"Training loss at epoch {epoch} is {losses / nb_batches_train}")
        print(f"Training accuracy: {train_acc / nb_batches_train}")
        print('Evaluating on validation:')
        evaluate(valid_loader)

In [16]:
def evaluate(data_loader):
    data_iterator = iter(data_loader)
    nb_batches = len(data_loader)
    model.eval()
    acc = 0 
    for batch in data_iterator:
        x = batch.text.cuda()
        y = batch.label.cuda()
                
        out = model(x)
        acc += (out.argmax(1) == y).cpu().numpy().mean()

    print(f"Eval accuracy: {acc / nb_batches}")

In [17]:
train(train_loader, valid_loader)

Training loss at epoch 0 is 0.6962967545226
Training accuracy: 0.5029548868858253
Evaluating on validation:
Eval accuracy: 0.5047637195121952
Training loss at epoch 1 is 0.6949725513872893
Training accuracy: 0.5049376988335101
Evaluating on validation:
Eval accuracy: 0.504763719512195
Training loss at epoch 2 is 0.6935595045055168
Training accuracy: 0.507815261576529
Evaluating on validation:
Eval accuracy: 0.49641768292682936
Training loss at epoch 3 is 0.6933628450269285
Training accuracy: 0.5137581742665253
Evaluating on validation:
Eval accuracy: 0.5349466463414634
Training loss at epoch 4 is 0.6914480566114619
Training accuracy: 0.5229873630258042
Evaluating on validation:
Eval accuracy: 0.5259527439024391
Training loss at epoch 5 is 0.6860236266384954
Training accuracy: 0.5461293743372214
Evaluating on validation:
Eval accuracy: 0.5575838414634147
Training loss at epoch 6 is 0.6756551520548005
Training accuracy: 0.5763741604807353
Evaluating on validation:
Eval accuracy: 0.596112