## Downgrade pytorch (please restart runtime after run this cell)


In [11]:
!pip install torch==1.9.0
!pip install torchtext==0.10.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch==1.9.0
  Downloading torch-1.9.0-cp37-cp37m-manylinux1_x86_64.whl (831.4 MB)
[K     |████████████████████████████████| 831.4 MB 2.6 kB/s 
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 1.12.0+cu113
    Uninstalling torch-1.12.0+cu113:
      Successfully uninstalled torch-1.12.0+cu113
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.13.0+cu113 requires torch==1.12.0, but you have torch 1.9.0 which is incompatible.
torchtext 0.13.0 requires torch==1.12.0, but you have torch 1.9.0 which is incompatible.
torchaudio 0.12.0+cu113 requires torch==1.12.0, but you have torch 1.9.0 which is incompatible.[0m
Successfully installed torch-1.9.0


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.10.0
  Downloading torchtext-0.10.0-cp37-cp37m-manylinux1_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 24.8 MB/s 
Installing collected packages: torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.13.0
    Uninstalling torchtext-0.13.0:
      Successfully uninstalled torchtext-0.13.0
Successfully installed torchtext-0.10.0


## Model Definition

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math 

class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention,self).__init__()
        self.softargmax = F.softmax
    def forward(self,Q,K,V):
        batch_size, head_size, sequnce_length, d_k = K.size()
        K_T = K.transpose(2,3)
        A = self.softargmax((Q@K_T)/math.sqrt(d_k), dim = -1)
        H = A@V
        return H

class MultiHeadAttention(torch.nn.Module):
    def __init__(self,input_dim, head_size,d_model):
        super(MultiHeadAttention, self).__init__()
        d_q= d_k= d_v = d_model
        self.head_size = head_size
        self.scaled_dot_product = ScaledDotProductAttention()

        self.W_q = nn.Linear(input_dim,head_size*d_q)
        self.W_k = nn.Linear(input_dim,head_size*d_k)
        self.W_v = nn.Linear(input_dim,head_size*d_v)

        self.W_h = nn.Linear(head_size*d_model,d_model)

    def split(self,X):
        batch_size, sequence_length, num_head_times_d_model = X.size()
        d_model = num_head_times_d_model//self.head_size
        X = X.view(batch_size, sequence_length, self.head_size,d_model).transpose(1,2)
        return X

    def concat(self,X):
        batch_size, head_size, sequence_length, d_model = X.size()
        assert(head_size == self.head_size)
        X = X.transpose(1,2).contiguous().view(batch_size, sequence_length,head_size*d_model)
        return X

    def forward(self,X_query,X_key,X_value):
        Q,K,V = self.W_q(X_query), self.W_k(X_key), self.W_v(X_value)
        Q,K,V = self.split(Q), self.split(K), self.split(V)
        H = self.scaled_dot_product(Q,K,V)
        H = self.concat(H)
        out = self.W_h(H)
        return out

In [2]:
import torch
import torch.nn as nn

class FeedForward(nn.Module):
    def __init__(self, d_model, d_hidden, dropout_prob):
        super(FeedForward, self).__init__()
        self.layer1 = nn.Linear(d_model, d_hidden)
        self.layer2 = nn.Linear(d_hidden, d_model)
        self.dropout = nn.Dropout(p=dropout_prob)
        self.relu = nn.ReLU()
      
    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.layer2(x)
        return x

In [3]:
import torch
import torch.nn as nn

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len, dropout_prob):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout_prob)
        position = torch.arange(0, max_len)
        position = position.float().unsqueeze(dim=1)
        
        pe = torch.zeros(max_len, d_model)
        div_term = torch.exp(torch.arange(0, d_model, step=2) * (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position*div_term)
        pe[:, 1::2] = torch.cos(position*div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        batch_size, seq_len, d_model = x.size()
        return self.dropout(self.pe[:seq_len, :].unsqueeze(0)+x)

In [4]:
import torch
class EncoderLayer(torch.nn.Module):
    def __init__(self,d_model,head_size,mlp_hidden_dim,dropout_prob = 0.1):
        super().__init__()
        input_dim = d_model
        self.attention = MultiHeadAttention(input_dim, head_size, d_model)
        self.layer_norm1 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
        self.layer_norm2 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
        self.mlp = FeedForward(d_model,mlp_hidden_dim, dropout_prob)

    def forward(self, x):
        # 1. compute self attention
        _x = x
        x = self.attention(x,x,x)
        # 2. add and norm
        x = self.layer_norm1(x + _x)
        
        # 3. positionwise feed forward network
        _x = x
        x = self.mlp(x)
      
        # 4. add and norm
        x = self.layer_norm2(x + _x)
        return x

In [5]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab_size, max_position_embeddings, p):
        super().__init__()
        self.word_embeddings = nn.Embedding(vocab_size, d_model, padding_idx=1)
        self.positional_encoding = PositionalEncoding( d_model,max_position_embeddings,p)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-12)

    def forward(self, input_ids):
        seq_length = input_ids.size(1)
        
        # Get word embeddings for each input id
        word_embeddings = self.word_embeddings(input_ids)                   # (bs, max_seq_length, dim)
        
        
        embeddings = self.positional_encoding(word_embeddings)
        # Layer norm 
        embeddings = self.layer_norm(embeddings)             # (bs, max_seq_length, dim)
        return embeddings

In [6]:
class Encoder(nn.Module):
    def __init__(self, num_layers, d_model, head_size, mlp_hidden_dim, input_vocab_size,
               maximum_position_encoding, p=0.1):
        super().__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = Embeddings(d_model, input_vocab_size,maximum_position_encoding, p)

        self.enc_layers = nn.ModuleList()
        for _ in range(num_layers):
            self.enc_layers.append(EncoderLayer(d_model, head_size, mlp_hidden_dim, p))
        
    def forward(self, x):
        x = self.embedding(x) # Transform to (batch_size, input_seq_length, d_model)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x)

        return x  # (batch_size, input_seq_len, d_model)

## Dataset Processing

In [7]:
from torchtext import legacy
from torchtext.legacy import data
import torchtext.datasets as datasets

In [8]:
max_len = 200
text = legacy.data.Field(sequential=True, fix_length=max_len, batch_first=True, lower=True, dtype=torch.long)
label = legacy.data.LabelField(sequential=False, dtype=torch.long)
ds_train, ds_test = legacy.datasets.IMDB.splits(text, label, root='./')
print('train : ', len(ds_train))
print('test : ', len(ds_test))
print('train.fields :', ds_train.fields)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:08<00:00, 9.95MB/s]


train :  25000
test :  25000
train.fields : {'text': <torchtext.legacy.data.field.Field object at 0x7f685ef928d0>, 'label': <torchtext.legacy.data.field.LabelField object at 0x7f685ef92950>}


In [9]:
ds_train, ds_valid = ds_train.split(0.9)
print('train : ', len(ds_train))
print('valid : ', len(ds_valid))
print('test : ', len(ds_test))

train :  22500
valid :  2500
test :  25000


In [10]:
num_words = 50_000
text.build_vocab(ds_train, max_size=num_words)
label.build_vocab(ds_train)
vocab = text.vocab

In [11]:

batch_size = 164
train_loader, valid_loader, test_loader = data.BucketIterator.splits(
    (ds_train, ds_valid, ds_test), batch_size=batch_size, sort_key=lambda x: len(x.text), repeat=False)

In [12]:
class TransformerClassifier(nn.Module):
    def __init__(self, num_layers, d_model, head_size, conv_hidden_dim, input_vocab_size, num_answers):
        super().__init__()
        
        self.encoder = Encoder(num_layers, d_model, head_size, conv_hidden_dim, input_vocab_size,
                         maximum_position_encoding=10000)
        self.dense = nn.Linear(d_model, num_answers)

    def forward(self, x):
        x = self.encoder(x)
        
        x, _ = torch.max(x, dim=1)
        x = self.dense(x)
        return x

In [13]:
model = TransformerClassifier(num_layers=1, d_model=32, head_size=2, 
                         conv_hidden_dim=128, input_vocab_size=50002, num_answers=2)
model.cuda()

TransformerClassifier(
  (encoder): Encoder(
    (embedding): Embeddings(
      (word_embeddings): Embedding(50002, 32, padding_idx=1)
      (positional_encoding): PositionalEncoding(
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (layer_norm): LayerNorm((32,), eps=1e-12, elementwise_affine=True)
    )
    (enc_layers): ModuleList(
      (0): EncoderLayer(
        (attention): MultiHeadAttention(
          (scaled_dot_product): ScaledDotProductAttention()
          (W_q): Linear(in_features=32, out_features=64, bias=True)
          (W_k): Linear(in_features=32, out_features=64, bias=True)
          (W_v): Linear(in_features=32, out_features=64, bias=True)
          (W_h): Linear(in_features=64, out_features=32, bias=True)
        )
        (layer_norm1): LayerNorm((32,), eps=1e-06, elementwise_affine=True)
        (layer_norm2): LayerNorm((32,), eps=1e-06, elementwise_affine=True)
        (mlp): FeedForward(
          (layer1): Linear(in_features=32, out_features=128, b

## Training Loop

In [14]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
epochs = 10
t_total = len(train_loader) * epochs

In [15]:
def train(train_loader, valid_loader):
    
    for epoch in range(epochs):
        train_iterator, valid_iterator = iter(train_loader), iter(valid_loader)
        nb_batches_train = len(train_loader)
        train_acc = 0
        model.train()
        losses = 0.0

        for batch in train_iterator:
            x = batch.text.cuda()
            y = batch.label.cuda()
            
            out = model(x)  # ①

            loss = F.cross_entropy(out, y)  # ②
            
            model.zero_grad()  # ③

            loss.backward()  # ④
            losses += loss.item()

            optimizer.step()  # ⑤
                        
            train_acc += (out.argmax(1) == y).cpu().numpy().mean()
        
        print(f"Training loss at epoch {epoch} is {losses / nb_batches_train}")
        print(f"Training accuracy: {train_acc / nb_batches_train}")
        print('Evaluating on validation:')
        evaluate(valid_loader)



In [16]:
def evaluate(data_loader):
    data_iterator = iter(data_loader)
    nb_batches = len(data_loader)
    model.eval()
    acc = 0 
    for batch in data_iterator:
        x = batch.text.cuda()
        y = batch.label.cuda()
                
        out = model(x)
        acc += (out.argmax(1) == y).cpu().numpy().mean()

    print(f"Eval accuracy: {acc / nb_batches}")

In [17]:
train(train_loader, valid_loader)

Training loss at epoch 0 is 0.737967532614003
Training accuracy: 0.535894971721456
Evaluating on validation:
Eval accuracy: 0.5721417682926829
Training loss at epoch 1 is 0.6543457499448804
Training accuracy: 0.6233927624602336
Evaluating on validation:
Eval accuracy: 0.6350990853658536
Training loss at epoch 2 is 0.5996039017387058
Training accuracy: 0.6872459349593496
Evaluating on validation:
Eval accuracy: 0.7
Training loss at epoch 3 is 0.5204477003519086
Training accuracy: 0.7426431601272536
Evaluating on validation:
Eval accuracy: 0.7453887195121952
Training loss at epoch 4 is 0.4486882384272589
Training accuracy: 0.79110330505479
Evaluating on validation:
Eval accuracy: 0.791158536585366
Training loss at epoch 5 is 0.3915546417668246
Training accuracy: 0.8251977288794629
Evaluating on validation:
Eval accuracy: 0.8052591463414634
Training loss at epoch 6 is 0.3570093303055003
Training accuracy: 0.8435677359490986
Evaluating on validation:
Eval accuracy: 0.8117378048780487
Train

In [18]:
evaluate(test_loader)

Eval accuracy: 0.8081216678179853
