# Network Design Alternative to RNN's

## Loading Libraries

In [9]:
#Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import seaborn as sns
import matplotlib
import matplotlib_inline
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
import matplotlib.patches as patches

# Dataset's Iteration Performance
from tqdm import tqdm

# Time
import time

# OS
import re
import sys
import json
import string
import unicodedata
from glob import glob
from io import BytesIO
from imageio import imread
from zipfile import ZipFile
import requests, zipfile, io
from collections import Counter 
from urllib.request import urlopen


# SciPy
from scipy.signal import convolve

# PyTorch
import torch
import torchvision
import torch.nn as nn
from torch.utils.data import *
from torchvision.ops import nms
import torch.nn.functional as F
from torchtext.vocab import Vocab 
from torchvision import transforms
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torch.utils.data import Dataset, DataLoader
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator


# IDLMAM Libraries
from idlmam import moveTo, run_epoch, set_seed, View, pad_and_pack
from idlmam import train_simple_network, set_seed, Flatten, weight_reset, train_network
from idlmam import LanguageNameDataset, pad_and_pack, EmbeddingPackable, LastTimeStep, LambdaLayer
from idlmam import AttentionAvg, GeneralScore, DotScore, AdditiveAttentionScore, ApplyAttention, getMaskByFill


# Scikit-Learn
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

#  IPython Display
from IPython.display import Latex
from IPython.display import display_pdf
from IPython.display import set_matplotlib_formats

### Visualization Set-Up

In [None]:
%matplotlib inline

matplotlib_inline.backend_inline.set_matplotlib_formats('png', 'pdf')

### Setting Seeds & Device

In [None]:
torch.backends.cudnn.deterministic=True

set_seed(42)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")  

## TorchText: Tools for Text

### Installing ToxchText

In [None]:
# !pip install  torchtext 
# !pip install  sentencepiece 

### Loading Dataset in TorchText

In [None]:
# Retrieving Dataset
train_iter, test_iter = AG_NEWS(root='./data', split=('train', 'test'))

# Training Set
train_dataset = list(train_iter)

# Test Set
test_dataset = list(test_iter)

In [None]:
# 
tokenizer = get_tokenizer('basic_english')

counter = Counter() 

for (label, line) in train_dataset: 
    counter.update(tokenizer(line)) 

vocab = Vocab(counter, min_freq=10, specials=('<unk>', '<BOS>', '<EOS>', '<PAD>')) 


In [None]:
def text_transform(x): 
    return [vocab['<BOS>']] + [vocab[token] for token in tokenizer(x)] + [vocab['<EOS>']] 

def label_transform(x): 
    return x-1 

print(text_transform(train_dataset[0][1])) 

In [None]:
VOCAB_SIZE = len(vocab)

NUM_CLASS = len(np.unique([z[0] for z in train_dataset])) 

print("Vocab: ", VOCAB_SIZE)
print("Num Classes: ", NUM_CLASS)

padding_idx = vocab["<PAD>"]

# Embedding Dimension
embed_dim = 128

# Batch Size
B = 64

# Epochs
epochs = 15

In [None]:
def pad_batch(batch):
    labels = [label_transform(z[0]) for z in batch] 
    texts = [torch.tensor(text_transform(z[1]), dtype=torch.int64) for z in batch] 
    
    max_len = max([text.size(0) for text in texts])
    texts = [F.pad(text, (0,max_len-text.size(0)), value=padding_idx) for text in texts]
    x, y = torch.stack(texts), torch.tensor(labels, dtype=torch.int64)
    
    return x, y

In [None]:
# Data Loader
train_loader = DataLoader(train_dataset, batch_size=B, shuffle=True, collate_fn=pad_batch)

test_loader = DataLoader(test_dataset, batch_size=B, collate_fn=pad_batch)

## Defining a Baseline Model

In [None]:
gru = nn.Sequential(
  nn.Embedding(VOCAB_SIZE, embed_dim, padding_idx=padding_idx), 
  nn.GRU(embed_dim, embed_dim, num_layers=3, batch_first=True, bidirectional=True), 
  LastTimeStep(rnn_layers=3, bidirectional=True), 
  nn.Linear(embed_dim*2, NUM_CLASS), 
)

# Loss Function
loss_func = nn.CrossEntropyLoss()

In [None]:
gru_results = train_network(gru, 
loss_func, 
train_loader, 
val_loader=test_loader, 
score_funcs={'Accuracy': accuracy_score}, 
device=device, 
epochs=epochs)

In [None]:
sns.lineplot(x='epoch', 
y='val Accuracy', 
data=gru_results, 
label='GRU')

plt.grid(True)
plt.show()

In [None]:
simpleEmbdAvg = nn.Sequential(
    nn.Embedding(VOCAB_SIZE, embed_dim, padding_idx=padding_idx), #(B, T) -> (B, T, D) 
    nn.Linear(embed_dim, embed_dim),
    nn.LeakyReLU(),
    nn.Linear(embed_dim, embed_dim),
    nn.LeakyReLU(),
    nn.Linear(embed_dim, embed_dim),
    nn.LeakyReLU(),
    nn.AdaptiveAvgPool2d((1,embed_dim)), #(B, T, D) -> (B, 1, D)
    nn.Flatten(), #(B, 1, D) -> (B, D)
    nn.Linear(embed_dim, embed_dim),
    nn.LeakyReLU(),
    nn.BatchNorm1d(embed_dim),
    nn.Linear(embed_dim, NUM_CLASS)
)

In [None]:
simpleEmbdAvg_results = train_network(simpleEmbdAvg, 
loss_func, 
train_loader, 
val_loader=test_loader, 
score_funcs={'Accuracy': accuracy_score}, 
device=device, 
epochs=epochs)

In [None]:
# GRU
sns.lineplot(x='epoch', 
y='val Accuracy', 
data=gru_results, 
label='GRU')

#
sns.lineplot(x='epoch', 
y='val Accuracy', 
data=simpleEmbdAvg_results, 
label='Average Embedding')

plt.grid(True)
plt.show()

In [None]:
#
sns.lineplot(x='total time', 
y='val Accuracy', 
data=gru_results, 
label='GRU')

#
sns.lineplot(x='total time', 
y='val Accuracy', 
data=simpleEmbdAvg_results, 
label='Average Embedding')

plt.grid(True)
plt.show()

### Weighted Average over Time with Attention

In [None]:
class EmbeddingAttentionBag(nn.Module):

    def __init__(self, vocab_size, D, embd_layers=3, padding_idx=None):
        super(EmbeddingAttentionBag, self).__init__()
        self.padding_idx = padding_idx
        self.embd = nn.Embedding(vocab_size, D, padding_idx=padding_idx)
        if isinstance(embd_layers, int):
            self.embd_layers =  nn.Sequential( 
                *[nn.Sequential(nn.Linear(embed_dim, embed_dim),
                nn.LeakyReLU()) for _ in range(embd_layers)]
            )
        else:
            self.embd_layers = embd_layers
        self.attn = AttentionAvg(AdditiveAttentionScore(D)) 
    
    def forward(self, input):
        if self.padding_idx is not None:
            mask = input != self.padding_idx
        else:
            mask = input == input 
        
        x = self.embd(input) 
        x = self.embd_layers(x)
        context = x.sum(dim=1)/(mask.sum(dim=1).unsqueeze(1)+1e-5) 
        return self.attn(x, context, mask=mask) 

In [None]:
attnEmbd = nn.Sequential(
    EmbeddingAttentionBag(VOCAB_SIZE, embed_dim, padding_idx=padding_idx), 
    nn.Linear(embed_dim, embed_dim),
    nn.LeakyReLU(),
    nn.BatchNorm1d(embed_dim),
    nn.Linear(embed_dim, NUM_CLASS)
)

In [None]:
attnEmbd_results = train_network(attnEmbd, 
loss_func, 
train_loader, 
val_loader=test_loader, 
score_funcs={'Accuracy': accuracy_score}, 
device=device, 
epochs=epochs)

In [None]:
# GRU
sns.lineplot(x='total time', y='val Accuracy', data=gru_results, label='GRU')

# Average Embedding
sns.lineplot(x='total time', y='val Accuracy', data=simpleEmbdAvg_results, label='Average Embedding')

# Attention Embedding
sns.lineplot(x='total time', y='val Accuracy', data=attnEmbd_results, label='Attention Embedding')

## Pooling over Time & 1D CNNs

In [None]:
def cnnLayer(in_size, out_size): 
    return nn.Sequential(
        nn.Conv1d(in_size, out_size, kernel_size=k_size, padding=k_size//2),
        nn.LeakyReLU(),
        nn.BatchNorm1d(out_size))

k_size = 3
cnnOverTime = nn.Sequential(
    nn.Embedding(VOCAB_SIZE, embed_dim, padding_idx=padding_idx), 
    LambdaLayer(lambda x : x.permute(0,2,1)), 
    cnnLayer(embed_dim, embed_dim),
    cnnLayer(embed_dim, embed_dim),
    nn.AvgPool1d(2), 
    cnnLayer(embed_dim, embed_dim*2),
    cnnLayer(embed_dim*2, embed_dim*2),
    nn.AvgPool1d(2), 
    cnnLayer(embed_dim*2, embed_dim*4),
    cnnLayer(embed_dim*4, embed_dim*4),
    nn.AdaptiveMaxPool1d(1), 
    nn.Flatten(), 
    nn.Linear(4*embed_dim, embed_dim),
    nn.LeakyReLU(),
    nn.BatchNorm1d(embed_dim),
    nn.Linear(embed_dim, NUM_CLASS)
)

In [None]:
cnn_results = train_network(cnnOverTime, 
loss_func, 
train_loader, 
val_loader=test_loader, 
score_funcs={'Accuracy': accuracy_score}, 
device=device, 
epochs=epochs)

In [None]:
# GRU
sns.lineplot(x='total time', y='val Accuracy', data=gru_results, label='GRU')

# Average Embedding
sns.lineplot(x='total time', y='val Accuracy', data=simpleEmbdAvg_results, label='Average Embedding')

# Attention Embedding
sns.lineplot(x='total time', y='val Accuracy', data=attnEmbd_results, label='Attention Embedding')

# CNN Adaptative Pooling
sns.lineplot(x='total time', y='val Accuracy', data=cnn_results, label='CNN Adaptive Pooling')

plt.grid(True)
plt.show()

## Positional Embedding Add Sequence Information to any Model

In [None]:
# Positional Range
position = np.arange(0, 100)

# Sine Position
sns.lineplot(position, np.sin(position), label="sin(position)")

plt.grid(True)
plt.show()

In [None]:
# Positional Range
position = np.arange(0, 100)

# Sine Position
sns.lineplot(x=position, 
y=np.sin(position), 
label="sin(position)")

# Sine Position on 10th
sns.lineplot(x=position, 
y=np.sin(position/10), 
label="sin(position/10)")

plt.grid(True)
plt.show()

In [None]:
# Dimesionality
dimensions = 6 

# Positional Range
position = np.expand_dims(np.arange(0, 100), 1)

# Frequency Stability
div = np.exp(np.arange(0, dimensions*2, 2) * (-math.log(10000.0) / (dimensions*2)))

for i in range(dimensions):
    sns.lineplot(x=position[:,0], y=np.sin(position*div)[:,i], label="Dim-"+str(i))

### Implementing a Positional Encoding Module

In [None]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000, batch_first=False):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.d_model = d_model

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)
        
        self.batch_first = batch_first

    def forward(self, x):
        if self.batch_first: 
            x = x.permute(1, 0, 2)

        x = x *np.sqrt(self.d_model) + self.pe[:x.size(0), :]
        x = self.dropout(x)
        
        if self.batch_first: 
            x = x.permute(1, 0, 2)
            
        return x

In [None]:
simplePosEmbdAvg = nn.Sequential(
    nn.Embedding(VOCAB_SIZE, embed_dim, padding_idx=padding_idx), 
    PositionalEncoding(embed_dim, batch_first=True),
    nn.Linear(embed_dim, embed_dim),
    nn.LeakyReLU(),
    nn.Linear(embed_dim, embed_dim),
    nn.LeakyReLU(),
    nn.Linear(embed_dim, embed_dim),
    nn.LeakyReLU(),
    nn.AdaptiveAvgPool2d((1,None)), 
    nn.Flatten(), 
    nn.Linear(embed_dim, embed_dim),
    nn.LeakyReLU(),
    nn.BatchNorm1d(embed_dim),
    nn.Linear(embed_dim, NUM_CLASS)
)

In [None]:
embd_layers =  nn.Sequential( 
    *([PositionalEncoding(embed_dim, batch_first=True)]+
      [nn.Sequential(nn.Linear(embed_dim, embed_dim), nn.LeakyReLU()) for _ in range(3)])
)

attnPosEmbd = nn.Sequential(
    EmbeddingAttentionBag(VOCAB_SIZE, embed_dim, padding_idx=padding_idx, embd_layers=embd_layers), #(B, T) -> (B, D) 
    nn.Linear(embed_dim, embed_dim),
    nn.LeakyReLU(),
    nn.BatchNorm1d(embed_dim),
    nn.Linear(embed_dim, NUM_CLASS)
)

posEmbdAvg_results = train_network(simplePosEmbdAvg, 
loss_func, 
train_loader, 
val_loader=test_loader, 
score_funcs={'Accuracy': accuracy_score}, 
device=device, 
epochs=epochs)

In [None]:
attnPosEmbd_results = train_network(attnPosEmbd, 
loss_func, 
train_loader, 
val_loader=test_loader, 
score_funcs={'Accuracy': accuracy_score}, 
device=device, 
epochs=epochs)

Positional Encoding Results:

In [None]:
# Average Embedding
sns.lineplot(x='total time', y='val Accuracy', data=simpleEmbdAvg_results, label='Average Embedding')

# Average Positional Embedding
sns.lineplot(x='total time', y='val Accuracy', data=posEmbdAvg_results, label='Average Positional Embedding')

# Attention Embedding
sns.lineplot(x='total time', y='val Accuracy', data=attnEmbd_results, label='Attention Embedding')

# Attention Positional Embedding
sns.lineplot(x='total time', y='val Accuracy', data=attnPosEmbd_results, label='Attention Positional Embedding')

plt.grid(True)
plt.show()

In [None]:
# GRU
sns.lineplot(x='total time', 
y='val Accuracy', 
data=gru_results, 
label='GRU')

# Attention Embedding
sns.lineplot(x='total time', 
y='val Accuracy', 
data=attnEmbd_results, 
label='Attention Embedding')

# Attention Positional Embedding
sns.lineplot(x='total time', 
y='val Accuracy', 
data=attnPosEmbd_results, 
label='Attention Positional Embedding')

## Transformers: Big Models for Big Data

### Multihead Attention

Transformers Blocks:

In [None]:
class SimpleTransformerClassifier(nn.Module):

    def __init__(self, vocab_size, D, padding_idx=None):
        super(SimpleTransformerClassifier, self).__init__()
        self.padding_idx = padding_idx
        self.embd = nn.Embedding(vocab_size, D, padding_idx=padding_idx)
        self.position = PositionalEncoding(D, batch_first=True)
        self.transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=D, nhead=8),num_layers=3)
        self.attn = AttentionAvg(AdditiveAttentionScore(D))
        self.pred = nn.Sequential(
            nn.Flatten(), 
            nn.Linear(D, D),
            nn.LeakyReLU(),
            nn.BatchNorm1d(D),
            nn.Linear(D, NUM_CLASS)
        )
    
    def forward(self, input):
        if self.padding_idx is not None:
            mask = input != self.padding_idx
        else:
            mask = input == input 
        x = self.embd(input) 
        x = self.position(x)  
        x = self.transformer(x.permute(1,0,2)) 
        x = x.permute(1,0,2) 
        context = x.sum(dim=1)/mask.sum(dim=1).unsqueeze(1)
        return self.pred(self.attn(x, context, mask=mask))

In [None]:
simpleTransformer = SimpleTransformerClassifier(VOCAB_SIZE, 
embed_dim, 
padding_idx=padding_idx)

transformer_results = train_network(simpleTransformer, 
loss_func, 
train_loader, 
val_loader=test_loader, 
score_funcs={'Accuracy': accuracy_score}, 
device=device, 
epochs=epochs)

In [None]:
# GRU
sns.lineplot(x='total time', y='val Accuracy', data=gru_results, label='GRU')

# Attention Embedding
sns.lineplot(x='total time', y='val Accuracy', data=attnEmbd_results, label='Attention Embedding')

# Attention Positional Embedding
sns.lineplot(x='total time', y='val Accuracy', data=attnPosEmbd_results, label='Attention Positional Embedding')

# CNN Adaptive Pooling
sns.lineplot(x='total time', y='val Accuracy', data=cnn_results, label='CNN Adaptive Pooling')

# Transformer
sns.lineplot(x='total time', y='val Accuracy', data=transformer_results, label='Transformer')

NameError: name 'sns' is not defined

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=1d6ef229-9840-40af-b62b-b2ab55589447' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>