# Setup

## Imports

In [1]:
import torch
from datasets import load_dataset
from torch import nn, Tensor
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizerFast

from IPython.display import clear_output
import matplotlib.pyplot as plt
from tqdm import tqdm
import math
import time
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


## Constants

In [None]:
# RANDOM_STATE = 42
BATCH_SIZE = 32

## Hardware

In [None]:
torch.cuda.empty_cache()

[torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())]

In [None]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
device

## Functions

In [2]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [3]:
def tokenize(texts, tokenizer):
    res = tokenizer(
        texts, 
        return_tensors="pt",
        padding='max_length',
        max_length=512,
        truncation=True
    )
    return res['input_ids'], res['attention_mask']

In [4]:
def train(device, model, iterator, optimizer, criterion, clip, train_history=None, valid_history=None, n_step=100):
    model.train()
    epoch_loss = 0
    history = []
    for i, batch in enumerate(iterator):
        X = batch[0].to(device)
        y = batch[1].to(device)
        optimizer.zero_grad()

        output = model(X).view(-1)
        loss = criterion(output, y)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()

        history.append(loss.cpu().data.numpy())
        if (i+1)%n_step==0:
            fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 8))

            clear_output(True)
            ax[0].plot(history, label='train loss')
            ax[0].set_xlabel('Batch')
            ax[0].set_title('Train loss')
            if train_history is not None:
                ax[1].plot(train_history, label='general train history')
                ax[1].set_xlabel('Epoch')
            if valid_history is not None:
                ax[1].plot(valid_history, label='general valid history')
            plt.legend()
            plt.show()
    return epoch_loss / len(iterator)

In [5]:
def evaluate(device, model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            X = batch[0].to(device)
            y = batch[1].to(device)
            output = model(X).view(-1)
            loss = criterion(output, y)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [6]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [7]:
def predict(device, model, iterator):
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for i, batch in tqdm(enumerate(iterator)):
            X = batch[0].to(device)
            y = batch[1].to(device)
            output = model(X).view(-1)
            y_pred += output.cpu().numpy().tolist()
            y_true += y.cpu().numpy().tolist()
    return y_true, y_pred

## Transformer modules

### Default modules

In [None]:
class PositionalEmbedding(nn.Module):
    def __init__(self, max_seq_len, embed_model_dim):
        super(PositionalEmbedding, self).__init__()
        self.embed_dim = embed_model_dim

        pe = torch.zeros(max_seq_len, self.embed_dim)
        for pos in range(max_seq_len):
            for i in range(0, self.embed_dim, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/self.embed_dim)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/self.embed_dim)))
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x * math.sqrt(self.embed_dim)
        seq_len = x.size(1)
        x = x + torch.autograd.Variable(self.pe[:,:seq_len], requires_grad=False)
        return x

In [None]:
class ScaleDotProductAttention(nn.Module):
    """
    Compute scale dot product attention

    Query : given sentence that we focused on (decoder)
    Key : every sentence to check relationship with Qeury(encoder)
    Value : every sentence same with Key (encoder)
    """
    def __init__(self):
        super(ScaleDotProductAttention, self).__init__()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, q, k, v, mask=None):
        batch_size, head, length, d_tensor = k.size()
        k_t = k.transpose(2, 3)
        score = (q @ k_t) / torch.sqrt(torch.tensor(d_tensor))
        if mask is not None:
            score = score.masked_fill(mask == 0, -10000)
        score = self.softmax(score)
        v = score @ v
        return v, score

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, input_size, hidden_size, n_heads):
        super(MultiHeadAttention, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_heads = n_heads
        self.attention = ScaleDotProductAttention()
        self.w_q = nn.Linear(self.input_size, self.hidden_size)
        self.w_k = nn.Linear(self.input_size, self.hidden_size)
        self.w_v = nn.Linear(self.input_size, self.hidden_size)
        self.w_concat = nn.Linear(self.hidden_size, self.hidden_size)

    def forward(self, q, k, v, mask=None):
        q, k, v = self.w_q(q), self.w_k(k), self.w_v(v)
        q, k, v = self.split(q), self.split(k), self.split(v)
        out, attention = self.attention(q, k, v, mask=mask)
        out = self.concat(out)
        out = self.w_concat(out)
        return out

    def split(self, tensor):
        """
        split tensor by number of head

        :param tensor: [batch_size, length, d_model]
        :return: [batch_size, head, length, d_tensor]
        """
        batch_size, length, d_model = tensor.size()

        d_tensor = d_model // self.n_heads
        tensor = tensor.view(batch_size, length, self.n_heads, d_tensor).transpose(1, 2)
        # it is similar to group convolution (split by number of heads)
        return tensor

    def concat(self, tensor):
        """
        inverse function of self.split(tensor : torch.Tensor)

        :param tensor: [batch_size, head, length, d_tensor]
        :return: [batch_size, length, d_model]
        """
        batch_size, head, length, d_tensor = tensor.size()
        d_model = head * d_tensor

        tensor = tensor.transpose(1, 2).contiguous().view(batch_size, length, d_model)
        return tensor

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, input_size, hidden_size, n_heads, drop_prob=0.1):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(input_size, hidden_size, n_heads)
        self.norm1 = nn.LayerNorm(input_size)
        self.dropout = nn.Dropout(p=drop_prob)

        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, input_size)
        self.relu = nn.ReLU()

        self.norm2 = nn.LayerNorm(input_size)

    def forward(self, x, src_mask):
        _x = x

        x = self.attention(q=x, k=x, v=x, mask=src_mask)
        x = self.dropout(x)
        x = self.norm1(x + _x)
        
        _x = x
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
      
        x = self.dropout(x)
        x = self.norm2(x + _x)
        return x

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, n_heads, n_layers, drop_prob=0.1):
        super().__init__()
        encoder_layers = []
        for _ in range(n_layers):
            layer = EncoderLayer(
                input_size=input_size,
                hidden_size=hidden_size,
                n_heads=n_heads,
                drop_prob=drop_prob
            )
            encoder_layers.append(layer)
        self.layers = nn.ModuleList(encoder_layers)

    def forward(self, x, src_mask):
        for layer in self.layers:
            x = layer(x, src_mask)
        return x

In [None]:
# TODO: Decoder

In [None]:
# TODO: rewrite
class BinaryClassificationTransformerModel(nn.Module):
    def __init__(self, ntoken: int, model_size: int = 128, n_heads: int = 4, 
    nlayers: int = 1, dropout: float = 0.1, maxlen: int = 512):
        super().__init__()
        self.model_size = model_size
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEmbedding(maxlen, model_size)
        self.emb = nn.Embedding(ntoken, model_size)
        self.transformer_encoder = Encoder(
            input_size=self.model_size, 
            hidden_size=self.model_size, 
            n_heads=n_heads, 
            n_layers=nlayers, 
            drop_prob=dropout
        )
        self.decoder = nn.Linear(model_size, 1)  # Bin classifier
        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.emb.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor = None) -> Tensor:
        src = self.emb(src)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        pooled = output.mean(dim=1)
        output = self.decoder(pooled)
        return torch.sigmoid(output)

### Hypercube Transformer

In [None]:
class HyperCubeLayer(nn.Module):
    __constants__ = ['in_features', 'out_sqrt_features']
    in_features: int
    out_sqrt_features: int
    weight: torch.Tensor

    def __init__(self, in_features: int, out_sqrt_features: int, bias: bool = True,
                 device=None, dtype=None) -> None:
        factory_kwargs = {'device': device, 'dtype': dtype}
        super().__init__()
        hc_input_size = np.sqrt(in_features)
        assert hc_input_size % 1 == 0
        self.hc_input_size = hc_input_size = int(hc_input_size)
        self.in_features = in_features
        self.out_sqrt_features = out_sqrt_features  # No. of output features = out_sqrt_features * sqrt(in_features)
        self.weight = nn.Parameter(torch.empty((out_sqrt_features, hc_input_size, hc_input_size), **factory_kwargs))
        if bias:
            self.bias = nn.Parameter(torch.empty((out_sqrt_features,), **factory_kwargs))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self) -> None:
        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
            nn.init.uniform_(self.bias, -bound, bound)

    def extra_repr(self) -> str:
        return 'in_features={}, hc_input_size={}, out_sqrt_features={}, bias={}'.format(
            self.in_features, self.hc_input_size, self.out_sqrt_features, self.bias is not None
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x.view((*x.shape[:-1], self.hc_input_size, self.hc_input_size))
        x = (x.movedim(1,2) @ self.weight).movedim(2,1) + self.bias
        x = x.flatten(start_dim=-2)
        return x

In [None]:
class HyperCubeBlock(nn.Module):
    def __init__(self, input_size, out_sqrt_features=None):
        if out_sqrt_features is None:
            out_sqrt_features = input_size
        super(HyperCubeBlock, self).__init__()
        self.hc_layers_1 = HyperCubeLayer(input_size, int(np.sqrt(input_size)))  # TODO: fix
        self.hc_layers_2 = HyperCubeLayer(input_size, out_sqrt_features)
        self.relu = nn.ReLU()
            
    def forward(self, x):
        x = self.hc_layers_1(x)
        # !Check if needed
        # sq = int(np.sqrt(x.shape[-1]))
        # x = x.view((*x.shape[:-1], sq, sq))
        # x = x.transpose(-1,-2)
        # x = x.flatten(start_dim=-2)
        x = self.relu(x)
        x = self.hc_layers_2(x)
        return x

In [None]:
class MultiHeadAttentionHC(nn.Module):
    def __init__(self, input_size, hidden_size, n_heads):
        super(MultiHeadAttentionHC, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_heads = n_heads
        self.attention = ScaleDotProductAttention()
        self.hidden_size = self.input_size  # TMP second param is also input_size instead of hidden_size
        # self.w_q = HyperCubeBlock(self.input_size, self.hidden_size)
        # self.w_k = HyperCubeBlock(self.input_size, self.hidden_size)
        # self.w_v = HyperCubeBlock(self.input_size, self.hidden_size)
        self.w_q = HyperCubeBlock(self.input_size, int(np.sqrt(self.hidden_size)))
        self.w_k = HyperCubeBlock(self.input_size, int(np.sqrt(self.hidden_size)))
        self.w_v = HyperCubeBlock(self.input_size, int(np.sqrt(self.hidden_size)))
        self.w_concat = HyperCubeBlock(self.hidden_size, int(np.sqrt(self.hidden_size)))

    def forward(self, q, k, v, mask=None):
        q, k, v = self.w_q(q), self.w_k(k), self.w_v(v)
        q, k, v = self.split(q), self.split(k), self.split(v)
        out, attention = self.attention(q, k, v, mask=mask)
        out = self.concat(out)
        out = self.w_concat(out)
        return out

    def split(self, tensor):
        """
        split tensor by number of head

        :param tensor: [batch_size, length, d_model]
        :return: [batch_size, head, length, d_tensor]
        """
        batch_size, length, d_model = tensor.size()

        d_tensor = d_model // self.n_heads
        tensor = tensor.view(batch_size, length, self.n_heads, d_tensor).transpose(1, 2)
        # it is similar with group convolution (split by number of heads)
        return tensor

    def concat(self, tensor):
        """
        inverse function of self.split(tensor : torch.Tensor)

        :param tensor: [batch_size, head, length, d_tensor]
        :return: [batch_size, length, d_model]
        """
        batch_size, head, length, d_tensor = tensor.size()
        d_model = head * d_tensor

        tensor = tensor.transpose(1, 2).contiguous().view(batch_size, length, d_model)
        return tensor

In [None]:
class EncoderLayerHC(nn.Module):
    def __init__(self, input_size, hidden_size, n_heads, drop_prob=0.1):
        super(EncoderLayerHC, self).__init__()
        self.attention = MultiHeadAttentionHC(input_size, hidden_size, n_heads)
        self.norm1 = nn.LayerNorm(input_size)
        self.dropout = nn.Dropout(p=drop_prob)

        hidden_size = input_size  # TMP second param is also input_size instead of hidden_size
        # self.linear1 = HyperCubeBlock(input_size, hidden_size)
        # self.linear2 = HyperCubeBlock(hidden_size, input_size)
        self.linear1 = HyperCubeBlock(input_size, int(np.sqrt(hidden_size)))
        self.linear2 = HyperCubeBlock(hidden_size, int(np.sqrt(input_size)))
        self.relu = nn.ReLU()

        self.norm2 = nn.LayerNorm(input_size)

    def forward(self, x, src_mask):
        _x = x

        x = self.attention(q=x, k=x, v=x, mask=src_mask)
        x = self.dropout(x)
        
        x = self.norm1(x + _x)
        _x = x

        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        x = self.dropout(x)

        x = self.norm2(x + _x)
        return x

In [None]:
class EncoderHC(nn.Module):
    def __init__(self, input_size, hidden_size, n_heads, n_layers, drop_prob=0.1):
        super().__init__()
        encoder_layers = []
        for _ in range(n_layers):
            layer = EncoderLayerHC(
                input_size=input_size,
                hidden_size=hidden_size,
                n_heads=n_heads,
                drop_prob=drop_prob
            )
            encoder_layers.append(layer)
        self.layers = nn.ModuleList(encoder_layers)

    def forward(self, x, src_mask):
        for layer in self.layers:
            x = layer(x, src_mask)
        return x

In [None]:
# TODO: EncoderLayerHC

In [None]:
# TODO: rewrite transformer

# Data

## Load

In [8]:
WMT14_DATASET = load_dataset('wmt14', 'de-en')

Downloading and preparing dataset wmt14/de-en to C:/Users/yaram/.cache/huggingface/datasets/wmt14/de-en/1.0.0/2de185b074515e97618524d69f5e27ee7545dcbed4aa9bc1a4235710ffca33f4...


Downloading data: 100%|██████████| 658M/658M [01:32<00:00, 7.14MB/s]
Downloading data: 100%|██████████| 919M/919M [02:07<00:00, 7.19MB/s]]
Downloading data: 100%|██████████| 80.5M/80.5M [00:11<00:00, 6.76MB/s]
Downloading data: 100%|██████████| 38.7M/38.7M [00:05<00:00, 7.37MB/s]
Downloading data files: 100%|██████████| 5/5 [04:02<00:00, 48.44s/it]
Extracting data files: 100%|██████████| 5/5 [00:33<00:00,  6.76s/it]
Extracting data files: 0it [00:00, ?it/s]
                                                                                           

Dataset wmt14 downloaded and prepared to C:/Users/yaram/.cache/huggingface/datasets/wmt14/de-en/1.0.0/2de185b074515e97618524d69f5e27ee7545dcbed4aa9bc1a4235710ffca33f4. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 10.33it/s]


In [9]:
WMT14_DATASET

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 4508785
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3003
    })
})

In [10]:
WMT14_DATASET['train'][0]

{'translation': {'de': 'Wiederaufnahme der Sitzungsperiode',
  'en': 'Resumption of the session'}}

In [36]:
%%time
df_train = pd.DataFrame(WMT14_DATASET['train']['translation'])
df_val = pd.DataFrame(WMT14_DATASET['validation']['translation'])
df_test = pd.DataFrame(WMT14_DATASET['test']['translation'])

KeyboardInterrupt: 

In [29]:
df_train

Unnamed: 0,de,en
0,Wiederaufnahme der Sitzungsperiode,Resumption of the session
1,"Ich erkläre die am Freitag, dem 17. Dezember u...",I declare resumed the session of the European ...
2,"Wie Sie feststellen konnten, ist der gefürchte...","Although, as you will have seen, the dreaded '..."
3,Im Parlament besteht der Wunsch nach einer Aus...,You have requested a debate on this subject in...
4,Heute möchte ich Sie bitten - das ist auch der...,"In the meantime, I should like to observe a mi..."
...,...,...
4508780,Das bleibt eine der größten Errungenschaften i...,Their achievement remains one of the greatest ...
4508781,Gleichzeitig scheint sich Zumas revolutionäre ...,"At the same time, Zuma’s revolutionary generat..."
4508782,"In einer Region, wo die älteren Menschen sehr ...","In a region that reveres the elderly, Zuma’s a..."
4508783,Drei von zehn Südafrikanern sind jünger als 15...,Three in ten South Africans are younger than 1...


In [35]:
df_train['en']

TypeError: 'int' object is not subscriptable

In [34]:
%%time
tokenized_en_input_train = tokenize(df_train['en'])
tokenized_en_input_val = tokenize(df_val['en'])
tokenized_en_input_test = tokenize(df_test['en'])

tokenized_de_output_train = tokenize(df_train['de'])
tokenized_de_output_val = tokenize(df_val['de'])
tokenized_de_output_test = tokenize(df_test['de'])

TypeError: 'int' object is not subscriptable

In [26]:
TensorDataset(WMT14_DATASET['validation']['translation'])

AttributeError: 'list' object has no attribute 'size'

In [None]:
train_dataset = TensorDataset(
        tokenized_ru_input_train,
        tokenized_en_output_train
    )
val_dataset = TensorDataset(
        tokenized_ru_input_val,
        tokenized_en_output_val
    )
test_dataset = TensorDataset(
        tokenized_ru_input_test,
        tokenized_en_output_test
    )

## Preprocessing

In [13]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

Downloading: 100%|██████████| 29.0/29.0 [00:00<00:00, 27.8kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading: 100%|██████████| 213k/213k [00:00<00:00, 721kB/s] 
Downloading: 100%|██████████| 436k/436k [00:00<00:00, 1.04MB/s]
Downloading: 100%|██████████| 570/570 [00:00<00:00, 528kB/s]


In [None]:
%%time
df_train_inputs, df_train_mask = tokenize(list(df_train['text']), tokenizer)
df_val_inputs, df_val_mask = tokenize(list(df_val['text']), tokenizer)
df_test_inputs, df_test_mask = tokenize(list(df_test['text']), tokenizer)

## To Tensor

In [None]:
# TODO: fix for without labelling
%%time

# convert the data to torch tensors
# train_labels = torch.tensor(df_train['label'].to_numpy(), dtype=torch.float32)
# valid_labels = torch.tensor(df_val['label'].to_numpy(), dtype=torch.float32)
# test_labels = torch.tensor(df_test['label'].to_numpy(), dtype=torch.float32)

# create TensorDataset
train_dataset = TensorDataset(df_train_inputs, train_labels)
valid_dataset = TensorDataset(df_val_inputs, valid_labels)
test_dataset = TensorDataset(df_test_inputs, test_labels)

# create dataloader
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

loaders = {
    "train": train_dataloader,
    "val": valid_dataloader,
}

# Train