In [None]:
from transformers import BertTokenizerFast
from torch.utils.data import Dataset
from datasets import load_dataset
import torch
import torch.nn.functional as F
import math

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')
sentence = "By the way, we ball"
tokens = tokenizer.tokenize(sentence)
print(tokens)



['By', 'the', 'way', ',', 'we', 'ball']


In [4]:
mnli_dataset = load_dataset("multi_nli")

In [5]:
print(mnli_dataset['train'][0])

{'promptID': 31193, 'pairID': '31193n', 'premise': 'Conceptually cream skimming has two basic dimensions - product and geography.', 'premise_binary_parse': '( ( Conceptually ( cream skimming ) ) ( ( has ( ( ( two ( basic dimensions ) ) - ) ( ( product and ) geography ) ) ) . ) )', 'premise_parse': '(ROOT (S (NP (JJ Conceptually) (NN cream) (NN skimming)) (VP (VBZ has) (NP (NP (CD two) (JJ basic) (NNS dimensions)) (: -) (NP (NN product) (CC and) (NN geography)))) (. .)))', 'hypothesis': 'Product and geography are what make cream skimming work. ', 'hypothesis_binary_parse': '( ( ( Product and ) geography ) ( ( are ( what ( make ( cream ( skimming work ) ) ) ) ) . ) )', 'hypothesis_parse': '(ROOT (S (NP (NN Product) (CC and) (NN geography)) (VP (VBP are) (SBAR (WHNP (WP what)) (S (VP (VBP make) (NP (NP (NN cream)) (VP (VBG skimming) (NP (NN work)))))))) (. .)))', 'genre': 'government', 'label': 1}


In [6]:
class MNLIDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.data)
    def  __getitem__(self, idx):
        example = self.data[idx]
        premise = example['premise']
        hypothesis = example['hypothesis']
        label = example['label']
        encoded_pair = self.tokenizer.encode_plus(premise, hypothesis, max_length=self.max_length, padding='max_length', truncation=True,return_tensors='pt')
        input_ids = encoded_pair['input_ids'].squeeze(0)
        attention_mask = encoded_pair['attention_mask'].squeeze(0)
        # Keeping this commented out for now, maybe not very essential for encoder only models? Investigate further...
        # token_type_ids = encoded_pair.get('token_type_ids', torch.zeros_like(input_ids))
        return {'input_ids': input_ids,'attention_mask': attention_mask,# 'token_type_ids': token_type_ids,
'labels': torch.tensor(label)}


In [None]:
train_data = mnli_dataset["train"]
max_seq_length = 128
train_dataset = MNLIDataset(train_data, tokenizer, max_seq_length)
print(f"Size of training dataset: {len(train_dataset)}")

Size of training dataset: 392702


In [8]:
sample = train_dataset[0]
print(sample['input_ids'].shape)
print(sample['attention_mask'].shape)
print(sample['labels'])

torch.Size([128])
torch.Size([128])
tensor(1)


In [None]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, hidden_size, num_attention_heads, dropout_rate):
        super().__init__()
        self.num_heads = num_attention_heads
        self.head_dim = hidden_size // num_attention_heads
        assert self.head_dim * self.num_heads == hidden_size

        self.query = nn.Linear(hidden_size, hidden_size)
        self.key = nn.Linear(hidden_size, hidden_size)
        self.value = nn.Linear(hidden_size, hidden_size)

        self.dropout = nn.Dropout(dropout_rate)
        self.output = nn.Linear(hidden_size, hidden_size)
    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        seq_len_q, seq_len_k, seq_len_v = query.size(1), key.size(1), value.size(1)
        query = self.query(query)
        key = self.query(key)
        value = self.value(value)
        query = query.view(batch_size, seq_len_q, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        key = key.view(batch_size, seq_len_k, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        value = value.view(batch_size, seq_len_v, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        attention_scores = torch.matmul(query, key.transpose(-2,-1))
        attention_scores = attention_scores/(self.head_dim**0.5)
        if mask is not None:
            attention_scores = attention_scores.masked_fill(mask==0, float('-inf'))
        attention_weights = F.softmax(attention_scores,dim=-1)
        scaled_attention = torch.matmul(attention_weights, value)
        scaled_attention = scaled_attention.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_len_q, self.num_heads * self.head_dim)
        output = self.output(scaled_attention)
        return output


            


NameError: name 'nn' is not defined

In [None]:
class FeedForwardNetwork(nn.Module):
    def __init__(self, hidden_size, intermediate_size, dropout_rate):
        super().__init__()
        self.dense1 = nn.Linear(hidden_size, intermediate_size)
        self.relu = nn.ReLU()
        self.dense2 = nn.Linear(intermediate_size, hidden_size)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        x = F.relu(self.dense1(x))
        x = self.dropout(x)
        x = self.dense2(x)
        return x

In [None]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, hidden_size, num_attention_heads, intermediate_size, dropout_rate):
        super().__init__()
        self.self_attention = MultiHeadSelfAttention(hidden_size, num_attention_heads, dropout_rate)
        self.feed_forward = FeedForwardNetwork(hidden_size, intermediate_size, dropout_rate)
        self.norm1 = nn.LayerNorm(hidden_size)
        self.norm2 = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x, mask):
        attention_output = self.self_attention(x,x, x, mask)
        normed_output1 = self.norm1(attention_output + x)
        ff_output = self.feed_forward(normed_output1)
        final_output = self.norm2(ff_output + normed_output1)
        return final_output

In [None]:
class FactorizedEmbedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        super().__init__()
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.projection = nn.Linear(embedding_dim, hidden_size)

    def forward(self, input_ids):
        factor_embeds = self.word_embeddings(input_ids)
        project_embeds = self.projection(factor_embeds)
        return project_embeds

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, hidden_size, max_seq_length, dropout_rate):
        super().__init__()
        self.dropout = nn.Dropout(dropout_rate)

        position = torch.arange(0, max_seq_length).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, hidden_size, 2) * -(math.log(10000.0) / hidden_size))
        self.pe = torch.zeros(max_seq_length, 1, hidden_size)
        self.pe[:, 0, 0::2] = torch.sin(position * div_term)
        self.pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', self.pe) # Store as a buffer (not a learnable parameter)

    def forward(self, x):
        seq_length = x.size(1)
        pe = self.pe[:seq_length].squeeze(1)
        x = x + pe.unsqeeze(0)
        x = self.dropout(x)
        return x

In [None]:
class AtomBERT(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, num_attention_heads, intermediate_size, num_classes, max_seq_length, dropout_rate):
        super().__init__()
        self.embedding = FactorizedEmbedding(vocab_size, embedding_dim, hidden_size)
        self.positional_encoding = PositionalEncoding(hidden_size, max_seq_length, dropout_rate)
        self.encoder_layer = TransformerEncoderLayer(hidden_size, num_attention_heads, intermediate_size, dropout_rate)
        self.num_layers = num_layers
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        embeddings = self.embedding(input_ids)
        embeddings = self.positional_encoding(embeddings)

        # Share the encoder layer across all layers
        encoder_output = embeddings
        for _ in range(self.num_layers):
            encoder_output = self.encoder_layer(encoder_output, attention_mask)

        # You might want to pool or take the [CLS] token representation here
        # For simplicity, let's take the mean of the sequence
        pooled_output = torch.mean(encoder_output, dim=1)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits