In [None]:
# Install necessary packages
!pip install datasets tokenizers

import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import DataLoader
from datasets import load_dataset
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers
from tokenizers.processors import BertProcessing
import numpy as np
from math import sqrt

# Load IMDb dataset
ds = load_dataset("imdb")

# Prepare training data
training_data = [item["text"] for item in ds['train']]

# Initialize a tokenizer
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
tokenizer.normalizer = normalizers.Sequence([
    normalizers.NFD(),
    normalizers.Lowercase()
])
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
    pre_tokenizers.Whitespace(),
    pre_tokenizers.Punctuation()
])
trainer = trainers.WordPieceTrainer(
    vocab_size=30000,
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)
tokenizer.train_from_iterator(training_data, trainer=trainer)
tokenizer.post_processor = BertProcessing(
    ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ("[CLS]", tokenizer.token_to_id("[CLS]"))
)
tokenizer.decoder = decoders.WordPiece()

# Save the tokenizer
tokenizer.save("custom_tokenizer.json")

# Load tokenizer from file
tokenizer = Tokenizer.from_file("custom_tokenizer.json")

# Calculate maximum token ID and vocab_size
max_token_id = max(tokenizer.get_vocab().values())
vocab_size = max_token_id + 1

# Collate function for the custom tokenizer
def custom_collate_fn(batch):
    max_len = 400
    texts, labels = [], []
    for row in batch:
        labels.append(row['label'])
        texts.append(row['text'])

    encodings = tokenizer.encode_batch(texts)
    input_ids = [encoding.ids for encoding in encodings]
    input_ids = [ids[:max_len] + [tokenizer.token_to_id("[PAD]")] * max(0, max_len - len(ids)) for ids in input_ids]
    input_ids = torch.LongTensor(input_ids)
    labels = torch.LongTensor(labels)

    return input_ids, labels

# DataLoader with custom collate function
train_loader = DataLoader(ds['train'], batch_size=64, shuffle=True, collate_fn=custom_collate_fn)
test_loader = DataLoader(ds['test'], batch_size=64, shuffle=False, collate_fn=custom_collate_fn)

# Positional Encoding
def positional_encoding(position, d_model):
    angle_rads = np.arange(position)[:, np.newaxis] / np.power(
        10000, (2 * (np.arange(d_model)[np.newaxis, :] // 2)) / np.float32(d_model)
    )
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])  # Apply sin to even indices
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])  # Apply cos to odd indices
    pos_encoding = angle_rads[np.newaxis, ...]
    return torch.FloatTensor(pos_encoding)

# Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super().__init__()
        assert d_model % n_heads == 0, "d_model must be divisible by n_heads"
        self.n_heads = n_heads
        self.depth = d_model // n_heads

        self.wq = nn.Linear(d_model, d_model)
        self.wk = nn.Linear(d_model, d_model)
        self.wv = nn.Linear(d_model, d_model)
        self.dense = nn.Linear(d_model, d_model)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, mask=None):
        batch_size, seq_len, d_model = x.size()

        # Linear projections
        q = self.wq(x)
        k = self.wk(x)
        v = self.wv(x)

        # Reshape and transpose for multi-head attention
        q = q.view(batch_size, seq_len, self.n_heads, self.depth).transpose(1, 2)
        k = k.view(batch_size, seq_len, self.n_heads, self.depth).transpose(1, 2)
        v = v.view(batch_size, seq_len, self.n_heads, self.depth).transpose(1, 2)

        # Scaled dot-product attention
        scores = torch.matmul(q, k.transpose(-2, -1)) / sqrt(self.depth)

        if mask is not None:
            # mask shape: (batch_size, 1, 1, seq_len)
            # scores shape: (batch_size, n_heads, seq_len, seq_len)
            # Broadcast mask to match scores for masking
            scores = scores.masked_fill(mask == 0, -1e9)

        attention_weights = self.softmax(scores)
        output = torch.matmul(attention_weights, v)

        # Concatenate heads
        output = output.transpose(1, 2).contiguous().view(batch_size, seq_len, d_model)
        output = self.dense(output)
        return output

# Transformer Layer
class TransformerLayer(nn.Module):
    def __init__(self, d_model, dff, n_heads, dropout_rate=0.1):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, n_heads)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, dff),
            nn.ReLU(),
            nn.Linear(dff, d_model)
        )
        self.layernorm1 = nn.LayerNorm(d_model)
        self.layernorm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.dropout2 = nn.Dropout(dropout_rate)

    def forward(self, x, mask=None):
        # Multi-head attention with residual connection
        attn_output = self.mha(x, mask)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(x + attn_output)

        # Feed-forward network with residual connection
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2

# Full Transformer-based Text Classifier
class TransformerTextClassifier(nn.Module):
    def __init__(self, vocab_size, d_model, n_layers, dff, n_heads, max_len=400, dropout_rate=0.1):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=d_model,
            padding_idx=tokenizer.token_to_id("[PAD]")
        )
        self.pos_encoding = nn.Parameter(positional_encoding(max_len, d_model), requires_grad=False)
        self.layers = nn.ModuleList([
            TransformerLayer(d_model, dff, n_heads, dropout_rate) for _ in range(n_layers)
        ])
        self.classifier = nn.Linear(d_model, 1)

    def forward(self, x):
        mask = (x != self.embedding.padding_idx).unsqueeze(1).unsqueeze(2)  # Shape: (batch_size, 1, 1, seq_len)

        seq_len = x.size(1)
        x = self.embedding(x)
        x = x * sqrt(self.embedding.embedding_dim)
        x = x + self.pos_encoding[:, :seq_len, :]

        for layer in self.layers:
            x = layer(x, mask)

        x = x[:, 0, :]  # Use the representation of [CLS] token
        x = self.classifier(x)
        return x.squeeze(-1)

# Model Initialization
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TransformerTextClassifier(
    vocab_size=vocab_size,
    d_model=128,
    n_layers=5,
    dff=512,
    n_heads=4
).to(device)

# Optimizer and Loss Function
optimizer = Adam(model.parameters(), lr=0.0001)
loss_fn = nn.BCEWithLogitsLoss()

# Accuracy Calculation
def accuracy(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            preds = torch.round(torch.sigmoid(outputs))
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return correct / total

# Training Loop
n_epochs = 10  # Adjust epochs as needed
for epoch in range(1, n_epochs + 1):
    model.train()
    total_loss = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device).float()
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    train_acc = accuracy(model, train_loader)
    test_acc = accuracy(model, test_loader)
    print(f"Epoch {epoch}/{n_epochs}, Loss: {total_loss:.4f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}")


Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K 

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Epoch 1/10, Loss: 227.1973, Train Acc: 0.7516, Test Acc: 0.7320
Epoch 2/10, Loss: 164.7910, Train Acc: 0.8514, Test Acc: 0.8130
Epoch 3/10, Loss: 137.0807, Train Acc: 0.8747, Test Acc: 0.8218
Epoch 4/10, Loss: 117.6618, Train Acc: 0.9140, Test Acc: 0.8329
Epoch 5/10, Loss: 99.4767, Train Acc: 0.9405, Test Acc: 0.8300
Epoch 6/10, Loss: 83.1747, Train Acc: 0.9564, Test Acc: 0.8293
Epoch 7/10, Loss: 65.0518, Train Acc: 0.9731, Test Acc: 0.8266
Epoch 8/10, Loss: 54.5704, Train Acc: 0.9796, Test Acc: 0.8232
Epoch 9/10, Loss: 40.0646, Train Acc: 0.9850, Test Acc: 0.8167
Epoch 10/10, Loss: 35.3603, Train Acc: 0.9838, Test Acc: 0.8155
