In [1]:
import torch
torch.__version__


'2.4.0+rocm6.3.4.git7cecbf6d'

In [2]:
from datasets import load_dataset
from torch.utils.data import random_split
import torch

# Step 1: Load and prepare the datasets
raw_dataset = load_dataset("imdb")  # returns a DatasetDict with 'train' and 'test' splits

train_full = raw_dataset["train"]   # 25,000 labeled examples
test_dataset = raw_dataset["test"]  # 25,000 labeled examples

# Convert to list of dicts if needed (for compatibility with PyTorch Dataset)
train_full = list(train_full)
test_dataset = list(test_dataset)

# Step 2: Split train into train/validation
torch.manual_seed(1)
train_dataset, valid_dataset = random_split(train_full, [20000, 5000])


In [4]:
import re
from collections import Counter, OrderedDict

token_counts = Counter()

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub(r'[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized

for sample in train_dataset:
    text = sample['text']
    tokens = tokenizer(text)
    token_counts.update(tokens)

print('Vocab-size:', len(token_counts))

Vocab-size: 69006


In [5]:
class Vocab:
    def __init__(self, stoi, itos):
        self.stoi = stoi
        self.itos = itos
        self.pad_index = stoi["<pad>"]
        self.unk_index = stoi["<unk>"]

    def __getitem__(self, token):
        return self.stoi.get(token, self.unk_index)

    def __len__(self):
        return len(self.itos)


In [6]:
from collections import OrderedDict
from torch.nn import Module
from torch import tensor

# Step 3: build vocabulary
sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)

# Manually build vocab mapping
itos = ["<pad>", "<unk>"] + list(ordered_dict.keys())
stoi = {token: idx for idx, token in enumerate(itos)}

# Define lookup function
def lookup(token):
    return stoi.get(token, stoi["<unk>"])

# Example usage
print([lookup(token) for token in ['this', 'is', 'an', 'example']])

vocab = Vocab(stoi, itos)

print([vocab[token] for token in ['this', 'is', 'an', 'example']])

[11, 7, 35, 457]
[11, 7, 35, 457]


In [9]:
## Step 3-A: define the functions for transformation
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]

# No need for torchtext version check
# Label is already 0 (neg) or 1 (pos) in HuggingFace IMDB
label_pipeline = lambda x: float(x)

## Step 3-B: wrap the encode and transformation function
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), 
                                      dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(
        text_list, batch_first=True)
    return padded_text_list.to(device), label_list.to(device), lengths.to(device)

In [13]:
## Step 3-B: wrap the encode and transformation function
from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    text_list = []
    label_list = []
    length_list = []

    for sample in batch:
        label = torch.tensor(label_pipeline(sample['label']), dtype=torch.float32)
        processed_text = torch.tensor(text_pipeline(sample['text']), dtype=torch.int64)
        text_list.append(processed_text)
        label_list.append(label)
        length_list.append(len(processed_text))

    # Pad all sequences to max length in batch
    text_batch = pad_sequence(text_list, batch_first=True, padding_value=vocab["<pad>"])
    label_batch = torch.stack(label_list)
    length_batch = torch.tensor(length_list, dtype=torch.int64)

    return text_batch.to(device), label_batch.to(device), length_batch.to(device)

In [14]:
## Take a small batch

from torch.utils.data import DataLoader
dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, collate_fn=collate_batch)
text_batch, label_batch, length_batch = next(iter(dataloader))
print(text_batch)
print(label_batch)
print(length_batch)
print(text_batch.shape)

tensor([[   35,  1739,     7,   449,   721,     6,   301,     4,   787,     9,
             4,    18,    44,     2,  1705,  2460,   186,    25,     7,    24,
           100,  1874,  1739,    25,     7, 34414,  3568,  1103,  7517,   787,
             5,     2,  4991, 12401,    36,     7,   148,   111,   939,     6,
         11598,     2,   172,   135,    62,    25,  3199,  1602,     3,   928,
          1500,     9,     6,  4601,     2,   155,    36,    14,   274,     4,
         42944,     9,  4991,     3,    14, 10296,    34,  3568,     8,    51,
           148,    30,     2,    58,    16,    11,  1893,   125,     6,   420,
          1214,    27, 14542,   940,    11,     7,    29,   951,    18,    17,
         15994,   459,    34,  2480, 15211,  3713,     2,   840,  3200,     9,
          3568,    13,   107,     9,   175,    94,    25,    51, 10297,  1796,
            27,   712,    16,     2,   220,    17,     4,    54,   722,   238,
           395,     2,   787,    32,    27,  5236,  

In [15]:
## Step 4: batching the datasets

batch_size = 32  

train_dl = DataLoader(train_dataset, batch_size=batch_size,
                      shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size,
                      shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=batch_size,
                     shuffle=False, collate_fn=collate_batch)

In [16]:

embedding = nn.Embedding(num_embeddings=10, 
                         embedding_dim=3, 
                         padding_idx=0)
 
# a batch of 2 samples of 4 indices each
text_encoded_input = torch.LongTensor([[1,2,4,5],[4,3,2,0]])
print(embedding(text_encoded_input))

tensor([[[-0.4651, -0.3203,  2.2408],
         [ 0.3824, -0.3446, -0.3531],
         [-0.0251, -0.5973, -0.2959],
         [ 0.8356,  0.4025, -0.6924]],

        [[-0.0251, -0.5973, -0.2959],
         [ 0.9124, -0.4643,  0.3046],
         [ 0.3824, -0.3446, -0.3531],
         [ 0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward0>)


Building the RNN (GRU) for the sentiment analysis task

In [21]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 
                                      embed_dim, 
                                      padding_idx=0) 
        self.rnn = nn.GRU(embed_dim, rnn_hidden_size, 
                           batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        out, hidden = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out
         
vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size) 
model = model.to(device)

In [22]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [23]:
def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)
 
def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            pred = model(text_batch, lengths)[:, 0]
            loss = loss_fn(pred, label_batch)
            total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [24]:
num_epochs = 10 

torch.manual_seed(1)
 
for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')

Epoch 0 accuracy: 0.6004 val_accuracy: 0.6836
Epoch 1 accuracy: 0.7604 val_accuracy: 0.7934
Epoch 2 accuracy: 0.8470 val_accuracy: 0.8418
Epoch 3 accuracy: 0.8934 val_accuracy: 0.8570
Epoch 4 accuracy: 0.9223 val_accuracy: 0.8632
Epoch 5 accuracy: 0.9457 val_accuracy: 0.8668
Epoch 6 accuracy: 0.9609 val_accuracy: 0.8602
Epoch 7 accuracy: 0.9734 val_accuracy: 0.8562
Epoch 8 accuracy: 0.9835 val_accuracy: 0.8546
Epoch 9 accuracy: 0.9901 val_accuracy: 0.8658


In [26]:
print(f'Device name [0]:', 
      torch.cuda.get_device_name(0))

Device name [0]: AMD Radeon RX 7900 XTX
