In [1]:
from datasets import load_dataset
dataset = load_dataset("imdb")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [5]:
import random

random.seed(42)
def create_datatset(dataset):
    all_texts = dataset['train']['text'] + dataset['test']['text']
    all_labels = dataset['train']['label'] + dataset['test']['label']
    combined = list(zip(all_texts, all_labels))
    sampled = random.sample(combined, k=30000)
    sampled_texts, sampled_labels = zip(*sampled)
    return sampled_texts, sampled_labels

text, labels = create_datatset(dataset)

In [10]:
from torch.utils.data import Dataset, DataLoader

class IMBDdataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, index):
        return self.texts[index], self.labels[index]
        

In [11]:
import re
from nltk.tokenize import word_tokenize
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to C:\Users\Nkris/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
def preprocess_text(text):
    text = text.lower()
    ## remove all special character and punctuation from the text
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    ## tokenise the text
    return word_tokenize(text)

In [14]:
tokenised_texts = [preprocess_text(t) for t in text]
tokenised_texts

[['i',
  'just',
  'returned',
  'from',
  'viewing',
  'this',
  'academy',
  'awardnominated',
  'doc',
  'and',
  'i',
  'was',
  'thoroughly',
  'touched',
  'and',
  'interested',
  'in',
  'exploring',
  'the',
  'works',
  'of',
  'this',
  'fellow',
  'id',
  'never',
  'heard',
  'of',
  'before',
  'of',
  'course',
  'im',
  'someone',
  'whos',
  'captivated',
  'with',
  'beautiful',
  'architecture',
  'so',
  'i',
  'realize',
  'others',
  'wont',
  'carebr',
  'br',
  'we',
  'can',
  'only',
  'imagine',
  'if',
  'there',
  'had',
  'been',
  'a',
  'couple',
  'more',
  'visionaries',
  'in',
  'philadelphia',
  'back',
  'in',
  'the',
  'late',
  '60s',
  'when',
  'kahns',
  'plans',
  'were',
  'a',
  'possibility',
  'what',
  'a',
  'wonderful',
  'city',
  'center',
  'there',
  'would',
  'be',
  'if',
  'you',
  'wonder',
  'whether',
  'youll',
  'see',
  'more',
  'about',
  'the',
  'bangladesh',
  'building',
  'at',
  'the',
  'beginning',
  'of',
  't

In [25]:
from collections import defaultdict, Counter

In [None]:
# counter = defaultdict(int)
# for text in tokenised_texts:
#     for token in text:
#         counter[token] += 1

In [32]:
counter = Counter([tok for sent in tokenised_texts for tok in sent])

In [35]:
counter

Counter({'the': 398850,
         'a': 192633,
         'and': 192632,
         'of': 173338,
         'to': 160347,
         'is': 125709,
         'in': 110996,
         'it': 91902,
         'i': 91589,
         'this': 89516,
         'that': 81790,
         'br': 68586,
         'was': 57303,
         'as': 54455,
         'for': 52100,
         'with': 51924,
         'movie': 50012,
         'but': 49160,
         'film': 44530,
         'on': 39918,
         'not': 37043,
         'you': 35914,
         'are': 35074,
         'his': 34534,
         'have': 32998,
         'be': 31571,
         'he': 31332,
         'one': 30592,
         'its': 29517,
         'at': 27847,
         'all': 27118,
         'by': 26388,
         'an': 25725,
         'they': 25116,
         'from': 24085,
         'who': 24041,
         'so': 23726,
         'like': 23592,
         'or': 21228,
         'just': 21159,
         'her': 20533,
         'if': 20239,
         'about': 20181,
         'h

In [36]:
vocab = ["<PAD>", "<UNK>"] + [w for w, c in counter.items() if c >= 2]
vocab

['<PAD>',
 '<UNK>',
 'i',
 'just',
 'returned',
 'from',
 'viewing',
 'this',
 'academy',
 'doc',
 'and',
 'was',
 'thoroughly',
 'touched',
 'interested',
 'in',
 'exploring',
 'the',
 'works',
 'of',
 'fellow',
 'id',
 'never',
 'heard',
 'before',
 'course',
 'im',
 'someone',
 'whos',
 'captivated',
 'with',
 'beautiful',
 'architecture',
 'so',
 'realize',
 'others',
 'wont',
 'carebr',
 'br',
 'we',
 'can',
 'only',
 'imagine',
 'if',
 'there',
 'had',
 'been',
 'a',
 'couple',
 'more',
 'visionaries',
 'philadelphia',
 'back',
 'late',
 '60s',
 'when',
 'kahns',
 'plans',
 'were',
 'possibility',
 'what',
 'wonderful',
 'city',
 'center',
 'would',
 'be',
 'you',
 'wonder',
 'whether',
 'youll',
 'see',
 'about',
 'bangladesh',
 'building',
 'at',
 'beginning',
 'movie',
 'patient',
 'for',
 'it',
 'will',
 'provide',
 'climax',
 'film',
 'end',
 'his',
 'sons',
 'personal',
 'discoveries',
 'process',
 'making',
 'are',
 'quite',
 'interesting',
 'sometimes',
 'touching',
 'eve

In [37]:
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}

In [46]:
### glove embeddings
import numpy as np
glove_path = "./glove.6B.100d.txt"

def process_glove(path):
    glove = {}
    with open(path, "r", encoding="utf-8") as f :
        for line in f:
            # print(line)
            word_embedding = line.strip().split()
            # print(word_embedding)
            word = word_embedding[0]
            embedding = np.array(word_embedding[1:], dtype=np.float32)
            # print(word)
            # print(embedding)
            glove[word] = embedding
            
    
    return glove

glove = process_glove(path=glove_path)

In [47]:
glove

{'the': array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
        -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
         0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
        -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
         0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
        -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
         0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
         0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
        -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
        -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
        -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
        -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
        -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
        -1.2526  ,  0.071624,  

In [53]:
glove["the"].shape

(100,)

In [49]:
len(vocab)

63518

In [48]:
embedding_dim = 100
embedding_matrix = np.random.normal(0, 1, (len(vocab), embedding_dim))
embedding_matrix

array([[-0.75215897,  1.61261614,  0.78739896, ...,  0.20726466,
        -0.22466549,  0.63442069],
       [-0.67082148, -1.1048605 , -2.25099976, ..., -0.89555749,
         0.89793664, -0.98103915],
       [-0.64846656, -1.11928296,  0.66483168, ...,  0.52446186,
        -2.11512301,  1.74715298],
       ...,
       [ 0.26297405,  1.41409903, -0.0403448 , ..., -1.11299695,
        -0.4140877 ,  0.75009344],
       [-0.02509004, -0.53370055, -0.94729238, ...,  0.46937312,
        -0.13902506, -0.3385523 ],
       [ 0.74695314, -0.50925771, -1.6681548 , ...,  0.92972998,
        -2.25671149,  0.81824374]])

In [50]:
embedding_matrix.shape

(63518, 100)

In [51]:
word2idx

{'<PAD>': 0,
 '<UNK>': 1,
 'i': 2,
 'just': 3,
 'returned': 4,
 'from': 5,
 'viewing': 6,
 'this': 7,
 'academy': 8,
 'doc': 9,
 'and': 10,
 'was': 11,
 'thoroughly': 12,
 'touched': 13,
 'interested': 14,
 'in': 15,
 'exploring': 16,
 'the': 17,
 'works': 18,
 'of': 19,
 'fellow': 20,
 'id': 21,
 'never': 22,
 'heard': 23,
 'before': 24,
 'course': 25,
 'im': 26,
 'someone': 27,
 'whos': 28,
 'captivated': 29,
 'with': 30,
 'beautiful': 31,
 'architecture': 32,
 'so': 33,
 'realize': 34,
 'others': 35,
 'wont': 36,
 'carebr': 37,
 'br': 38,
 'we': 39,
 'can': 40,
 'only': 41,
 'imagine': 42,
 'if': 43,
 'there': 44,
 'had': 45,
 'been': 46,
 'a': 47,
 'couple': 48,
 'more': 49,
 'visionaries': 50,
 'philadelphia': 51,
 'back': 52,
 'late': 53,
 '60s': 54,
 'when': 55,
 'kahns': 56,
 'plans': 57,
 'were': 58,
 'possibility': 59,
 'what': 60,
 'wonderful': 61,
 'city': 62,
 'center': 63,
 'would': 64,
 'be': 65,
 'you': 66,
 'wonder': 67,
 'whether': 68,
 'youll': 69,
 'see': 70,
 'abou

In [63]:
for word, idx in word2idx.items():
    if word in glove:
        embedding_matrix[idx] = glove[word]

In [64]:
embedding_matrix

array([[-0.75215897,  1.61261614,  0.78739896, ...,  0.20726466,
        -0.22466549,  0.63442069],
       [-0.67082148, -1.1048605 , -2.25099976, ..., -0.89555749,
         0.89793664, -0.98103915],
       [-0.046539  ,  0.61966002,  0.56647003, ..., -0.37616   ,
        -0.032502  ,  0.80620003],
       ...,
       [ 0.26297405,  1.41409903, -0.0403448 , ..., -1.11299695,
        -0.4140877 ,  0.75009344],
       [-0.02509004, -0.53370055, -0.94729238, ...,  0.46937312,
        -0.13902506, -0.3385523 ],
       [ 0.74695314, -0.50925771, -1.6681548 , ...,  0.92972998,
        -2.25671149,  0.81824374]])

In [65]:
embedding_matrix[17]

array([-0.038194  , -0.24487001,  0.72812003, -0.39961001,  0.083172  ,
        0.043953  , -0.39140999,  0.3344    , -0.57545   ,  0.087459  ,
        0.28786999, -0.06731   ,  0.30906001, -0.26383999, -0.13231   ,
       -0.20757   ,  0.33395001, -0.33848   , -0.31742999, -0.48335999,
        0.1464    , -0.37303999,  0.34577   ,  0.052041  ,  0.44946   ,
       -0.46970999,  0.02628   , -0.54154998, -0.15518001, -0.14106999,
       -0.039722  ,  0.28277001,  0.14393   ,  0.23464   , -0.31020999,
        0.086173  ,  0.20397   ,  0.52623999,  0.17163999, -0.082378  ,
       -0.71787   , -0.41531   ,  0.20334999, -0.12763   ,  0.41367   ,
        0.55186999,  0.57907999, -0.33476999, -0.36559001, -0.54856998,
       -0.062892  ,  0.26583999,  0.30204999,  0.99774998, -0.80480999,
       -3.0243001 ,  0.01254   , -0.36941999,  2.21670008,  0.72201002,
       -0.24978   ,  0.92136002,  0.034514  ,  0.46744999,  1.10790002,
       -0.19358   , -0.074575  ,  0.23353   , -0.052062  , -0.22

In [66]:
assert (embedding_matrix[17] == glove["the"]).all()

In [67]:
### Padding

import torch
from torch.nn.utils.rnn import pad_sequence

def tokens_to_indices(tokens, word2idx):
    return torch.tensor([word2idx.get(t, word2idx["<UNK>"]) for t in tokens])

indexed_seqs = [tokens_to_indices(seq, word2idx) for seq in tokenised_texts]
padded_seqs = pad_sequence(indexed_seqs, batch_first=True, padding_value=word2idx["<PAD>"])
labels_tensor = torch.tensor(labels)

In [71]:
padded_seqs

tensor([[   2,    3,    4,  ...,    0,    0,    0],
        [ 139,  140,  141,  ...,    0,    0,    0],
        [   7,   83,   40,  ...,    0,    0,    0],
        ...,
        [ 966,   17,  193,  ...,    0,    0,    0],
        [   2,  225,  290,  ...,    0,    0,    0],
        [1704,  243,  134,  ...,    0,    0,    0]])

In [68]:
labels_tensor

tensor([1, 0, 0,  ..., 0, 1, 1])

In [None]:
### creating dataset
from torch.utils.data import TensorDataset

dataset = TensorDataset(padded_seqs, labels_tensor)

<torch.utils.data.dataset.TensorDataset at 0x2359271deb0>

In [74]:
for seq, label in dataset:
    print(seq, label)
    break

tensor([2, 3, 4,  ..., 0, 0, 0]) tensor(1)


In [76]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

print(f"Training size = {train_size} and Testing size = {test_size}")

Training size = 24000 and Testing size = 6000


In [77]:
### use sampling to randomly assign text and labels to train and test
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

In [78]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

In [79]:
### since we created an embedding matrix, we have to create a custom embedding layer class
import torch 
import torch.nn as nn 
import torch.nn.functional as F

class EmbeddingLayer(nn.Module):
    def __init__(self, embedding_matrix):
        super(EmbeddingLayer, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float))
    
    def forward(self, x):
        return self.embedding(x)

In [81]:
embedding_matrix.shape[1]

100

In [80]:
### base models

In [94]:
class VanillaRNN(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, num_classes=2):
        super(VanillaRNN, self).__init__()
        self.embedding = EmbeddingLayer(embedding_matrix)
        ## embedding_matrix.shape = 100 (because of glove embeddings)
        self.rnn = nn.RNN(embedding_matrix.shape[1], hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        # print(x.shape) # torch.Size([64, 2273, 100])
        outputs, h_n = self.rnn(x) ## h_n contains the last hidden state
        # print(h_n.shape) # torch.Size([1, 64, 128])
        # print(outputs.shape) # torch.Size([64, 2273, 128])
        h_n = h_n.squeeze(0)
        return self.fc(h_n)
        


In [95]:
import torch.optim as optim 

def train_model(model, train_loader, epochs=5, lr=0.001):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss() ## applies softmax internally

    model.train()
    for epch in range(epochs):
        total_loss = 0
        for texts, labels in train_loader:
            texts, labels = texts.to(device) , labels.to(device)
            optimizer.zero_grad()
            outputs = model(texts)
            # print(f"Outputs shape {outputs.shape}") Outputs shape torch.Size([64, 2])
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f"Epoch {epch+1} Loss: {total_loss / len(train_loader):.4f}")




In [96]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_targets = []
    device = next(model.parameters()).device

    with torch.no_grad():
        for texts, labels in test_loader:
            texts, labels = texts.to(device), label.to(device)
            outputs = model(texts)
            y_pred = torch.argmax(outputs, dim=1)

            ### keep track to later analyse the model
            all_preds.extend(y_pred.cpu().numpy())
            all_targets.extend(labels.numpy())

    acc = accuracy_score(all_targets, all_preds)
    p, r, f1, _ = precision_recall_fscore_support(all_targets, all_preds, average="macro")
    print(f"Accuracy: {acc:.4f}, Precision: {p:.4f}, Recall: {r:.4f}, F1: {f1:.4f}")
    print("Confusion Matrix:\n", confusion_matrix(all_targets, all_preds))

In [97]:
model = VanillaRNN(embedding_matrix, hidden_dim=128)
train_model(model, train_loader)
evaluate_model(model, test_loader)

Epoch 1 Loss: 0.6969


KeyboardInterrupt: 