### Project 1 - Predicting Sentiment of IMDb movie reviews

In [12]:
from datasets import load_dataset

# Laster datasettet (tar første gang litt tid – det cacher lokalt)
imdb = load_dataset("imdb")


In [13]:
train_dataset = imdb['train']
test_dataset = imdb['test']

In [14]:
from torch.utils.data import random_split
import torch

torch.manual_seed(1)
train_dataset, valid_dataset = random_split(list(train_dataset), [20000, 5000])

In [15]:
train_dataset[0]

{'text': 'An extra is called upon to play a general in a movie about the Russian Revolution. However, he is not any ordinary extra. He is Serguis Alexander, former commanding general of the Russia armies who is now being forced to relive the same scene, which he suffered professional and personal tragedy in, to satisfy the director who was once a revolutionist in Russia and was humiliated by Alexander. It can now be the time for this broken man to finally "win" his penultimate battle. This is one powerful movie with meticulous direction by Von Sternberg, providing the greatest irony in Alexander\'s character in every way he can. Jannings deserved his Oscar for the role with a very moving performance playing the general at his peak and at his deepest valley. Powell lends a sinister support as the revenge minded director and Brent is perfect in her role with her face and movements showing so much expression as Jannings\' love. All around brilliance. Rating, 10.',
 'label': 1}

In [16]:
import re
from collections import Counter

def tokenizer(text):
    text = re.sub(r'<[^>]*>', '', text)
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub(r'[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized

token_counts = Counter()

for example in train_dataset:  # ikke unpack som tuple!
    tokens = tokenizer(example["text"])
    token_counts.update(tokens)

print("Vocab-size:", len(token_counts))


Vocab-size: 69006


In [17]:
from collections import Counter

# Behold de N vanligste ordene (valgfritt, f.eks. 10 000)
max_vocab_size = 100000
most_common = token_counts.most_common(max_vocab_size)

# Spesielle tokens
specials = ["<PAD>", "<UNK>"]
word2index = {word: idx for idx, word in enumerate(specials)}
index2word = {idx: word for idx, word in enumerate(specials)}

# Start telling videre etter spesial-tokens
for idx, (word, _) in enumerate(most_common, start=len(specials)):
    word2index[word] = idx
    index2word[idx] = word  


In [18]:
def text_pipeline(text):
    tokens = tokenizer(text)
    return [word2index.get(token, word2index["<UNK>"]) for token in tokens]

def label_pipeline(label):
    return 1.0 if label == "pos" else 0.0


In [19]:
import torch
import torch.nn as nn

def collate_batch(batch):
    text_list, label_list, lengths = [], [], []

    for example in batch:
        label = label_pipeline(example["label"])
        text_tensor = torch.tensor(text_pipeline(example["text"]), dtype=torch.int64)
        label_list.append(label)
        text_list.append(text_tensor)
        lengths.append(len(text_tensor))

    padded_texts = nn.utils.rnn.pad_sequence(text_list, batch_first=True, padding_value=word2index["<PAD>"])
    labels = torch.tensor(label_list, dtype=torch.float32)
    lengths = torch.tensor(lengths, dtype=torch.int64)
    return padded_texts, labels, lengths


In [20]:
from torch.utils.data import DataLoader

batch_size = 32

train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)


In [21]:
text_batch, label_batch, length_batch = next(iter(train_dl))

print("Tekst-batch shape:", text_batch.shape)        # (batch_size, sekvenslengde)
print("Label-batch shape:", label_batch.shape)       # (batch_size,)
print("Sekvenslengder:", length_batch)               # (batch_size,)


Tekst-batch shape: torch.Size([32, 975])
Label-batch shape: torch.Size([32])
Sekvenslengder: tensor([253, 330, 527, 111, 390, 188, 860, 271, 911, 190, 134, 111, 116, 102,
        129,  43, 140, 246, 631,  98, 219, 403,  44, 166, 116, 130, 975, 400,
        295,  65, 140, 178])


In [22]:
seq_idx = 0  # f.eks. første i batch
tokens = text_batch[seq_idx][:length_batch[seq_idx]]
padding = text_batch[seq_idx][length_batch[seq_idx]:]

print("Tokens (indekser):", tokens.tolist())
print("Padding:", padding.tolist())
print("Label:", label_batch[seq_idx].item())


Tokens (indekser): [3288, 304, 1659, 1072, 8568, 461, 45, 5, 163, 10824, 2289, 6418, 1344, 606, 38, 4, 107, 936, 8872, 9, 60, 656, 38, 11, 74, 16963, 11264, 3, 17880, 416, 40, 4, 368, 5, 50, 673, 3, 4, 543, 2763, 1130, 32, 207, 5, 2, 595, 5, 19020, 34, 15804, 14400, 19, 2, 18, 153, 21, 65, 81, 4, 50, 291, 5, 5016, 2, 899, 3, 2, 2745, 10, 51, 64, 1837, 2, 856, 3, 839, 49, 5, 2, 114, 15, 6, 137, 69, 89, 21, 1316, 388, 2, 107, 13, 3, 68, 1171, 10, 122, 8, 234, 30, 4, 116, 98, 74, 6, 959, 5, 4, 606, 976, 16, 2, 3254, 5, 2, 673, 3, 2, 1092, 19, 57, 11, 173, 162, 21, 726, 45, 4, 176, 16, 71, 2, 1147, 773, 7, 44, 35, 618, 243, 563, 6, 19020, 6, 3487, 2, 421, 44, 16905, 1845, 184, 69, 169, 45, 306, 12, 2, 595, 7, 166, 2, 1141, 14082, 40, 7, 4, 506, 3416, 847, 202, 2, 595, 13, 13038, 314, 258, 34, 5343, 18523, 4, 7758, 15, 35, 5935, 213, 22, 3, 4, 1103, 1370, 12, 2734, 49, 7059, 19, 165, 65, 6379, 1342, 150, 8832, 8809, 15, 2, 293, 670, 107, 3558, 7, 628, 19, 24, 12, 1082, 347, 2, 595, 2384, 49

In [23]:
print("Pad-indeks:", word2index["<PAD>"])
print("Unike verdier i hele batchen:", torch.unique(text_batch))


Pad-indeks: 0
Unike verdier i hele batchen: tensor([    0,     2,     3,  ..., 62602, 62603, 62604])


In [26]:
embedding = nn.Embedding(
    num_embeddings=10, 
    embedding_dim=3,
    padding_idx=0)

text_encoded_input = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 0]])
print(embedding(text_encoded_input))

tensor([[[ 0.3430, -0.5329, -0.7423],
         [-0.3842,  0.4307, -0.5028],
         [ 0.5857, -0.2052,  2.7972],
         [ 1.0885,  0.5652,  0.2847]],

        [[ 0.5857, -0.2052,  2.7972],
         [ 0.4700,  1.9600, -0.3665],
         [-0.3842,  0.4307, -0.5028],
         [ 0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward0>)


In [29]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers=2, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        _, hidden = self.rnn(x)
        out = hidden[-1, :, :]
        out = self.fc(out)
        return out
    
model = RNN(64, 32)
print(model)

model(torch.randn(5, 3, 64))

RNN(
  (rnn): RNN(64, 32, num_layers=2, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)


tensor([[0.3162],
        [0.4356],
        [0.4306],
        [0.2232],
        [0.2070]], grad_fn=<AddmmBackward0>)

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size)