In [6]:
x_seq = torch.tensor([[1.0]*5, [2.0]*5, [3.0]*5]).float()

# output of the simple RNN
output, hn = rnn_layer(torch.reshape(x_seq, (1, 3, 5)))

# manually computing the output
out_man = []
for t in range(3):
    xt = torch.reshape(x_seq[t], (1, 5))
    print(f'Time step {t} =>')
    print('   Input       :', xt.numpy())

    ht = torch.matmul(xt, torch.transpos(w_xh, 0, 1)) + b_xh
    print('   Hidden      :', ht.detach().numpy())

    if t > 0:
        prev_h = out_man[t-1]
    else:
        prev_h = torch.zeros((ht.shape))

    ot = ht + torch.matmul(prev_h, torch.transpose(w_hh, 0, 1)) + b_hh
    ot = torch.tanh(ot)
    out_man.append(ot)

    print('    Output (manual) :', ot.detach().numpy())
    print('    RNN output      :', output[:, t].detach().numpy)
    print()


Time step 0 =>
   Input       : [[1. 1. 1. 1. 1.]]
   Hidden      : [[-0.4701929  0.5863904]]
    Output (manual) : [[-0.3519801   0.52525216]]
    RNN output      : [[-0.3519801   0.52525216]]

Time step 1 =>
   Input       : [[2. 2. 2. 2. 2.]]
   Hidden      : [[-0.88883156  1.2364397 ]]
    Output (manual) : [[-0.68424344  0.76074266]]
    RNN output      : [[-0.68424344  0.76074266]]

Time step 2 =>
   Input       : [[3. 3. 3. 3. 3.]]
   Hidden      : [[-1.3074702  1.8864892]]
    Output (manual) : [[-0.8649416  0.9046636]]
    RNN output      : [[-0.8649416  0.9046636]]



### Project 1 - Predicting Sentiment of IMDb movie reviews

In [20]:
from datasets import load_dataset

# Laster datasettet (tar første gang litt tid – det cacher lokalt)
imdb = load_dataset("imdb")


In [21]:
train_dataset = imdb['train']
test_dataset = imdb['test']

In [25]:
import torch
from torch.utils.data.dataset import random_split
torch.manual_seed(1)
train_dataset, val_dataset = random_split(list(train_dataset), [20000, 5000])

In [27]:
train_dataset[0]

{'text': 'An extra is called upon to play a general in a movie about the Russian Revolution. However, he is not any ordinary extra. He is Serguis Alexander, former commanding general of the Russia armies who is now being forced to relive the same scene, which he suffered professional and personal tragedy in, to satisfy the director who was once a revolutionist in Russia and was humiliated by Alexander. It can now be the time for this broken man to finally "win" his penultimate battle. This is one powerful movie with meticulous direction by Von Sternberg, providing the greatest irony in Alexander\'s character in every way he can. Jannings deserved his Oscar for the role with a very moving performance playing the general at his peak and at his deepest valley. Powell lends a sinister support as the revenge minded director and Brent is perfect in her role with her face and movements showing so much expression as Jannings\' love. All around brilliance. Rating, 10.',
 'label': 1}

In [28]:
import re
from collections import Counter

def tokenizer(text):
    text = re.sub(r'<[^>]*>', '', text)
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub(r'[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized

token_counts = Counter()

for example in train_dataset:  # ikke unpack som tuple!
    tokens = tokenizer(example["text"])
    token_counts.update(tokens)

print("Vocab-size:", len(token_counts))


Vocab-size: 69006


In [32]:
from collections import Counter

# Behold de N vanligste ordene (valgfritt, f.eks. 10 000)
max_vocab_size = 100000
most_common = token_counts.most_common(max_vocab_size)

# Spesielle tokens
specials = ["<PAD>", "<UNK>"]
word2index = {word: idx for idx, word in enumerate(specials)}
index2word = {idx: word for idx, word in enumerate(specials)}

# Start telling videre etter spesial-tokens
for idx, (word, _) in enumerate(most_common, start=len(specials)):
    word2index[word] = idx
    index2word[idx] = word  


In [35]:
def text_pipeline(text):
    tokens = tokenizer(text)
    ids = [word2index.get(token, word2index["<UNK>"]) for token in tokens]
    return ids

def label_pipeline(label):
    return 1 if label == "pos" else 0


In [38]:
sample = train_dataset[0]
x = text_pipeline(sample["text"])
y = label_pipeline(sample["label"])

print("Input IDs:", x[:10])
print("Label:", y)


Input IDs: [35, 1739, 7, 449, 721, 6, 301, 4, 787, 9]
Label: 0


In [43]:
import torch.nn as nn

def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))

    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(
        text_list, batch_first = True
    )
    return padded_text_list, label_list, lengths

from torch.utils.data import DataLoader
dataloader = DataLoader(train_dataset, batch_size = 4, shuffle=False, collate_fn = collate_batch)

In [45]:
text_batch, label_batch, length_batch = next(iter(dataloader))
print(text_batch, label_batch, length_batch)

tensor([[6357],
        [6357],
        [6357],
        [6357]]) tensor([0, 0, 0, 0]) tensor([1, 1, 1, 1])
