In [7]:
import torch

from tokenizers import Tokenizer, models, pre_tokenizers, decoders
from tokenizers.trainers import BpeTrainer

import pandas as pd


## Initialise tokenizer

In [8]:
# Initialize the tokenizer
tokenizer = Tokenizer(models.BPE())

# Set the pre-tokenizer
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)

# Set the decoder
tokenizer.decoder = decoders.ByteLevel()


## Train tokenizer

In [15]:
# Train the tokenizer on imdb corpus

# # commented out to avoid re-training the tokenizer
# trainer = BpeTrainer(special_tokens=["<PAD>", "<UNK>", "<BOS>", "<EOS>"])
# tokenizer.train(files=["data/imdb_text.txt"], trainer=trainer)

# # Save the tokenizer
# tokenizer.save("tut/sentiment_analysis/models/tokenizer.json")


## How to load in the tokenizer elsewhere:

In [14]:
# Load the tokenizer

from tokenizers import Tokenizer

tokenizer = Tokenizer.from_file("tut/sentiment_analysis/models/tokenizer.json")

# Tokenize a sentence
sentence = "this movie fricken sucks!"
output = tokenizer.encode(sentence)

# Get the token IDs
token_ids = output.ids

# Get the token strings
tokens = output.tokens

print(f"Token IDs: {token_ids}")
print(f"Tokens: {tokens}")


Token IDs: [242, 275, 322, 12348, 4528, 4]
Tokens: ['Ġthis', 'Ġmovie', 'Ġfr', 'icken', 'Ġsucks', '!']


In [4]:
tokenizer.decode(output.ids)


' this movie fricken sucks!'

## Prepare training data

In [16]:
all_data = pd.read_csv("tut/sentiment_analysis/data/imdb_data.csv")

# add <BOS> and <EOS> tokens
all_data["review"] = all_data["review"].apply(lambda x: "<BOS>" + x + "<EOS>")

# tokenize the reviews
all_data["review"] = all_data["review"].apply(lambda x: tokenizer.encode(x).ids)

# get pad token
pad_token = tokenizer.token_to_id("<PAD>")

# max length of a review
lengths = all_data["review"].apply(lambda x: len(x))

max_len = lengths.max()


## Pack reviews into square tensor

In [17]:
# pack all the reviews in to a tensor with padding, do it efficiently af, use type hints
def pack_reviews(reviews: pd.Series, pad_token: int, max_len: int) -> torch.Tensor:
    # create a tensor of shape (len(reviews), max_len) filled with pad_token
    packed = torch.full((len(reviews), max_len), pad_token, dtype=torch.long)
    # loop over the reviews
    for i, review in enumerate(reviews):
        # get the length of the review
        review_len = len(review)
        # fill the tensor with the review
        packed[i, :review_len] = torch.tensor(review)
    return packed


# apply this function to the reviews
packed_reviews = pack_reviews(all_data["review"], pad_token, max_len)


## Calculate labels

In [18]:
# convert the labels to a tensor. the values are "positive" and "negative"
# convert them to 1 and 0, 1 for positive and 0 for negative
labels = torch.tensor(all_data["sentiment"] == "positive", dtype=torch.long)

In [19]:
# convert lengths to tensor of ints
lengths = torch.tensor(lengths.values, dtype=torch.int32)

In [20]:
packed_reviews.shape, labels.shape

(torch.Size([50000, 3113]), torch.Size([50000]))

In [21]:
torch.save(
    {"reviews": packed_reviews, "labels": labels, "lengths": lengths},
    "tut/sentiment_analysis/data/imdb_data.pt",
)
