In [2]:
import torch

from tokenizers import Tokenizer, models, pre_tokenizers, decoders
from tokenizers.trainers import BpeTrainer

import pandas as pd


## Initialise tokenizer

In [3]:
# Initialize the tokenizer
tokenizer = Tokenizer(models.BPE())

# Set the pre-tokenizer
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)

# Set the decoder
tokenizer.decoder = decoders.ByteLevel()


## Train tokenizer

In [12]:
# Train the tokenizer on imdb corpus

trainer = BpeTrainer(special_tokens=["<PAD>", "<UNK>", "<BOS>", "<EOS>"])
tokenizer.train(files=["data/imdb_text.txt"], trainer=trainer)

# Save the tokenizer
tokenizer.save("models/tokenizer.json")







## How to load in the tokenizer elsewhere:

In [4]:
# Load the tokenizer

from tokenizers import Tokenizer

tokenizer = Tokenizer.from_file("models/tokenizer.json")

# Tokenize a sentence
sentence = "this movie fricken sucks!"
output = tokenizer.encode(sentence)

# Get the token IDs
token_ids = output.ids

# Get the token strings
tokens = output.tokens

print(f"Token IDs: {token_ids}")
print(f"Tokens: {tokens}")


Token IDs: [242, 275, 322, 12348, 4528, 4]
Tokens: ['Ġthis', 'Ġmovie', 'Ġfr', 'icken', 'Ġsucks', '!']


In [5]:
tokenizer.decode(output.ids)


' this movie fricken sucks!'

## Prepare training data

In [6]:
all_data = pd.read_csv("data/imdb_data.csv")

# add <BOS> and <EOS> tokens
all_data["review"] = all_data["review"].apply(lambda x: "<BOS>" + x + "<EOS>")

# tokenize the reviews
all_data["review"] = all_data["review"].apply(lambda x: tokenizer.encode(x).ids)

# get pad token
pad_token = tokenizer.token_to_id("<PAD>")
all_data.head()

Unnamed: 0,review,sentiment
0,"[2, 1473, 198, 174, 487, 5139, 397, 3082, 234,...",positive
1,"[2, 269, 1572, 680, 1455, 17, 299, 211, 256, 2...",positive
2,"[2, 210, 867, 242, 253, 171, 1572, 586, 200, 3...",positive
3,"[2, 7618, 448, 239, 171, 1056, 667, 171, 680, ...",negative
4,"[2, 2123, 294, 20215, 239, 297, 9789, 206, 174...",positive


In [20]:
# max length of a review
max_len = all_data["review"].apply(lambda x: len(x)).max()


In [1]:


# pack all the reviews in to a tensor with padding, do it efficiently af, use type hints
def pack_reviews(reviews: pd.Series, pad_token: int, max_len: int) -> torch.Tensor:
    # create a tensor of shape (len(reviews), max_len) filled with pad_token
    packed = torch.full((len(reviews), max_len), pad_token, dtype=torch.long)
    # loop over the reviews
    for i, review in enumerate(reviews):
        # get the length of the review
        review_len = len(review)
        # fill the tensor with the review
        packed[i, :review_len] = torch.tensor(review)
    return packed


# apply this function to the reviews
packed_reviews = pack_reviews(all_data["review"], pad_token, max_len)

# convert the labels to a tensor. the values are "positive" and "negative"
# so we need to convert them to [1, 0] and [0, 1] respectively
labels = torch.tensor(
    all_data["sentiment"]
    .apply(lambda x: [1, 0] if x == "positive" else [0, 1])
    .tolist()
)


NameError: name 'max_len' is not defined