## Experiments with tokenization stuff

## Preparing Tokenizer

In [1]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders

In [2]:
tokenizer = Tokenizer(models.WordPiece())

# Create a new WordPiece tokenizer

In [3]:
trainer = trainers.WordPieceTrainer(
    vocab_size=50000,  # Set the vocabulary size
    special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"]  # Special tokens
)

# Training the tokenizer on a corpus

In [4]:
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Pre-tokenization (e.g., whitespace tokenization)

## Loading data

In [6]:
import pandas as pd

In [11]:
df = pd.read_csv('../data/preprocessed/preprocessed.csv')

In [13]:
data = df['0'].to_list()

In [14]:
len(data)

110208

In [15]:
tokenizer.train_from_iterator(data)

In [18]:
# tokenizer.save("../data/tokenizer/tokens.json")

# commented out to avoid accidental run

In [19]:
import numpy as np

In [None]:
def pad(tokens:list[int], sequenceLength:int):
    """
    Adds padding to vector, so it has the same length of `sequenceLength`
    """
    if len(tokens) > sequenceLength:
        raise ValueError(f"The given text should contain less than {sequenceLength} not {len(tokens)}")
    
    if len(tokens) < sequenceLength:
        tokens = np.pad(tokens, (sequenceLength-len(tokens), 0), 'constant')

    return tokens

In [20]:
def tokenize(text:str, tokenizer:Tokenizer, sequenceLength:int, shouldPad=True):
    """
    Tokenizes a text and return a matrix of shape, (n, sequenceLength)

    Here `n` is dependant on the length of text
    `n = len(tokenizer.encode(text).ids) % sequenceLength`
    """

    tokens = tokenizer.encode(text).ids

    tokens = [tokens[i:i+sequenceLength] for i in range(0, len(tokens), sequenceLength)]

    if shouldPad:
        return np.stack( list( map( lambda x: pad(x, sequenceLength), tokens ) ), axis=0)
    else:
        return tokens
    

In [23]:
max(map(lambda x:len(tokenizer.encode(x).ids), data))

# seeing the maximum amout of tokens 

144

Every vectorizer models will be consisting of max 150 tokens