<a href="https://colab.research.google.com/github/jyotidabass/-Training-and-Customizing-Tokenizers/blob/main/Training_and_Customizing_Tokenizers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# Import necessary libraries
import pandas as pd
import torch
from transformers import AutoTokenizer

# Create dummy data
data = {
    "text": [
        "This is a sample sentence.",
        "This is another sample sentence.",
        "I love playing football.",
        "I love playing basketball.",
        "This is a new sentence with [NEW_TOKEN]."
    ]
}

# Convert dummy data to pandas dataframe
df = pd.DataFrame(data)

# Create a tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Print the original tokens
print("Original Tokens:")
print(tokenizer.encode(df["text"][0]))

# Customize the tokenizer by adding new tokens
tokenizer.add_tokens(["[NEW_TOKEN]"])

# Print the customized tokens
print("\nCustomized Tokens:")
print(tokenizer.encode(df["text"][4]))

# Train the tokenizer on the dummy data using train_new_from_iterator
# This method is used to train the tokenizer on new data.
tokenizer.train_new_from_iterator(df["text"], vocab_size=len(tokenizer)) #vocab_size can be adjusted
#Print the trained tokens
print("\nTrained Tokens:")
print(tokenizer.encode(df["text"][0]))

Original Tokens:
[101, 2023, 2003, 1037, 7099, 6251, 1012, 102]

Customized Tokens:
[101, 2023, 2003, 1037, 2047, 6251, 2007, 30522, 1012, 102]

Trained Tokens:
[101, 2023, 2003, 1037, 7099, 6251, 1012, 102]


# Here's a simple real-life application using Python code for Training  Tokenizers:

In [6]:
!pip install transformers datasets
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader

# Load the dummy dataset
df = pd.DataFrame({
    "review": [
        "I loved the movie!",
        "It was a terrible film.",
        "The acting was great.",
        "The special effects were amazing.",
        "I didn't like it."
    ],
    "sentiment": [1, 0, 1, 1, 0]
})

# Create a tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Preprocess the text data
preprocessed_reviews = []
for review in df["review"]:
    preprocessed_review = tokenizer.encode(review, add_special_tokens=True)
    preprocessed_reviews.append(preprocessed_review)

# Create a custom dataset class
class MovieReviewDataset(torch.utils.data.Dataset):
    def __init__(self, preprocessed_reviews, labels):
        self.preprocessed_reviews = preprocessed_reviews
        self.labels = labels

    def __getitem__(self, idx):
        return {"input_ids": self.preprocessed_reviews[idx], "labels": self.labels[idx]}

    def __len__(self):
        return len(self.preprocessed_reviews)

# Create a dataset instance
dataset = MovieReviewDataset(preprocessed_reviews, df["sentiment"])

# Create a DataLoader for batching
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=lambda batch: tokenizer.pad(batch, padding='longest', return_tensors='pt'))
# Added collate_fn to pad sequences to the longest length in the batch

# Train a model for sentiment analysis
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

for epoch in range(5):
    model.train()
    total_loss = 0
    for batch in dataloader:
        # Move input_ids and labels to device within the loop
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)
        optimizer.zero_grad()

        # Pass attention_mask to the model
        outputs = model(input_ids, labels=labels, attention_mask=batch['attention_mask'].to(device)) # Pass attention_mask to model
        loss = outputs.loss  # Access loss using outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}")  # Divide by the number of batches

# Evaluate the model
model.eval()
test_review = "I loved the movie!"
test_review = tokenizer.encode(test_review, add_special_tokens=True)
test_review = torch.tensor(test_review).unsqueeze(0).to(device)  # Add batch dimension
output = model(test_review)
print(output)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 1, Loss: 0.7540531555811564
Epoch 2, Loss: 0.6956837773323059
Epoch 3, Loss: 0.5696997046470642
Epoch 4, Loss: 0.46580002705256146
Epoch 5, Loss: 0.5019958118597666
SequenceClassifierOutput(loss=None, logits=tensor([[0.0422, 0.6374]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


# Here's a Python code with simplest explanation for Customizing Tokenizers:
Customizing Tokenizers
Goal: Customize a tokenizer to recognize specific words or phrases.

In [8]:
import pandas as pd
import torch
from transformers import AutoTokenizer

# Load the dummy dataset
df = pd.DataFrame({
    "text": [
        "I loved the movie!",
        "It was a terrible film.",
        "The acting was great.",
        "The special effects were amazing.",
        "I didn't like it."
    ]
})

# Create a tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Customize the tokenizer
tokenizer.add_tokens(["[MOVIE]", "[ACTING]"])

# Preprocess the text data
preprocessed_text = []
for text in df["text"]:
    preprocessed_text.append(tokenizer.encode(text, add_special_tokens=True))

# Print the preprocessed text
print(preprocessed_text)

[[101, 1045, 3866, 1996, 3185, 999, 102], [101, 2009, 2001, 1037, 6659, 2143, 1012, 102], [101, 1996, 3772, 2001, 2307, 1012, 102], [101, 1996, 2569, 3896, 2020, 6429, 1012, 102], [101, 1045, 2134, 1005, 1056, 2066, 2009, 1012, 102]]


# Customizing Tokenizers with Special Tokens

Goal: Customize a tokenizer to recognize special tokens.

In [10]:
import pandas as pd
import torch
from transformers import AutoTokenizer

# Load the dummy dataset
df = pd.DataFrame({
    "text": [
        "I loved the movie!",
        "It was a terrible film.",
        "The acting was great.",
        "The special effects were amazing.",
        "I didn't like it."
    ]
})

# Create a tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


# Customize the tokenizer with special tokens
tokenizer.add_special_tokens({'additional_special_tokens': ["[CLS]", "[SEP]"]})

# Preprocess the text data
preprocessed_text = []
for text in df["text"]:
    preprocessed_text.append(tokenizer.encode(text, add_special_tokens=True))

# Print the preprocessed text
print(preprocessed_text)

[[101, 1045, 3866, 1996, 3185, 999, 102], [101, 2009, 2001, 1037, 6659, 2143, 1012, 102], [101, 1996, 3772, 2001, 2307, 1012, 102], [101, 1996, 2569, 3896, 2020, 6429, 1012, 102], [101, 1045, 2134, 1005, 1056, 2066, 2009, 1012, 102]]


