# training a pre trained model with data for sentiment analysis

* Glue -- sst2 dataset
* train and tests dataset
* ver interesting table: sentence - tokens - ids - n_tokens - essential_tokens
* predicting labels from other datasets different fron sst2

# libraries

In [1]:
pip install datasets transformers evaluate torch scikit-learn accelerate -U

Note: you may need to restart the kernel to use updated packages.


In [1]:
from datasets import load_dataset

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

import evaluate
import torch
import pandas as pd
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# importing the model - tokenizer - dataset

In [4]:
# 'num_labels=2' specifies that this is a binary classification task
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2)
# DistilBERT is a smaller, faster version of BERT. It has already been
# pre-trained on general language tasks

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# Load the tokenizer for DistilBERT (or any other model to fine-tune).
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")



In [5]:
tokenizer?

[0;31mSignature:[0m     
[0mtokenizer[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtext[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtext_pair[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtext_target[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m


In [6]:
# Load the SST-2 dataset from the GLUE benchmark.
# The 'sst2' configuration is used for sentiment classification tasks.
dataset = load_dataset('glue', 'sst2')

# The SST-2 dataset is a binary classification dataset for sentiment
# analysis. It contains sentences with labels 0 (negative) and 1 (positive)
# the test set has label = -1 wich means it is unlabelled

README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [8]:
# Sampling a smaller subset for training and testing
n_train = 600   # Number of training samples
n_test = 150    # Number of testing samples

# Shuffle the training and test datasets and select a subset.
#the test dataset is unlabelled: labels = -1, we use the validation set
train_dataset = dataset['train'].shuffle(seed=42).select(range(n_train))
test_dataset = dataset['validation'].shuffle(seed=17).select(range(n_test))

## tokenizing train - test subsets

In [9]:
#function to tokenize the sentences in the dataset.
def tokenize_function(examples):
    return tokenizer(
        examples['sentence'],           # The text field in SST-2 is 'sentence'
        padding="longest",              # Pad to the longest sentence
        truncation=True,                # Truncate if the sentence is longer than 512 tokens
        max_length=512                  # Max token length
    )

In [10]:
# Tokenize the training dataset
tokenized_train = train_dataset.map(tokenize_function, batched=True)

# Tokenize the test dataset
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Explanation:
# The tokenizer converts the sentences into token IDs that the
# model can process. We use padding and truncation to ensure all
# inputs have the same lenght in each batch. The function 'map' applies the
# tokenization to all examples in the dataset in batches for speed.

In [11]:
tokenized_train?

[0;31mType:[0m        Dataset
[0;31mString form:[0m
Dataset({
    features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
    num_rows: 600
})
[0;31mLength:[0m      600
[0;31mFile:[0m        ~/.conda/envs/default/lib/python3.9/site-packages/datasets/arrow_dataset.py
[0;31mDocstring:[0m   A Dataset backed by an Arrow table.


### showing sentences - tokens  ids


In [12]:
# Define the special token ID for padding, usually tokenizer.pad_token_id
pad_token_id = tokenizer.pad_token_id

In [13]:
pad_token_id?

[0;31mType:[0m        int
[0;31mString form:[0m 0
[0;31mDocstring:[0m  
int([x]) -> integer
int(x, base=10) -> integer

Convert a number or string to an integer, or return 0 if no arguments
are given.  If x is a number, return x.__int__().  For floating point
numbers, this truncates towards zero.

If x is not a number or if base is given, then x must be a string,
bytes, or bytearray instance representing an integer literal in the
given base.  The literal can be preceded by '+' or '-' and be surrounded
by whitespace.  The base defaults to 10.  Valid bases are 0 and 2-36.
Base 0 means to interpret the base from the string as an integer literal.
>>> int('0b100', base=0)
4


In [14]:
# Function to process the tokenized dataset and extract necessary fields
def extract_token_info_with_essential_tokens(example):
    # Get the original sentence
    original_sentence = example['sentence'] if 'sentence' in example else None

    # Get the tokenized sentence by converting token IDs back to tokens
    tokenized_sentence = tokenizer.convert_ids_to_tokens(example['input_ids'])

    # Count the number of tokens excluding padding
    essential_tokens = sum(1 for token_id in example['input_ids'] if token_id != pad_token_id)

    # Return original sentence, tokenized sentence, token IDs, total tokens, and essential tokens
    return {
        'sentence': original_sentence,                       # The original sentence
        'tokenized_sentence': " ".join(tokenized_sentence),  # Tokenized sentence as a string
        'token_ids': example['input_ids'],                   # List of token IDs
        'num_tokens': len(example['input_ids']),             # Total number of tokens (including padding)
        'essential_tokens': essential_tokens                 # Number of tokens excluding padding
    }


In [15]:
# Apply the extraction function to the already tokenized dataset
processed_test = tokenized_test.map(extract_token_info_with_essential_tokens, batched=False)

In [16]:
processed_test?

[0;31mType:[0m        Dataset
[0;31mString form:[0m
Dataset({
    features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask', 'tokenized_sentence', 'token_ids', 'num_tokens', 'essential_tokens'],
    num_rows: 150
})
[0;31mLength:[0m      150
[0;31mFile:[0m        ~/.conda/envs/default/lib/python3.9/site-packages/datasets/arrow_dataset.py
[0;31mDocstring:[0m   A Dataset backed by an Arrow table.


In [17]:
# Create a DataFrame from the processed dataset
test_data = [
    {
        "sentence": ex['sentence'],
        "tokenized_sentence": ex['tokenized_sentence'], # Tokenized sentence as a string
        "token_ids": ex['token_ids'],                   #Tokens id
        "essential_tokens": ex['essential_tokens'],     # Number of tokens without padding
        "num_tokens": ex['num_tokens'],                 # Total tokens (with padding)
    }
    for ex in processed_test
]

In [18]:
test_data?

[0;31mType:[0m        list
[0;31mString form:[0m [{'sentence': 'at least one scene is so disgusting that viewers may be hard pressed to retain the <...> 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'essential_tokens': 37, 'num_tokens': 54}]
[0;31mLength:[0m      150
[0;31mDocstring:[0m  
Built-in mutable sequence.

If no argument is given, the constructor creates a new empty list.
The argument must be an iterable if specified.


In [19]:
# Create a pandas DataFrame
df_test = pd.DataFrame(test_data)

# Sort the DataFrame by the number of essential tokens in descending order
df_test = df_test.sort_values(by="essential_tokens", ascending=False)

In [20]:
df_test

Unnamed: 0,sentence,tokenized_sentence,token_ids,essential_tokens,num_tokens
13,"for all its technical virtuosity , the film is...",[CLS] for all its technical vi ##rt ##uo ##sit...,"[101, 2005, 2035, 2049, 4087, 6819, 5339, 1909...",54,54
34,the special effects and many scenes of weightl...,[CLS] the special effects and many scenes of w...,"[101, 1996, 2569, 3896, 1998, 2116, 5019, 1997...",47,54
47,but the power of these ( subjects ) is obscure...,[CLS] but the power of these ( subjects ) is o...,"[101, 2021, 1996, 2373, 1997, 2122, 1006, 5739...",47,54
113,"it does nothing new with the old story , excep...","[CLS] it does nothing new with the old story ,...","[101, 2009, 2515, 2498, 2047, 2007, 1996, 2214...",47,54
54,"the tale of tok ( andy lau ) , a sleek sociopa...","[CLS] the tale of to ##k ( andy lau ) , a slee...","[101, 1996, 6925, 1997, 2000, 2243, 1006, 5557...",46,54
...,...,...,...,...,...
97,big fat waste of time .,[CLS] big fat waste of time . [SEP] [PAD] [PAD...,"[101, 2502, 6638, 5949, 1997, 2051, 1012, 102,...",8,54
66,it treats women like idiots .,[CLS] it treats women like idiots . [SEP] [PAD...,"[101, 2009, 18452, 2308, 2066, 28781, 1012, 10...",8,54
109,a deep and meaningful film .,[CLS] a deep and meaningful film . [SEP] [PAD]...,"[101, 1037, 2784, 1998, 15902, 2143, 1012, 102...",8,54
88,a wildly inconsistent emotional experience .,[CLS] a wildly inconsistent emotional experien...,"[101, 1037, 13544, 20316, 6832, 3325, 1012, 10...",8,54


## labelling (predicting) with the pretrained model

### labelling a single sentence

In [21]:
def predict_label_sentence(sentence):
    """
    Takes a sentence and returns the original sentence, the tokenized sentence,
    the token IDs, the softmax probabilities, and the predicted label.

    Args:
    - sentence (str): The input sentence for sentiment analysis.

    Returns:
    - dict: A dictionary containing:
        - 'original_sentence': The original sentence.
        - 'tokenized_sentence': The tokenized version of the sentence.
        - 'input_ids': The token IDs (numerical representation).
        - 'softmax_probs': The softmax probabilities for each class.
        - 'predicted_label': The predicted class label.
        - 'sentiment': Sentiment as 'positive' or 'negative'.
    """

    # Step 1: Tokenize the sentence
    tokens = tokenizer(sentence, return_tensors="pt", padding="longest", truncation=True, max_length=512)

    # Step 2: Get the token IDs and tokenized sentence
    input_ids = tokens['input_ids']
    tokenized_sentence = tokenizer.convert_ids_to_tokens(input_ids[0])

    # Step 3: Pass the tokenized input to the model to get logits
    with torch.no_grad():  # Disable gradient computation for evaluation
        output = model(**tokens)
        logits = output.logits

    # Step 4: Apply softmax using torch to get probabilities
    softmax_probs = torch.softmax(logits, dim=-1)

    # Step 5: Get the predicted label (argmax of softmax output)
    predicted_label = torch.argmax(softmax_probs, dim=-1).item()

    # Step 6: Determine sentiment based on the predicted label
    sentiment_label = "positive" if predicted_label == 1 else "negative"

    # Step 7: Prepare result dictionary
    result = {
        "original_sentence": sentence,
        "tokenized_sentence": tokenized_sentence,
        "input_ids": input_ids[0].tolist(),
        "softmax_probs": softmax_probs[0].tolist(),  # Convert tensor to list
        "predicted_label": predicted_label,
        "sentiment": sentiment_label
    }

    return result


In [22]:
sentence_label = predict_label_sentence("I am happy")

In [23]:
sentence_label

{'original_sentence': 'I am happy',
 'tokenized_sentence': ['[CLS]', 'i', 'am', 'happy', '[SEP]'],
 'input_ids': [101, 1045, 2572, 3407, 102],
 'softmax_probs': [0.46126559376716614, 0.5387344360351562],
 'predicted_label': 1,
 'sentiment': 'positive'}

### labelling a whole dataset

In [24]:
def predict_label_dataset(new_data, text_columns=["text", "sentence", "content", "title"]):
    """
    Evaluates the model on a subset of data and returns a DataFrame with
    all the sentences, true labels, predicted labels, number of tokens,
    softmax probabilities, tokenized sentences, and input IDs, along with accuracy.

    Args:
    - new_data: The subset of the dataset to evaluate (already selected).
    - text_columns: A list of possible text columns to use (default: ['text', 'sentence', 'content', 'title']).

    Returns:
    - df: DataFrame containing detailed prediction information.
    - accuracy: Accuracy of the model on the dataset (None if labels are unavailable).
    """
    
    # Step 1: Find the appropriate text column
    for col in text_columns:
        if col in new_data.column_names:
            text_column = col
            break
    else:
        raise ValueError(f"None of the specified text columns {text_columns} were found in the dataset.")
    
    # Initialize results dictionary
    results = {
        "Sentence": [],
        "Tokenized Sentence": [],
        "Input IDs": [],
        "Number of Tokens": [],
        "Softmax Probs": [],
        "Predicted Label": [],
        "True Label": [],
        "Sentiment": []
    }
    
    # Iterate through each example in the dataset with a progress bar
    for i in tqdm(range(len(new_data)), desc="Labelling Sentences"):
        sentence = new_data[i][text_column]
        true_label = new_data[i].get('label', -1)  # Use -1 if label is missing
        
        # Use predict_label_sentence function to get predictions and other details
        prediction = predict_label_sentence(sentence)
        
        # Count non-padding tokens
        input_ids = prediction["input_ids"]
        pad_token_id = tokenizer.pad_token_id
        num_tokens = sum([1 for token_id in input_ids if token_id != pad_token_id])
        
        # Append the details to the results dictionary
        results["Sentence"].append(prediction["original_sentence"])
        results["Tokenized Sentence"].append(prediction["tokenized_sentence"])
        results["Input IDs"].append(prediction["input_ids"])
        results["Number of Tokens"].append(num_tokens)
        results["Softmax Probs"].append(prediction["softmax_probs"])
        results["Predicted Label"].append(prediction["predicted_label"])
        results["True Label"].append(true_label)
        results["Sentiment"].append(prediction["sentiment"])
    
    # Convert results to a DataFrame
    df = pd.DataFrame(results)
    
    # Check if true labels are available (i.e., not all -1)
    if df["True Label"].isin([-1]).all():
        accuracy = None
    else:
        # Calculate Accuracy
        accuracy = accuracy_score(df["True Label"], df["Predicted Label"])
        
    # Return the DataFrame and accuracy
    return df, accuracy


In [25]:
df_test_non_trained, accuracy_test_non_trained = predict_label_dataset(test_dataset)

Labelling Sentences: 100%|██████████| 150/150 [00:09<00:00, 15.71it/s]


In [26]:
accuracy_test_non_trained

0.5066666666666667

In [27]:
df_test_non_trained

Unnamed: 0,Sentence,Tokenized Sentence,Input IDs,Number of Tokens,Softmax Probs,Predicted Label,True Label,Sentiment
0,at least one scene is so disgusting that viewe...,"[[CLS], at, least, one, scene, is, so, disgust...","[101, 2012, 2560, 2028, 3496, 2003, 2061, 1942...",20,"[0.49043747782707214, 0.5095624923706055]",1,0,positive
1,even the finest chef ca n't make a hotdog into...,"[[CLS], even, the, finest, chef, ca, n, ', t, ...","[101, 2130, 1996, 10418, 10026, 6187, 1050, 10...",44,"[0.4584480822086334, 0.541551947593689]",1,0,positive
2,collateral damage finally delivers the goods f...,"[[CLS], collateral, damage, finally, delivers,...","[101, 24172, 4053, 2633, 18058, 1996, 5350, 20...",14,"[0.47445327043533325, 0.525546669960022]",1,1,positive
3,"exciting and direct , with ghost imagery that ...","[[CLS], exciting, and, direct, ,, with, ghost,...","[101, 10990, 1998, 3622, 1010, 2007, 5745, 134...",20,"[0.443885862827301, 0.5561141967773438]",1,1,positive
4,and when you 're talking about a slapstick com...,"[[CLS], and, when, you, ', re, talking, about,...","[101, 1998, 2043, 2017, 1005, 2128, 3331, 2055...",22,"[0.4730995297431946, 0.5269004702568054]",1,0,positive
...,...,...,...,...,...,...,...,...
145,it 's a bit disappointing that it only manages...,"[[CLS], it, ', s, a, bit, disappointing, that,...","[101, 2009, 1005, 1055, 1037, 2978, 15640, 200...",20,"[0.4603593051433563, 0.5396407246589661]",1,0,positive
146,a breezy romantic comedy that has the punch of...,"[[CLS], a, bree, ##zy, romantic, comedy, that,...","[101, 1037, 21986, 9096, 6298, 4038, 2008, 203...",24,"[0.46655434370040894, 0.5334456562995911]",1,1,positive
147,the film tries too hard to be funny and tries ...,"[[CLS], the, film, tries, too, hard, to, be, f...","[101, 1996, 2143, 5363, 2205, 2524, 2000, 2022...",18,"[0.4715810716152191, 0.5284189581871033]",1,0,positive
148,thanks to scott 's charismatic roger and eisen...,"[[CLS], thanks, to, scott, ', s, charismatic, ...","[101, 4283, 2000, 3660, 1005, 1055, 23916, 507...",35,"[0.4877658486366272, 0.512234091758728]",1,1,positive


## prepare datasets for pytorch

In [28]:
# Remove the 'sentence' and 'idx' columns from the tokenized_train dataset
tokenized_train = tokenized_train.remove_columns(["sentence", "idx"])

# Remove the 'sentence' and 'idx' columns from the tokenized_test dataset
tokenized_test = tokenized_test.remove_columns(["sentence", "idx"])

# Check the columns after removal (optional, for confirmation)
print(tokenized_train.column_names)
print(tokenized_test.column_names)


['label', 'input_ids', 'attention_mask']
['label', 'input_ids', 'attention_mask']


In [29]:
tokenized_train?

[0;31mType:[0m        Dataset
[0;31mString form:[0m
Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 600
})
[0;31mLength:[0m      600
[0;31mFile:[0m        ~/.conda/envs/default/lib/python3.9/site-packages/datasets/arrow_dataset.py
[0;31mDocstring:[0m   A Dataset backed by an Arrow table.


In [30]:
tokenized_train.shape

(600, 3)

In [31]:
# Hugging Face's Trainer API expects the data in PyTorch format.
tokenized_train.set_format("torch")
tokenized_test.set_format("torch")

In [32]:
tokenized_train?

[0;31mType:[0m        Dataset
[0;31mString form:[0m
Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 600
})
[0;31mLength:[0m      600
[0;31mFile:[0m        ~/.conda/envs/default/lib/python3.9/site-packages/datasets/arrow_dataset.py
[0;31mDocstring:[0m   A Dataset backed by an Arrow table.


# Finetuning the model

In [33]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [34]:
model?

[0;31mSignature:[0m      [0mmodel[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mType:[0m           DistilBertForSequenceClassification
[0;31mString form:[0m   
DistilBertForSequenceClassification(
           (distilbert): DistilBertModel(
           (embeddings): Embedding <...> : Linear(in_features=768, out_features=2, bias=True)
           (dropout): Dropout(p=0.2, inplace=False)
           )
[0;31mFile:[0m           ~/.conda/envs/default/lib/python3.9/site-packages/transformers/models/distilbert/modeling_distilbert.py
[0;31mDocstring:[0m     
DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.


This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings,

In [35]:
model.config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.44.2",
  "vocab_size": 30522
}

## Detailed Explanation of `DistilBertConfig`


This configuration describes the architecture and hyperparameters for the `DistilBERT` model. Below is an in-depth explanation of each field in the configuration:

1. **`_name_or_path`: "distilbert-base-uncased"`  
   - This specifies the name or path of the pretrained model.
   - `"distilbert-base-uncased"` is a smaller, lighter version of BERT that removes the case sensitivity of text (i.e., it treats "Hello" and "hello" the same way).
   
2. **`activation`: "gelu"`  
   - This defines the activation function used in the model.  
   - `"gelu"` stands for **Gaussian Error Linear Unit**, which is a smoother version of ReLU and commonly used in transformer models.

3. **`architectures`: ["DistilBertForMaskedLM"]**  
   - This indicates the type of architecture being used.  
   - `DistilBertForMaskedLM` is the architecture for **Masked Language Modeling**, where the model predicts missing or masked words in sentences. This is used for pretraining BERT-based models.

4. **`attention_dropout`: 0.1**  
   - Dropout rate for the attention layers.  
   - Dropout is a regularization technique used to prevent overfitting by randomly setting a fraction of the attention scores to zero during training. In this case, the rate is 10% (0.1).

5. **`dim`: 768**  
   - The dimensionality of the hidden representations in the model.  
   - Each input token is represented by a vector of size 768 in this version of DistilBERT.

6. **`dropout`: 0.1**  
   - The general dropout rate applied throughout the model.  
   - This helps prevent overfitting by randomly dropping 10% of the neurons during training.

7. **`hidden_dim`: 3072**  
   - This represents the size of the hidden layer in the feedforward neural network part of the transformer model.  
   - Specifically, this is the size of the intermediate layer in each transformer block, which typically has a larger dimension (3072) compared to the input/output dimension (768).
8. **`initializer_range`: 0.02**  
   - This defines the range used to initialize the weights in the model.  
   - The model’s weights are initialized using a uniform distribution in the range [-0.02, 0.02].

9. **`max_position_embeddings`: 512**  
   - The maximum number of tokens or positions that the model can handle.  
   - For DistilBERT, this is capped at 512 tokens. Any input longer than 512 tokens will be truncated.

10. **`model_type`: "distilbert"`  
   - This defines the type of model being used.  
   - `distilbert` is a distilled version of the BERT model, which retains 97% of BERT’s performance but is 60% faster and smaller in size.

11. **`n_heads`: 12**  
   - The number of attention heads in the multi-head attention mechanism.  
   - In transformer architectures like BERT, the attention mechanism is split into multiple "heads" that focus on different parts of the input sequence. DistilBERT uses 12 attention heads.

12. **`n_layers`: 6**  
   - The number of layers (transformer blocks) in the model.  
   - DistilBERT has 6 layers, as opposed to the 12 layers in BERT. This reduction is one reason why DistilBERT is faster and smaller.

13. **`pad_token_id`: 0**  
   - The token ID used to represent padding in the input sequence.  
   - Padding tokens are added to make all sequences in a batch the same length, and `0` is the ID for the padding token.

14. **`qa_dropout`: 0.1**  
   - Dropout rate applied during the Question Answering (QA) head of the model.  
   - This is used in tasks like SQuAD (Stanford Question Answering Dataset), where a 10% dropout rate is applied.

15. **`seq_classif_dropout`: 0.2**  
   - Dropout rate used in the sequence classification head of the model.  
   - This is applicable for tasks like text classification, where a 20% dropout rate is applied to prevent overfitting.

16. **`sinusoidal_pos_embds`: false**  
   - This flag indicates whether sinusoidal positional embeddings are used.  
   - DistilBERT uses learned positional embeddings (as in the original BERT) instead of sinusoidal ones.

17. **`tie_weights_`: true**  
   - This indicates whether the weights of the embeddings and the output layer are tied.  
   - Weight tying reduces the number of parameters in the model and ensures that the input and output embeddings are similar.

18. **`transformers_version`: "4.44.0"**  
   - This specifies the version of the Hugging Face Transformers library used to configure the model.  
   - In this case, the version is 4.44.0.

19. **`vocab_size`: 30522**  
   - The size of the vocabulary used by the tokenizer and the model.  
   - DistilBERT inherits the BERT tokenizer, which uses a vocabulary of 30,522 tokens. This includes words, subwords, and special tokens (like [PAD], [CLS], etc.).


## training arguments

In [36]:
# Load accuracy as the evaluation metric. This will be used to compute
# the accuracy of the model on the validation dataset during evaluation.
accuracy_metric = evaluate.load("accuracy")

# Define the function to compute metrics (accuracy in this case).
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)


In [37]:
# Define the training arguments, which control how the model will be trained.
# Each argument has a direct or indirect impact on both the computation time
# and the model's final performance.
training_args = TrainingArguments(
    output_dir="./results",          # Directory where the model's checkpoints
                                     # and outputs will be saved.
                                     # (Doesn't directly affect training time)

    eval_steps=5,                    # Evaluate the model every  eval_steps.
                                     # Frequent evaluations can slow down training,
                                     # but provide insights into model performance
                                     # during training

    learning_rate=2e-5,              # Learning rate controls the speed at which
                                     # the model updates weights during training.
                                     # A higher rate may lead to faster convergence,
                                     # but could also risk overshooting optima,
                                     # while a lower rate results in slower but
                                     # potentially more stable training.

    per_device_train_batch_size=16,  # Batch size for training on each device (GPU/CPU).
                                     # A larger batch size speeds up training by
                                     # processing more data per step, but uses more memory.
                                     # If you run out of memory, reduce this value.
                                     # Smaller batch sizes mean more updates per epoch.

    per_device_eval_batch_size=64,   # Batch size for evaluation (validation/test set).
                                     # Larger batch sizes can make evaluation faster
                                     # but require more memory. Evaluation only happens
                                     # during the validation phase, so it doesn't affect
                                     # the training speed.

    num_train_epochs=4,              # Number of training epochs. Each epoch is one full
                                     # pass through the training dataset. More epochs
                                     # increase training time but give the model more
                                     # chances to learn. Fewer epochs result in faster
                                     # training but risk underfitting the model.

    gradient_accumulation_steps=3,   # Accumulate gradients over multiple steps before
                                     # updating model weights. This simulates a larger
                                     # batch size (e.g., with batch_size=16 and
                                     # gradient_accumulation_steps=3, the model behaves
                                     # like batch_size=48). This reduces memory usage
                                     # but slows down training because updates happen
                                     # less frequently.

    weight_decay=0.01,               # Weight decay applies regularization during training
                                     # to prevent overfitting by penalizing large weights.
                                     # It improves generalization and helps ensure that
                                     # the model performs well on unseen data.

    logging_dir="./logs",            # Directory for saving logs. Logging doesn't directly
                                     # affect training speed, but frequent logging
                                     # (e.g., at every step) can slow down the process.
                                     # Set appropriate intervals for logging to balance
                                     # information and speed.

    logging_steps=100,               # Log metrics every 100 steps. Too frequent logging
                                     # can slow training down, while infrequent logging
                                     # might not provide enough insight into the model's
                                     # performance during training. Adjust based on your
                                     # need for monitoring.

    save_strategy="epoch",           # Save the model's checkpoints at the end of each
                                     # epoch. This is generally efficient and safe
                                     # unless you need more frequent saving (e.g., "steps").
                                     # More frequent saving can slow down training,
                                     # as saving checkpoints takes time.

    load_best_model_at_end=True,     # Load the best model based on the evaluation
                                     # metric after training finishes. While this
                                     # doesn't affect training speed, it ensures the
                                     # best-performing model (usually evaluated on
                                     # validation accuracy or loss) is kept.

    metric_for_best_model="accuracy",# Monitor accuracy to select the best model.
                                     # This defines the metric used to determine
                                     # which model is considered the best when
                                     # `load_best_model_at_end` is set to True.

    evaluation_strategy="epoch",     # Run evaluation at the end of each epoch.
                                     # This balances training and evaluation time,
                                     # allowing for regular checks on validation
                                     # performance without frequent interruptions.

    report_to="none",                # No need to report results to external platforms
                                     # like TensorBoard or Weights & Biases. This keeps
                                     # overhead minimal and speeds up the training process
                                     # if you're not interested in reporting metrics
                                     # elsewhere.

    seed=42                          # Sets a fixed random seed to ensure reproducibility.
                                     # Doesn't affect computation time but helps ensure
                                     # the same results on re-runs.
)




In [38]:
trainer = Trainer(
    model=model,                     # The model to fine-tune
    args=training_args,              # Training arguments (from TrainingArguments)
    train_dataset=tokenized_train,   # The tokenized training dataset
    eval_dataset=tokenized_test,     # The tokenized evaluation dataset
    compute_metrics=compute_metrics, # Function to compute evaluation metrics

    # Additional Arguments
    tokenizer=tokenizer,             # The tokenizer to use (optional, but useful if you
                                     # want to use it for decoding or processing inputs).

    data_collator=None,              # A function to prepare batches of data. This is
                                     # typically left as `None`, and the default
                                     # collator is used, but you can define your own
                                     # data collator if necessary (e.g., for dynamic
                                     # padding).

    optimizers=(None, None),         # You can provide your own optimizer and scheduler
                                     # (learning rate scheduler). If `None`, the default
                                     # AdamW optimizer and linear scheduler are used.

    callbacks=None,                  # List of callbacks, such as `EarlyStoppingCallback`,
                                     # to run during training. Callbacks can be used
                                     # to perform additional actions during training.

    preprocess_logits_for_metrics=None,  # If you want to pre-process logits before
                                         # computing metrics, define a function here.
)


In [39]:
#trainer?

In [40]:
trainer.train()
#trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.675232,0.506667
1,No log,0.598589,0.68
3,No log,0.489973,0.873333


TrainOutput(global_step=48, training_loss=0.5745113690694174, metrics={'train_runtime': 326.155, 'train_samples_per_second': 7.358, 'train_steps_per_second': 0.147, 'total_flos': 31264375885920.0, 'train_loss': 0.5745113690694174, 'epoch': 3.7894736842105265})

In [41]:
# Run evaluation on both training and validation datasets
train_results = trainer.evaluate(eval_dataset=tokenized_train)  # Evaluate on training set
test_results = trainer.evaluate(eval_dataset=tokenized_test)    # Evaluate on test set


In [42]:
train_results

{'eval_loss': 0.4245406687259674,
 'eval_accuracy': 0.8816666666666667,
 'eval_runtime': 19.1689,
 'eval_samples_per_second': 31.301,
 'eval_steps_per_second': 0.522,
 'epoch': 3.7894736842105265}

In [43]:
test_results

{'eval_loss': 0.48997294902801514,
 'eval_accuracy': 0.8733333333333333,
 'eval_runtime': 4.4629,
 'eval_samples_per_second': 33.611,
 'eval_steps_per_second': 0.672,
 'epoch': 3.7894736842105265}

## labelling with the trained model

In [44]:
sentence_label = predict_label_sentence("I am happy")

In [45]:
sentence_label

{'original_sentence': 'I am happy',
 'tokenized_sentence': ['[CLS]', 'i', 'am', 'happy', '[SEP]'],
 'input_ids': [101, 1045, 2572, 3407, 102],
 'softmax_probs': [0.19336703419685364, 0.8066329956054688],
 'predicted_label': 1,
 'sentiment': 'positive'}

In [46]:
df_test_trained, accuracy_test_trained = predict_label_dataset(test_dataset)

Labelling Sentences: 100%|██████████| 150/150 [00:10<00:00, 14.60it/s]


In [47]:
df_test_trained

Unnamed: 0,Sentence,Tokenized Sentence,Input IDs,Number of Tokens,Softmax Probs,Predicted Label,True Label,Sentiment
0,at least one scene is so disgusting that viewe...,"[[CLS], at, least, one, scene, is, so, disgust...","[101, 2012, 2560, 2028, 3496, 2003, 2061, 1942...",20,"[0.6496649980545044, 0.350335031747818]",0,0,negative
1,even the finest chef ca n't make a hotdog into...,"[[CLS], even, the, finest, chef, ca, n, ', t, ...","[101, 2130, 1996, 10418, 10026, 6187, 1050, 10...",44,"[0.564052939414978, 0.4359470307826996]",0,0,negative
2,collateral damage finally delivers the goods f...,"[[CLS], collateral, damage, finally, delivers,...","[101, 24172, 4053, 2633, 18058, 1996, 5350, 20...",14,"[0.5163972973823547, 0.48360273241996765]",0,1,negative
3,"exciting and direct , with ghost imagery that ...","[[CLS], exciting, and, direct, ,, with, ghost,...","[101, 10990, 1998, 3622, 1010, 2007, 5745, 134...",20,"[0.1598498672246933, 0.8401501178741455]",1,1,positive
4,and when you 're talking about a slapstick com...,"[[CLS], and, when, you, ', re, talking, about,...","[101, 1998, 2043, 2017, 1005, 2128, 3331, 2055...",22,"[0.6049067974090576, 0.39509323239326477]",0,0,negative
...,...,...,...,...,...,...,...,...
145,it 's a bit disappointing that it only manages...,"[[CLS], it, ', s, a, bit, disappointing, that,...","[101, 2009, 1005, 1055, 1037, 2978, 15640, 200...",20,"[0.5640332102775574, 0.4359667897224426]",0,0,negative
146,a breezy romantic comedy that has the punch of...,"[[CLS], a, bree, ##zy, romantic, comedy, that,...","[101, 1037, 21986, 9096, 6298, 4038, 2008, 203...",24,"[0.18238702416419983, 0.8176130056381226]",1,1,positive
147,the film tries too hard to be funny and tries ...,"[[CLS], the, film, tries, too, hard, to, be, f...","[101, 1996, 2143, 5363, 2205, 2524, 2000, 2022...",18,"[0.614990770816803, 0.3850092589855194]",0,0,negative
148,thanks to scott 's charismatic roger and eisen...,"[[CLS], thanks, to, scott, ', s, charismatic, ...","[101, 4283, 2000, 3660, 1005, 1055, 23916, 507...",35,"[0.2577976882457733, 0.7422022819519043]",1,1,positive


In [48]:
accuracy_test_trained

0.8733333333333333

# compatible datasasets

In [49]:
# Define the list of compatible datasets
compatible_datasets = {
    "imdb": "imdb",
    "yelp": "yelp_polarity",
    "amazon": "amazon_polarity"
}

In [50]:
def download_and_select_samples(dataset_name, n_samples):
    """
    Downloads a dataset for sentiment analysis and selects a random subset of n_samples.

    Args:
    - dataset_name: Name of the dataset (must be one of the compatible datasets).
    - n_samples: Number of random samples to select.

    Returns:
    - new_data: A subset of the dataset with n_samples randomly selected.
    """

    # Step 1: Load the dataset
    if dataset_name not in compatible_datasets:
        raise ValueError(f"Dataset '{dataset_name}' not found. Choose from {list(compatible_datasets.keys())}")

    dataset_info = compatible_datasets[dataset_name]

    # Some datasets require specifying a subset
    if isinstance(dataset_info, tuple):
        dataset = load_dataset(*dataset_info)
    else:
        dataset = load_dataset(dataset_info)

    # Use the test split if available, otherwise use the train split
    split = 'test' if 'test' in dataset else 'train'
    data = dataset[split]

    # Step 2: Select a random sample of n_samples from the dataset
    new_data = data.shuffle(seed=17).select(range(n_samples))

    return new_data


### imdb

In [51]:
imdb_data = download_and_select_samples(dataset_name = "imdb", n_samples = 150)

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

In [52]:
imdb_data

Dataset({
    features: ['text', 'label'],
    num_rows: 150
})

In [53]:
# Evaluate the model on the selected new_data
imdb_df, imdb_accuracy = predict_label_dataset(imdb_data)

Labelling Sentences: 100%|██████████| 150/150 [00:38<00:00,  3.93it/s]


In [54]:
print(f"Accuracy imdb: {imdb_accuracy}")
#print(imdb_df.head())  # Display the first few rows of the DataFrame

Accuracy imdb: 0.8533333333333334


In [55]:
imdb_df

Unnamed: 0,Sentence,Tokenized Sentence,Input IDs,Number of Tokens,Softmax Probs,Predicted Label,True Label,Sentiment
0,I wish I had read the comments on IMDb before ...,"[[CLS], i, wish, i, had, read, the, comments, ...","[101, 1045, 4299, 1045, 2018, 3191, 1996, 7928...",159,"[0.5610942244529724, 0.4389057457447052]",0,0,negative
1,I loved this movie! So worth the long running ...,"[[CLS], i, loved, this, movie, !, so, worth, t...","[101, 1045, 3866, 2023, 3185, 999, 2061, 4276,...",149,"[0.4507291316986084, 0.5492709279060364]",1,1,positive
2,I actually went to see this movie with low exp...,"[[CLS], i, actually, went, to, see, this, movi...","[101, 1045, 2941, 2253, 2000, 2156, 2023, 3185...",222,"[0.39913123846054077, 0.6008687615394592]",1,1,positive
3,For anyone who cares to know something about t...,"[[CLS], for, anyone, who, cares, to, know, som...","[101, 2005, 3087, 2040, 14977, 2000, 2113, 224...",201,"[0.522199273109436, 0.4778006672859192]",0,0,negative
4,"Eric Idle, Robbie Coltraine, Janet Suzman - it...","[[CLS], eric, idle, ,, robbie, colt, ##raine, ...","[101, 4388, 18373, 1010, 12289, 9110, 26456, 1...",125,"[0.5400406122207642, 0.45995938777923584]",0,0,negative
...,...,...,...,...,...,...,...,...
145,I bought this Chuck Norris DVD knowing that it...,"[[CLS], i, bought, this, chuck, norris, dvd, k...","[101, 1045, 4149, 2023, 8057, 15466, 4966, 420...",178,"[0.4604353904724121, 0.5395646095275879]",1,0,positive
146,"This movie is based on the book, ""A Many Splen...","[[CLS], this, movie, is, based, on, the, book,...","[101, 2023, 3185, 2003, 2241, 2006, 1996, 2338...",266,"[0.31596124172210693, 0.6840387582778931]",1,1,positive
147,"I must say, when I saw this film at a 6.5 on t...","[[CLS], i, must, say, ,, when, i, saw, this, f...","[101, 1045, 2442, 2360, 1010, 2043, 1045, 2387...",500,"[0.5218430757522583, 0.4781569540500641]",0,0,negative
148,I really enjoyed this movie. I have a real sen...,"[[CLS], i, really, enjoyed, this, movie, ., i,...","[101, 1045, 2428, 5632, 2023, 3185, 1012, 1045...",350,"[0.4625762701034546, 0.5374237298965454]",1,1,positive


In [56]:
imdb_df['Sentence'][0]

"I wish I had read the comments on IMDb before I saw this movie. The first 1 hour was OK, though it did make me wonder why everything was centered at Chicago and why no one reported any weather anomaly from outside US. Isolated acts of nature (of this magnitude) are unthinkable. But beyond the first 60 minutes, the movie just drags on like a never-ending story. The screenplay is horrible. As for the actors, very poor choice. Only the people hired to run in panic stick to their roles. But I do have to agree that this movie has got some good 'special effects'. If you rented it on a DVD and would want to watch the movie, despite the reviews, then play it on maximum speed your player would allow!"

### yelp

In [57]:
yelp_data = download_and_select_samples(dataset_name = "yelp", n_samples = 150)

README.md:   0%|          | 0.00/8.93k [00:00<?, ?B/s]

In [58]:
yelp_data

Dataset({
    features: ['text', 'label'],
    num_rows: 150
})

In [59]:
yelp_df, yelp_accuracy= predict_label_dataset(yelp_data)

Labelling Sentences: 100%|██████████| 150/150 [00:22<00:00,  6.65it/s]


In [60]:
yelp_accuracy

0.9266666666666666

In [61]:
yelp_df

Unnamed: 0,Sentence,Tokenized Sentence,Input IDs,Number of Tokens,Softmax Probs,Predicted Label,True Label,Sentiment
0,Service and food were awesome! Highly recommen...,"[[CLS], service, and, food, were, awesome, !, ...","[101, 2326, 1998, 2833, 2020, 12476, 999, 3811...",23,"[0.2670975625514984, 0.732902467250824]",1,1,positive
1,"The food was OK, it was kind of slow so the fi...","[[CLS], the, food, was, ok, ,, it, was, kind, ...","[101, 1996, 2833, 2001, 7929, 1010, 2009, 2001...",163,"[0.5392808318138123, 0.46071913838386536]",0,0,negative
2,The gym is dirty and old and the whole place i...,"[[CLS], the, gym, is, dirty, and, old, and, th...","[101, 1996, 9726, 2003, 6530, 1998, 2214, 1998...",279,"[0.5794830918312073, 0.4205169379711151]",0,0,negative
3,"Just arrived from the overnight train, arrived...","[[CLS], just, arrived, from, the, overnight, t...","[101, 2074, 3369, 2013, 1996, 11585, 3345, 101...",68,"[0.275282621383667, 0.724717378616333]",1,1,positive
4,So just in case this is the first review you'v...,"[[CLS], so, just, in, case, this, is, the, fir...","[101, 2061, 2074, 1999, 2553, 2023, 2003, 1996...",309,"[0.4868742823600769, 0.5131257176399231]",1,1,positive
...,...,...,...,...,...,...,...,...
145,Usually I am not a big stickler for customer s...,"[[CLS], usually, i, am, not, a, big, stick, ##...","[101, 2788, 1045, 2572, 2025, 1037, 2502, 6293...",512,"[0.4426237642765045, 0.5573763251304626]",1,1,positive
146,"Of the cheaper casinos on the Strip, Bally's h...","[[CLS], of, the, cheaper, casinos, on, the, st...","[101, 1997, 1996, 16269, 27300, 2006, 1996, 61...",185,"[0.44811585545539856, 0.551884114742279]",1,1,positive
147,Extradinarilly big for a cafe! They've got eve...,"[[CLS], extra, ##dina, ##rill, ##y, big, for, ...","[101, 4469, 18979, 24714, 2100, 2502, 2005, 10...",140,"[0.4032253921031952, 0.5967746376991272]",1,1,positive
148,"The serving is good, but the steak dinner is n...","[[CLS], the, serving, is, good, ,, but, the, s...","[101, 1996, 3529, 2003, 2204, 1010, 2021, 1996...",32,"[0.5420543551445007, 0.45794567465782166]",0,0,negative


In [62]:
yelp_df['Sentence'][0]

"Service and food were awesome! Highly recommend the French onion soup. Can't wait to come back."

### amazon

In [63]:
amazon_data = download_and_select_samples(dataset_name = "amazon", n_samples = 150)

README.md:   0%|          | 0.00/6.81k [00:00<?, ?B/s]

In [64]:
amazon_data

Dataset({
    features: ['label', 'title', 'content'],
    num_rows: 150
})

In [65]:
amazon_df, amazon_accuracy= predict_label_dataset(amazon_data)

Labelling Sentences: 100%|██████████| 150/150 [00:17<00:00,  8.47it/s]


In [66]:
amazon_accuracy

0.8533333333333334

In [67]:
amazon_df

Unnamed: 0,Sentence,Tokenized Sentence,Input IDs,Number of Tokens,Softmax Probs,Predicted Label,True Label,Sentiment
0,Ben Harper was brought to my attention through...,"[[CLS], ben, harper, was, brought, to, my, att...","[101, 3841, 8500, 2001, 2716, 2000, 2026, 3086...",132,"[0.5244153141975403, 0.47558465600013733]",0,0,negative
1,I think I'm one of the few folks that recieved...,"[[CLS], i, think, i, ', m, one, of, the, few, ...","[101, 1045, 2228, 1045, 1005, 1049, 2028, 1997...",60,"[0.5591208338737488, 0.44087913632392883]",0,1,negative
2,"First, why did I read this book? Do I have an ...","[[CLS], first, ,, why, did, i, read, this, boo...","[101, 2034, 1010, 2339, 2106, 1045, 3191, 2023...",224,"[0.4386332631111145, 0.5613666772842407]",1,1,positive
3,"if they taught this kind of history in school,...","[[CLS], if, they, taught, this, kind, of, hist...","[101, 2065, 2027, 4036, 2023, 2785, 1997, 2381...",69,"[0.48176589608192444, 0.518234133720398]",1,1,positive
4,"Well, well ,well the latest in the o'malley sa...","[[CLS], well, ,, well, ,, well, the, latest, i...","[101, 2092, 1010, 2092, 1010, 2092, 1996, 6745...",66,"[0.24162133038043976, 0.7583786249160767]",1,1,positive
...,...,...,...,...,...,...,...,...
145,"I highly recommend Gary Chapman's ""5 Love Lang...","[[CLS], i, highly, recommend, gary, chapman, '...","[101, 1045, 3811, 16755, 5639, 11526, 1005, 10...",71,"[0.2884887158870697, 0.7115112543106079]",1,1,positive
146,I purchased one of the HP 540 series PDA's (in...,"[[CLS], i, purchased, one, of, the, hp, 540, s...","[101, 1045, 4156, 2028, 1997, 1996, 6522, 2026...",91,"[0.5197734832763672, 0.4802265465259552]",0,0,negative
147,"After reading the many rave reviews, I was exp...","[[CLS], after, reading, the, many, rave, revie...","[101, 2044, 3752, 1996, 2116, 23289, 4391, 101...",192,"[0.5477862358093262, 0.45221370458602905]",0,0,negative
148,"For most of my 7th and 8th grade year, a few o...","[[CLS], for, most, of, my, 7th, and, 8th, grad...","[101, 2005, 2087, 1997, 2026, 5504, 1998, 5893...",211,"[0.45704326033592224, 0.5429567098617554]",1,1,positive


In [68]:
amazon_df['Sentence'][0]

"Ben Harper was brought to my attention through his association with Jack Johnson. Then Direct TV showed Ben Harper this month on their free concert. I only saw part of the show and decided to buy Live from Mars as my first (and last) Ben Harper CD. I can't get into his music...it doesn't have any flow. His guitar playing is mediocre at best, and his vocals even worse. At times I thought Tiny Tim had come back from the dead. I'll stick with Jack Johnson. Ben Harper was not what I expected, and I utterly fail to see what all the hype is about."

# Comparing tokenizers

In [69]:
# Define the tokenizers to be compared
tokenizers = {
    "BART": "facebook/bart-base",
    "DistilBERT": "distilbert-base-uncased",
    "GPT-2": "gpt2",
    "T5": "t5-small",
    "Albert": "albert-base-v2",
    "XLM-Roberta": "xlm-roberta-base"
}

In [70]:
def load_tokenizer(model_name):
    """
    Load and return a tokenizer based on the provided model name.
    """
    return AutoTokenizer.from_pretrained(model_name)

In [71]:

def get_vocab_size(tokenizer):
    """
    Return the vocabulary size of the provided tokenizer.
    """
    return tokenizer.vocab_size

In [72]:
def tokenize_sentence(tokenizer, sentence):
    """
    Tokenize the sentence using the provided tokenizer and return the tokens and token IDs.
    """
    tokens = tokenizer.tokenize(sentence)
    token_ids = tokenizer.encode(sentence, add_special_tokens=True)
    return tokens, token_ids

In [73]:
def print_tokenizer_info(tokenizer_name, tokens, token_ids, vocab_size):
    """
    Print information about the tokenizer, including tokens, token IDs, and vocabulary size.
    """
    print(f"\n{tokenizer_name} Vocabulary Size: {vocab_size}")
    print(f"{tokenizer_name} Tokenized Sentence:")
    print(tokens)
    print(f"{tokenizer_name} Token IDs:")
    print(token_ids)


In [74]:

# Define the tokenizers to be compared
tokenizers = {
    "BART": "facebook/bart-base",
    "DistilBERT": "distilbert-base-uncased",
    "GPT-2": "gpt2",
    "T5": "t5-small",
    "Albert": "albert-base-v2",
    "XLM-Roberta": "xlm-roberta-base"
}

In [75]:
# Sentence to tokenize
sentence = "This is how a tokenized expression looks. En español es distinto"

In [76]:
# Load tokenizers and compare them
for name, model_name in tokenizers.items():
    tokenizer = load_tokenizer(model_name)
    vocab_size = get_vocab_size(tokenizer)
    tokens, token_ids = tokenize_sentence(tokenizer, sentence)
    print_tokenizer_info(name, tokens, token_ids, vocab_size)




BART Vocabulary Size: 50265
BART Tokenized Sentence:
['This', 'Ġis', 'Ġhow', 'Ġa', 'Ġtoken', 'ized', 'Ġexpression', 'Ġlooks', '.', 'ĠEn', 'Ġes', 'pa', 'Ã±', 'ol', 'Ġes', 'Ġdist', 'into']
BART Token IDs:
[0, 713, 16, 141, 10, 19233, 1538, 8151, 1326, 4, 2271, 2714, 6709, 6303, 1168, 2714, 7018, 12473, 2]

DistilBERT Vocabulary Size: 30522
DistilBERT Tokenized Sentence:
['this', 'is', 'how', 'a', 'token', '##ized', 'expression', 'looks', '.', 'en', 'es', '##pan', '##ol', 'es', 'di', '##sti', '##nto']
DistilBERT Token IDs:
[101, 2023, 2003, 2129, 1037, 19204, 3550, 3670, 3504, 1012, 4372, 9686, 9739, 4747, 9686, 4487, 16643, 13663, 102]

GPT-2 Vocabulary Size: 50257
GPT-2 Tokenized Sentence:
['This', 'Ġis', 'Ġhow', 'Ġa', 'Ġtoken', 'ized', 'Ġexpression', 'Ġlooks', '.', 'ĠEn', 'Ġes', 'pa', 'Ã±', 'ol', 'Ġes', 'Ġdist', 'into']
GPT-2 Token IDs:
[1212, 318, 703, 257, 11241, 1143, 5408, 3073, 13, 2039, 1658, 8957, 12654, 349, 1658, 1233, 20424]

T5 Vocabulary Size: 32100
T5 Tokenized Sentence:
