<a href="https://colab.research.google.com/github/kanovotn/sentiment-analysis-model-trainer/blob/master/notebooks/inference_and_evaluation_sentiment_analysis_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Inference and evaluation of sentiment analysis model

Find the full decription of this project in my blog: https://kanovotn.github.io/2024-05-01-sentiment-analysis-with-hugging-face/

In [1]:
!pip install datasets evaluate transformers[sentencepiece]

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━

In [2]:
!pip install transformers[torch]

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

# Inference on test "imdb" dataset

In [7]:
from datasets import load_dataset, concatenate_datasets


def load_and_split_dataset(dataset_name):
        # Split ratios
        train_split = 0.6
        test_split = 0.2

        # Load the dataset
        dataset_train = load_dataset(dataset_name, split="train")
        dataset_test = load_dataset(dataset_name, split="test")

        # Merge them and shuffle
        dataset_full = concatenate_datasets([dataset_train, dataset_test])
        # Shuffle the data with fixed seed to ensure the reproducibility of the dataset
        dataset_full = dataset_full.shuffle(seed=42).flatten_indices()

        # Calculate the number of samples for train, validate, and test
        total_samples = len(dataset_full)
        train_size = int(total_samples * train_split)
        test_size = int(total_samples * test_split)

        # Split the dataset
        dataset_train = dataset_full.select(range(train_size))
        dataset_test = dataset_full.select(range(train_size, train_size + test_size))
        dataset_validation = dataset_full.select(range(train_size + test_size, total_samples))

        return dataset_train, dataset_validation, dataset_test

In [8]:
import pandas as pd
from datasets import load_dataset
from evaluate import evaluator
from transformers import AutoTokenizer

models = [
    "lyrisha/distilbert-base-finetuned-imdb-sentiment",
    "lyrisha/distilbert-base-finetuned-sentiment",
    "distilbert/distilbert-base-uncased-finetuned-sst-2-english",
    "siebert/sentiment-roberta-large-english"
]

#data = load_dataset("imdb", split="test[80%:]")

dataset_train, dataset_validation, dataset_test = load_and_split_dataset("imdb")
#tokenizer = AutoTokenizer.from_pretrained("lyrisha/distilbert-base-finetuned-imdb-sentiment")
#def tokenize_function(example):
#  return tokenizer(example['text'], max_length=512, truncation=True, padding="max_length")

#tokenized_dataset_test = dataset_test.map(tokenize_function, batched=True)


task_evaluator = evaluator("sentiment-analysis")

results = []
for model in models:
    results.append(
        task_evaluator.compute(
            model_or_pipeline=model, data=dataset_test, label_mapping={"NEGATIVE": 0, "POSITIVE": 1}
            )
        )

df = pd.DataFrame(results, index=models)

Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 50000
})


Flattening the indices:   0%|          | 0/50000 [00:00<?, ? examples/s]



# Results on IMDB test set

In [9]:
df

Unnamed: 0,accuracy,total_time_in_seconds,samples_per_second,latency_in_seconds
lyrisha/distilbert-base-finetuned-imdb-sentiment,0.9257,72.652311,137.641872,0.007265
lyrisha/distilbert-base-finetuned-sentiment,0.9092,72.201125,138.501997,0.00722
distilbert/distilbert-base-uncased-finetuned-sst-2-english,0.8868,72.295551,138.321098,0.00723
siebert/sentiment-roberta-large-english,0.9497,309.411738,32.319394,0.030941


# Inference on test "rotten_tomatoes" dataset

In [3]:
import pandas as pd
from datasets import load_dataset
from evaluate import evaluator
from transformers import pipeline

# List of model IDs
models = [
    "lyrisha/distilbert-base-finetuned-imdb-sentiment",
    "lyrisha/distilbert-base-finetuned-sentiment",
    "distilbert/distilbert-base-uncased-finetuned-sst-2-english",
    "siebert/sentiment-roberta-large-english"
]

# Load the test data from the Rotten Tomatoes dataset
data = load_dataset("rotten_tomatoes", split="test")

# Initialize the evaluator for sentiment analysis
task_evaluator = evaluator("sentiment-analysis")

results = []
for model in models:
    # Use the model to compute results on the dataset
    model_results = task_evaluator.compute(
        model_or_pipeline=model, data=data, label_mapping={"NEGATIVE": 0, "POSITIVE": 1}
    )
    results.append(model_results)

df3 = pd.DataFrame(results, index=models)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/699k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]



config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


tokenizer_config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

# Results on Rotten tomatoes test set

In [4]:
df3

Unnamed: 0,accuracy,total_time_in_seconds,samples_per_second,latency_in_seconds
lyrisha/distilbert-base-finetuned-imdb-sentiment,0.822702,7.1254,149.605642,0.006684
lyrisha/distilbert-base-finetuned-sentiment,0.880863,107.658127,9.901714,0.100993
distilbert/distilbert-base-uncased-finetuned-sst-2-english,0.896811,6.529148,163.267869,0.006125
siebert/sentiment-roberta-large-english,0.920263,21.053129,50.633803,0.01975
