# Sentiment Analysis of Movie Reviews using a Pretrained LLM

In this notebook, I fine-tune a pretrained LLM, DistilBERT base model (uncased), on the IMDB movie-review dataset. I deploy the fine-tuned model to Hugging Face.

## Installs and Imports

In [1]:
!pip install datasets
!pip install transformers[torch]
!pip install evaluate
!pip install -U accelerate

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00

In [2]:
import numpy as np
import torch

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
import accelerate
import evaluate

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Load the Data

In [4]:
ds_movies = load_dataset('imdb')

Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [5]:
# Select a subset of movie reviews on which to investigate hyperparameters.
from numpy import random

rng = random.default_rng()
n_samples = 1_000
indices = rng.choice(len(ds_movies['train']), size=n_samples, replace=False)
ds_movies['train'] = ds_movies['train'].select(indices)

## Pretrained Model and Tokenizer

In [6]:
model_name = 'distilbert-base-uncased'
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [7]:
def tokenize(dataset):
    """Tokenize the movie reviews of dataset."""
    return tokenizer(dataset["text"], padding="max_length", truncation=True)

In [8]:
tokenized_ds_movies = ds_movies.map(tokenize, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

## Train and Test

In [9]:
f1_score = evaluate.load('f1')

def compute_metrics(eval_pred):
    """Compute metrics for evaluation."""
    # raw outputs, actual labels
    logits, labels = eval_pred

    # predictions is the highest output probability
    predictions = np.argmax(logits, axis=-1)

    f1 = f1_score.compute(predictions=predictions, references=labels)
    return {'f1': f1['f1']}

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [10]:
training_args = TrainingArguments(
    output_dir='movie-review-classifier',
    eval_strategy='epoch',
    num_train_epochs=2,
    learning_rate=2e-5,
    weight_decay=0.1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    push_to_hub=True,
    hub_model_id='derek-harnett/movie-review-classifier',
    hub_strategy='end'
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds_movies['train'],
    eval_dataset=tokenized_ds_movies['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [12]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,No log,0.367029,0.864276
2,No log,0.297319,0.882769


TrainOutput(global_step=126, training_loss=0.41722146291581413, metrics={'train_runtime': 1040.6948, 'train_samples_per_second': 1.922, 'train_steps_per_second': 0.121, 'total_flos': 264934797312000.0, 'train_loss': 0.41722146291581413, 'epoch': 2.0})

In [13]:
trainer.evaluate()

{'eval_loss': 0.2973193824291229,
 'eval_f1': 0.882769156549007,
 'eval_runtime': 466.4348,
 'eval_samples_per_second': 53.598,
 'eval_steps_per_second': 3.351,
 'epoch': 2.0}

## Push to Hugging Face

In [14]:
trainer.push_to_hub(commit_message='test push_to_hub()')

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

events.out.tfevents.1722641252.c7531b552757.174.1:   0%|          | 0.00/399 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

events.out.tfevents.1722639745.c7531b552757.174.0:   0%|          | 0.00/5.73k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/derek-harnett/movie-review-classifier/commit/98aabe03cf00e988bdf8ada8cd3cd002f1ffaf5a', commit_message='test push_to_hub()', commit_description='', oid='98aabe03cf00e988bdf8ada8cd3cd002f1ffaf5a', pr_url=None, pr_revision=None, pr_num=None)