# Finetune bert classifier for sentiment classification
Example from https://huggingface.co/docs/transformers/training

# Development environment


In [1]:
! pip install -U transformers[torch]
! pip install -U accelerate
! pip install datasets
! pip install evaluate
! pip install scikit-learn
! pip install wandb


Collecting transformers[torch]
  Downloading transformers-4.48.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0->trans

In [2]:
import warnings
warnings.filterwarnings("ignore")

import transformers
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification
import wandb
import time

import numpy as np
import evaluate


# Login to Weights and Biases


In [3]:
wandb.login()


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjohnlohjy[0m ([33mjohnlohjy-sutd-singapore-university-of-technology-design[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
wandb.init(
      # Set the project where this run will be logged
      project="sutd-mlops-project",
      # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10)
      name=f"experiment_session3_run_1",
      # Track hyperparameters and run metadata
      config={
          "learning_rate": 3e-5, # 2e-5
          "weight_decay": 0.01,
          "num_train_epochs": 5, # 2
          "train_subsample_size": 1000,
          "architecture": "distilbert",
          "dataset_name": "rotten_tomatoes",
          "model_name": "distilbert-base-uncased"
      })
config = wandb.config

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


# Prepare data


In [5]:
dataset = load_dataset(config.dataset_name)
dataset["train"][0]

README.md:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
 'label': 1}

In [6]:
tokenizer = AutoTokenizer.from_pretrained(config.model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [7]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(config.train_subsample_size))
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(100))
small_test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100))

# Train the model


In [8]:
num_labels = len(np.unique(dataset['train']['label']))
model = AutoModelForSequenceClassification.from_pretrained(config.model_name, num_labels=num_labels)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [10]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [11]:
training_args = TrainingArguments(
    output_dir=".",
    report_to="wandb",
    evaluation_strategy="epoch",
    learning_rate=config.learning_rate,
    weight_decay=config.weight_decay,
    num_train_epochs=config.num_train_epochs,
    logging_steps=20)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [13]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.4818,0.485985,0.79
2,0.1864,0.64222,0.81
3,0.1023,0.78225,0.82
4,0.0718,1.015839,0.79
5,0.0122,0.965361,0.8


TrainOutput(global_step=625, training_loss=0.19017729359865188, metrics={'train_runtime': 254.8849, 'train_samples_per_second': 19.617, 'train_steps_per_second': 2.452, 'total_flos': 662336993280000.0, 'train_loss': 0.19017729359865188, 'epoch': 5.0})

# Test the model


In [14]:
# Accuracy on training set
trainer.evaluate(small_train_dataset)

{'eval_loss': 0.00889758300036192,
 'eval_accuracy': 0.998,
 'eval_runtime': 14.901,
 'eval_samples_per_second': 67.109,
 'eval_steps_per_second': 8.389,
 'epoch': 5.0}

In [15]:
# Accuracy on validation set
trainer.evaluate(small_eval_dataset)

{'eval_loss': 0.9653609395027161,
 'eval_accuracy': 0.8,
 'eval_runtime': 1.5892,
 'eval_samples_per_second': 62.925,
 'eval_steps_per_second': 8.18,
 'epoch': 5.0}

In [16]:
# Accuracy on test set
trainer.evaluate(small_test_dataset)


{'eval_loss': 1.1884403228759766,
 'eval_accuracy': 0.76,
 'eval_runtime': 1.5452,
 'eval_samples_per_second': 64.715,
 'eval_steps_per_second': 8.413,
 'epoch': 5.0}

In [17]:
# accuracy of the whole test set - for fair comparison with the classification performance achieved by SGD in previous sessions
def predict(tokenized_test_data, trainer):
    output_array = trainer.predict(tokenized_test_data)[0]
    pred_prob = np.exp(output_array)/np.sum(np.exp(output_array), axis = 1)[..., None]
    pred = np.argmax(pred_prob, axis = 1)
    return pred_prob, pred

pred_prob, pred  = predict(tokenized_datasets["test"], trainer)
accuracy = np.sum(pred == dataset["test"]['label'])/len(dataset["test"]['label'])
print(f"Accuracy: {accuracy}")
wandb.sklearn.plot_precision_recall(dataset["test"]['label'], pred_prob, ["negative", "positive"])

Accuracy: 0.8086303939962477


In [None]:
wandb.finish()


0,1
eval/accuracy,▂▁▃▃█▃▁
eval/loss,▄▄▆▆▁▆█
eval/runtime,▁▁▁▁█▁▁
eval/samples_per_second,█▃▅▃▅▁▂
eval/steps_per_second,█▃▅▃▃▁▂
test/accuracy,▁
test/loss,▁
test/runtime,▁
test/samples_per_second,▁
test/steps_per_second,▁

0,1
eval/accuracy,0.76
eval/loss,0.99287
eval/runtime,1.4998
eval/samples_per_second,66.677
eval/steps_per_second,8.668
test/accuracy,0.80488
test/loss,0.75848
test/runtime,15.4681
test/samples_per_second,68.916
test/steps_per_second,8.663


# What to try next

- train and evaluate with the complete training and test dataset instead of a sample
- experiment with different training parameters (number of epochs, optimizers, batch size, learning rate schedule, ...)
- compare DistilBERT vs the full BERT model: https://huggingface.co/bert-base-uncased
- compare the results with the scikit model from the previous notebook. What is the cost-benefit trade off between deep learning and traditional ML?
- Check out this more detailed sentiment tutorial on Huggingface https://huggingface.co/blog/sentiment-analysis-python