# Information

Author: Thanh Liem (liemkg1234@gmail.com)

Task: Sentiment Analysis

Spoce: Text Classification

Model: DistilBERT (https://huggingface.co/distilbert/distilbert-base-uncased)

Dataset: imdb (https://huggingface.co/datasets/imdb)

Metric: Accuracy, Precision, Recall, F1

Result: Weight and ONNX Quantized (https://github.com/huggingface/optimum)

Hours Used: 8

# Package

In [8]:
!pip install transformers datasets evaluate accelerate 
!pip uninstall -y wandb 
!pip install --quiet optimum[exporters,onnxruntime-gpu]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


# Hyper Parameters

In [9]:
model_name = "distilbert/distilbert-base-uncased"
folder_save = "/kaggle/working/distilbert_imdb"
epochs = 10
batch_size = 16
max_input_length = 512

# Load dataset

Dataset Summary

Large Movie Review Dataset. This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.

In [10]:
from datasets import load_dataset, DatasetDict

raw_datasets = load_dataset("imdb")

raw_datasets = DatasetDict({
    'train': raw_datasets['train'].select(range(5000, raw_datasets['train'].num_rows)),
    'validation': raw_datasets['train'].select(range(5000)),
    'test': raw_datasets['test']
})

print(raw_datasets)
print(raw_datasets['test'][0])

  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})
{'text': 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat impor

# Preprocess

In [11]:
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding


tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    return tokenizer(examples["text"], max_length=max_input_length, truncation=True, padding='max_length')

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print(tokenized_datasets)
print(tokenized_datasets['test'][0])

  0%|          | 0/20 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
})
{'text': 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character developme

# Metrics

In [12]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy_result = accuracy.compute(predictions=predictions, references=labels)
    precision_result = precision.compute(predictions=predictions, references=labels, average='macro')
    recall_result = recall.compute(predictions=predictions, references=labels, average='macro')
    f1_result = f1.compute(predictions=predictions, references=labels, average='macro')

    return {
        'accuracy': accuracy_result["accuracy"],
        'precision': precision_result["precision"],
        'recall': recall_result["recall"],
        'f1': f1_result["f1"],
    }

# Model

In [13]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Fine-tune

In [14]:
args = TrainingArguments(
    output_dir=folder_save,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=False,
    report_to=None,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2777,0.315077,0.8798,0.5,0.4399,0.468029
2,0.1708,0.282552,0.9054,0.5,0.4527,0.475176
3,0.1189,0.26705,0.9314,0.5,0.4657,0.482241
4,0.0594,0.586662,0.8412,0.5,0.4206,0.456876
5,0.0316,0.743903,0.8574,0.5,0.4287,0.461613
6,0.0235,0.779007,0.8618,0.5,0.4309,0.462885
7,0.0167,0.615299,0.901,0.5,0.4505,0.473961
8,0.0104,1.112934,0.8552,0.5,0.4276,0.460975


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


KeyboardInterrupt: 

Do validation dataset chỉ có 1 class nên metrics không tốt => cần shuffle trước khi split ra cho validation

Tuy nhiên trên test dataset vẫn tốt: > 90% ở các chỉ số

In [None]:
%cd /kaggle/working/distilbert_imdb
!ls
!find . -mindepth 1 | grep -v '^./checkpoint-1875' | xargs rm -rf
!ls

In [22]:
model_checkpoint = "/kaggle/input/distilbert-imdb-finetuned/checkpoint-1875"
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)

print(test_dataset[0]['label'])
pipe(test_dataset[0]['text'])

0


[{'label': 'NEGATIVE', 'score': 0.9980358481407166}]

# Eval

In [30]:
test_dataset = raw_datasets['test']
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# Original
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
model_checkpoint = "/kaggle/input/distilbert-imdb-finetuned/checkpoint-1875"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
model = model.to(device)

# # ONNX + Quantized
# from optimum.onnxruntime import ORTModelForSequenceClassification
# from transformers import AutoTokenizer
# model_checkpoint = "/kaggle/input/distilbert-imdb-finetuned/distilbert_imdb_onnx_quantized"
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# model = ORTModelForSequenceClassification.from_pretrained(model_checkpoint, use_cache=False)


pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, max_length=max_input_length)

texts = []
labels = []
predictions = []
for i in range(len(test_dataset['text'])):
    prediction_label = pipe(test_dataset[i]['text'])
    prediction = label2id[prediction_label[0]['label']]
    # append
    texts.append(test_dataset[i]['text'])
    labels.append(test_dataset[i]['label'])
    predictions.append(prediction)


cuda


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


NameError: name 'distilbert_imdb' is not defined

In [31]:
import pandas as pd
data = {
    'text': texts,
    'label': labels,
    'predict': predictions,
}

df = pd.DataFrame(data)
df.to_csv(f'eval_distilbert_imdb.csv', index=False)
df.to_excel(f'eval_distilbert_imdb.xlsx', index=False)

import evaluate

# Metric
accuracy_test = evaluate.load("accuracy").compute(predictions=predictions, references=labels)
precision_test = evaluate.load("precision").compute(predictions=predictions, references=labels, average='macro')
recall_test = evaluate.load("recall").compute(predictions=predictions, references=labels, average='macro')
f1_test = evaluate.load("f1").compute(predictions=predictions, references=labels, average='macro')

print(f"accuracy: {accuracy_test}, precision: {precision_test}, recall: {recall_test}, f1: {f1_test}")

accuracy: {'accuracy': 0.92656}, precision: {'precision': 0.9265711823076015}, recall: {'recall': 0.92656}, f1: {'f1': 0.9265595187004618}


# Interface

In [None]:
# Original
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
model_checkpoint = "/kaggle/input/distilbert-imdb-finetuned/checkpoint-1875"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

# # ONNX + Quantized
# from optimum.onnxruntime import ORTModelForSequenceClassification
# from transformers import AutoTokenizer
# model_checkpoint = "/kaggle/input/distilbert-imdb-finetuned/distilbert_imdb_onnx_quantized"
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# model = ORTModelForSequenceClassification.from_pretrained(model_checkpoint, use_cache=False)


pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, max_length=max_input_length)
pipe("Hello")

# ONNX + Quantized

In [None]:
%cd /kaggle/working/distilbert_imdb

# Export
!optimum-cli export onnx \
  --task text-classification \
  -m checkpoint-1875 \
  --optimize O1 \
  distilbert_imdb_onnx

In [None]:
# Quantize
!optimum-cli onnxruntime quantize \
  --avx512 \
  --onnx_model distilbert_imdb_onnx \
  --output distilbert_imdb_onnx_quantized

In [None]:
!zip -r /kaggle/working/kaggle.zip /kaggle/working

In [None]:
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer

model_interface = "models/distilbert_imdb/distilbert_imdb_onnx_quantized"
tokenizer = AutoTokenizer.from_pretrained(model_interface)
model = ORTModelForSequenceClassification.from_pretrained(model_interface, use_cache=False)
