In [1]:
import evaluate
import pandas as pd
import torch
from datasets import load_dataset
from dotenv import load_dotenv
from evaluate import evaluator
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForSequenceClassification, pipeline


In [2]:
load_dotenv()

True

## Load the model. Using pretrained model with quantization. Test for it's performance.

In [11]:
models = [
    'peft/bert-base-uncased',
    'peft/bert-large-uncased',
    # 'peft/roberta-base',
    # 'peft/roberta-large',
    'peft/distilbert-base-uncased',
]

In [5]:
data = load_dataset("imdb", split="test").shuffle(seed=42).select(range(1000))
task_evaluator = evaluator('sentiment-analysis')

In [None]:
%%time

# Without quantization 
results = []

for model_id in models:
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_id,
        num_labels=2,
    )
    eval_results = task_evaluator.compute(
        model_or_pipeline=pipeline('sentiment-analysis', model=model, tokenizer=tokenizer),
        data=data,
        label_mapping={"LABEL_0": 0, "LABEL_1": 1},
        metric=evaluate.combine(["accuracy", "recall", "precision", "f1"]),
    )
    print(f'{model_id=} {eval_results=}')
    results.append(eval_results)

In [7]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_weight=True,
    bnb_4bit_activation=True
)

In [12]:
%%time

# With quantization
quantization_results = []

for model_id in models:
    model = AutoModelForSequenceClassification.from_pretrained(model_id, quantization_config=bnb_config)
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    eval_results = task_evaluator.compute(
        model_or_pipeline=pipeline('sentiment-analysis', model=model, tokenizer=tokenizer),
        data=data,
        label_mapping={"LABEL_0": 0, "LABEL_1": 1},
        metric=evaluate.combine(["accuracy", "recall", "precision", "f1"]),
    )
    print(f'{model_id=} {eval_results=}')
    quantization_results.append(eval_results)

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model_id='peft/bert-base-uncased' eval_results={'accuracy': 0.916, 'recall': 0.9057377049180327, 'precision': 0.9208333333333333, 'f1': 0.9132231404958677, 'total_time_in_seconds': 37.72987239994109, 'samples_per_second': 26.504197771990384, 'latency_in_seconds': 0.037729872399941085}


`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


model_id='peft/bert-large-uncased' eval_results={'accuracy': 0.924, 'recall': 0.9282786885245902, 'precision': 0.917004048582996, 'f1': 0.9226069246435845, 'total_time_in_seconds': 66.75712840002961, 'samples_per_second': 14.979673691290118, 'latency_in_seconds': 0.06675712840002962}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model_id='peft/distilbert-base-uncased' eval_results={'accuracy': 0.856, 'recall': 0.8278688524590164, 'precision': 0.8706896551724138, 'f1': 0.8487394957983193, 'total_time_in_seconds': 18.67440090002492, 'samples_per_second': 53.54924130383564, 'latency_in_seconds': 0.01867440090002492}
CPU times: total: 52 s
Wall time: 2min 21s


In [None]:
df = pd.DataFrame(results, index=models)
df[["accuracy", "recall", "precision", "f1", "total_time_in_seconds", 'samples_per_second', 'latency_in_seconds']]

In [13]:
dfq = pd.DataFrame(quantization_results, index=models)
dfq[["accuracy", "recall", "precision", "f1", "total_time_in_seconds", 'samples_per_second', 'latency_in_seconds']]

Unnamed: 0,accuracy,recall,precision,f1,total_time_in_seconds,samples_per_second,latency_in_seconds
peft/bert-base-uncased,0.916,0.905738,0.920833,0.913223,37.729872,26.504198,0.03773
peft/bert-large-uncased,0.924,0.928279,0.917004,0.922607,66.757128,14.979674,0.066757
peft/distilbert-base-uncased,0.856,0.827869,0.87069,0.848739,18.674401,53.549241,0.018674
