In [1]:
import evaluate
import pandas as pd
import torch
from datasets import load_dataset
from dotenv import load_dotenv
from evaluate import evaluator
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForSequenceClassification, pipeline

from peft import PeftConfig, PeftModel

In [2]:
load_dotenv()

True

## Load the model. Using pretrained model with quantization. Test for it's performance.

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_weight=True,
    bnb_4bit_activation=True
)

In [4]:
models = [
    'peft/bert-base-uncased',
    'peft/bert-large-uncased',
    'peft/roberta-base',
    'peft/roberta-large',
    'peft/distilbert-base-uncased',
]

In [5]:
%%time

# Without quantization 
data = load_dataset("imdb", split="test").shuffle(seed=42).select(range(1000))
task_evaluator = evaluator('sentiment-analysis')
results = []

for model_id in models:
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_id,
        num_labels=2,
    )
    eval_results = task_evaluator.compute(
        model_or_pipeline=pipeline('sentiment-analysis', model=model, tokenizer=tokenizer),
        data=data,
        label_mapping={"LABEL_0": 0, "LABEL_1": 1},
        metric=evaluate.combine(["accuracy", "recall", "precision", "f1"]),
    )
    print(f'{model_id=} {eval_results=}')
    results.append(eval_results)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model_id='peft/bert-base-uncased' eval_results={'accuracy': 0.921, 'recall': 0.9262295081967213, 'precision': 0.9131313131313131, 'f1': 0.9196337741607324, 'total_time_in_seconds': 263.3712404000107, 'samples_per_second': 3.7969217841750322, 'latency_in_seconds': 0.2633712404000107}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model_id='peft/bert-large-uncased' eval_results={'accuracy': 0.923, 'recall': 0.9200819672131147, 'precision': 0.9219712525667351, 'f1': 0.921025641025641, 'total_time_in_seconds': 910.1546687999507, 'samples_per_second': 1.0987143551309926, 'latency_in_seconds': 0.9101546687999508}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model_id='peft/roberta-base' eval_results={'accuracy': 0.937, 'recall': 0.9139344262295082, 'precision': 0.9550321199143469, 'f1': 0.9340314136125655, 'total_time_in_seconds': 259.067527599982, 'samples_per_second': 3.859997465772972, 'latency_in_seconds': 0.259067527599982}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model_id='peft/roberta-large' eval_results={'accuracy': 0.965, 'recall': 0.9487704918032787, 'precision': 0.9788583509513742, 'f1': 0.963579604578564, 'total_time_in_seconds': 885.460542100016, 'samples_per_second': 1.1293558012515552, 'latency_in_seconds': 0.885460542100016}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model_id='peft/distilbert-base-uncased' eval_results={'accuracy': 0.861, 'recall': 0.8442622950819673, 'precision': 0.8673684210526316, 'f1': 0.8556593977154725, 'total_time_in_seconds': 130.0178044999484, 'samples_per_second': 7.691254315868693, 'latency_in_seconds': 0.1300178044999484}
CPU times: total: 3h 51min 20s
Wall time: 41min 41s


In [12]:
%%time

# With quantization
quantization_results = []

for model_id in models:
    peft_config = PeftConfig.from_pretrained(model_id)
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    base_model = AutoModelForSequenceClassification.from_pretrained(model_id.split('/')[1])
    model = PeftModel.from_pretrained(
        model=base_model,
        model_id=model_id,
        quantization_config=bnb_config,
        num_labels=2,
    )
    eval_results = task_evaluator.compute(
        model_or_pipeline=pipeline('sentiment-analysis', model=model, tokenizer=tokenizer),
        data=data,
        label_mapping={"LABEL_0": 0, "LABEL_1": 1},
        metric=evaluate.combine(["accuracy", "recall", "precision", "f1"]),
    )
    print(f'{model_id=} {eval_results=}')
    quantization_results.append(eval_results)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The model 'PeftModelForSequenceClassification' is not supported for sentiment-analysis. Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassification', 'DistilBertForSequenceClassification', 'ElectraForSequen

model_id='peft/bert-base-uncased' eval_results={'accuracy': 0.923, 'recall': 0.9262295081967213, 'precision': 0.9168356997971603, 'f1': 0.9215086646279307, 'total_time_in_seconds': 261.37462470005266, 'samples_per_second': 3.8259261056714147, 'latency_in_seconds': 0.2613746247000526}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The model 'PeftModelForSequenceClassification' is not supported for sentiment-analysis. Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassification', 'DistilBertForSequenceClassification', 'ElectraForSeque

model_id='peft/bert-large-uncased' eval_results={'accuracy': 0.923, 'recall': 0.9200819672131147, 'precision': 0.9219712525667351, 'f1': 0.921025641025641, 'total_time_in_seconds': 904.6657828000607, 'samples_per_second': 1.1053805935987402, 'latency_in_seconds': 0.9046657828000606}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The model 'PeftModelForSequenceClassification' is not supported for sentiment-analysis. Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClass

model_id='peft/roberta-base' eval_results={'accuracy': 0.937, 'recall': 0.9098360655737705, 'precision': 0.958963282937365, 'f1': 0.9337539432176656, 'total_time_in_seconds': 252.01753239997197, 'samples_per_second': 3.9679779040647065, 'latency_in_seconds': 0.25201753239997193}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The model 'PeftModelForSequenceClassification' is not supported for sentiment-analysis. Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClas

model_id='peft/roberta-large' eval_results={'accuracy': 0.965, 'recall': 0.9487704918032787, 'precision': 0.9788583509513742, 'f1': 0.963579604578564, 'total_time_in_seconds': 871.5159360000398, 'samples_per_second': 1.1474259490763394, 'latency_in_seconds': 0.8715159360000398}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The model 'PeftModelForSequenceClassification' is not supported for sentiment-analysis. Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassificatio

model_id='peft/distilbert-base-uncased' eval_results={'accuracy': 0.862, 'recall': 0.8463114754098361, 'precision': 0.8676470588235294, 'f1': 0.8568464730290456, 'total_time_in_seconds': 143.15038430003915, 'samples_per_second': 6.9856606036350435, 'latency_in_seconds': 0.14315038430003915}
CPU times: total: 3h 49min 23s
Wall time: 41min 5s


In [14]:
df = pd.DataFrame(results, index=models)
df[["accuracy", "recall", "precision", "f1", "total_time_in_seconds", 'samples_per_second', 'latency_in_seconds']]

Unnamed: 0,accuracy,recall,precision,f1,total_time_in_seconds,samples_per_second,latency_in_seconds
peft/bert-base-uncased,0.921,0.92623,0.913131,0.919634,263.37124,3.796922,0.263371
peft/bert-large-uncased,0.923,0.920082,0.921971,0.921026,910.154669,1.098714,0.910155
peft/roberta-base,0.937,0.913934,0.955032,0.934031,259.067528,3.859997,0.259068
peft/roberta-large,0.965,0.94877,0.978858,0.96358,885.460542,1.129356,0.885461
peft/distilbert-base-uncased,0.861,0.844262,0.867368,0.855659,130.017804,7.691254,0.130018


In [15]:
dfq = pd.DataFrame(quantization_results, index=models)
dfq[["accuracy", "recall", "precision", "f1", "total_time_in_seconds", 'samples_per_second', 'latency_in_seconds']]

Unnamed: 0,accuracy,recall,precision,f1,total_time_in_seconds,samples_per_second,latency_in_seconds
peft/bert-base-uncased,0.923,0.92623,0.916836,0.921509,261.374625,3.825926,0.261375
peft/bert-large-uncased,0.923,0.920082,0.921971,0.921026,904.665783,1.105381,0.904666
peft/roberta-base,0.937,0.909836,0.958963,0.933754,252.017532,3.967978,0.252018
peft/roberta-large,0.965,0.94877,0.978858,0.96358,871.515936,1.147426,0.871516
peft/distilbert-base-uncased,0.862,0.846311,0.867647,0.856846,143.150384,6.985661,0.14315
