In [None]:
import evaluate
import pandas as pd
import torch
from datasets import load_dataset
from dotenv import load_dotenv
from evaluate import evaluator
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForSequenceClassification, pipeline

from peft import PeftConfig, PeftModel

In [None]:
load_dotenv()

## Load the model. Using pretrained model with quantization. Test for it's performance.

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_weight=True,
    bnb_4bit_activation=True
)

In [None]:
models = [
    'peft/bert-base-uncased',
    'peft/bert-large-uncased',
    'peft/roberta-base',
    'peft/roberta-large',
    # 'saved_model/distilbert-base-peft',
]

In [None]:
%%time

# Without quantization 
data = load_dataset("imdb", split="test").shuffle(seed=42).select(range(1000))
task_evaluator = evaluator('sentiment-analysis')
results = []

for model_id in models:
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_id,
        num_labels=2,
    )
    eval_results = task_evaluator.compute(
        model_or_pipeline=pipeline('sentiment-analysis', model=model, tokenizer=tokenizer),
        data=data,
        label_mapping={"LABEL_0": 0, "LABEL_1": 1},
        metric=evaluate.combine(["accuracy", "recall", "precision", "f1"]),
    )
    print(f'{model_id=} {eval_results=}')
    results.append(eval_results)

In [None]:
%%time

# With quantization
quantization_results = []

for model_id in models:
    peft_config = PeftConfig.from_pretrained(model_id)
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    base_model = AutoModelForSequenceClassification.from_pretrained()
    model = PeftModel.from_pretrained(
        model_id=model_id,
        quantization_config=bnb_config,
        num_labels=2,
    )
    eval_results = task_evaluator.compute(
        model_or_pipeline=pipeline('sentiment-analysis', model=model, tokenizer=tokenizer),
        data=data,
        label_mapping={"LABEL_0": 0, "LABEL_1": 1},
        metric=evaluate.combine(["accuracy", "recall", "precision", "f1"]),
    )
    print(f'{model_id=} {eval_results=}')
    quantization_results.append(eval_results)

In [None]:
df = pd.DataFrame(results, index=models)
df[["accuracy", "recall", "precision", "f1", "total_time_in_seconds", 'samples_per_second', 'latency_in_seconds']]

In [None]:
dfq = pd.DataFrame(quantization_results, index=models)
dfq[["accuracy", "recall", "precision", "f1", "total_time_in_seconds", 'samples_per_second', 'latency_in_seconds']]