In [None]:
# import core pytorch + computer vision libraries
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.models import resnet18, ResNet18_Weights
from functools import partial

# import transformer model + tokenizer + dataset utilities
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset

# import custom modules
import quantizer
import utils

# set device to GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


In [None]:
# load base BERT model + tokenizer for binary classification
model_name = "bert-base-uncased"
bert_tokenizer = AutoTokenizer.from_pretrained(model_name)
model_bert_fp32_clf = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

# load GLUE SST-2 validation split
dataset = load_dataset("glue", "sst2")
sst2_val = dataset['validation']

# tokenizer helper for SST-2 sentences
def tokenize_sst2(batch):
    return bert_tokenizer(batch['sentence'], padding='max_length', truncation=True, max_length=128)

# map tokenizer over entire validation set
sst2_val = sst2_val.map(tokenize_sst2, batched=True)

# rename label field to match HF expected key
sst2_val = sst2_val.rename_column("label", "labels")

# convert dataset into torch tensors + dataloader
sst2_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataloader = DataLoader(sst2_val, batch_size=32, shuffle=False)

# label mapping reference
sst2_labels = [0, 1]  # 0 = negative, 1 = positive


In [None]:
# utilities
import copy
from tqdm import tqdm

# function to generate predictions for entire dataloader
@torch.no_grad()
def get_all_predictions(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []

    print("Gathering all predictions for confusion matrix...")
    for batch in tqdm(dataloader, desc="Predicting"):
        # move tensors to gpu/cpu device
        batch = {k: v.to(device) for k, v in batch.items()}
        labels = batch["labels"]

        # forward pass
        outputs = model(**batch)

        # get class with max logit
        pred = torch.argmax(outputs.logits, dim=1)

        # store predictions + labels
        all_preds.extend(pred.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    return all_labels, all_preds


In [None]:
# optimizer import
from torch.optim import AdamW
from tqdm import tqdm

# load + prepare SST-2 training set
sst2_train = dataset['train'].map(tokenize_sst2, batched=True)
sst2_train = sst2_train.rename_column("label", "labels")   # match eval naming
sst2_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
train_dataloader = DataLoader(sst2_train, batch_size=16, shuffle=True)

# configure optimizer + training mode
model_bert_fp32_clf.train()
optimizer = AdamW(model_bert_fp32_clf.parameters(), lr=2e-5)
num_epochs = 2

print("--- Fine-Tuning FP32 BERT on SST-2 ---")

for epoch in range(num_epochs):
    print(f"Starting Epoch {epoch + 1}/{num_epochs}")
    for batch in tqdm(train_dataloader, desc="Training"):
        optimizer.zero_grad()

        # move batch to device
        batch = {k: v.to(device) for k,v in batch.items()}

        # forward + loss
        outputs = model_bert_fp32_clf(**batch)
        loss = outputs.loss

        # backward + optimize
        loss.backward()
        optimizer.step()

    # epoch summary print
    print(f"Epoch {epoch + 1} complete. Loss: {loss.item()}")

print("--- Fine-Tuning Complete ---")
model_bert_fp32_clf.eval()


In [None]:
# imports for dataset subsets and accuracy evaluation
from torch.utils.data import Subset, DataLoader
from sklearn.metrics import accuracy_score

# create quantization functions for adaptive float
quant_af4_func = partial(quantizer.quantize_to_adaptivfloat, total_bits=4, exponent_bits=2)
quant_af4_func.__name__ = "AdaptivFloat_4bit"

quant_af8_func = partial(quantizer.quantize_to_adaptivfloat, total_bits=8, exponent_bits=3)
quant_af8_func.__name__ = "AdaptivFloat_8bit"

# create a smaller "quick" dataloader for faster evaluation
val_dataset = val_dataloader.dataset
subset_indices = list(range(0, len(val_dataset), 5)) # use 1/5th = 20% of validation set
quick_dataset = Subset(val_dataset, subset_indices)
quick_loader = DataLoader(quick_dataset, batch_size=val_dataloader.batch_size)
print(f"Using full validation set of {len(val_dataset)} samples.")
print(f"Using quick analysis subset of {len(quick_dataset)} samples.")

# helper function to compute accuracy on a dataloader
@torch.no_grad()
def get_accuracy(model, dataloader, device):
    """Helper function to get just the accuracy number."""
    model.eval()
    all_preds = []
    all_labels = []
    
    for batch in dataloader:
        if isinstance(batch, list):
            data, labels = batch
            data, labels = data.to(device), labels.to(device)
            model_input = data 
        elif isinstance(batch, dict):
            labels = batch.pop('labels').to(device)
            model_input = {k: v.to(device) for k, v in batch.items()}
        
        if isinstance(model_input, dict):
            outputs = model(**model_input)
        else:
            outputs = model(model_input)
        
        if isinstance(outputs, torch.Tensor):
            pred = torch.argmax(outputs, dim=1)
        else:
            pred = torch.argmax(outputs.logits, dim=1)
            
        all_preds.extend(pred.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
        
    return accuracy_score(all_labels, all_preds)

# calculate baseline FP32 accuracy on the quick subset
print("Calculating baseline FP32 accuracy on quick loader...")
baseline_quick_accuracy = get_accuracy(model_bert_fp32_clf, quick_loader, device)
print(f"Baseline FP32 (Quick): {baseline_quick_accuracy * 100:.2f}%")

# collect all weight layer names in the model
layer_names = [name for name, param in model_bert_fp32_clf.named_parameters() if 'weight' in name]
print(f"Found {len(layer_names)} layers with weights to analyze.")


In [None]:
# imports for data handling, JSON, copy, progress bar, and torch
import pandas as pd
import json
import copy
from tqdm import tqdm
import torch

sensitivity_results = {}
baseline_acc = baseline_quick_accuracy

print(f"\n--- Starting Sensitivity Analysis (Quantizing 1 layer at a time to INT4) ---")

# loop through each layer to test its sensitivity to INT4 quantization
for layer_to_quantize in tqdm(layer_names, desc="Analyzing Layers"):
    
    # create a fresh deep copy of the FP32 model
    temp_model = copy.deepcopy(model_bert_fp32_clf).to(device)
    
    # find the target layer and apply INT4 quantization
    found = False
    for name, param in temp_model.named_parameters():
        if name == layer_to_quantize:
            param.data.copy_(quant_af4_func(param.data))
            found = True
            break # no need to check other layers
    
    if not found:
        print(f"Warning: Layer {layer_to_quantize} not found in model params.")
        continue

    # evaluate the model with only this layer quantized
    current_accuracy = get_accuracy(temp_model, quick_loader, device)
    
    # compute and log accuracy drop
    accuracy_drop = baseline_acc - current_accuracy
    sensitivity_results[layer_to_quantize] = accuracy_drop
    
    # clean up GPU memory
    del temp_model
    torch.cuda.empty_cache()

print("--- Analysis Complete ---")

# convert results to a dataframe and sort by accuracy drop
sensitivity_df = pd.DataFrame(
    list(sensitivity_results.items()), 
    columns=['Layer', 'Accuracy Drop']
).sort_values(by='Accuracy Drop', ascending=False)

# display most sensitive layers (largest drop)
print("\nMost Sensitive Layers (Largest Accuracy Drop when set to INT4):")
print(sensitivity_df.head(10))

# display most robust layers (smallest drop)
print("\nMost Robust Layers (Smallest Accuracy Drop when set to INT4):")
print(sensitivity_df.tail(10))

# optionally save results to CSV
sensitivity_df.to_csv("sensitivity_analysis_results.csv", index=False)
print("\nFull sensitivity results saved to 'sensitivity_analysis_results.csv'")


In [None]:
# imports for data handling and JSON
import pandas as pd
import json

sensitivity_df = pd.read_csv("sensitivity_analysis_results.csv")
print(f"Loaded sensitivity results for {len(sensitivity_df)} layers.")

# define threshold to distinguish sensitive layers (1% drop)
SENSITIVITY_THRESHOLD = 0.01

precision_profile = {}
num_sensitive_layers = 0

# create layer precision profile based on sensitivity
for index, row in sensitivity_df.iterrows():
    layer_name = row['Layer']
    accuracy_drop = row['Accuracy Drop']
    
    if accuracy_drop > SENSITIVITY_THRESHOLD:
        precision_profile[layer_name] = 'INT8'
        num_sensitive_layers += 1
    else:
        precision_profile[layer_name] = 'INT4'

print(f"\nCreated profile with {num_sensitive_layers} layers at INT8")
print(f"and {len(precision_profile) - num_sensitive_layers} layers at INT4.")

# save the precision profile to JSON for later use
with open('bert_precision_profile.json', 'w') as f:
    json.dump(precision_profile, f, indent=4)

print("Precision profile saved to 'bert_precision_profile.json'")


In [None]:
# imports for evaluation metrics and plotting
import sklearn.metrics as metrics
import matplotlib.pyplot as plt

# helper function to plot a confusion matrix
def plot_confusion_matrix(y_true, y_pred, labels, title):
    """
    Plots a confusion matrix using scikit-learn.
    """
    # compute confusion matrix
    cm = metrics.confusion_matrix(y_true, y_pred, labels=labels)
    
    # create display object for plotting
    disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)

    # setup figure and axis
    fig, ax = plt.subplots(figsize=(6, 6))
    
    # plot confusion matrix with blue color map
    disp.plot(ax=ax, cmap='Blues')
    
    # set plot title and show
    plt.title(title)
    plt.show()


In [None]:
# imports for JSON handling and deep copy
import json
import copy

with open('bert_precision_profile.json', 'r') as f:
    precision_profile = json.load(f)
print(f"Loaded precision profile with {len(precision_profile)} layers.")

# ensure required variables and utils are loaded
# model_bert_fp32_clf, val_dataloader, sst2_labels, utils.apply_quantization_to_model

# Test 1: FP32 Baseline evaluation on full dataset
print("\n--- 1. Evaluating FP32 Baseline (Full Dataset) ---")
y_true, y_pred_fp32 = get_all_predictions(model_bert_fp32_clf, val_dataloader, device)
acc_fp32 = accuracy_score(y_true, y_pred_fp32)
plot_confusion_matrix(y_true, y_pred_fp32, sst2_labels, f"BERT - FP32 Baseline (Acc: {acc_fp32*100:.2f}%)")

# Test 2: Uniform INT8 quantization
print("\n--- 2. Evaluating Uniform INT8 ---")
model_int8 = copy.deepcopy(model_bert_fp32_clf).to(device)
utils.apply_quantization_to_model(model_int8, quant_af8_func) # apply 8-bit quant function
y_true, y_pred_int8 = get_all_predictions(model_int8, val_dataloader, device)
acc_int8 = accuracy_score(y_true, y_pred_int8)
plot_confusion_matrix(y_true, y_pred_int8, sst2_labels, f"BERT - Uniform INT8 (Acc: {acc_int8*100:.2f}%)")

# Test 3: Uniform INT4 quantization (expected poor performance)
print("\n--- 3. Evaluating Uniform INT4 ---")
model_int4 = copy.deepcopy(model_bert_fp32_clf).to(device)
utils.apply_quantization_to_model(model_int4, quant_af4_func) # apply 4-bit quant function
y_true, y_pred_int4 = get_all_predictions(model_int4, val_dataloader, device)
acc_int4 = accuracy_score(y_true, y_pred_int4)
plot_confusion_matrix(y_true, y_pred_int4, sst2_labels, f"BERT - Uniform INT4 (Acc: {acc_int4*100:.2f}%)")

# Test 4: Mixed-Precision INT4/INT8 according to profile
print("\n--- 4. Evaluating MIXED PRECISION (INT4/INT8) ---")
model_mixed = copy.deepcopy(model_bert_fp32_clf).to(device)
utils.apply_quantization_to_model(model_mixed, precision_profile) # apply dictionary-based quantization
y_true, y_pred_mixed = get_all_predictions(model_mixed, val_dataloader, device)
acc_mixed = accuracy_score(y_true, y_pred_mixed)
plot_confusion_matrix(y_true, y_pred_mixed, sst2_labels, f"BERT - Mixed-Precision (Acc: {acc_mixed*100:.2f}%)")

# Final Accuracy Report
print("\n--- Final Accuracy Report ---")
print(f"FP32 Baseline:       {acc_fp32 * 100:.2f}%")
print(f"Uniform INT8:        {acc_int8 * 100:.2f}%")
print(f"Uniform INT4:        {acc_int4 * 100:.2f}%")
print(f"Mixed-Precision (New): {acc_mixed * 100:.2f}%")


In [None]:
# import copy module
import copy

# create a uniform INT8 quantized model
model_int8_baseline = copy.deepcopy(model_bert_fp32_clf).to(device)
utils.apply_quantization_to_model(model_int8_baseline, quant_af8_func)

# evaluate accuracy on the quick validation subset
print("Calculating baseline Uniform INT8 accuracy on quick loader...")
baseline_int8_accuracy = get_accuracy(model_int8_baseline, quick_loader, device)
print(f"Baseline Uniform INT8 (Quick): {baseline_int8_accuracy * 100:.2f}%")

del model_int8_baseline
torch.cuda.empty_cache()


In [None]:
# imports for data handling, JSON, copy, progress bar, and torch
import pandas as pd
import json
import copy
from tqdm import tqdm
import torch

# initialize dictionary to store sensitivity results
new_sensitivity_results = {}
baseline_acc = baseline_int8_accuracy # baseline is now the INT8 model

print(f"\n--- Starting New Analysis (Demoting 1 layer at a time from INT8 to INT4) ---")

# loop through each layer to demote it from INT8 to INT4 and measure impact
for layer_to_demote in tqdm(layer_names, desc="Analyzing Layers (INT8->INT4)"):
    
    # create a fresh uniform INT8 model
    temp_model = copy.deepcopy(model_bert_fp32_clf).to(device)
    utils.apply_quantization_to_model(temp_model, quant_af8_func)
    
    # find and demote only the target layer to INT4
    found = False
    for name, param in temp_model.named_parameters():
        if name == layer_to_demote:
            param.data.copy_(quant_af4_func(param.data)) # apply INT4
            found = True
            break
    
    if not found:
        print(f"Warning: Layer {layer_to_demote} not found.")
        continue

    # evaluate this mostly INT8 model
    current_accuracy = get_accuracy(temp_model, quick_loader, device)
    
    # log the accuracy drop relative to INT8 baseline
    accuracy_drop = baseline_acc - current_accuracy
    new_sensitivity_results[layer_to_demote] = accuracy_drop
    
    # clean up GPU memory
    del temp_model
    torch.cuda.empty_cache()

print("--- New Analysis Complete ---")

# convert results to dataframe and sort by accuracy drop ascending
new_sensitivity_df = pd.DataFrame(
    list(new_sensitivity_results.items()), 
    columns=['Layer', 'Accuracy Drop']
).sort_values(by='Accuracy Drop', ascending=True) # ascending: smallest drop first

# display most robust layers (safe to demote to INT4)
print("\nMost Robust Layers (Safest to demote to INT4):")
print(new_sensitivity_df.head(20))

# display most sensitive layers (keep these at INT8)
print("\nMost Sensitive Layers (Keep these at INT8):")
print(new_sensitivity_df.tail(10))

# save new sensitivity results to CSV
new_sensitivity_df.to_csv("sensitivity_analysis_results_v2.csv", index=False)
print("\nFull V2 sensitivity results saved to 'sensitivity_analysis_results_v2.csv'")


In [None]:
# imports for data handling and JSON
import pandas as pd
import json

# load the new V2 sensitivity results
new_sensitivity_df = pd.read_csv("sensitivity_analysis_results_v2.csv")
print(f"Loaded new V2 sensitivity results for {len(new_sensitivity_df)} layers.")

# define threshold for safe demotion to INT4 (0.1% accuracy drop)
DEMOTION_THRESHOLD = 0.001 # 0.1% drop

# initialize new precision profile
new_precision_profile = {}
num_int4_layers = 0

# assign INT4 or INT8 to each layer based on threshold
for index, row in new_sensitivity_df.iterrows():
    layer_name = row['Layer']
    accuracy_drop = row['Accuracy Drop']
    
    if accuracy_drop <= DEMOTION_THRESHOLD:
        new_precision_profile[layer_name] = 'INT4'
        num_int4_layers += 1
    else:
        new_precision_profile[layer_name] = 'INT8'

print(f"\nCreated new profile with {num_int4_layers} layers at INT4")
print(f"and {len(new_precision_profile) - num_int4_layers} layers at INT8.")

# save the new V2 precision profile to JSON
with open('bert_precision_profile_v2.json', 'w') as f:
    json.dump(new_precision_profile, f, indent=4)

print("New V2 precision profile saved to 'bert_precision_profile_v2.json'")


In [None]:
# imports for JSON handling and deep copy
import json
import copy

# load the new V2 precision profile
with open('bert_precision_profile_v2.json', 'r') as f:
    new_precision_profile = json.load(f)
print(f"Loaded V2 precision profile with {len(new_precision_profile)} layers.")

# Test 1: FP32 Baseline evaluation on full dataset
print("\n--- 1. Evaluating FP32 Baseline (Full Dataset) ---")
y_true_fp32, y_pred_fp32 = get_all_predictions(model_bert_fp32_clf, val_dataloader, device)
acc_fp32 = accuracy_score(y_true_fp32, y_pred_fp32)
plot_confusion_matrix(y_true_fp32, y_pred_fp32, sst2_labels, f"BERT - FP32 Baseline (Acc: {acc_fp32*100:.2f}%)")

# Test 2: Uniform INT8 quantization
print("\n--- 2. Evaluating Uniform INT8 ---")
model_int8 = copy.deepcopy(model_bert_fp32_clf).to(device)
utils.apply_quantization_to_model(model_int8, quant_af8_func) # apply 8-bit quant function
y_true_int8, y_pred_int8 = get_all_predictions(model_int8, val_dataloader, device)
acc_int8 = accuracy_score(y_true_int8, y_pred_int8)
plot_confusion_matrix(y_true_int8, y_pred_int8, sst2_labels, f"BERT - Uniform INT8 (Acc: {acc_int8*100:.2f}%)")

# Test 3: Uniform INT4 quantization (expected poor performance)
print("\n--- 3. Evaluating Uniform INT4 ---")
model_int4 = copy.deepcopy(model_bert_fp32_clf).to(device)
utils.apply_quantization_to_model(model_int4, quant_af4_func) # apply 4-bit quant function
y_true_int4, y_pred_int4 = get_all_predictions(model_int4, val_dataloader, device)
acc_int4 = accuracy_score(y_true_int4, y_pred_int4)
plot_confusion_matrix(y_true_int4, y_pred_int4, sst2_labels, f"BERT - Uniform INT4 (Acc: {acc_int4*100:.2f}%)")

# Test 4: Mixed-Precision V2 according to new profile
print("\n--- 4. Evaluating NEW Mixed-Precision (V2) ---")
model_mixed_v2 = copy.deepcopy(model_bert_fp32_clf).to(device)
utils.apply_quantization_to_model(model_mixed_v2, new_precision_profile) # apply V2 profile
y_true_mixed_v2, y_pred_mixed_v2 = get_all_predictions(model_mixed_v2, val_dataloader, device)
acc_mixed_v2 = accuracy_score(y_true_mixed_v2, y_pred_mixed_v2)
plot_confusion_matrix(y_true_mixed_v2, y_pred_mixed_v2, sst2_labels, f"BERT - Mixed-Precision V2 (Acc: {acc_mixed_v2*100:.2f}%)")

# Final Accuracy Report (V2)
print("\n--- Final Accuracy Report (V2) ---")
print(f"FP32 Baseline:         {acc_fp32 * 100:.2f}%")
print(f"Uniform INT8:          {acc_int8 * 100:.2f}%")
print(f"Uniform INT4:          {acc_int4 * 100:.2f}%")
print(f"Mixed-Precision (V2):  {acc_mixed_v2 * 100:.2f}%")


In [None]:
# imports for data handling, JSON, copy, progress bar, and torch
import pandas as pd
import json
import copy
from tqdm import tqdm
import torch

# --- 1. Load Initial V2 Sensitivity Results ---
# Used to get the first layer to demote
try:
    new_sensitivity_df = pd.read_csv("sensitivity_analysis_results_v2.csv")
except FileNotFoundError:
    print("Error: 'sensitivity_analysis_results_v2.csv' not found.")
    print("Please re-run the 'New Cell 2: Run the INT8 -> INT4 Demotion Analysis' first.")
    raise

# --- 2. Setup the Greedy Search ---
print("--- Starting Greedy Iterative Mixed-Precision Search ---")
ACCURACY_TARGET = 0.91 # stop condition (91% accuracy)
baseline_acc = baseline_int8_accuracy # baseline from INT8 model

# start with all layers at INT8
final_precision_profile = {layer: 'INT8' for layer in layer_names}
# layers still available to be demoted
layers_to_test = set(layer_names)

# history of search results
search_history = [] 

# iterate over all layers
for i in range(len(layer_names)):
    print(f"\n--- Iteration {i+1} / {len(layer_names)} ---")
    
    # 1. Build current baseline model with existing profile
    current_baseline_model = copy.deepcopy(model_bert_fp32_clf).to(device)
    utils.apply_quantization_to_model(current_baseline_model, final_precision_profile)
    
    # 2. Get its accuracy on quick loader
    current_baseline_acc = get_accuracy(current_baseline_model, quick_loader, device)
    print(f"Current baseline accuracy: {current_baseline_acc * 100:.2f}%")
    search_history.append((i, current_baseline_acc)) # log the result
    
    # 3. Check stop condition
    if current_baseline_acc < ACCURACY_TARGET:
        print(f"Accuracy dropped below {ACCURACY_TARGET*100}%. Stopping search.")
        # revert the last change
        final_precision_profile[last_layer_demoted] = 'INT8'
        search_history.pop() # remove failing entry
        break
    
    if not layers_to_test:
        print("All layers have been demoted to INT4.")
        break

    # 4. Find the next best layer to demote
    best_layer_to_demote = None
    best_layer_accuracy = -1.0
    
    # iterate over layers still at INT8
    for layer_name in tqdm(list(layers_to_test), desc="Finding next best layer"):
        
        # create test profile with one additional INT4 layer
        test_profile = final_precision_profile.copy()
        test_profile[layer_name] = 'INT4'
        
        # build test model
        test_model = copy.deepcopy(model_bert_fp32_clf).to(device)
        utils.apply_quantization_to_model(test_model, test_profile)
        
        # evaluate accuracy
        acc = get_accuracy(test_model, quick_loader, device)
        
        # track the best candidate
        if acc > best_layer_accuracy:
            best_layer_accuracy = acc
            best_layer_to_demote = layer_name
            
        # clean up memory
        del test_model
        torch.cuda.empty_cache()

    # 5. Lock in the best layer
    final_precision_profile[best_layer_to_demote] = 'INT4'
    layers_to_test.remove(best_layer_to_demote)
    last_layer_demoted = best_layer_to_demote
    
    print(f"Demoting layer: {best_layer_to_demote} (New Acc: {best_layer_accuracy * 100:.2f}%)")
    del current_baseline_model

print("--- Greedy Search Complete ---")

# --- 6. Save the new, optimal profile ---
num_int4 = list(final_precision_profile.values()).count('INT4')
num_int8 = len(final_precision_profile) - num_int4
print(f"Created OPTIMAL profile with {num_int4} layers at INT4 and {num_int8} at INT8")

with open('bert_precision_profile_OPTIMAL.json', 'w') as f:
    json.dump(final_precision_profile, f, indent=4)
print("Optimal profile saved to 'bert_precision_profile_OPTIMAL.json'")


In [None]:
# imports for JSON handling and deep copy
import json
import copy

# --- 1. Load the NEW OPTIMAL Profile ---
with open('bert_precision_profile_OPTIMAL.json', 'r') as f:
    optimal_profile = json.load(f)
print(f"Loaded OPTIMAL profile with {len(optimal_profile)} layers.")

# --- 2. Evaluate the New Optimal Mixed-Precision Model ---
print("\n--- Evaluating OPTIMAL Mixed-Precision ---")
model_optimal = copy.deepcopy(model_bert_fp32_clf).to(device)
utils.apply_quantization_to_model(model_optimal, optimal_profile) # apply OPTIMAL profile
y_true_optimal, y_pred_optimal = get_all_predictions(model_optimal, val_dataloader, device)
acc_optimal = accuracy_score(y_true_optimal, y_pred_optimal)
plot_confusion_matrix(y_true_optimal, y_pred_optimal, sst2_labels, f"BERT - Optimal Mixed-Precision (Acc: {acc_optimal*100:.2f}%)")

# --- 3. Final Accuracy Report ---
print("\n--- Final Accuracy Report (OPTIMAL) ---")
print(f"FP32 Baseline:         {acc_fp32 * 100:.2f}%") # from previous run
print(f"Uniform INT8:          {acc_int8 * 100:.2f}%") # from previous run
print(f"Uniform INT4:          {acc_int4 * 100:.2f}%") # from previous run
print(f"Optimal Mixed-Precision: {acc_optimal * 100:.2f}%")
