In [1]:
import torch
from typing import Tuple

In [2]:
def _calculate_scale_and_zeropoint(
    min_val: float, max_val: float, num_bits: int) -> Tuple[float, int]:
    qmin = 0.
    qmax = 2.**num_bits - 1.

    scale = (max_val - min_val) / (qmax - qmin)

    initial_zero_point = qmin - min_val / scale

    zero_point = 0
    if initial_zero_point < qmin:
        zero_point = int(qmin)
    elif initial_zero_point > qmax:
        zero_point = int(qmax)
    else:
        zero_point = int(initial_zero_point)
    
    return scale, zero_point

In [3]:
def quantize(x: torch.Tensor, scale: float, zero_point: int, dtype=torch.uint8):
    q_x = zero_point + x / scale
    q_x.clamp_(0, 255).round_()
    q_x = q_x.to(dtype)
    return q_x

def dequantize(x: torch.Tensor, scale: float, zero_point: int):
    return scale * (x.float() - zero_point)

In [4]:
from copy import deepcopy

def test_case_0():
  torch.manual_seed(999)
  test_input = torch.randn((4,4))

  min_val, max_val = torch.min(test_input), torch.max(test_input)
  scale, zero_point = _calculate_scale_and_zeropoint(min_val, max_val, 8)

  your_quant = quantize(test_input, scale, zero_point)
  your_dequant = dequantize(your_quant, scale, zero_point)

  test_case_0 = torch.Tensor([
      [-0.2623,  1.3991,  0.2842,  1.0275],
      [-0.9838, -3.4104,  1.4866,  0.2405],
      [ 1.4866, -0.3716,  0.0874,  2.1424],
      [ 0.6340, -1.1587, -0.7870,  0.0656]])

  assert torch.allclose(your_dequant, test_case_0, atol=1e-4)
  assert torch.allclose(your_dequant, test_input, atol=5e-2)

  return test_input, your_dequant, your_quant



### Test Case 1
def test_case_1():
  torch.manual_seed(999)
  test_input = torch.randn((8,8))

  min_val, max_val = torch.min(test_input), torch.max(test_input)
  scale, zero_point = _calculate_scale_and_zeropoint(min_val, max_val, 8)

  your_quant = quantize(test_input, scale, zero_point)
  your_dequant = dequantize(your_quant, scale, zero_point)

  test_case_1 = torch.Tensor(
      [[-0.2623,  1.3991,  0.2842,  1.0275, -0.9838, -3.4104,  1.4866,  0.2405],
      [ 1.4866, -0.3716,  0.0874,  2.1424,  0.6340, -1.1587, -0.7870,  0.0656],
      [ 0.0000, -0.6558, -1.0056,  0.3061,  0.6340, -1.0931, -1.6178,  1.5740],
      [-1.7927,  0.6121, -0.7214,  0.6121,  0.3279, -1.5959, -0.5247,  0.3498],
      [-1.3773,  1.1149, -0.7870,  0.2842,  0.9182, -1.1805, -0.7433, -1.5522],
      [ 1.0056, -0.1093,  1.3991, -0.9182, -1.1805, -0.6777, -0.3061,  0.9838],
      [ 0.2186,  1.6396,  1.0712,  1.7489,  0.0874,  0.3498,  0.9838,  1.2024],
      [-0.3935, -0.6340,  1.9238,  1.2898,  0.0219,  0.3935,  1.4866, -0.9401]])

  assert torch.allclose(your_dequant, test_case_1, atol=1e-4)
  assert torch.allclose(your_dequant, test_input, atol=5e-2)

  return test_input, your_dequant, your_quant

In [5]:
# Empirically, report the average and maximum quantization error for the test cases
def test():
  test_input, your_dequant, your_quant = test_case_0()
  test_input, your_dequant, your_quant = test_case_1()

  avg_error = torch.mean(torch.abs(test_input - your_dequant))
  max_error = torch.max(torch.abs(test_input - your_dequant))

  return avg_error, max_error

test()

(tensor(0.0059), tensor(0.0115))

In [6]:
# Save the original fp32 tensor and quantized tensor to disk with torch.save. Report the difference in disk utilization
output_folder = "data/lab3"

def save_to_disk(test_input, your_quant, output_folder):
    torch.save(test_input, f"{output_folder}/test_input.pt")
    torch.save(your_quant, f"{output_folder}/your_quant.pt")
    
    test_input_size = test_input.element_size() * test_input.nelement()
    your_quant_size = your_quant.element_size() * your_quant.nelement()
    
    return test_input_size, your_quant_size

test_input, your_dequant, your_quant = test_case_1()
save_to_disk(test_input, your_quant, output_folder)

(256, 64)

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import time
import os
import matplotlib.pyplot as plt
from torchvision import transforms
from itertools import product

import torch.ao.quantization as quantization
from torch.ao.quantization import get_default_qconfig
from torch.ao.quantization.quantize_fx import prepare_fx, convert_fx, prepare_qat_fx

In [8]:
class MNISTDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        data = pd.read_csv(csv_file)
        self.labels = data.iloc[:, 0].values
        self.pixels = data.iloc[:, 1:].values.astype('float32')
        self.pixels = self.pixels.reshape(-1, 28, 28)  # Reshape to 28x28 images

        # Normalize the pixel values
        self.pixels_mean = self.pixels.mean()
        self.pixels_std = self.pixels.std()
        self.pixels = (self.pixels - self.pixels_mean) / self.pixels_std

        self.transform = transform

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        image = self.pixels[idx]
        label = self.labels[idx]

        if self.transform:
            image = self.transform(torch.tensor(image).unsqueeze(0))

        return image.squeeze(0), torch.tensor(label)
    

class FFNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, num_hidden_layers):
        super(FFNN, self).__init__()
        self.layers = nn.ModuleList()
        self.layers.append(nn.Linear(input_size, hidden_size))
        for _ in range(num_hidden_layers - 1):
            self.layers.append(nn.Linear(hidden_size, hidden_size))
        self.layers.append(nn.Linear(hidden_size, num_classes))
        self.relu = nn.ReLU()

    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten the input
        for layer in self.layers[:-1]:
            x = self.relu(layer(x))
        return self.layers[-1](x)

In [9]:
def create_dataloader(dataset_path, batch_size, is_train=True):
    # Create center crop transform
    transform = transforms.Compose([
        transforms.CenterCrop(20)  # Crop to 20x20 as specified
    ])
    
    # Create dataset and dataloader
    dataset = MNISTDataset(dataset_path, transform=transform)
    return DataLoader(dataset, batch_size=batch_size, shuffle=is_train)

def print_size_of_model(model, label=""):
    torch.save(model.state_dict(), "temp.p")
    size=os.path.getsize("temp.p")
    print("model: ",label,' \t','Size (MB):', size/1e6)
    os.remove('temp.p')
    return size

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


In [10]:
def train_model(model, train_loader, val_loader, epochs, learning_rate, device):
    print(f"Training normal precision model for {epochs} epochs")
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    for epoch in range(epochs):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            
        # Validation
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for data, target in val_loader:
                data, target = data.to(device), target.to(device)
                outputs = model(data)
                _, predicted = torch.max(outputs.data, 1)
                total += target.size(0)
                correct += (predicted == target).sum().item()
        
        print(f'Epoch {epoch+1}, Accuracy: {100 * correct / total:.2f}%')
    
    return model

In [11]:
def train_model_mixed_precision(model, train_loader, val_loader, epochs, learning_rate, device):
    print(f"Training model with mixed precision for {epochs} epochs")
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Do mixed precision training with torch.autocast and GradScaler
    scaler = torch.cuda.amp.GradScaler()
    
    for epoch in range(epochs):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            
            with torch.cuda.amp.autocast():
                output = model(data)
                loss = criterion(output, target)
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            
        # Validation
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for data, target in val_loader:
                data, target = data.to(device), target.to(device)
                outputs = model(data)
                _, predicted = torch.max(outputs.data, 1)
                total += target.size(0)
                correct += (predicted == target).sum().item()
        
        print(f'Epoch {epoch+1}, Accuracy: {100 * correct / total:.2f}%')
    
    return model

In [12]:
def measure_inference_time(model, test_loader, batch_size, num_runs=5):
    model.eval()
    times = []
    
    with torch.no_grad():
        for _ in range(num_runs):
            data, _ = next(iter(test_loader))
            if batch_size == 1:
                data = data[0:1]
                
            start_time = time.time()
            _ = model(data)
            end_time = time.time()
            times.append(end_time - start_time)
    
    mean_time = np.mean(times)
    std_time = np.std(times)
    return mean_time, std_time

In [13]:
def evaluate_model(model, test_loader, mixed_precision=False):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, target in test_loader:
            if mixed_precision:
                with torch.cuda.amp.autocast():
                    outputs = model(data)
            else:
                outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()
    return 100 * correct / total

In [14]:
# Dynamic Quantization
def apply_dynamic_quantization(model):
    return torch.quantization.quantize_dynamic(
        model,
        {nn.Linear},
        dtype=torch.qint8
    )

# Static Quantization
def apply_static_quantization(model, calibration_loader):
    model.eval()
    
    # Set the qconfig
    qconfig = get_default_qconfig("fbgemm")
    qconfig_dict = {"": qconfig}
    
    example_input = torch.randn(1, 1, 20, 20)  # 1-channel (grayscale), 20x20
    
    # Prepare the model for static quantization
    prepared_model = prepare_fx(model, qconfig_dict, example_input)
    
    # Calibrate with the training data
    with torch.no_grad():
        for data, _ in calibration_loader:
            prepared_model(data.unsqueeze(1))  # Ensure correct input shape for calibration
    
    # Convert to quantized model
    quantized_model = convert_fx(prepared_model)
    
    return quantized_model


# Quantization Aware Training
def apply_qat(model, train_loader, val_loader, epochs=2):
    model.train()
    
    # Set the qconfig
    qconfig = get_default_qconfig("fbgemm")
    qconfig_dict = {"": qconfig}

    example_input = torch.randn(1, 1, 20, 20)  # 1-channel (grayscale), 20x20
    
    # Prepare the model for QAT
    prepared_model = prepare_qat_fx(model, qconfig_dict, example_input)
    
    # Train the model
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(prepared_model.parameters(), lr=0.001)
    
    for epoch in range(epochs):
        for data, target in train_loader:
            optimizer.zero_grad()
            output = prepared_model(data.unsqueeze(1))  # Ensure correct input shape
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
    
    # Convert to final quantized model
    quantized_model = convert_fx(prepared_model)
    
    return quantized_model


In [15]:
def main():
    # Hyperparameters
    input_size = 20 * 20  # 20x20 pixels
    hidden_size = 1024
    num_classes = 10
    num_hidden_layers = 2
    batch_size = 64
    learning_rate = 0.001
    epochs = 2
    
    # Create model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = FFNN(input_size, hidden_size, num_classes, num_hidden_layers)
    mixed_model = deepcopy(model)
    
    # Create dataloaders
    train_loader = create_dataloader('data/mnist_train.csv', batch_size, True)
    test_loader = create_dataloader('data/mnist_test.csv', batch_size, False)
    
    # Train base model
    
    model = model.to(device)
    model = train_model(model, train_loader, test_loader, epochs, learning_rate, device)
    model = model.to("cpu")

    # Mixed Precision Training
    mixed_model = mixed_model.to(device)
    mixed_precision_model = train_model_mixed_precision(mixed_model, train_loader, test_loader, epochs, learning_rate, device)
    mixed_precision_model = mixed_precision_model.to("cpu")
    
    # Baseline evaluation
    base_acc = evaluate_model(model, test_loader)
    base_size = print_size_of_model(model, "Baseline")
    base_latency_b1, base_std_b1 = measure_inference_time(model, test_loader, 1)
    base_latency_b64, base_std_b64 = measure_inference_time(model, test_loader, 64)
    base_params = count_parameters(model)

    # Mixed Precision evaluation
    mixed_precision_acc = evaluate_model(mixed_precision_model, test_loader, mixed_precision=True)
    mixed_precision_size = print_size_of_model(mixed_precision_model, "Mixed Precision")
    mixed_precision_latency_b1, mixed_precision_std_b1 = measure_inference_time(mixed_precision_model, test_loader, 1)
    mixed_precision_latency_b64, mixed_precision_std_b64 = measure_inference_time(mixed_precision_model, test_loader, 64)
    mixed_precision_params = count_parameters(mixed_precision_model)
    
    # Dynamic Quantization
    model = model.cpu()
    quantized_dynamic = apply_dynamic_quantization(model)
    dynamic_acc = evaluate_model(quantized_dynamic, test_loader)
    dynamic_size = print_size_of_model(quantized_dynamic, "Dynamic Quantized")
    dynamic_latency_b1, dynamic_std_b1 = measure_inference_time(quantized_dynamic, test_loader, 1)
    dynamic_latency_b64, dynamic_std_b64 = measure_inference_time(quantized_dynamic, test_loader, 64)
    dynamic_params = count_parameters(quantized_dynamic)
    
    # Static Quantization
    static_quantized = apply_static_quantization(model, test_loader)
    static_acc = evaluate_model(static_quantized, test_loader)
    static_size = print_size_of_model(static_quantized, "Static Quantized")
    static_latency_b1, static_std_b1 = measure_inference_time(static_quantized, test_loader, 1)
    static_latency_b64, static_std_b64 = measure_inference_time(static_quantized, test_loader, 64)
    static_params = count_parameters(static_quantized)
    
    # QAT
    qat_model = apply_qat(model, train_loader, test_loader)
    qat_acc = evaluate_model(qat_model, test_loader)
    qat_size = print_size_of_model(qat_model, "QAT")
    qat_latency_b1, qat_std_b1 = measure_inference_time(qat_model, test_loader, 1)
    qat_latency_b64, qat_std_b64 = measure_inference_time(qat_model, test_loader, 64)
    qat_params = count_parameters(qat_model)
    
    # Print results
    print("\nResults Summary:")
    print(f"{'Model Type':<15} {'Accuracy':<10} {'Size (MB)':<12} {'Latency B1':<15} {'Latency B64':<15} {'Parameters':<12}")
    print("-" * 90)
    print(f"{'Baseline':<15} {base_acc:.2f}% {base_size/1e6:.2f} {base_latency_b1*1000:.2f}±{base_std_b1*1000:.2f}ms {base_latency_b64*1000:.2f}±{base_std_b64*1000:.2f}ms {base_params}")
    print(f"{'Mixed Precision':<15} {mixed_precision_acc:.2f}% {mixed_precision_size/1e6:.2f} {mixed_precision_latency_b1*1000:.2f}±{mixed_precision_std_b1*1000:.2f}ms {mixed_precision_latency_b64*1000:.2f}±{mixed_precision_std_b64*1000:.2f}ms {mixed_precision_params}")
    print(f"{'Dynamic':<15} {dynamic_acc:.2f}% {dynamic_size/1e6:.2f} {dynamic_latency_b1*1000:.2f}±{dynamic_std_b1*1000:.2f}ms {dynamic_latency_b64*1000:.2f}±{dynamic_std_b64*1000:.2f}ms {dynamic_params}")
    print(f"{'Static':<15} {static_acc:.2f}% {static_size/1e6:.2f} {static_latency_b1*1000:.2f}±{static_std_b1*1000:.2f}ms {static_latency_b64*1000:.2f}±{static_std_b64*1000:.2f}ms {static_params}")
    print(f"{'QAT':<15} {qat_acc:.2f}% {qat_size/1e6:.2f} {qat_latency_b1*1000:.2f}±{qat_std_b1*1000:.2f}ms {qat_latency_b64*1000:.2f}±{qat_std_b64*1000:.2f}ms {qat_params}")

In [16]:
main()

Training normal precision model for 2 epochs
Epoch 1, Accuracy: 97.17%
Epoch 2, Accuracy: 97.41%
Training model with mixed precision for 2 epochs


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():


Epoch 1, Accuracy: 96.74%
Epoch 2, Accuracy: 97.14%
model:  Baseline  	 Size (MB): 5.884202


  with torch.cuda.amp.autocast():


model:  Mixed Precision  	 Size (MB): 5.884202
model:  Dynamic Quantized  	 Size (MB): 1.480898


  prepared = prepare(


model:  Static Quantized  	 Size (MB): 1.515474
model:  QAT  	 Size (MB): 1.515474

Results Summary:
Model Type      Accuracy   Size (MB)    Latency B1      Latency B64     Parameters  
------------------------------------------------------------------------------------------
Baseline        97.41% 5.88 0.90±0.20ms 1.54±0.50ms 1470474
Mixed Precision 97.14% 5.88 0.20±0.40ms 1.75±0.45ms 1470474
Dynamic         97.39% 1.48 0.60±0.49ms 1.20±0.50ms 0
Static          97.28% 1.52 1.00±0.55ms 0.80±0.40ms 0
QAT             97.42% 1.52 0.60±0.49ms 0.79±0.40ms 0


In [17]:
def sensitivity_analysis(calibration_loader):
    """
    Analyze accuracy and inference time in the SST/MNIST model from quantizing just one module or layer at a time. 
    Your results will depend in part on the structure of your model; looping through the named_modules as in the documentation code will include modules as well as "leaf" layers.
    """

    # Hyperparameters
    input_size = 20 * 20  # 20x20 pixels
    hidden_size = 1024
    num_classes = 10
    num_hidden_layers = 2
    batch_size = 64
    learning_rate = 0.001
    epochs = 2
    
    # Create model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = FFNN(input_size, hidden_size, num_classes, num_hidden_layers)
    model = model.to(device)
    
    # Create dataloaders
    train_loader = create_dataloader('data/mnist_train.csv', batch_size, True)
    test_loader = create_dataloader('data/mnist_test.csv', batch_size, False)
    
    # Train base model
    model = train_model(model, train_loader, test_loader, epochs, learning_rate, device)
    
    # Evaluate base model
    model = model.to("cpu")
    base_acc = evaluate_model(model, test_loader)
    base_latency_b1, base_std_b1 = measure_inference_time(model, test_loader, 1)
    base_latency_b64, base_std_b64 = measure_inference_time(model, test_loader, 64)
    
    # Quantize one module at a time
    results = []
    model = model.cpu()
    example_input = torch.randn(1, 1, 20, 20)  # 1-channel (grayscale), 20x20
    for name, module in model.named_modules():
        
        print("Only quantizing part: ", name)

        # The module_name key allows module-specific qconfigs. 
        qconfig_dict = {"": None, 
                        "module_name": [(name, torch.ao.quantization.get_default_qconfig("fbgemm"))]}

        model_prepared = prepare_fx(model, qconfig_dict, example_input)
        # Calibrate with the training data
        model_prepared.eval()
        with torch.no_grad():
            for data, _ in calibration_loader:
                model_prepared(data.unsqueeze(1))  # Ensure correct input shape for calibration

        model_quantized = convert_fx(model_prepared)
        
        # Evaluate quantized model
        acc = evaluate_model(model_quantized, test_loader)
        latency_b1, std_b1 = measure_inference_time(model_quantized, test_loader, 1)
        latency_b64, std_b64 = measure_inference_time(model_quantized, test_loader, 64)
        
        results.append({
            "layer": name,
            "accuracy": acc,
            "latency_b1": latency_b1,
            "std_b1": std_b1,
            "latency_b64": latency_b64,
            "std_b64": std_b64
        })
    
    # Print results
    print("\nSensitivity Analysis Results:")
    print(f"{'Layer':<20} {'Accuracy':<10} {'Latency B1':<15} {'Std B1':<15} {'Latency B64':<15} {'Std B64':<15}")
    print("-" * 90)
    for result in results:
        print(f"{result['layer']:<20} {result['accuracy']:.2f}% {result['latency_b1']*1000:.2f}±{result['std_b1']*1000:.2f}ms {result['latency_b64']*1000:.2f}±{result['std_b64']*1000:.2f}ms")


In [18]:
test_loader = create_dataloader('data/mnist_test.csv', 64, False)
sensitivity_analysis(test_loader)

Training normal precision model for 2 epochs
Epoch 1, Accuracy: 97.05%
Epoch 2, Accuracy: 97.17%
Only quantizing part:  
Only quantizing part:  layers
Only quantizing part:  layers.0
Only quantizing part:  layers.1
Only quantizing part:  layers.2
Only quantizing part:  relu

Sensitivity Analysis Results:
Layer                Accuracy   Latency B1      Std B1          Latency B64     Std B64        
------------------------------------------------------------------------------------------
                     97.17% 0.00±0.00ms 1.22±0.23ms
layers               97.07% 0.70±0.40ms 0.60±0.49ms
layers.0             97.17% 1.10±0.49ms 5.00±3.04ms
layers.1             97.13% 1.03±0.32ms 1.26±0.39ms
layers.2             97.06% 1.00±0.00ms 2.91±0.74ms
relu                 97.17% 1.00±0.00ms 6.72±2.73ms


In [19]:
def sensitivity_analysis_exclude(calibration_loader):
    """
    Analyze accuracy and inference time in the SST/MNIST model by quantizing all but one module at a time.
    Only modules that do not have child modules will be looped through for this analysis.
    """

    # Hyperparameters
    input_size = 20 * 20  # 20x20 pixels
    hidden_size = 1024
    num_classes = 10
    num_hidden_layers = 2
    batch_size = 64
    learning_rate = 0.001
    epochs = 2
    
    # Create model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = FFNN(input_size, hidden_size, num_classes, num_hidden_layers)
    model = model.to(device)
    
    # Create dataloaders
    train_loader = create_dataloader('data/mnist_train.csv', batch_size, True)
    test_loader = create_dataloader('data/mnist_test.csv', batch_size, False)
    
    # Train base model
    model = train_model(model, train_loader, test_loader, epochs, learning_rate, device)
    
    # Evaluate base model
    model = model.to("cpu")
    base_acc = evaluate_model(model, test_loader)
    base_latency_b1, base_std_b1 = measure_inference_time(model, test_loader, 1)
    base_latency_b64, base_std_b64 = measure_inference_time(model, test_loader, 64)
    
    # Quantize all but one module at a time
    results = []
    model = model.cpu()
    example_input = torch.randn(1, 1, 20, 20)  # 1-channel (grayscale), 20x20
    
    # Identify leaf layers (layers without child modules)
    leaf_layers = [name for name, module in model.named_modules() if len(list(module.children())) == 0]
    
    for exclude_layer in leaf_layers:
        print(f"Quantizing all except layer: {exclude_layer}")

        # Set up qconfig dictionary
        qconfig_dict = {"": torch.ao.quantization.get_default_qconfig("fbgemm")}
        for layer in leaf_layers:
            if layer == exclude_layer:
                qconfig_dict["module_name"] = [(layer, None)]  # Skip quantization for this layer

        # Prepare and convert the model for quantization
        model_prepared = prepare_fx(model, qconfig_dict, example_input)

        # Calibrate
        model_prepared.eval()
        with torch.no_grad():
            for data, _ in calibration_loader:
                model_prepared(data.unsqueeze(1))

        model_quantized = convert_fx(model_prepared)
        
        # Evaluate the quantized model
        acc = evaluate_model(model_quantized, test_loader)
        latency_b1, std_b1 = measure_inference_time(model_quantized, test_loader, 1)
        latency_b64, std_b64 = measure_inference_time(model_quantized, test_loader, 64)
        
        results.append({
            "excluded_layer": exclude_layer,
            "accuracy": acc,
            "latency_b1": latency_b1,
            "std_b1": std_b1,
            "latency_b64": latency_b64,
            "std_b64": std_b64
        })
    
    # Print results
    print("\nSensitivity Analysis Results:")
    print(f"{'Excluded Layer':<20} {'Accuracy':<10} {'Latency B1':<15} {'Latency B64':<15}")
    print("-" * 60)
    for result in results:
        print(f"{result['excluded_layer']:<20} {result['accuracy']:.2f}% {result['latency_b1']*1000:.2f}±{result['std_b1']*1000:.2f}ms {result['latency_b64']*1000:.2f}±{result['std_b64']*1000:.2f}ms")


In [20]:
test_loader = create_dataloader('data/mnist_test.csv', 64, False)
sensitivity_analysis_exclude(test_loader)

Training normal precision model for 2 epochs
Epoch 1, Accuracy: 97.54%
Epoch 2, Accuracy: 97.55%
Quantizing all except layer: layers.0
Quantizing all except layer: layers.1
Quantizing all except layer: layers.2
Quantizing all except layer: relu

Sensitivity Analysis Results:
Excluded Layer       Accuracy   Latency B1      Latency B64    
------------------------------------------------------------
layers.0             97.43% 1.10±0.20ms 1.40±0.49ms
layers.1             97.44% 0.60±0.49ms 0.90±0.49ms
layers.2             97.53% 0.60±0.49ms 1.30±0.40ms
relu                 97.44% 0.80±0.40ms 1.09±0.64ms
