In [1]:
import torch
import numpy as np
import torch.nn as nn
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd
import random

In [2]:
from layers import CompressionLayer, QuantizationLayer, HardQuantizationThresholdRoundingLayer, HardQuantizationLayer, HardQuantizationThresholdLayer
from models import MultiLayerPerceptron
from datasets import get_dataloader
from training_utils import train_model, eval_val, eval_quantization

In [3]:
from joblib import Parallel, delayed

## Load Dataset

In [4]:
dataset = 'California_Housing'
train_loader, val_loader, test_loader = get_dataloader(dataset = dataset)

### Utils

In [5]:
def estimate_quantile(train_loader, quantiles):
    all_data = []

    # Collect all data from the train_loader
    for batch in train_loader:
        inputs, _ = batch
        all_data.append(inputs)

    # Concatenate all data along the first dimension
    all_data = torch.cat(all_data, dim=0)
    quantile_values = torch.quantile(all_data, quantiles, dim=0).transpose(0,1)
    return quantile_values

def get_quantization_thresholds(train_loader, n_bits):
    thresholds = 2 ** n_bits - 1
    quantiles = torch.arange(1 / (thresholds + 1), 1, 1 / (thresholds + 1))
    thresholds = estimate_quantile(train_loader, quantiles)
    return thresholds

def get_min_max_values(train_loader, num_features):
    min_values = torch.tensor([float('inf')] * num_features)
    max_values = torch.tensor([-float('inf')] * num_features)
    for batch in train_loader:
        inputs, _ = batch
        min_values = torch.min(min_values, inputs.min(dim=0).values)
        max_values = torch.max(max_values, inputs.max(dim=0).values)
    return min_values, max_values

### Grid Search for DNNs

In [6]:
device = 'cuda'

In [7]:
def train_mlp_model(architecture, min_values, max_values, thresholds,
                    num_epochs=100, learning_rate=0.001, weight_decay=0.0001,add_noise=False,
                    n_bits = 8):
    model = MultiLayerPerceptron(architecture)

    model.to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    
    best_val_loss = train_model(model, num_epochs=num_epochs,
                train_loader=train_loader, test_loader=test_loader,
                optimizer=optimizer, criterion=criterion, has_quantization_layer=False,
                train_quantization_layer=False, print_result=False,
                add_noise=add_noise, device=device)
    
    quantization_model = HardQuantizationLayer(n_bits=n_bits, min_values=min_values, max_values=max_values)
    quantization_thr_model = HardQuantizationThresholdRoundingLayer(thresholds=thresholds)

    model_hard_post_mlp = nn.Sequential(quantization_model, model)
    model_hard_post_mlp.to(device)

    model_hard_thr_post_mlp = nn.Sequential(quantization_thr_model, model)
    model_hard_thr_post_mlp.to(device)

    val_loss_mlp = eval_val(model=model, val_dataloader=val_loader, criterion=criterion, device=device)
    val_loss_hard_post_mlp = eval_val(model=model_hard_post_mlp, val_dataloader=val_loader, criterion=criterion, device=device)
    val_loss_hard_thr_post_mlp = eval_val(model=model_hard_thr_post_mlp, val_dataloader=val_loader, criterion=criterion, device=device)
    return val_loss_mlp, val_loss_hard_post_mlp, val_loss_hard_thr_post_mlp
    
def train_mlp_pre_model(architecture, min_values, max_values, thresholds,
                    num_epochs=100, learning_rate=0.001, weight_decay=0.0001,add_noise=False,
                    n_bits = 8):
    quantization_model = HardQuantizationLayer(n_bits=n_bits, min_values=min_values, max_values=max_values)
    quantization_thr_model = HardQuantizationThresholdLayer(thresholds=thresholds)
    mlp = MultiLayerPerceptron(architecture)
    mlp_thr = MultiLayerPerceptron(architecture)

    model_hard_pre_mlp = nn.Sequential(quantization_model, mlp)
    model_hard_pre_mlp.to(device)

    model_hard_pre_thr_mlp = nn.Sequential(quantization_thr_model, mlp_thr)
    model_hard_pre_thr_mlp.to(device)

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model_hard_pre_mlp.parameters(), lr=learning_rate, weight_decay=weight_decay)
    best_val_loss = train_model(model_hard_pre_mlp, num_epochs=num_epochs,
                train_loader=train_loader, test_loader=test_loader,
                optimizer=optimizer, criterion=criterion, has_quantization_layer=False,
                train_quantization_layer=False, print_result=False,
                add_noise=add_noise, device=device)
    
    val_loss_hard_pre_mlp = eval_val(model=model_hard_pre_mlp, val_dataloader=val_loader, criterion=criterion, device=device)

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model_hard_pre_thr_mlp.parameters(), lr=learning_rate, weight_decay=weight_decay)
    best_val_loss = train_model(model_hard_pre_thr_mlp, num_epochs=num_epochs,
                train_loader=train_loader, test_loader=test_loader,
                optimizer=optimizer, criterion=criterion, has_quantization_layer=False,
                train_quantization_layer=False, print_result=False,
                add_noise=add_noise, device=device)
    
    val_loss_hard_pre_thr_mlp = eval_val(model=model_hard_pre_thr_mlp, val_dataloader=val_loader, criterion=criterion, device=device)
    return val_loss_hard_pre_mlp, val_loss_hard_pre_thr_mlp

def train_soft_mlp(architecture, min_values, max_values, thresholds,
                    num_epochs=100, learning_rate=0.001, weight_decay=0.0001,add_noise=False,
                    n_bits = 8, decrease_factor = 0.001):
    num_features = thresholds.shape[0]
    num_thresholds_per_feature = thresholds.shape[1]

    quantization_model = QuantizationLayer(num_features=thresholds.shape[0], 
                                           num_thresholds_per_feature=thresholds.shape[1],
                                           tau=1)
    quantization_model.set_thresholds(thresholds)
    mlp = MultiLayerPerceptron(architecture)

    model_soft_mlp = nn.Sequential(quantization_model, mlp)
    model_soft_mlp.to(device)

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model_soft_mlp.parameters(), lr=learning_rate, weight_decay=weight_decay)
    best_val_loss = train_model(model_soft_mlp, num_epochs=num_epochs,
                train_loader=train_loader, test_loader=test_loader,
                optimizer=optimizer, criterion=criterion, has_quantization_layer=True,
                train_quantization_layer=True, print_result=False, decrease_factor=decrease_factor,
                add_noise=add_noise, device=device)
    
    quantization_thr_model = HardQuantizationThresholdLayer(thresholds=quantization_model.thresholds)
    model_soft_hard_mlp = nn.Sequential(quantization_thr_model, mlp)

    val_loss_soft_mlp = eval_val(model=model_soft_mlp, val_dataloader=val_loader, criterion=criterion, device=device)
    val_loss_soft_hard_mlp = eval_val(model=model_soft_hard_mlp, val_dataloader=val_loader, criterion=criterion, device=device)

    return val_loss_soft_mlp, val_loss_soft_hard_mlp

def train_soft_comp_mlp(architecture, min_values, max_values, thresholds,
                    num_epochs=100, learning_rate=0.001, weight_decay=0.0001,add_noise=False,
                    n_bits = 8, decrease_factor = 0.001):
    num_features = thresholds.shape[0]
    num_thresholds_per_feature = thresholds.shape[1]
    
   
    comp_model = CompressionLayer(a_init = thresholds.flatten(), 
                                  a_index = torch.repeat_interleave(torch.arange(num_features),num_thresholds_per_feature), 
                                  tau = 1)
    architecture[0] = num_features * num_thresholds_per_feature
    mlp = MultiLayerPerceptron(architecture)

    model_soft_thr_mlp = nn.Sequential(comp_model, mlp)
    model_soft_thr_mlp.to(device)

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model_soft_thr_mlp.parameters(), lr=learning_rate, weight_decay=weight_decay)
    best_val_loss = train_model(model_soft_thr_mlp, num_epochs=num_epochs,
                train_loader=train_loader, test_loader=test_loader,
                optimizer=optimizer, criterion=criterion,has_quantization_layer=True,
                train_quantization_layer=True, print_result=False, decrease_factor=decrease_factor,
                add_noise=add_noise, device=device)
    
    val_loss_soft_thr_mlp = eval_val(model=model_soft_thr_mlp, val_dataloader=val_loader, criterion=criterion, device=device)
    comp_model.set_round_quantization(True)   
    val_loss_soft_hard_thr_mlp = eval_val(model=model_soft_thr_mlp, val_dataloader=val_loader, criterion=criterion, device=device)
    return val_loss_soft_thr_mlp, val_loss_soft_hard_thr_mlp

In [8]:
def random_search_soft_quantization_threshold(n_steps = 10, n_bits =8, num_features=8, optimize_dict = {}, device = 'cpu'):
    thresholds = get_quantization_thresholds(train_loader, n_bits)
    min_values, max_values = get_min_max_values(train_loader, num_features=num_features)
    
    # Define default hyperparameters
    weight_decay =  0
    learning_rate = 0.001
    hidden_layers = 2
    hidden_neurons = 256
    num_epochs = 30
    add_noise = False
    decrease_factor = 0.001

    # Lists to store results
    val_loss_mlp_values = []
    val_loss_hard_post_mlp_values = []
    val_loss_hard_thr_post_mlp_values = []
    val_loss_hard_pre_mlp_values = []
    val_loss_hard_thr_pre_mlp_values = []
    val_loss_soft_mlp_values = []
    val_loss_soft_hard_mlp_values = []
    val_loss_soft_comp_mlp_values = []
    val_loss_soft_hard_comp_mlp_values = []
    

    hyperparameter_dict = {
        'weight_decay': [],
        'learning_rate': [],
        'hidden_layers': [],
        'hidden_neurons': [],
        'num_epochs': [],
        'decrease_factor': []}
    

    # Perform random search
    for _ in tqdm(range(n_steps)):
        for key, value in optimize_dict.items():
            if key == 'weight_decay':
                weight_decay = random.choice(value)
            elif key == 'learning_rate':
                learning_rate = random.choice(value)
            elif key == 'hidden_layers':
                hidden_layers = random.choice(value)
            elif key == 'hidden_neurons':
                hidden_neurons = random.choice(value)    
            elif key == 'num_epochs':
                num_epochs = random.choice(value)    
            elif key == 'add_noise':
                add_noise = random.choice(value)    
            elif key == 'decrease_factor':
                decrease_factor = random.choice(value)
            else:
                raise ValueError(f"Unknown hyperparameter: {key}")
            
        architecture = [8] + [hidden_neurons] * hidden_layers + [1]
        hyperparameter_dict['weight_decay'].append(weight_decay)
        hyperparameter_dict['learning_rate'].append(learning_rate)
        hyperparameter_dict['hidden_layers'].append(hidden_layers)
        hyperparameter_dict['hidden_neurons'].append(hidden_neurons)
        hyperparameter_dict['num_epochs'].append(num_epochs)
        hyperparameter_dict['decrease_factor'].append(decrease_factor)

        # Calculate losses for mlp model
        val_loss_mlp, val_loss_hard_post_mlp, val_loss_hard_thr_post_mlp = train_mlp_model(
            architecture=architecture, min_values=min_values, max_values=max_values, thresholds=thresholds,
            num_epochs=num_epochs, learning_rate=learning_rate, weight_decay=weight_decay,
            n_bits=n_bits)
        
        # Calculate losses for pre-training quantization model
        val_loss_hard_pre_mlp, val_loss_hard_thr_pre_mlp = train_mlp_pre_model(architecture=architecture, min_values=min_values, max_values=max_values, thresholds=thresholds,
            num_epochs=num_epochs, learning_rate=learning_rate, weight_decay=weight_decay,
            n_bits=n_bits)

        # Calculate losses for quantization model
        val_loss_soft_mlp, val_loss_soft_hard_mlp = train_soft_mlp(architecture=architecture, min_values=min_values, max_values=max_values, thresholds=thresholds,
            num_epochs=num_epochs, learning_rate=learning_rate, weight_decay=weight_decay,
            n_bits=n_bits, decrease_factor=decrease_factor)

        # Calculate losses for quantization model
        val_loss_soft_comp_mlp, val_loss_soft_hard_comp_mlp = train_soft_comp_mlp(architecture=architecture, min_values=min_values, max_values=max_values, thresholds=thresholds,
            num_epochs=num_epochs, learning_rate=learning_rate, weight_decay=weight_decay,
            n_bits=n_bits, decrease_factor=decrease_factor)
        

        val_loss_mlp_values.append(val_loss_mlp)
        val_loss_hard_post_mlp_values.append(val_loss_hard_post_mlp)
        val_loss_hard_thr_post_mlp_values.append(val_loss_hard_thr_post_mlp)
        val_loss_hard_pre_mlp_values.append(val_loss_hard_pre_mlp)
        val_loss_hard_thr_pre_mlp_values.append(val_loss_hard_thr_pre_mlp)
        val_loss_soft_mlp_values.append(val_loss_soft_mlp)
        val_loss_soft_hard_mlp_values.append(val_loss_soft_hard_mlp)
        val_loss_soft_comp_mlp_values.append(val_loss_soft_comp_mlp)
        val_loss_soft_hard_comp_mlp_values.append(val_loss_soft_hard_comp_mlp)
 
    losses_df = pd.DataFrame({
        'val_loss_mlp': val_loss_mlp_values,
        'val_loss_hard_post_mlp': val_loss_hard_post_mlp_values,
        'val_loss_hard_thr_post_mlp': val_loss_hard_thr_post_mlp_values,
        'val_loss_hard_pre_mlp': val_loss_hard_pre_mlp_values,
        'val_loss_hard_thr_pre_mlp': val_loss_hard_thr_pre_mlp_values,
        'val_loss_soft_mlp': val_loss_soft_mlp_values,
        'val_loss_soft_hard_mlp': val_loss_soft_hard_mlp_values,
        'val_loss_soft_comp_mlp': val_loss_soft_comp_mlp_values,
        'val_loss_soft_hard_comp_mlp': val_loss_soft_hard_comp_mlp_values
    })

    # Create DataFrame with results
    results_df = pd.DataFrame(hyperparameter_dict)
    results_df = pd.concat([results_df, losses_df], axis=1)

    results_df = results_df.sort_values('val_loss_mlp')  # Sort by loss ascending    
    return results_df

In [9]:
n_bits = 4
results_df_all = random_search_soft_quantization_threshold(n_bits = n_bits,
                                              n_steps = 150,
                                              optimize_dict=
                                              {'weight_decay': [0, 0.0001],
                                              'learning_rate': [0.001, 0.0001],
                                              'hidden_layers': [3,4,5,6],
                                            'hidden_neurons': [128, 256, 512, 1024, 2048, 4096],
                                              'num_epochs': [30,50,70],
                                              'decrease_factor': [0.001, 0.0001]},
                                              device = device)

100%|██████████| 150/150 [11:59:25<00:00, 287.77s/it]  


In [10]:
results_df_all.head()

Unnamed: 0,weight_decay,learning_rate,hidden_layers,hidden_neurons,num_epochs,decrease_factor,val_loss_mlp,val_loss_hard_post_mlp,val_loss_hard_thr_post_mlp,val_loss_hard_pre_mlp,val_loss_hard_thr_pre_mlp,val_loss_soft_mlp,val_loss_soft_hard_mlp,val_loss_soft_comp_mlp,val_loss_soft_hard_comp_mlp
55,0.0,0.001,3,256,50,0.001,0.266625,6.056498,0.345149,0.461592,0.347203,0.288408,0.291408,0.296838,0.298835
147,0.0001,0.001,3,256,70,0.001,0.266889,5.323131,0.333658,0.451262,0.344442,0.304309,0.306016,0.289422,0.293439
146,0.0001,0.001,6,128,50,0.0001,0.267646,4.266414,0.353401,0.463285,0.344008,0.295517,0.295759,0.296939,0.297019
120,0.0001,0.001,5,2048,70,0.001,0.272618,6.882819,0.353799,0.454359,0.334896,0.295002,0.296485,0.279243,0.282143
10,0.0001,0.0001,5,256,70,0.0001,0.272845,5.71034,0.346443,0.458922,0.360545,0.33501,0.334952,0.29207,0.291813


In [12]:
results_df_all.to_csv(f'results/{dataset}/random_search_results_all_{n_bits}bits.csv', index=False)