# Neural Network LC-Model Compression with PyTorch

## Environment Setup

In [None]:
# restart runtime after running cell
! git clone https://github.com/UCMerced-ML/LC-model-compression
! pip3 install -e ./LC-model-compression

In [2]:
# uc merced's lc-model compression algorithms
import lc
from lc.torch import ParameterTorch as Param, AsVector, AsIs
from lc.compression_types import ConstraintL0Pruning, LowRank, RankSelection, AdaptiveQuantization
from lc.models.torch import lenet300_classic, lenet300_modern_drop, lenet300_modern

# data science libraries
import torch
from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader, random_split
from torchvision import datasets

import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

torch.set_num_threads(4)
device = torch.device('cuda') 

## Reference Network and Utility Functions

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.loss = torch.nn.CrossEntropyLoss()

        self.conv1 = nn.Conv2d(in_channels = 1, out_channels = 20, kernel_size = 5)
        self.pool1 = nn.MaxPool2d(kernel_size = 2, stride = 2)
        self.conv2 = nn.Conv2d(in_channels = 20, out_channels = 50, kernel_size = 5)
        self.pool2 = nn.MaxPool2d(kernel_size = 2, stride = 2)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(4*4*50, 500)
        self.fc2 = nn.Linear(500, 5)

    def forward(self, x):
        x = nn.functional.relu(self.conv1(x))
        x = self.pool1(x)
        x = nn.functional.relu(self.conv2(x))
        x = self.pool2(x)
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.fc2(x)
        return x

In [None]:
def train_net(net, parameters, final_model=False):
    train_accs = []
    val_accs = []
    train_losses = []
    val_losses = []

    max_val_acc = 0
    epochs_per_early_stop_check = parameters["epochs_per_early_stop_check"]
    early_stop_thresh = 1e-5
    intitial_early_stop_patience = 3
    early_stop_patience = intitial_early_stop_patience

    train_loader, _, _ = data_loader(parameters["batch_size"])
    params = list(filter(lambda p: p.requires_grad, net.parameters()))
    optimizer = optim.SGD(params, 
                          lr=parameters["lr"], 
                          momentum=parameters["momentum"], 
                          weight_decay=parameters["weight_decay"],
                          nesterov = parameters["nesterov"])
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=parameters["step_size"], gamma=parameters["gamma"])
    max_epochs=100
    for epoch in range(max_epochs):
        avg_loss = []
        for i, (x, target) in enumerate(train_loader):
            optimizer.zero_grad()
            x = x.cuda()
            target = target.cuda().to(dtype=torch.long)
            out = net(x)
            loss = net.loss(out, target)
            avg_loss.append(loss.item())
            loss.backward()
            optimizer.step()
            # -------------------------------------------------------------------------------
            if (final_model):
                acc_train, loss_train, acc_val, loss_val = train_val_acc_eval_f(net.eval(), tuning=(not final_model))
                train_accs.append(acc_train)
                val_accs.append(acc_val)
                train_losses.append(loss_train)
                val_losses.append(loss_val)
            # ------------------------------------------------------------------------------- 
        scheduler.step()

        print(f"\tepoch #{epoch} is finished.")
        print(f"\t  avg. train loss: {np.mean(avg_loss):.6f}")
        ## Note: when preparing final report, this chunk should be put in the inner loop
        ## to record errors for each SGD step, rather than just for each epoch (visualize
        ## errs function should also be modified accordingly)
        ## During hyperparameter tuning, it will be sufficient to find error rates only
        ## on each epoch
        # ------------------------------------------------------------------------------- 
        if (not final_model):
            acc_train, loss_train, acc_val, loss_val = train_val_acc_eval_f(net.eval(), tuning=(not final_model))
            train_accs.append(acc_train)
            val_accs.append(acc_val)
        print(f"\t#Train err: {100-acc_train*100:.2f}%, train loss: {loss_train}")
        print(f"\t#Validation err: {100-acc_val*100:.5f}%, validation loss: {loss_val}\n")
        # ------------------------------------------------------------------------------- 
        if (epoch % epochs_per_early_stop_check == epochs_per_early_stop_check - 1):
            if (max_val_acc + early_stop_thresh < acc_val):
                max_val_acc = acc_val
                early_stop_patience = intitial_early_stop_patience
            else:
                early_stop_patience -= 1
            if (early_stop_patience == 0):
                break;
    
    total_steps = len(val_accs)
    accs = np.zeros((2, total_steps), dtype=float)
    losses = None
    if final_model:
        losses = np.zeros((2, total_steps), dtype=float)
        for i in range(total_steps):
            losses[0, i] = train_losses[i]
            losses[1, i] = val_losses[i]
    for i in range(total_steps):
        accs[0, i] = train_accs[i]
        accs[1, i] = val_accs[i]
    visualize_accs(accs, losses, final_model)
    print(accs)
    print(losses)
    print("#" + str(parameters))
    print(f"\t#Train err: {100-acc_train*100:.2f}%, train loss: {loss_train}")
    print(f"\t#Validation err: {100-acc_val*100:.5f}%, validation loss: {loss_val}\n")

In [None]:
def data_loader(batch_size=2048, n_workers=2, tuning=False):
    train_data_th = datasets.MNIST(root='./datasets', download=True, train=True)
    test_data_th = datasets.MNIST(root='./datasets', download=True, train=False)

    # getting subset of mnist dataset
    indices = (train_data_th.targets == 0) | (train_data_th.targets == 2) | (train_data_th.targets == 5) | (train_data_th.targets == 6) | (train_data_th.targets == 7)
    train_data, train_targets = train_data_th.data[indices], train_data_th.targets[indices]
    
    indices = (test_data_th.targets == 0) | (test_data_th.targets == 2) | (test_data_th.targets == 5) | (test_data_th.targets == 6) | (test_data_th.targets == 7)
    test_data, test_targets = test_data_th.data[indices], test_data_th.targets[indices]

    for i, digit in enumerate([0,2,5,6,7]): # change labels to be in range 0-C1 b/c cross-entropy function
        train_targets = torch.where(train_targets == digit, i, train_targets)
        test_targets = torch.where(test_targets == digit, i, test_targets)

    data_train = np.array(train_data[:]).reshape([-1, 1, 28, 28]).astype(np.float32)
    data_test = np.array(test_data[:]).reshape([-1, 1, 28, 28]).astype(np.float32)

    data_train = (data_train / 255)
    dtrain_mean = data_train.mean(axis=0)
    data_train -= dtrain_mean
    data_test = (data_test / 255).astype(np.float32)
    data_test -= dtrain_mean

    train_data = TensorDataset(torch.from_numpy(data_train), train_targets)

    # split validation set from train data
    val_split = int(0.3 * len(train_data))
    train_data, val_data = random_split(train_data, [len(train_data) - val_split, val_split], generator=torch.Generator().manual_seed(1778))

    if (tuning): # take subset of full train/val sets for hyperparameter tuning
        subset_proportion = 0.4
        train_subset_size = int(len(train_data) * subset_proportion)
        val_subset_size = int(len(val_data) * subset_proportion)
        train_data, _ = random_split(train_data, [train_subset_size, len(train_data) - train_subset_size], generator=torch.Generator().manual_seed(1778))
        val_data, _ = random_split(val_data, [val_subset_size, len(val_data) - val_subset_size], generator=torch.Generator().manual_seed(1778))

    test_data = TensorDataset(torch.from_numpy(data_test), test_targets)

    train_loader = DataLoader(train_data, num_workers=n_workers, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_data, num_workers=n_workers, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_data, num_workers=n_workers, batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, test_loader

In [None]:
def compute_acc_loss(forward_func, data_loader):
    correct_cnt, ave_loss = 0, 0
    for batch_idx, (x, target) in enumerate(data_loader):
        with torch.no_grad():
            target = target.cuda()
            score, loss = forward_func(x.cuda(), target)
            _, pred_label = torch.max(score.data, 1)
            correct_cnt += (pred_label == target.data).sum().item()
            ave_loss += loss.data.item() * len(x)
    accuracy = correct_cnt * 1.0 / len(data_loader.dataset)
    ave_loss /= len(data_loader.dataset)
    return accuracy, ave_loss

In [None]:
def compute_compression_ratio(lc_alg): # https://towardsdatascience.com/understanding-and-calculating-the-number-of-parameters-in-convolution-neural-networks-cnns-fc88790d530d
    compressed_model_bits = lc_alg.count_param_bits() + (20 + 50 + 500 + 5)*32
    uncompressed_model_bits = (((1*5*5 + 1)*20 + (20*5*5 + 1)*50)\
         + (4*4*50*500 + 500*5 + 500+5))*32 # (right) linear (top) convolutional
    compression_ratio = uncompressed_model_bits/compressed_model_bits
    return compression_ratio

In [None]:
def forward_func(x, target):
    return net(x), net.loss(net(x), target)

def train_test_acc_eval_f(net):
    train_loader, _, test_loader = data_loader()
    with torch.no_grad():
        acc_train, loss_train = compute_acc_loss(forward_func, train_loader)
        acc_test, loss_test = compute_acc_loss(forward_func, test_loader)
    print(f"Train err: {100-acc_train*100:.2f}%, train loss: {loss_train}")
    print(f"TEST ERR: {100-acc_test*100:.2f}%, test loss: {loss_test}")

In [None]:
def train_test_acc_eval_f(net):
    train_loader, _, test_loader = data_loader()
    def forward_func(x, target):
        y = net(x)
        return y, net.loss(y, target)
    with torch.no_grad():
        acc_train, loss_train = compute_acc_loss(forward_func, train_loader)
        acc_test, loss_test = compute_acc_loss(forward_func, test_loader)
    print(f"train err: {100-acc_train*100:.2f}%, train loss: {loss_train}")
    print(f"test err: {100-acc_test*100:.2f}%, test loss: {loss_test}")

def test_acc_eval_f(net):
    _, _, test_loader = data_loader()
    with torch.no_grad():
        acc_test, _ = compute_acc_loss(forward_func, test_loader)
    return acc_test

def train_val_acc_eval_f(net, tuning):
    train_loader, val_loader, _ = data_loader()
    with torch.no_grad():
        acc_train, loss_train = compute_acc_loss(forward_func, train_loader)
        acc_val, loss_val = compute_acc_loss(forward_func, val_loader)
    return acc_train, loss_train, acc_val, loss_val


In [None]:
# visualizes train/val acc if not final model else test err
def visualize_accs(accs, losses, final_model):
    epochs = np.arange(len(accs[0]))
    fig = plt.figure()
    ax = plt.gca()
    if (not final_model):
        ax.plot(epochs, accs[0], "b-", label="Train")
        ax.plot(epochs, accs[1], "g-", label="Validation")
        ax.set_xlabel('Epoch')
        ax.set_title('Accuracy per Epoch')
        ax.set_ylabel('Accuracy (%)')
        ax.set_xscale('log')
        ax.set_yscale('log')
        ax.legend()
    else:
        ax.plot(epochs, accs[0], "b-", label="Train")
        ax.plot(epochs, accs[1], "g-", label="Validation")
        ax.set_xlabel('SGD Step')
        ax.set_title('Accuracy per SGD Step')
        ax.set_ylabel('Error (%)')
        ax.set_xscale('log')
        ax.set_yscale('log')
        ax.legend()
        fig2 = plt.figure()
        ax2 = plt.gca()
        ax2.plot(epochs, losses[0], "b-", label="Train")
        ax2.plot(epochs, losses[1], "g-", label="Validation")
        ax2.set_xlabel('SGD Step')
        ax2.set_title('Loss per SGD Step')
        ax2.set_ylabel('Loss')
        ax2.set_xscale('log')
        ax2.set_yscale('log')
        ax2.legend()

def visualize_params(test_error, num_params):
    fig = plt.figure()
    ax = plt.gca()
    ax.plot(num_params, test_error)
    ax.set_xlabel('Number of Compressed Parameters')
    ax.set_title('Number of Compressed Parameters vs. Test Error')
    ax.set_ylabel('Test Error (%)')
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_ylim([0, 10])

def visualize_ratios(test_error, ratios):
    fig = plt.figure()
    ax = plt.gca()
    ax.plot(ratios, test_error)
    ax.set_xlabel('Compression Ratio')
    ax.set_title('Compression Ratio  vs. Test Error')
    ax.set_ylabel('Test Error (%)')
    ax.set_xscale('linear')
    ax.set_yscale('log')
    ax.set_ylim([0, 10])

In [None]:
# This should only be used when reporting the final model - hard-coded to evaluate on test set
def report_confumat(net):
    net.cuda()

    _, _, test_loader = data_loader(batch_size=10000, n_workers=0)
    test_set, test_labels = next(iter(test_loader))

    out = net(test_set.to(torch.device('cuda')))
    _, preds = out.max(1)

    labels = [0, 2, 5, 6, 7]

    conf_matrix = confusion_matrix(y_true=test_labels.numpy(), y_pred=preds.cpu().detach().numpy())

    fig, ax = plt.subplots(figsize=(7.5, 7.5))
    ax.matshow(conf_matrix)
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
    
    plt.xlabel('Predictions', fontsize=18)
    plt.ylabel('Actuals', fontsize=18)
    plt.title('Confusion Matrix', fontsize=18)
    ax.set_xticklabels(['']+labels)
    ax.set_yticklabels(['']+labels)
    plt.show()

In [None]:
file_name = "/content/state_dicts/batch_size16__lr0.005__gamma0.95__step_size1__momentum0.9__weight_decay0__nesterovTrue__epochs_per_early_stop_check5.pt"

## L-Step and $\mu$-Schedule Definitions

In [None]:
mu_s = [5e-5 * ((1.15) ** n) for n in range(30)]

def my_l_step(model, lc_penalty, step):
    train_loader, val_loader, test_loader = data_loader()
    params = list(filter(lambda p: p.requires_grad, model.parameters()))
    # ------------------- Learning rate parameter
    lr = (0.2)*(0.98**step)
    # -------------------------------------------
    optimizer = optim.SGD(params, lr=lr, momentum=0.9, nesterov=True)
    print(f'L-step #{step} with lr: {lr:.5f}')
    epochs_per_step_ = 10
    if step == 0:
        epochs_per_step_ = epochs_per_step_ * 2
    for epoch in range(epochs_per_step_):
        avg_loss = []
        for x, target in train_loader:
            optimizer.zero_grad()
            x = x.to(device)
            target = target.to(dtype=torch.long, device=device)
            out = model(x)
            loss = model.loss(out, target) + lc_penalty()
            avg_loss.append(loss.item())
            loss.backward()
            optimizer.step()

        print(f"\tepoch #{epoch} is finished.")
        print(f"\t  avg. train loss: {np.mean(avg_loss):.6f}")

## Pruning

In [None]:
net = Net().cuda()
net.load_state_dict(torch.load(file_name))

layers = [lambda x=x: getattr(x, 'weight') for x in net.modules() if isinstance(x, nn.Linear) or isinstance(x, nn.Conv2d)]
compression_tasks = {
    Param(layers, device): (AsVector, ConstraintL0Pruning(kappa=360000), 'pruning')
}

lc_alg = lc.Algorithm(
    model=net,                            # model to compress
    compression_tasks=compression_tasks,  # specifications of compression
    l_step_optimization=my_l_step,        # implementation of L-step
    mu_schedule=mu_s,                     # schedule of mu values
    evaluation_func=train_test_acc_eval_f # evaluation function
)

lc_alg.run()
print('Compressed_params:', lc_alg.count_params())
print('Compression_ratio:', compute_compression_ratio(lc_alg))

### Pruning Results

kappa = 20000:
- train err: 79.89%, train loss: nan
- test err: 79.96%, test loss: nan
- compressed params: 20000
- compression ratio: 19.637840330142076

kappa = 40000:
- train err: 0.00%, train loss: 0.0002841083364256174
- test err: 0.59%, test loss: 0.02454856142310277
- Compressed_params: 40000
- compression ratio: 9.01502218850799

- kappa = 80000:
- train err: 0.00%, train loss: 0.00014816065872306467
- test err: 0.53%, test loss: 0.024508386036735372
- compressed params: 80000
- compression ratio: 4.707208436345564

kappa = 120000: 
- train err: 0.00%, train loss: 8.489977729248691e-05
- test err: 0.53%, test loss: 0.027143318327962743
- compressed params: 120000
- compression ratio: 3.206413950135802

kappa = 160000: 
- train err: 0.00%, train loss: 4.5211303780846254e-05
- test err: 0.49%, test loss: 0.029226327829200066
- compressed params: 160000
- compression ratio: 2.438292413031914

kappa = 240000:
- train err: 0.00%, train loss: 2.1779679784315724e-05
- test err: 0.51%, test loss: 0.031638809090864924
- compressed params: 240000
- compression ratio: 1.656376113226443

kappa = 360000:
- train err: 0.00%, train loss: 1.7712191409147578e-05
- test err: 0.53%, test loss: 0.03311194237816797
- compressed params: 360000
- compression ratio: 1.1171031039638002

## Quantization

In [None]:
net = Net().cuda()
net.load_state_dict(torch.load(file_name))

layers = [lambda x=x: getattr(x, 'weight') for x in net.modules() if isinstance(x, nn.Linear) or isinstance(x, nn.Conv2d)]

compression_tasks = { # k=2 for each layer gives x compression
    Param(layers[0], device): (AsVector, AdaptiveQuantization(k=2), 'layer0_quant'),
    Param(layers[1], device): (AsVector, AdaptiveQuantization(k=2), 'layer1_quant'),
    Param(layers[2], device): (AsVector, AdaptiveQuantization(k=2), 'layer2_quant'),
    Param(layers[3], device): (AsVector, AdaptiveQuantization(k=2), 'layer3_quant')
}

lc_alg = lc.Algorithm(
    model=net,                            # model to compress
    compression_tasks=compression_tasks,  # specifications of compression
    l_step_optimization=my_l_step,        # implementation of L-step
    mu_schedule=mu_s,                     # schedule of mu values
    evaluation_func=train_test_acc_eval_f # evaluation function
)
lc_alg.run()  
print('Compressed_params:', lc_alg.count_params())
print('Compression_ratio:', compute_compression_ratio(lc_alg))

### Quantization Results

k=64 for each layer:
- TEST ERR: 0.61%, test loss: 0.03328548995416101
- Compressed_params: 428256
- Compression_ratio: 5.285763619096953

k=32 for each layer:
- TEST ERR: 0.59%, test loss: 0.031655197397087734
- Compressed_params: 428128
- Compression_ratio: 6.341930805883572

k=16 for each layer:
- TEST ERR: 0.57%, test loss: 0.03163435025814852
- Compressed_params: 428064
- Compression_ratio: 7.916197196106319

k=8 for each layer:
- TEST ERR: 0.61%, test loss: 0.03194960453728469
- Compressed_params: 428032
- Compression_ratio: 10.521825591672394

k=4 for each layer:
- TEST ERR: 0.74%, test loss: 0.027538442431905275
- Compressed_params: 428016
- Compression_ratio: 15.67517647489119

k=2 for each layer:
- TEST ERR: 0.76%, test loss: 0.022537321148626888
- Compressed_params: 428008
- Compression_ratio: 30.704613841524573


## Low-Rank

In [None]:
net = Net().cuda()
net.load_state_dict(torch.load(file_name))

layers = [lambda x=x: getattr(x, 'weight') for x in net.modules() if isinstance(x, nn.Linear) or isinstance(x, nn.Conv2d)]
# ----------------- alpha - compresssion parameter
alpha=1e-9
# ------------------------------------------------
compression_tasks = {
    Param(layers[0], device): (AsIs, RankSelection(conv_scheme='scheme_2', alpha=alpha, criterion='storage', module=layers[0], normalize=True), "layer1_lr"),
    Param(layers[1], device): (AsIs, RankSelection(conv_scheme='scheme_2', alpha=alpha, criterion='storage', module=layers[1], normalize=True), "layer2_lr"),
    Param(layers[2], device): (AsIs, RankSelection(conv_scheme='scheme_1', alpha=alpha, criterion='storage', module=layers[2], normalize=True), "layer3_lr"),
    Param(layers[3], device): (AsIs, RankSelection(conv_scheme='scheme_1', alpha=alpha, criterion='storage', module=layers[3], normalize=True), "layer4_lr")
}

lc_alg = lc.Algorithm(
    model=net,                            # model to compress
    compression_tasks=compression_tasks,  # specifications of compression
    l_step_optimization=my_l_step,        # implementation of L-step
    mu_schedule=mu_s,                     # schedule of mu values
    evaluation_func=train_test_acc_eval_f # evaluation function
)
lc_alg.run()
print('Compressed_params:', lc_alg.count_params())
print('Compression_ratio:', compute_compression_ratio(lc_alg))

### Low-Rank Results

alpha = 1e-9
- TEST ERR: 0.59%, test loss: 0.028039038540883056
- Compressed_params: 100300
- Compression_ratio: 4.2721658476562805

alpha = 1.5e-9
- TEST ERR: 0.59%, test loss: 0.02737808789913152
- Compressed_params: 57450
- Compression_ratio: 7.457632648622194

alpha = 2e-9
- TEST ERR: 1.86%, test loss: 0.07579609481713036
- Compressed_params: 25150
- Compression_ratio: 17.028589166537326

alpha = 2.625e-9
- TEST ERR: 1.53%, test loss: 0.0711624405881988
- Compressed_params: 19450
- Compression_ratio: 22.01436654761427

alpha = 2.5e-9
- TEST ERR: 1.53%, test loss: 0.07167478040287344
- Compressed_params: 18500
- Compression_ratio: 23.143737079694553

## Pruning and Quantization

In [None]:
parameters = {'batch_size': 16, 'lr': 0.005, 'gamma': 0.95, 'step_size': 1, 'momentum': 0.9, 'weight_decay': 0, 'nesterov': True, 'epochs_per_early_stop_check': 5}
file_name = str(parameters).replace(' ', '').replace(':', '').replace('\'', '').replace(',', '__').strip('{').strip('}') + ".pt"
net = Net().cuda()
net.load_state_dict(torch.load("content/state_dicts/" + file_name))

layers = [lambda x=x: getattr(x, 'weight') for x in net.modules() if isinstance(x, nn.Linear) or isinstance(x, nn.Conv2d)]

compression_tasks = {
    Param(layers, device): [
        (AsVector, ConstraintL0Pruning(kappa=20000), 'pruning'),
        (AsVector, AdaptiveQuantization(k=2), 'quant')
    ]
}

lc_alg = lc.Algorithm(
    model=net,                            # model to compress
    compression_tasks=compression_tasks,  # specifications of compression
    l_step_optimization=my_l_step,        # implementation of L-step
    mu_schedule=mu_s,                     # schedule of mu values
    evaluation_func=train_test_acc_eval_f # evaluation function
)
lc_alg.run()
print('Compressed_params:', lc_alg.count_params())
print('Compression_ratio:', compute_compression_ratio(lc_alg))

## Pruning and Low-Rank

In [None]:
parameters = {'batch_size': 16, 'lr': 0.005, 'gamma': 0.95, 'step_size': 1, 'momentum': 0.9, 'weight_decay': 0, 'nesterov': True, 'epochs_per_early_stop_check': 5}
file_name = str(parameters).replace(' ', '').replace(':', '').replace('\'', '').replace(',', '__').strip('{').strip('}') + ".pt"
net = Net().cuda()
net.load_state_dict(torch.load("content/state_dicts/" + file_name))

layers = [lambda x=x: getattr(x, 'weight') for x in net.modules() if isinstance(x, nn.Linear) or isinstance(x, nn.Conv2d)]

compression_tasks = {
    Param(layers, device): [
        (AsVector, ConstraintL0Pruning(kappa=2662), 'pruning'),
        (AsVector, AdaptiveQuantization(k=2), 'quant')
    ]
}

lc_alg = lc.Algorithm(
    model=net,                            # model to compress
    compression_tasks=compression_tasks,  # specifications of compression
    l_step_optimization=my_l_step,        # implementation of L-step
    mu_schedule=mu_s,                     # schedule of mu values
    evaluation_func=train_test_acc_eval_f # evaluation function
)
lc_alg.run()
print('Compressed_params:', lc_alg.count_params())
print('Compression_ratio:', compute_compression_ratio(lc_alg))

## Pruning, Quantization, and Low-Rank

In [None]:
parameters = {'batch_size': 16, 'lr': 0.005, 'gamma': 0.95, 'step_size': 1, 'momentum': 0.9, 'weight_decay': 0, 'nesterov': True, 'epochs_per_early_stop_check': 5}
file_name = str(parameters).replace(' ', '').replace(':', '').replace('\'', '').replace(',', '__').strip('{').strip('}') + ".pt"
net = Net().cuda()
net.load_state_dict(torch.load("content/state_dicts/" + file_name))

layers = [lambda x=x: getattr(x, 'weight') for x in net.modules() if isinstance(x, nn.Linear) or isinstance(x, nn.Conv2d)]

alpha = 7.5e-8

compression_tasks = {
    Param(layers[0], device): (AsVector, ConstraintL0Pruning(kappa=104), 'pruning'),
    Param(layers[1], device): (AsVector, ConstraintL0Pruning(kappa=2505), 'pruning'),
    Param(layers[2], device): (AsIs, RankSelection(conv_scheme='scheme_1', alpha=alpha, criterion='storage', module=layers[2], normalize=True), "layer2_lr"),
    Param(layers[3], device): (AsVector, AdaptiveQuantization(k=2), 'layer3_quant'),
}

lc_alg = lc.Algorithm(
    model=net,                            # model to compress
    compression_tasks=compression_tasks,  # specifications of compression
    l_step_optimization=my_l_step,        # implementation of L-step
    mu_schedule=mu_s,                     # schedule of mu values
    evaluation_func=train_test_acc_eval_f # evaluation function
)
lc_alg.run()
print('Compressed_params:', lc_alg.count_params())
print('Compression_ratio:', compute_compression_ratio(lc_alg))

layer 0 kappa = 104
layer 1 kappa = 2505
layer 2 k = 2
layer 3 alpha = 1e-9
- Train err: 0.06%, train loss: 0.0024381824390053057
- TEST ERR: 0.74%, test loss: 0.025427111382986627
- Compressed_params: 405136
- Compression_ratio: 22.971771039394586

layer 0 kappa = 104
layer 1 kappa = 2505
layer 2 k = 2
layer 3 alpha = 2.5e-9
- Train err: 0.09%, train loss: 0.0027759067967801187
- TEST ERR: 0.67%, test loss: 0.025797346877296033
- Compressed_params: 405136
- Compression_ratio: 22.973618130436893

layer 0 kappa = 104
layer 1 kappa = 2505
layer 2 alpha = 2e-9
layer 3 k = 2
- Train err: 0.19%, train loss: 0.005692896033211272
- TEST ERR: 1.17%, test loss: 0.03771758408024755
- Compressed_params: 10311
- Compression_ratio: 48.08948577239336

layer 0 kappa = 104
layer 1 kappa = 2505
layer 2 alpha = 2.5e-9
layer 3 k = 2
- Train err: 0.04%, train loss: 0.0023015917632584425
- TEST ERR: 0.80%, test loss: 0.026643005014989517
- Compressed_params: 10311
- Compression_ratio: 48.08661930842

layer 0 kappa = 104
layer 1 kappa = 2505
layer 2 alpha = 3e-9
layer 3 k = 2
- Train err: 0.09%, train loss: 0.0029669747408118475
- TEST ERR: 0.80%, test loss: 0.029288866328254556
- Compressed_params: 10311
- Compression_ratio: 48.08594489596993

## Low-Rank with Automatic Rank Selection

In [None]:
parameters = {'batch_size': 16, 'lr': 0.005, 'gamma': 0.95, 'step_size': 1, 'momentum': 0.9, 'weight_decay': 0, 'nesterov': True, 'epochs_per_early_stop_check': 5}
file_name = str(parameters).replace(' ', '').replace(':', '').replace('\'', '').replace(',', '__').strip('{').strip('}') + ".pt"
net = Net().cuda()
net.load_state_dict(torch.load("content/state_dicts/" + file_name))

layers = [lambda x=x: getattr(x, 'weight') for x in net.modules() if isinstance(x, nn.Linear) or isinstance(x, nn.Conv2d)]
# ----------------- alpha - compresssion parameter
alpha=1e-9
# alpha=1e-3
# ------------------------------------------------
compression_tasks = {
    Param(layers[0], device): (AsIs, RankSelection(conv_scheme='scheme_2', alpha=alpha, criterion='storage', module=layers[0], normalize=True), "layer1_lr"),
    Param(layers[1], device): (AsIs, RankSelection(conv_scheme='scheme_2', alpha=alpha, criterion='storage', module=layers[1], normalize=True), "layer2_lr"),
    Param(layers[2], device): (AsIs, RankSelection(conv_scheme='scheme_1', alpha=alpha, criterion='storage', module=layers[2], normalize=True), "layer3_lr"),
    Param(layers[3], device): (AsIs, RankSelection(conv_scheme='scheme_1', alpha=alpha, criterion='storage', module=layers[2], normalize=True), "layer3_lr")
}

lc_alg = lc.Algorithm(
    model=net,                            # model to compress
    compression_tasks=compression_tasks,  # specifications of compression
    l_step_optimization=my_l_step,        # implementation of L-step
    mu_schedule=mu_s,                     # schedule of mu values
    evaluation_func=train_test_acc_eval_f # evaluation function
)
lc_alg.run()
print('Compressed_params:', lc_alg.count_params())
print('Compression_ratio:', compute_compression_ratio(lc_alg))

## Linear Pruning and Convolutional Quantization

In [None]:
net = Net().cuda()
net.load_state_dict(torch.load(file_name))

layers = [lambda x=x: getattr(x, 'weight') for x in net.modules() if isinstance(x, nn.Linear) or isinstance(x, nn.Conv2d)]
# k = 2 for each layer gives x compression

compression_tasks = {
    Param(layers[0], device): (AsVector, ConstraintL0Pruning(kappa=104), 'pruning'), # parameters of this layer: (1*5*5 + 1)*20 = 520
    Param(layers[1], device): (AsVector, ConstraintL0Pruning(kappa=5010), 'pruning'), # parameters of this layer: (20*5*5 + 1)*50 = 25,050
    Param(layers[2], device): (AsVector, AdaptiveQuantization(k=2), 'layer2_quant'),
    Param(layers[3], device): (AsVector, AdaptiveQuantization(k=2), 'layer3_quant')
}

lc_alg = lc.Algorithm(
    model=net,                            # model to compress
    compression_tasks=compression_tasks,  # specifications of compression
    l_step_optimization=my_l_step,        # implementation of L-step
    mu_schedule=mu_s,                     # schedule of mu values
    evaluation_func=train_test_acc_eval_f # evaluation function
)
lc_alg.run()  
print('Compressed_params:', lc_alg.count_params())
print('Compression_ratio:', compute_compression_ratio(lc_alg))

With this compression scheme, we again keep our quantization codebook at 2 for both layers, but apply quantization to the convolutional
layers only. On the linear layers, we apply pruning with different numbers of parameters proportional to the number of starting
parameters for each layer. We saw strange results here, with lower compression ratios giving higher error. The best compressed 
model that we got with this compression scheme had relatively high error of .78%, with a compression ratio of only approx. 17.
We noticed that this scheme gave some of our worst results, and is the inverse of the scheme that gave some of our best results 
(quantization on linear layers and pruning on conv layers). This leads us to believe that, for this model, quantization is more
effective on linear layers and pruning is more effective on convolutional layers.

C1
- layer 0 k = 2
- layer 1 k = 2
- layer 2 kappa = 100000
- layer 3 kappa = 1000
- Train err: 0.11%, train loss: 0.004736287437032822
- TEST ERR: 0.84%, test loss: 0.02545553339999512
- Compressed_params: 126504
- Compression_ratio: 3.766573718079878

C2
- layer 0 k = 2
- layer 1 k = 2
- layer 2 kappa = 80000
- layer 3 kappa = 500
- Train err: 0.14%, train loss: 0.005909851407126863
- TEST ERR: 0.82%, test loss: 0.024643742107235824
- Compressed_params: 106004
- Compression_ratio: 4.660869919845843

C3
- layer 0 k = 2
- layer 1 k = 2
- layer 2 kappa = 40000
- layer 3 kappa = 250
- Train err: 0.10%, train loss: 0.004272402206060383
- TEST ERR: 0.84%, test loss: 0.02374215452476635
- Compressed_params: 65754
- Compression_ratio: 8.876197675713039

C4
- layer 0 k = 2
- layer 1 k = 2
- layer 2 kappa = 20000
- layer 3 kappa = 125
- Train err: 0.06%, train loss: 0.0037158044668766416
- TEST ERR: 0.78%, test loss: 0.022713467773575725
- Compressed_params: 45629
- Compression_ratio: 16.641104294478527

## Linear Quantization and Convolutional Pruning Results

With this compression scheme, we applied pruning to the convolutional layers, and quantization to the linear layers. Our initial
findings were that the number of parameters of the first layer and the codebook size of both linear layers could be pretty
low without much harm to the model. This compression scheme gave us one of our highest compression ratio to error ratios, with
a test error of .51% and compression ratio of approx 26.4. This was given by compression with 104 parameters in the first convolutional 
layer, 2505 parameters in the second conv layer, and a codebook size of 2 for both linear layers. We tried increasing the compression 
a little bit more from that, and saw that it was impacting error enough as to not be worth pushing further. It looks like we got 
lucky with these particular compression parameters, as the test error for the model mentioned above is lower than those of 
models compressed with the same scheme but with more parameters in the second layer. A plot of this compression type's test error vs
number of parameters in the second layer is given in Figure X.

layer 1 kappa =  104
layer 2 kappa = 12525
layer 3 k = 2
layer 4 k = 2
- Train err: 0.01%, train loss: 0.0014465828186132072
- TEST ERR: 0.53%, test loss: 0.020963299728488143
- Compressed_params: 415289
- Compression_ratio: 15.891025551894028

layer 1 kappa =  104
layer 2 kappa = 5010
layer 3 k = 2
layer 4 k = 2
- Train err: 0.00%, train loss: 0.001723080134206964
- TEST ERR: 0.55%, test loss: 0.01985923921677964
- Compressed_params: 407618
- Compression_ratio: 22.635991067346463

layer 1 kappa =  104
layer 2 kappa = 2505
layer 3 k = 2
layer 4 k = 2
- Train err: 0.03%, train loss: 0.0024527820672789343
- TEST ERR: 0.51%, test loss: 0.02095057955792589
- Compressed_params: 405113
- Compression_ratio: 26.42542655375396

layer 1 kappa =  104
layer 2 kappa = 1250
layer 3 k = 2
layer 4 k = 2
- Train err: 0.06%, train loss: 0.003353178411787159
- TEST ERR: 0.57%, test loss: 0.0208197200697502
- Compressed_params: 403858
- Compression_ratio: 28.963396605759566

## Linear Pruning and Convolutional Quantization

In [None]:
net = Net().cuda()
net.load_state_dict(torch.load(file_name))

layers = [lambda x=x: getattr(x, 'weight') for x in net.modules() if isinstance(x, nn.Linear) or isinstance(x, nn.Conv2d)]
# k = 2 for each layer gives x compression

compression_tasks = {
    Param(layers[0], device): (AsVector, AdaptiveQuantization(k=2), 'layer0_quant'),
    Param(layers[1], device): (AsVector, AdaptiveQuantization(k=2), 'layer1_quant'),
    Param(layers[2], device): (AsVector, ConstraintL0Pruning(kappa=100000), 'pruning'), # parameters of this layer: 800*500 = 400,000
    Param(layers[3], device): (AsVector, ConstraintL0Pruning(kappa=1000), 'pruning') # parameters of this layer: 500*5 = 2,500
}

lc_alg = lc.Algorithm(
    model=net,                            # model to compress
    compression_tasks=compression_tasks,  # specifications of compression
    l_step_optimization=my_l_step,        # implementation of L-step
    mu_schedule=mu_s,                     # schedule of mu values
    evaluation_func=train_test_acc_eval_f # evaluation function
)
lc_alg.run()  
print('Compressed_params:', lc_alg.count_params())
print('Compression_ratio:', compute_compression_ratio(lc_alg))

With this compression scheme, we again keep our quantization codebook at 2 for both layers, but apply quantization to the convolutional
layers only. On the linear layers, we apply pruning with different numbers of parameters proportional to the number of starting
parameters for each layer. We saw strange results here, with lower compression ratios giving higher error. The best compressed 
model that we got with this compression scheme had relatively high error of .78%, with a compression ratio of only approx. 17.
We noticed that this scheme gave some of our worst results, and is the inverse of the scheme that gave some of our best results 
(quantization on linear layers and pruning on conv layers). This leads us to believe that, for this model, quantization is more
effective on linear layers and pruning is more effective on convolutional layers.

layer 0 k = 2
layer 1 k = 2
layer 2 kappa = 100000
layer 3 kappa = 1000
- Train err: 0.11%, train loss: 0.004736287437032822
- TEST ERR: 0.84%, test loss: 0.02545553339999512
- Compressed_params: 126504
- Compression_ratio: 3.766573718079878

layer 0 k = 2
layer 1 k = 2
layer 2 kappa = 80000
layer 3 kappa = 500
- Train err: 0.14%, train loss: 0.005909851407126863
- TEST ERR: 0.82%, test loss: 0.024643742107235824
- Compressed_params: 106004
- Compression_ratio: 4.660869919845843

layer 0 k = 2
layer 1 k = 2
layer 2 kappa = 40000
layer 3 kappa = 250
- Train err: 0.10%, train loss: 0.004272402206060383
- TEST ERR: 0.84%, test loss: 0.02374215452476635
- Compressed_params: 65754
- Compression_ratio: 8.876197675713039

layer 0 k = 2
layer 1 k = 2
layer 2 kappa = 20000
layer 3 kappa = 125
- Train err: 0.06%, train loss: 0.0037158044668766416
- TEST ERR: 0.78%, test loss: 0.022713467773575725
- Compressed_params: 45629
- Compression_ratio: 16.641104294478527

## Linear Pruning and Convolutional Low-Rank

In [None]:
net = Net().cuda()
net.load_state_dict(torch.load(file_name))

layers = [lambda x=x: getattr(x, 'weight') for x in net.modules() if isinstance(x, nn.Linear) or isinstance(x, nn.Conv2d)]

compression_tasks = {
    Param(layers[0], device): (AsIs, RankSelection(conv_scheme='scheme_2', alpha=alpha, criterion='storage', module=layers[0], normalize=True), "layer1_lr"),
    Param(layers[1], device): (AsIs, RankSelection(conv_scheme='scheme_2', alpha=alpha, criterion='storage', module=layers[1], normalize=True), "layer2_lr"),
    Param(layers[2], device): (AsVector, ConstraintL0Pruning(kappa=80000), 'pruning'), # parameters of this layer: 800*500 = 400,000
    Param(layers[3], device): (AsVector, ConstraintL0Pruning(kappa=500), 'pruning') # parameters of this layer: 500*5 = 2,500
}

lc_alg = lc.Algorithm(
    model=net,                            # model to compress
    compression_tasks=compression_tasks,  # specifications of compression
    l_step_optimization=my_l_step,        # implementation of L-step
    mu_schedule=mu_s,                     # schedule of mu values
    evaluation_func=train_test_acc_eval_f # evaluation function
)
lc_alg.run()  
print('Compressed_params:', lc_alg.count_params())
print('Compression_ratio:', compute_compression_ratio(lc_alg))

With this compression scheme, we again observed that pruning was not working very well on the linear layers. For very low 
compression ratios, we were already seeing error rates higher than we had seen before with much higher compression ratios. When
we pushed the model to have a compression ratio close to 20, it started giving garbage output, so we decided to stop there. 

alpha = 1e-9
layer 2 kappa = 80000
layer 3 kappa = 500
- Train err: 0.00%, train loss: 0.000249883399526372
- TEST ERR: 0.63%, test loss: 0.022979293916001155
- Compressed_params: 111125
- Compression_ratio: 3.5195712376729418

alpha = 2.625e-9
layer 2 kappa = 40000
layer 3 kappa = 250
- Train err: 0.00%, train loss: 0.0003304009178703896
- TEST ERR: 0.57%, test loss: 0.023337179302011287
- Compressed_params: 56875
- Compression_ratio: 6.693464472083884

alpha = 2.625e-9
layer 2 kappa = 15000
layer 3 kappa = 250
- Train err: 80.15%, train loss: 1.6093867913697117
- TEST ERR: 80.41%, test loss: 1.6099159806296381
- Compressed_params: 17875
- Compression_ratio: 19.249415404136105

## Linear Low-Rank and Convolutional Quantization

In [None]:
net = Net().cuda()
net.load_state_dict(torch.load(file_name))

layers = [lambda x=x: getattr(x, 'weight') for x in net.modules() if isinstance(x, nn.Linear) or isinstance(x, nn.Conv2d)]
# k = 2 for each layer gives x compression

compression_tasks = {
    Param(layers[0], device): (AsVector, AdaptiveQuantization(k=2), 'layer0_quant'),
    Param(layers[1], device): (AsVector, AdaptiveQuantization(k=2), 'layer1_quant'),
    Param(layers[2], device): (AsIs, RankSelection(conv_scheme='scheme_1', alpha=alpha, criterion='storage', module=layers[2], normalize=True), "layer3_lr"),
    Param(layers[3], device): (AsIs, RankSelection(conv_scheme='scheme_1', alpha=alpha, criterion='storage', module=layers[3], normalize=True), "layer4_lr")
}

lc_alg = lc.Algorithm(
    model=net,                            # model to compress
    compression_tasks=compression_tasks,  # specifications of compression
    l_step_optimization=my_l_step,        # implementation of L-step
    mu_schedule=mu_s,                     # schedule of mu values
    evaluation_func=train_test_acc_eval_f # evaluation function
)
lc_alg.run()  
print('Compressed_params:', lc_alg.count_params())
print('Compression_ratio:', compute_compression_ratio(lc_alg))

With this compression scheme, we decided to keep the codebook size as 2 for the quantization of linear layers for all runs,
since we have noticed that models tend to work pretty well with that type of compression. Here, we varied the amount of low-rank
compression on the convolutional layers by changing the alpha value. There were no stand-out compressed models from this compression
scheme, and the results are shown in Figure X

 alpha = 1e-9
 k = 2
 - Train err: 0.00%, train loss: 0.0007940481794554047
 - TEST ERR: 0.55%, test loss: 0.018662374498653996
 - Compressed_params: 434179
 - Compression_ratio: 9.559551326197454

 alpha = 2.5e-9
 k = 2
 - Train err: 0.00%, train loss: 0.0007536800849421109
 - TEST ERR: 0.65%, test loss: 0.02101732165169862
 - Compressed_params: 422279
 - Compression_ratio: 13.01388841442816

 alpha = 5e-9
 k = 2
 - Train err: 0.00%, train loss: 0.0006984795862447846
 - TEST ERR: 0.59%, test loss: 0.020446154859168397
 - Compressed_params: 410379
 - Compression_ratio: 20.37716112851174

 alpha = 1e-8
 k = 2
 - TEST ERR: 0.67%, test loss: 0.020718296900665834
 - Train err: 0.00%, train loss: 0.0009552586485285225
 - Compressed_params: 407229
 - Compression_ratio: 23.966670627791718

 alpha = 2.5e-8
 k = 2
 - Train err: 0.01%, train loss: 0.0013898124918792858
 - TEST ERR: 0.74%, test loss: 0.02361567228240226
 - Compressed_params: 406179
 - Compression_ratio: 25.461728688445458

 alpha = 5e-8
 k = 2
- Train err: 78.45%, train loss: 1.60869908425235
- TEST ERR: 78.98%, test loss: 1.6089411411792467
- Compressed_params: 404114
- Compression_ratio: 29.02223689445305

## Linear Low-Rank and Convolutional Pruning

In [None]:
net = Net().cuda()
net.load_state_dict(torch.load(file_name))

layers = [lambda x=x: getattr(x, 'weight') for x in net.modules() if isinstance(x, nn.Linear) or isinstance(x, nn.Conv2d)]

compression_tasks = {
    Param(layers[0], device): (AsVector, ConstraintL0Pruning(kappa=104), 'pruning'), # parameters of this layer: (1*5*5 + 1)*20 = 520
    Param(layers[1], device): (AsVector, ConstraintL0Pruning(kappa=5010), 'pruning'), # parameters of this layer: (20*5*5 + 1)*50 = 25,050
    Param(layers[2], device): (AsVector, AdaptiveQuantization(k=2), 'layer2_quant'),
    Param(layers[3], device): (AsVector, AdaptiveQuantization(k=2), 'layer3_quant')
}

lc_alg = lc.Algorithm(
    model=net,                            # model to compress
    compression_tasks=compression_tasks,  # specifications of compression
    l_step_optimization=my_l_step,        # implementation of L-step
    mu_schedule=mu_s,                     # schedule of mu values
    evaluation_func=train_test_acc_eval_f # evaluation function
)
lc_alg.run()  
print('Compressed_params:', lc_alg.count_params())
print('Compression_ratio:', compute_compression_ratio(lc_alg))

This compression scheme gave us our best compression ratio to error ratio. With 104 parameters in the first layer, 2505 parameters
in the second layer, and an alpha of $2.5*10^{-9}$ for low-rank compression applied to the linear layers. The resulting error
was .53%, and the compression ratio was 37.7. We have noticed a trend that some of our best models were compressed with pruning
on the convolutional layers, so when we tried to combine all three compression types into one model, we decided to fix the 
compression on the first two layers to be pruning with 104 parameters for the first layer and 2505 parameters for the second
layer.

layer 0 kappa = 104
layer 1 kappa = 5010
alpha = 1e-9
- Train err: 0.00%, train loss: 0.00023444037877254245
- TEST ERR: 0.57%, test loss: 0.023048661883867103
- Compressed_params: 41439
- Compression_ratio: 10.04371385404743

layer 0 kappa = 104
layer 1 kappa = 5010
alpha = 2.5e-9
- Train err: 0.00%, train loss: 0.00023940207609047666
- TEST ERR: 0.61%, test loss: 0.022994919822138023
- Compressed_params: 12839
- Compression_ratio: 30.45465244869227

layer 0 kappa = 104
layer 1 kappa = 2505
alpha = 2.5e-9
- Train err: 0.00%, train loss: 0.00026679373799296197
- TEST ERR: 0.53%, test loss: 0.023120258530651378
- Compressed_params: 10334
- Compression_ratio: 37.73248519798384

layer 0 kappa = 104
layer 1 kappa = 2505
alpha = 2.75e-9
- Train err: 0.00%, train loss: 0.0002578537257355746
- TEST ERR: 0.57%, test loss: 0.02328361051884653
- Compressed_params: 10334
- Compression_ratio: 37.72926725264447

## Linear Quantization and Convolutional Low-Rank

With this compression scheme, we decided to keep the codebook size as 2 for the quantization of linear layers for all runs,
since we have noticed that models tend to work pretty well with that type of compression. Here, we varied the amount of low-rank
compression on the convolutional layers by changing the alpha value. There were no stand-out compressed models from this compression
scheme, and the results are shown in Figure X

 alpha = 1e-9
 k = 2
 - Train err: 0.00%, train loss: 0.0007940481794554047
 - TEST ERR: 0.55%, test loss: 0.018662374498653996
 - Compressed_params: 434179
 - Compression_ratio: 9.559551326197454

 alpha = 2.5e-9
 k = 2
 - Train err: 0.00%, train loss: 0.0007536800849421109
 - TEST ERR: 0.65%, test loss: 0.02101732165169862
 - Compressed_params: 422279
 - Compression_ratio: 13.01388841442816

 alpha = 5e-9
 k = 2
 - Train err: 0.00%, train loss: 0.0006984795862447846
 - TEST ERR: 0.59%, test loss: 0.020446154859168397
 - Compressed_params: 410379
 - Compression_ratio: 20.37716112851174

 alpha = 1e-8
 k = 2
 - Train err: 0.00%, train loss: 0.0009552586485285225
 - TEST ERR: 0.67%, test loss: 0.020718296900665834
 - Compressed_params: 407229
 - Compression_ratio: 23.966670627791718

 alpha = 2.5e-8
 k = 2
 - Train err: 0.01%, train loss: 0.0013898124918792858
 - TEST ERR: 0.74%, test loss: 0.02361567228240226
 - Compressed_params: 406179
 - Compression_ratio: 25.461728688445458

 alpha = 5e-8
 k = 2
 - Train err: 78.45%, train loss: 1.60869908425235
 - TEST ERR: 78.98%, test loss: 1.6089411411792467
 - Compressed_params: 404114
- Compression_ratio: 29.02223689445305