In [1]:
# Torch stuff
from torchvision.models import resnet50, ResNet50_Weights
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch
import glob
from PIL import Image
from sklearn.model_selection import train_test_split
import torch.optim as optim

# Compression modules
import numpy as np
from pathlib import Path
from Comp4AI.notebooks.pysz.pysz import SZ
import zfpy

import sys
import time

In [2]:
if torch.cuda.is_available():
    # CUDA is available, you can proceed to use it
    device = torch.device('cuda')
    print('CUDA is available. Using GPU.')
else:
    # CUDA is not available, use CPU
    device = torch.device('cpu')
    print('CUDA is not available. Using CPU.')

CUDA is available. Using GPU.


In [3]:
resnet_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    transforms.Resize((256,256))])

In [4]:
resnet = resnet50(weights=ResNet50_Weights.DEFAULT)
resnet.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [5]:
num_classes = 10
resnet.fc = nn.Linear(2048, num_classes)

In [6]:
# resnet.load_state_dict(torch.load("finetuned_resnet.pt"))

In [7]:
pattern = 'subclasses-tiny-imagenet/*/images/*.JPEG'
jpeg_files = glob.glob(pattern, recursive=True)

In [8]:
class FineTuningDataset(Dataset):
    def __init__(self, data, transform = resnet_transform):
        '''class_dict = {'n02410509':347,
                      'n02106662':235,
                      'n07734744':947,
                      'n07873807':963,
                      'n07920052':967,
                      'n09428293':978,
                      'n01910747':107,
                      'n01882714':105,
                      'n04285008':817,
                      'n04146614':779}'''
        
        class_dict = {'n02410509':0,
                      'n02106662':1,
                      'n07734744':2,
                      'n07873807':3,
                      'n07920052':4,
                      'n09428293':5,
                      'n01910747':6,
                      'n01882714':7,
                      'n04285008':8,
                      'n04146614':9}
        
        data_and_labels = []
        for i in range(len(data)):
            file_path = data[i]
        
            # Split the file string
            split_file = data[i].split('/')
            class_wnid = split_file[1]
            file_name = split_file[3]
        
            with Image.open(file_path) as img:
            
                # Convert grayscale images to RGB
                if img.mode == 'L':
                    img = img.convert('RGB')
            
                img_tensor = resnet_transform(img)
            
            data_and_labels.append((img_tensor, class_dict[class_wnid]))
        
        self.data = data_and_labels
        self.transform = transform
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sample = self.data[idx][0]
        target = self.data[idx][1]
        
        return sample, target

In [9]:
data = FineTuningDataset(jpeg_files)

In [10]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [11]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(resnet.fc.parameters(), lr=0.001)
num_epochs = 10
batch_size = 128

In [12]:
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

In [13]:
def fine_tune(model=None, train_loader=None, loss_func=None, model_name=None):
    model = model.to(device)
    highest_acc = 0
    
    for epoch in range(1, num_epochs+1):
        train_correct = 0
        train_total = 0
        running_loss = 0.0
        for step, (images, labels) in enumerate(train_loader):
            model.train()   # set the model in training mode
                
            images = images.to(device) # move images and labels to GPU
            labels = labels.to(device)
                
            optimizer.zero_grad() # Zero out gradients from last backprop
                
            outputs = model(images) # Pass images through the model
            _, predicted = torch.max(outputs.data, 1) # Obtain indices of predictions
                
            train_loss = loss_func(outputs, labels) # Get the loss and backpropogate it
            train_loss.backward()
                
            # Get some metrics
            running_loss += train_loss
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum()
            train_accuracy = 100 * (train_correct/train_total)
                
            if step % 5 == 0:
                test_acc, test_loss = test(model=model, test_loader=test_loader, loss_func=loss_func)
                print(f"Epoch: {epoch}, Step: {step}")
                print(f"\tTrain Loss: {running_loss}, Train Accuracy: {train_accuracy}")
                print(f"\tTest Loss: {test_loss}, Test Accuracy: {test_acc}")
                print()
                if test_acc > highest_acc:
                    highest_acc = test_acc
                    torch.save(model.state_dict(), f'{model_name}.pt')
                
            optimizer.step()

In [14]:
def test(model=None, test_loader=None, loss_func=None):
    total = 0
    correct = 0
    
    model = model.to(device)
    model.eval()
    with torch.no_grad():
        total_loss = 0
        for images, labels in test_loader:            
            images = images.to(device)
            labels = labels.to(device)
            
            outputs = model(images)
            
            _, predicted = torch.max(outputs.data, 1)
            
            test_loss = loss_func(outputs, labels)
            total_loss += test_loss
            
            # Increment total number of observations seen by
            # number of items in this batch
            total += labels.size(0)

            # Increment total number of correct predictions by
            # number of correct predictions in this batch
            correct += (predicted == labels).sum()
            
        accuracy = (100 * (correct/total))
        accuracy = accuracy.item()
        return accuracy, total_loss

In [15]:
# fine_tune(model=resnet, train_loader=train_loader, loss_func=criterion)

In [16]:
start_time = time.perf_counter()

accuracy, test_loss = test(model=resnet, test_loader=test_loader, loss_func=criterion)

end_time = time.perf_counter()
total_time = end_time - start_time
minutes, seconds = divmod(total_time, 60)

print("Accuracy without compression: {:.2f}%\tTime spent: {}:{}".format(accuracy, int(minutes), int(seconds)))

Accuracy without compression: 10.20%	Time spent: 0:10


In [19]:
torch.cuda.memory_allocated()

94362112

In [20]:
# prepare your data in numpy array format
HOME="/home/jts75596/yuan_projects/feature_maps"

# init SZ (both SZ2 and SZ3 are supported)
# Please change the path to the SZ dynamic library file in your system
lib_extention = {
    "darwin": "libSZ3c.dylib",
    "windows": "SZ3c.dll",
}.get(sys.platform, "libSZ3c.so")

sz = SZ("{}/ExternalDependencies/SZ3/install/lib64/{}".format(HOME,lib_extention))

## Register forward hook to catch the inermediate results, compress them, then decompress them, and see how it performs on test set

In [21]:
def print_memory_usage(stage):
    print(f"{stage} - Memory allocated: {torch.cuda.memory_allocated() / (1024 ** 2):.2f} MB")

In [None]:
resnet.to(device)
resnet.eval()

In [20]:
# def SZ3_compress_output(module, input, output):
#     output = output.cpu().numpy()
#     data_cmpr, cmpr_ratio = sz.compress(output, 1, 0, 0.001, 0)
#     data_dec = sz.decompress(data_cmpr, output.shape, output.dtype)
#     output_dec = torch.from_numpy(data_dec).to(device)
#     return output_dec
    
# hook = resnet.maxpool.register_forward_hook(SZ3_compress_output)

# start_time = time.perf_counter()
# accuracy, test_loss = test(model=resnet, test_loader=test_loader, loss_func=criterion)
# end_time = time.perf_counter()
# total_time = end_time - start_time
# minutes, seconds = divmod(total_time, 60)

# print("Accuracy with SZ3: {:.2f}%\tTime spent: {}:{}".format(accuracy, int(minutes), int(seconds)))
# hook.remove()

Accuracy with SZ3: 92.20%	Time spent: 0:30


### Layer 1 output takes the longest because intermediate results are largest

### Rate=8 achieves same quality, if not better, in less time than tolerance=1e-3

### is tolerance=8 supposed to work so well???

In [23]:
def compress_intermediate_results(mode="train", layer=0, tolerance=1e-3, model_name=None):
    def zfpy_compress_output(module, input, output):
        if mode != "train":
            output = output.cpu().numpy() # For testing
        else:
            output = output.cpu().detach().numpy() # For training
        compressed_data = zfpy.compress_numpy(output, tolerance=tolerance)
    #     compressed_data = zfpy.compress_numpy(output, rate=8)
        decompressed_array = zfpy.decompress_numpy(compressed_data)
        output_dec = torch.from_numpy(decompressed_array).to(device)
        return output_dec

    if layer == 0:
        hook = resnet.maxpool.register_forward_hook(zfpy_compress_output)
    elif layer == 1:
        hook = resnet.layer1[-1].register_forward_hook(zfpy_compress_output)
    elif layer == 2:
        hook = resnet.layer2[-1].register_forward_hook(zfpy_compress_output)
    elif layer == 3:
        hook = resnet.layer3[-1].register_forward_hook(zfpy_compress_output)
    elif layer == 4:
        hook = resnet.layer4[-1].register_forward_hook(zfpy_compress_output)

    start_time = time.perf_counter()
    if mode == "train":
        fine_tune(model=resnet, train_loader=train_loader, loss_func=criterion, model_name=model_name)
    else:
        accuracy, test_loss = test(model=resnet, test_loader=test_loader, loss_func=criterion)
    end_time = time.perf_counter()
    total_time = end_time - start_time
    minutes, seconds = divmod(total_time, 60)
    hook.remove()
    
    if mode == "train":
        print()
        print("-------------------")
        print("Finished training!!")
        print("Layer: {}, Tolerance: {}, Total training time: {}:{}".format(layer, tolerance, int(minutes), int(seconds)))
        print("-------------------")
    else:
        print("Accuracy with zfpy: {:.2f}%\tTime spent: {}:{}".format(accuracy, int(minutes), int(seconds)))

Accuracy with zfpy: 10.20%	Time spent: 0:20


#### This block below will kill the kernel with 16 GB of memory, likely due to memory overflow issues

## Explore ways of compressing and decompressing without moving elements to the cpu and back. So need to find a way to compress and decompress tensors while they are on the gpu. Look at CuPy library for this...

###### Might still have to move it to cpu to use CuPy though...