In [1]:
import matplotlib.pyplot as plt
import numpy as np
import copy
import torch
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cdist


import infrastructure as inf

device = inf.device




Using device mps


# Reading in the pretrained models

In [2]:
from Custom_ResNet18 import custom_resnet_18

models = {}

max_superclass = 3

for i in range(0,max_superclass):
    for j in range(i+1,max_superclass):
        path = "results_training_run1/models/model_{}_{}.pt".format(i,j)
        models[(i,j)] = custom_resnet_18(num_classes=10).to(device)
        models[(i,j)].load_state_dict(torch.load(path, map_location=device))
        models[(i,j)].eval()




In [3]:
print(models[(0,1)])

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock2(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock2(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)


# Clustering to Initialization Algorithms

## Sorting into buckets

We start by taking the 64 filters of the first model as the basis for our clustering. We then sort the filters for each model into the buckets based on their cosine similarity, but in such a way that in each round only 1 filters is added to a bucket. We then take the avg filter from each bucket and use it as the basis for the next layer of clustering. We repeat this process until we have 64 clusters.

In [None]:
def calculate_cosine_similarity(cluster_group, vec):
    total_distance = 0

    for cluster_vec in cluster_group:
        
        # Reshape vectors to 2D arrays as required by cosine_similarity function
        vec1 = np.array(cluster_vec).reshape(1, -1)
        vec2 = np.array(vec).reshape(1, -1)

        # Calculate cosine similarity and convert to cosine distance
        similarity = cosine_similarity(vec1, vec2)
        distance = 1 - similarity

        # Since cosine_similarity returns a 2D array, we take the first element
        total_distance += distance[0][0]

    return total_distance

In [None]:
def get_custom_clustering_groups(filters_per_model, num_filters_per_model):
    filter_vectors = []
    
    num_models = len(filters_per_model)
    
    for idx, filters in enumerate(filters_per_model):
        
        filter_vectors.append([])

        for i in range(filters.shape[0]):
            # Flatten and normalize the filter
            filter_vec = filters[i].flatten()
            filter_vec /= np.linalg.norm(filter_vec)
            filter_vectors[idx].append(filter_vec)
            
    new_groups = [[] for _ in range(num_filters_per_model)]
    permutations = [[] for _ in range(num_models)]
    
    # Now we go through all models and assign filters in a greedy manner
    for i in range(num_models):
        model_filters = filter_vectors[i]
        used_indices = set()
        
        # We go through every group of the new clusters and assign the min distance vector from the next model to it
        for cluster_idx, cluster_group in enumerate(new_groups):
            min_distance = float('inf')
            selected_vector_idx = None
            
            for idx, vec in enumerate(model_filters):
                dist = calculate_cosine_similarity(cluster_group, vec)
                if dist < min_distance and idx not in used_indices:
                    min_distance = dist
                    selected_vector_idx = idx
            
            new_groups[cluster_idx].append(model_filters[selected_vector_idx])
            used_indices.add(selected_vector_idx)
            permutations[i].append(selected_vector_idx)

    return new_groups, permutations

In [None]:
def custom_clustering(filters, num_input_channels, num_filters, kernel_size, include_permutations=False):
    
    clustered_groups, permutations = get_custom_clustering_groups(filters, num_filters)
    
    filter_vectors = np.array(clustered_groups)
    average_filters = [np.mean(np.array(vectors), axis=0) for vectors in filter_vectors]
    
    shaped_average_filters = np.array(average_filters).reshape((num_filters, num_input_channels, kernel_size, kernel_size))
    torch_shaped_average_filters = torch.tensor(shaped_average_filters, dtype=torch.float32).to(device)
    
    if include_permutations:
        return torch_shaped_average_filters, permutations
    else:
        return torch_shaped_average_filters

## K-Means

In [None]:
def kmeans_clustering(filters, num_input_channels, num_filters, kernel_size):
    
    filter_vectors = []
        
    num_models = len(filters)
    
    for idx, model_filters in enumerate(filters):
        
        filter_vectors.append([])

        for i in range(model_filters.shape[0]):
            # Flatten and normalize the filter
            filter_vec = model_filters[i].flatten()
            filter_vec /= np.linalg.norm(filter_vec)
            filter_vectors[idx].append(filter_vec)
    
    filter_vectors = np.vstack(filter_vectors)
    kmeans = KMeans(n_clusters=num_filters, n_init='auto')
    cluster_labels = kmeans.fit_predict(filter_vectors)

    average_filters = []

    for cluster_num in range(num_filters):
        
        # Find indices where the cluster label matches the current cluster number
        indices = np.where(cluster_labels == cluster_num)[0]

        # Filters belonging to the current cluster
        cluster_filters = filter_vectors[indices]

        # Compute the average filter for this cluster
        average_filter = np.mean(cluster_filters, axis=0)
        average_filters.append(average_filter)
        
    shaped_average_filters = np.array(average_filters).reshape((num_filters, num_input_channels, kernel_size, kernel_size))
    torch_shaped_average_filters = torch.tensor(shaped_average_filters, dtype=torch.float32).to(device)
    
    return torch_shaped_average_filters

## Permutation

In [None]:
def permute_bias(permuted_model_bias, perm):
    permuted_model_bias.weight = torch.nn.Parameter(permuted_model_bias.weight[perm])
    permuted_model_bias.bias = torch.nn.Parameter(permuted_model_bias.bias[perm])
    permuted_model_bias.running_mean = permuted_model_bias.running_mean[perm]
    permuted_model_bias.running_var = permuted_model_bias.running_var[perm]

def permute_layer(permuted_layer, permuted_layer_next, permuted_bias, perm = None):
    if not perm is None: 
        permuted_layer.weight = torch.nn.Parameter(permuted_layer.weight[perm])
        permute_bias(permuted_bias, perm)
        permuted_layer_next.weight = torch.nn.Parameter(permuted_layer_next.weight.transpose(0,1)[perm].transpose(0,1))
    

def permute_weights(model, permutation):
    
     with torch.no_grad():
        permuted_model = copy.deepcopy(model)
        permute_layer(permuted_model.conv1, permuted_model.layer1[0].conv1, permuted_model.bn1, permutation)
        return permuted_model


## Model Initialization

### Custom Clustering

In [None]:
def create_initialized_model_custom(models_dict):
    pretrained_models = [model for model in models_dict.values()] 
    final_model = custom_resnet_18(num_classes=10).to(device)

    #Apply first layer clustering techniques
    trained_filters_layer1 = [model.conv1.weight.data.cpu().numpy() for model in models]
    sampled_filters_layer1 = kmeans_clustering(filters=trained_filters_layer1,
                                               num_input_channels=3,
                                               num_filters=64,
                                               kernel_size=7)
    
    # Apply second layer clustering techniques
    trained_filters_layer2 = [model.layer1[0].conv1.weight.data.cpu().numpy() for model in models]
    sampled_filters_layer2 = kmeans_clustering(filters=trained_filters_layer2,
                                               num_input_channels=64,
                                               num_filters=64,
                                               kernel_size=3)
    
    # Apply third layer clustering techniques
    trained_filters_layer3 = [model.layer1[0].conv2.weight.data.cpu().numpy() for model in models]
    sampled_filters_layer3 = kmeans_clustering(filters=trained_filters_layer3,
                                               num_input_channels=64,
                                               num_filters=64,
                                               kernel_size=3)
    
    # Apply fourth layer clustering techniques
    trained_filters_layer4 = [model.layer1[1].conv1.weight.data.cpu().numpy() for model in models]
    sampled_filters_layer4 = kmeans_clustering(filters=trained_filters_layer4,
                                               num_input_channels=64,
                                               num_filters=64,
                                               kernel_size=3)
    
    # Apply fifth layer clustering techniques
    trained_filters_layer5 = [model.layer1[1].conv2.weight.data.cpu().numpy() for model in models]
    sampled_filters_layer5 = kmeans_clustering(filters=trained_filters_layer5,
                                               num_input_channels=64,
                                               num_filters=64,
                                               kernel_size=3)
    
    
    final_model.conv1.weight.data = sampled_filters_layer1
    final_model.layer1[0].conv1.weight.data = sampled_filters_layer2
    final_model.layer1[0].conv2.weight.data = sampled_filters_layer3
    final_model.layer1[1].conv1.weight.data = sampled_filters_layer4
    final_model.layer1[1].conv2.weight.data = sampled_filters_layer5
    
    
    return final_model

##  Evaluation Function

In [None]:
def compare_models(final_model, random_model, loaders, num_epochs):
    
    final_optimizer = torch.optim.Adam(final_model.parameters(), lr=0.01)
    random_optimizer = torch.optim.Adam(random_model.parameters(), lr=0.01)
    
    loss_func = torch.nn.CrossEntropyLoss()
    
    cluster_tracked_params = inf.train(final_model) # fill out the rest here. 
    random_tracked_params = inf.train(random_model)

    return cluster_tracked_params, random_tracked_params

# Experiments

## Custom Clustering

In [8]:
# might want to enable batch tracking later, now only the epochs are tracked
#loaders = inf.make_dataloaders(batch_size=128)[0]
loaders = "Loaders import requieres ffcv import"
custom_clustered_model = create_initialized_model_custom(models)
random_model = custom_resnet_18(num_classes=10).to(device)

compare_models(custom_clustered_model, random_model, loaders,5)

starting loop 1


3it [00:00, 2555.42it/s]


starting loop 2


100%|██████████| 3/3 [00:01<00:00,  1.71it/s]


done initializing models,start training


TypeError: train() missing 1 required positional argument: 'loaders'

## Clustering for RGB seperately

Each filters outputs are used for different filters in the next layer, these dependencies can be visualized as a graph. We start clustering on single 3d dimensional filters (for the first layer) and then having a prob distribution over the clusters for each filter in the next layer. We then use a pointer network to predict the cluster for each filter in the next layer.

In [None]:
import matplotlib.pyplot as plt

def clustering_kmeans(models, mode):
    # mode can be one of the following
    # "euclid" 
    # "cosine"
    # "fft" (not implemented yet)
    
    # go over all models and extract the first layer, and store it in a list
    first_layer_weights = []
    for model in models:
        first_layer_weights.append(models[model].conv1.weight.data.cpu().numpy())

    # flatten the list
    first_layer_weights = np.array(first_layer_weights)
    print(first_layer_weights.shape)
        
    # combine the first two dimensions of this np array
    first_layer_weights = first_layer_weights.reshape(-1, *first_layer_weights.shape[2:])
    print(first_layer_weights.shape)
    
    # flatten the list 
    filters_r = first_layer_weights[:,0,:,:]
    filters_r = filters_r.reshape(filters_r.shape[0],-1)
    filters_g = first_layer_weights[:,1,:,:]
    filters_g = filters_g.reshape(filters_g.shape[0],-1)
    filters_b = first_layer_weights[:,2,:,:]
    filters_b = filters_b.reshape(filters_b.shape[0],-1)

    print(filters_r.shape)

    # make this a list of values from 30 to 300 in steps of 10
    choices_num_clusters = list(range(1,100,2))

     # use elbow method to find the best number of clusters
    
    loss_list = {}

    for n_clusters in choices_num_clusters: 
        kmeans_r = KMeans(n_clusters=n_clusters, n_init='auto',)
        cluster_labels = kmeans_r.fit_predict(filters_r)
        loss_list[(0,n_clusters)] = kmeans_r.inertia_

        # repeat for green
        kmeans_g = KMeans(n_clusters=n_clusters, n_init='auto',)
        cluster_labels = kmeans_g.fit_predict(filters_g)
        loss_list[(1,n_clusters)] = kmeans_g.inertia_

        # repeat for blue
        kmeans_b = KMeans(n_clusters=n_clusters, n_init='auto',)
        cluster_labels = kmeans_b.fit_predict(filters_b)
        loss_list[(2,n_clusters)] = kmeans_b.inertia_


    print(loss_list)
    # make 1x3 plot to plot the loss for each color channel
    fig, axs = plt.subplots(1,3,figsize=(15,5))
    # plot the loss for red
    data_r = [loss_list[(0,n_clusters)] for n_clusters in choices_num_clusters]
    axs[0].plot(choices_num_clusters,data_r)
    # add y label
    axs[0].set_ylabel('Loss')
    # add x label
    axs[0].set_xlabel('Number of clusters')
    # add title
    axs[0].set_title('Loss vs number of clusters for red channel')

    # plot the loss for green
    data_g = [loss_list[(1,n_clusters)] for n_clusters in choices_num_clusters]
    axs[1].plot(choices_num_clusters,data_g)
    # add y label
    axs[1].set_ylabel('Loss')
    # add x label
    axs[1].set_xlabel('Number of clusters')
    # add title
    axs[1].set_title('Loss vs number of clusters for green channel')

    # plot the loss for blue
    data_b = [loss_list[(2,n_clusters)] for n_clusters in choices_num_clusters]
    axs[2].plot(choices_num_clusters,data_b)
    # add y label
    axs[2].set_ylabel('Loss')
    # add x label
    axs[2].set_xlabel('Number of clusters')
    # add title
    axs[2].set_title('Loss vs number of clusters for blue channel')

    plt.show()

    return


#clusters = clustering_kmeans(models,mode='euclid')