# Running the experiments

In [7]:
import random
import infrastructure as inf
import numpy as np
import torch.nn as nn

import torch 
from torchvision.models import resnet18
import torchvision

torch.manual_seed(42)
random.seed(1234)

In [None]:
# this defines the number of models used to average the results in our experiments and the number of subsets of the data we test on
num_models_per_experiment = 5
num_subsets_to_test = 10

# all possible classes with indices between 10 and 20 are used for testing our results, while the models were trained on the first 10 classes
possible_tuples = []
for i in range(10,20):
    for j in range(i+1,20):
        possible_tuples.append((i,j))

# randomly sample 10 tuples from the list of possible tuples to test on
sample_tuples = random.sample(possible_tuples,num_subsets_to_test)
# sort the sampled tuples
sample_tuples.sort()

print(sample_tuples)

In [None]:
# use exactly the same hyperparameters that were used during training
device = inf.device
epochs = 15
optimizer = 'SGD'
lr = 0.01
lr_reduce_patience = 5
normalization = True
model_params_dict = {}

In [None]:
# load the data in the same way as during training
data_loaders = [inf.get_loaders_cifar100_superclass_subsets_pytorch(i,j,128,6,normalization) for (i,j) in sample_tuples]

In [None]:
def eval_models(models, data_loaders, save_path, save_final_params=True):
    # returns mean min max of the model runs as formatted in tracked params
    tracked_params = []
    for model in models:
        model = model.to(device)
        for loaders in data_loaders: 
            model, params = inf.train(model, loaders, lr=lr, epochs=epochs, tracking_freq=1,
                                      early_stopping_min_epochs=100, device=device, optimizer=optimizer,
                                      momentum=0.9, reduce_patience=lr_reduce_patience)
            tracked_params.append(params)
    # get mean min and max of the tracked params.
    p_mean, p_min, p_max = inf.list_tracked_params_to_avg(tracked_params,also_min_max=True)
    # save to disk
    if save_final_params:
        np.save(save_path + '_mean.npy',p_mean)
        np.save(save_path+ "_min.npy",p_min)
        np.save(save_path+ "_max.npy",p_max)
        
    return p_mean, p_min, p_max

## Testing Random Model

In [None]:
# Testing on num_models_per_experiment randomly initialized models
random_custom_models = []
for i in range(num_models_per_experiment):
    model = resnet18(weights=None).to(device)
    model.fc = nn.Linear(512, 10).to(device)
    random_custom_models.append(model)
    
path = "./experiment_results/tracked_params/random_model"
r_params = eval_models(random_custom_models, data_loaders, path, True)


model_params_dict['random initialized'] = r_params

plot_path = "./experiment_results/random_init"
inf.plot_trainings_mean_min_max(model_params_dict, display_train_acc=True, display_only_mean=True, save=True, save_path=plot_path, display=True)

## Testing Pretrained Model

In [None]:
# first we generate all possible tuples of classes that we used to train on
possible_tuples_pretrained = []
for i in range(10):
    for j in range(i+1,10):
        possible_tuples_pretrained.append((i,j))
        
# randomly sample num_models_per_experiment from the list of possible trained tuples
tuples_to_load = random.sample(possible_tuples_pretrained, num_models_per_experiment)
print(tuples_to_load)

pre_trained_models_subset = []

# note that the pretrained models are not contained in the GitHub repository due to their size
# to run this experiment, train the models with the same hyperparameters as used in the paper and save them in the folder ./results_training_run2_Adams/models_sgd
for (i,j) in tuples_to_load:
    model = resnet18(weights=None).to(device)
    model.fc = nn.Linear(512,10).to(device)
    model.load_state_dict(torch.load(f"./results_training_run2_Adams/models_sgd/model_{i}_{j}.pt"))

    # we additionally tested with the last layer being randomly initialized, but this did not change the results
    # model.fc = nn.Linear(512,10).to(device)
    
    pre_trained_models_subset.append(model)


path = "./experiment_results/tracked_params/pre_trained_model"
pre_t_params = eval_models(pre_trained_models_subset, data_loaders, path, True)

model_params_dict['pre trained'] = pre_t_params

plot_path = "./experiment_results/pretrained_init"
inf.plot_trainings_mean_min_max(model_params_dict, display_train_acc=False, display_only_mean=True, save=True, save_path=plot_path, display=True)

## Testing initialization with Gabor Filters

In [None]:
def generate_gabor_filter(size, sigma, theta, Lambda, psi, gamma):
    """
    Generates a Gabor filter with given parameters.
    :param size: Size of the filter (size x size).
    :param sigma: Standard deviation of the Gaussian envelope.
    :param theta: Orientation of the Gabor filter.
    :param Lambda: Wavelength of the sinusoidal factor.
    :param psi: Phase offset.
    :param gamma: Spatial aspect ratio.
    :return: Gabor filter as a 2D array.
    """
    sigma_x = sigma
    sigma_y = sigma / gamma

    # Prepare grid in x and y
    x = np.linspace(-size // 2, size // 2, size)
    y = np.linspace(-size // 2, size // 2, size)
    x, y = np.meshgrid(x, y)

    # Rotation
    x_theta = x * np.cos(theta) + y * np.sin(theta)
    y_theta = -x * np.sin(theta) + y * np.cos(theta)

    gb = np.exp(-.5 * (x_theta ** 2 / sigma_x ** 2 + y_theta ** 2 / sigma_y ** 2)) * np.cos(2 * np.pi / Lambda * x_theta + psi)
    return gb

def init_gabor_filters(module, random_seed):
    # Check if the module is a convolutional layer
    if isinstance(module, nn.Conv2d):
        out_channels = module.out_channels
        for i in range(out_channels):
            np.random.seed(random_seed)
            random_seed += 1

            sigma = np.random.uniform(1.5, 2.5)
            theta = np.random.uniform(0, np.pi)
            Lambda = np.random.uniform(2, 13)
            psi = np.random.uniform(0, 2*np.pi)
            gamma = np.random.uniform(0.9, 1.1)

            for j in range(module.in_channels):
                filter_size = module.kernel_size[0] if type(module.kernel_size) is tuple else module.kernel_size
                gabor_filter = generate_gabor_filter(filter_size, sigma, theta, Lambda, psi, gamma)
                module.weight.data[i, j, :, :] = torch.from_numpy(gabor_filter)
                
    return random_seed

def initialize_model_gabor(num_layers):
    model = resnet18(num_classes=10).to(device)
    
    random_seed = 0
    num_initialized = 0
    for name, module in model.named_modules():
        if 'conv' in name and num_initialized < num_layers:  # This ensures we are only initializing conv layers
            random_seed = init_gabor_filters(module, random_seed)
            num_initialized += 1

    model.to(device)
    return model

In [None]:
# vary this parameter to test different numbers of layers initialized with Gabor filters
num_layers = 17

# Testing on two random initialized models
gabor_models = []
for i in range(num_models_per_experiment):
    model = initialize_model_gabor(num_layers).to(device)
    gabor_models.append(model)
    
path = f"./experiment_results/tracked_params/gabor/gabor_model_num_layers_{num_layers}"
gabor_params = eval_models(gabor_models, data_loaders, path, True)

model_params_dict[f'gabor initialized {num_layers} layers'] = gabor_params

plot_path = f"./experiment_results/gabor_init_{num_layers}_layers"
inf.plot_trainings_mean_min_max(model_params_dict, display_train_acc=True, display_only_mean=True, save=True, save_path=plot_path, display=True)

# Setup for fine filter clustering (Euclid & Fourier)

In this approach a 3d filter is separated into 2d filters separated based on the input channels. These 2d filters are then clustered using KMeans for every layer separately, completely ignoring the input channels.

## Euclid

In [None]:
# first load all the models to memory that were pretrained

from sklearn.cluster import KMeans
pre_trained_models = []

for i in range(10):
    for j in range(i+1,10):
        # first check if this file exists: 
        try:
            model = resnet18(weights=None).to(device)
            model.fc = nn.Linear(512,10).to(device)
            model.load_state_dict(torch.load(f"./results_training_run2_Adams/models_sgd/model_{i}_{j}.pt"))
            pre_trained_models.append(model)
        except:
            continue
# shuffle the models 
np.random.shuffle(pre_trained_models)

print("Loaded",len(pre_trained_models),"pre trained models")

def models_to_filter_per_layer(models):
    print("Using",len(models),"models to create filters")
    filters_per_layer = {}
    
    filters_per_layer["0"] = []
    for model in models:
        filters_per_layer["0"].append(model.conv1.weight.data.cpu().numpy())
    
    filters_per_layer["1"] = []
    for model in models:
        filters_per_layer["1"].append(model.layer1[0].conv1.weight.data.cpu().numpy())
    
    filters_per_layer["2"] = []
    for model in models:
        filters_per_layer["2"].append(model.layer1[0].conv2.weight.data.cpu().numpy())

    filters_per_layer["3"] = []
    for model in models:
        filters_per_layer["3"].append(model.layer1[1].conv1.weight.data.cpu().numpy())
    
    filters_per_layer["4"] = []
    for model in models:
        filters_per_layer["4"].append(model.layer1[1].conv2.weight.data.cpu().numpy())

    filters_per_layer["5"] = []
    for model in models:
        filters_per_layer["5"].append(model.layer2[0].conv1.weight.data.cpu().numpy())

    filters_per_layer["6"] = []
    for model in models:
        filters_per_layer["6"].append(model.layer2[0].conv2.weight.data.cpu().numpy())

    filters_per_layer["7"] = []
    for model in models:
        filters_per_layer["7"].append(model.layer2[1].conv1.weight.data.cpu().numpy())

    filters_per_layer["8"] = []
    for model in models:
        filters_per_layer["8"].append(model.layer2[1].conv2.weight.data.cpu().numpy())

    filters_per_layer["9"] = []
    for model in models:
        filters_per_layer["9"].append(model.layer3[0].conv1.weight.data.cpu().numpy())

    filters_per_layer["10"] = []
    for model in models:
        filters_per_layer["10"].append(model.layer3[0].conv2.weight.data.cpu().numpy())

    filters_per_layer["11"] = []
    for model in models:
        filters_per_layer["11"].append(model.layer3[1].conv1.weight.data.cpu().numpy())

    filters_per_layer["12"] = []
    for model in models:
        filters_per_layer["12"].append(model.layer3[1].conv2.weight.data.cpu().numpy())

    filters_per_layer["13"] = []
    for model in models:
        filters_per_layer["13"].append(model.layer4[0].conv1.weight.data.cpu().numpy())

    filters_per_layer["14"] = []
    for model in models:
        filters_per_layer["14"].append(model.layer4[0].conv2.weight.data.cpu().numpy())

    filters_per_layer["15"] = []
    for model in models:
        filters_per_layer["15"].append(model.layer4[1].conv1.weight.data.cpu().numpy())

    filters_per_layer["16"] = []
    for model in models:
        filters_per_layer["16"].append(model.layer4[1].conv2.weight.data.cpu().numpy())

    for key in filters_per_layer.keys():
        filters_per_layer[key] = np.array(filters_per_layer[key])

    return filters_per_layer


def clustering_for_single_layer(filters_per_layer,layer_key, num_clusters):
    filters_per_layer[layer_key] = np.array(filters_per_layer[layer_key])
    # reshape every single filter to a 1d array
    filters_per_layer[layer_key] = filters_per_layer[layer_key].reshape(-1, *filters_per_layer[layer_key].shape[3:])
    filters_per_layer[layer_key] = filters_per_layer[layer_key].reshape(filters_per_layer[layer_key].shape[0],-1)

    # now it has the form #filters x #elements in filter (where 1 filter is one 2d array, stored as 1d)
    kmeans_filters = KMeans(n_clusters=num_clusters, n_init='auto',)
    cluster_labels = kmeans_filters.fit_predict(filters_per_layer[layer_key])
    
    # cluster centers:
    cluster_centers = kmeans_filters.cluster_centers_
    return cluster_labels, cluster_centers

def clustering_single_kmeans(models,device, num_clusters=30, num_layers=17):
    # num layers need to be at least 1 and at most 17
    assert num_layers >= 1 and num_layers <= 17

    
    # get dict of filters for every layer
    filters_per_layer = models_to_filter_per_layer(models)
    keys = list(filters_per_layer.keys())

    new_filters_per_layer = {}

    for layer_key in keys:

        orig_shape = filters_per_layer[layer_key].shape
        labels, centers = clustering_for_single_layer(filters_per_layer,layer_key,num_clusters)
        # create a prob density vector of the labels
        pdf = np.zeros(num_clusters)
        for i in range(labels.shape[0]):
            pdf[labels[i]] += 1
        pdf = pdf/labels.shape[0]
        # construct the vector of # filters x centroids 
        new_filters = np.zeros(orig_shape[1:])
        #print("new filters",new_filters.shape)    

        for i in range(new_filters.shape[0]):
            for j in range(new_filters.shape[1]):
                # sample an index from this pdf, this is the index of the cluster that we will use
                sampled_index = np.random.choice(num_clusters, p=pdf)
                #print(sampled_index)
                new_filters[i][j] = centers[sampled_index].reshape((new_filters.shape[2],new_filters.shape[3]))

        new_filters_per_layer[layer_key] = new_filters

     
        
    model = resnet18(num_classes=10).to(device)
    
    # go over all layers and set the new filters
    model.conv1.weight.data = torch.tensor(new_filters_per_layer["0"], dtype=torch.float32).to(device)
    if num_layers == 1: return model
    model.layer1[0].conv1.weight.data = torch.tensor(new_filters_per_layer["1"], dtype=torch.float32).to(device)
    if num_layers == 2: return model
    model.layer1[0].conv2.weight.data = torch.tensor(new_filters_per_layer["2"], dtype=torch.float32).to(device)
    if num_layers == 3: return model
    model.layer1[1].conv1.weight.data = torch.tensor(new_filters_per_layer["3"], dtype=torch.float32).to(device)
    if num_layers == 4: return model
    model.layer1[1].conv2.weight.data = torch.tensor(new_filters_per_layer["4"], dtype=torch.float32).to(device)
    if num_layers == 5: return model
    model.layer2[0].conv1.weight.data = torch.tensor(new_filters_per_layer["5"], dtype=torch.float32).to(device)
    if num_layers == 6: return model
    model.layer2[0].conv2.weight.data = torch.tensor(new_filters_per_layer["6"], dtype=torch.float32).to(device)
    if num_layers == 7: return model
    model.layer2[1].conv1.weight.data = torch.tensor(new_filters_per_layer["7"], dtype=torch.float32).to(device)
    if num_layers == 8: return model
    model.layer2[1].conv2.weight.data = torch.tensor(new_filters_per_layer["8"], dtype=torch.float32).to(device)
    if num_layers == 9: return model
    model.layer3[0].conv1.weight.data = torch.tensor(new_filters_per_layer["9"], dtype=torch.float32).to(device)
    if num_layers == 10: return model
    model.layer3[0].conv2.weight.data = torch.tensor(new_filters_per_layer["10"], dtype=torch.float32).to(device)
    if num_layers == 11: return model
    model.layer3[1].conv1.weight.data = torch.tensor(new_filters_per_layer["11"], dtype=torch.float32).to(device)
    if num_layers == 12: return model
    model.layer3[1].conv2.weight.data = torch.tensor(new_filters_per_layer["12"], dtype=torch.float32).to(device)
    if num_layers == 13: return model
    model.layer4[0].conv1.weight.data = torch.tensor(new_filters_per_layer["13"], dtype=torch.float32).to(device)
    if num_layers == 14: return model
    model.layer4[0].conv2.weight.data = torch.tensor(new_filters_per_layer["14"], dtype=torch.float32).to(device)
    if num_layers == 15: return model
    model.layer4[1].conv1.weight.data = torch.tensor(new_filters_per_layer["15"], dtype=torch.float32).to(device)
    if num_layers == 16: return model
    model.layer4[1].conv2.weight.data = torch.tensor(new_filters_per_layer["16"], dtype=torch.float32).to(device)
    
    return model


## Fourier

In [None]:
from sklearn.cluster import KMeans
def dft(weights):
    ffts = []
    for i in range(weights.shape[0]):
        ffts.append(np.fft.fft2(weights[i]))
    ffts = np.array(ffts)
    return np.real(ffts), np.imag(ffts)

def inverse_dft(cluster_results):
    weights = []
    for i in range(cluster_results.shape[0]):
        weights.append(np.fft.ifft2(cluster_results[i]))
    weights = np.array(weights)
    return np.real(weights), np.imag(weights)

def clustering_fourier_single_layer(filters, num_clusters):
    filters_reshaped = np.reshape(filters, (filters.shape[0]*filters.shape[1]*filters.shape[2],filters.shape[3],filters.shape[4]))
    dfts, _ = dft(filters_reshaped)
    dfts_reshaped = np.reshape(dfts,(dfts.shape[0],dfts.shape[1]*dfts.shape[2]))
    kmeans = KMeans(n_clusters = num_clusters, n_init='auto')
    cluster_labels = kmeans.fit_predict(dfts_reshaped)
    
    final_weights, _ = inverse_dft(np.reshape(kmeans.cluster_centers_, (kmeans.cluster_centers_.shape[0],dfts.shape[1],dfts.shape[2])))
    return cluster_labels, final_weights

def clustering_single_fourier(models,device, num_clusters=30, num_layers=17):
    # num layers need to be at least 1 and at most 17
    assert num_layers >= 1 and num_layers <= 17

    
    # get dict of filters for every layer
    filters_per_layer = models_to_filter_per_layer(models)
    keys = list(filters_per_layer.keys())

    new_filters_per_layer = {}

    
    for layer_key in keys:

        orig_shape = filters_per_layer[layer_key].shape
        
        labels, centers = clustering_fourier_single_layer(filters_per_layer[layer_key], num_clusters)
                
        # create a prob density vector of the labels
        pdf = np.zeros(num_clusters)
        for i in range(labels.shape[0]):
            pdf[labels[i]] += 1
        pdf = pdf/labels.shape[0]
        # construct the vector of # filters x centroids 
        new_filters = np.zeros(orig_shape[1:])

        for i in range(new_filters.shape[0]):
            for j in range(new_filters.shape[1]):
                # sample an index from this pdf, this is the index of the cluster that we will use
                sampled_index = np.random.choice(num_clusters, p=pdf)
                #print(sampled_index)
                new_filters[i][j] = centers[sampled_index].reshape((new_filters.shape[2],new_filters.shape[3]))

        new_filters_per_layer[layer_key] = new_filters

     
        
    model = resnet18(num_classes=10).to(device)
    
    # go over all layers and set the new filters
    model.conv1.weight.data = torch.tensor(new_filters_per_layer["0"], dtype=torch.float32).to(device)
    if num_layers == 1: return model
    model.layer1[0].conv1.weight.data = torch.tensor(new_filters_per_layer["1"], dtype=torch.float32).to(device)
    if num_layers == 2: return model
    model.layer1[0].conv2.weight.data = torch.tensor(new_filters_per_layer["2"], dtype=torch.float32).to(device)
    if num_layers == 3: return model
    model.layer1[1].conv1.weight.data = torch.tensor(new_filters_per_layer["3"], dtype=torch.float32).to(device)
    if num_layers == 4: return model
    model.layer1[1].conv2.weight.data = torch.tensor(new_filters_per_layer["4"], dtype=torch.float32).to(device)
    if num_layers == 5: return model
    model.layer2[0].conv1.weight.data = torch.tensor(new_filters_per_layer["5"], dtype=torch.float32).to(device)
    if num_layers == 6: return model
    model.layer2[0].conv2.weight.data = torch.tensor(new_filters_per_layer["6"], dtype=torch.float32).to(device)
    if num_layers == 7: return model
    model.layer2[1].conv1.weight.data = torch.tensor(new_filters_per_layer["7"], dtype=torch.float32).to(device)
    if num_layers == 8: return model
    model.layer2[1].conv2.weight.data = torch.tensor(new_filters_per_layer["8"], dtype=torch.float32).to(device)
    if num_layers == 9: return model
    model.layer3[0].conv1.weight.data = torch.tensor(new_filters_per_layer["9"], dtype=torch.float32).to(device)
    if num_layers == 10: return model
    model.layer3[0].conv2.weight.data = torch.tensor(new_filters_per_layer["10"], dtype=torch.float32).to(device)
    if num_layers == 11: return model
    model.layer3[1].conv1.weight.data = torch.tensor(new_filters_per_layer["11"], dtype=torch.float32).to(device)
    if num_layers == 12: return model
    model.layer3[1].conv2.weight.data = torch.tensor(new_filters_per_layer["12"], dtype=torch.float32).to(device)
    if num_layers == 13: return model
    model.layer4[0].conv1.weight.data = torch.tensor(new_filters_per_layer["13"], dtype=torch.float32).to(device)
    if num_layers == 14: return model
    model.layer4[0].conv2.weight.data = torch.tensor(new_filters_per_layer["14"], dtype=torch.float32).to(device)
    if num_layers == 15: return model
    model.layer4[1].conv1.weight.data = torch.tensor(new_filters_per_layer["15"], dtype=torch.float32).to(device)
    if num_layers == 16: return model
    model.layer4[1].conv2.weight.data = torch.tensor(new_filters_per_layer["16"], dtype=torch.float32).to(device)
    
    return model

## Test the number of pretrained models used for clustering

For this experiment the number of clusters is set 10 and the number of layers is set to all 17. 

In [None]:
choices_num_models_used = [2,4,6,10,20]

num_models_dict = {}

for num_models_used in choices_num_models_used:
    # create a subset of the pre trained models
    subset_models = pre_trained_models[:num_models_used]
    
    # create a model with the clustered filters
    clustered_models_euclid = [clustering_single_kmeans(subset_models, device, num_clusters=10, num_layers=17) for i in range(num_models_per_experiment)]
    clustered_models_fourier = [clustering_single_fourier(subset_models, device, num_clusters=10, num_layers=17) for i in range(num_models_per_experiment)]
    
    # evaluate the model
    path = f"./experiment_results/tracked_params/euclid/num_models/euclid_model_num_models_{num_models_used}"
    euclid_params = eval_models(clustered_models_euclid, data_loaders, path, True)
    path = f"./experiment_results/tracked_params/fourier/num_models/fourier_model_num_models_{num_models_used}"
    fourier_params = eval_models(clustered_models_fourier, data_loaders, path, True)
    
    model_params_dict[f'euclid #models{num_models_used}'] = euclid_params
    model_params_dict[f'fourier #models{num_models_used}'] = fourier_params
    
    num_models_dict[f'euclid #models{num_models_used}'] = euclid_params
    num_models_dict[f'fourier #models{num_models_used}'] = fourier_params

plot_path = "./experiment_results/clustered_num_models"
inf.plot_trainings_mean_min_max(num_models_dict,display_train_acc=False,display_only_mean=True,save=True,save_path=plot_path,display=True)

## Test the number of clusters used for clustering

For this experiment the number of pretrained models used is set to 10 and the number of layers is set to all 17.

In [None]:
choices_num_clusters = [1,2,3,10,30,50] 

num_clusters_dict = {}

for num_clusters in choices_num_clusters:
    # create a subset of the pre trained models
    subset_models = pre_trained_models[:10]
    
    # create a model with the clustered filters
    euclid_models = [clustering_single_kmeans(subset_models, device, num_clusters=num_clusters, num_layers=17) for i in range(num_models_per_experiment)]
    fourier_models = [clustering_single_fourier(subset_models, device, num_clusters=num_clusters, num_layers=17) for i in range(num_models_per_experiment)]

    # evaluate the model
    path = f"./experiment_results/tracked_params/euclid/num_clusters/euclid_model_num_clusters_{num_clusters}"
    euclid_params = eval_models(euclid_models, data_loaders, path, True)
    path = f"./experiment_results/tracked_params/fourier/num_clusters/fourier_model_num_clusters_{num_clusters}"
    fourier_params = eval_models(fourier_models, data_loaders, path, True)
    
    
    model_params_dict[f'euclid #clusters{num_clusters}'] = euclid_params
    model_params_dict[f'fourier #clusters{num_clusters}'] = fourier_params
    
    num_clusters_dict[f'euclid #clusters{num_clusters}'] = euclid_params
    num_clusters_dict[f'fourier #clusters{num_clusters}'] = fourier_params

plot_path = "./experiment_results/clustered_num_clusters"
inf.plot_trainings_mean_min_max(num_clusters_dict, display_train_acc=False, display_only_mean=True, save=True, save_path=plot_path, display=True)

## Test the number of layers used for clustering

For this experiment the number of pretrained models used is set to 10 and the number of clusters is set to all 10.

In [None]:
choices_num_layers = [1,2,6,10,17]
num_filters_dict = {}

for num_layers in choices_num_layers:
    # create a subset of the pre trained models
    subset_models = pre_trained_models[:10]

    # create a model with the clustered filters
    euclid_models = [clustering_single_kmeans(subset_models, device, num_clusters=10, num_layers=num_layers) for i in range(num_models_per_experiment)]
    fourier_models = [clustering_single_fourier(subset_models, device, num_clusters=10, num_layers=num_layers) for i in range(num_models_per_experiment)]

    # evaluate the model
    path = f"./experiment_results/tracked_params/euclid/num_layers/euclid_model_num_layers_{num_layers}"
    euclid_params = eval_models(euclid_models, data_loaders, path, True)
    path = f"./experiment_results/tracked_params/fourier/num_layers/fourier_model_num_layers_{num_layers}"
    fourier_params = eval_models(fourier_models, data_loaders, path, True)
    
    model_params_dict[f'euclid #layers{num_layers}'] = euclid_params
    model_params_dict[f'fourier #layers{num_layers}'] = fourier_params
    
    num_filters_dict[f'euclid #layers{num_layers}'] = euclid_params
    num_filters_dict[f'fourier #layers{num_layers}'] = fourier_params

plot_path = "./experiment_results/clustered_num_layers"
inf.plot_trainings_mean_min_max(num_filters_dict,display_train_acc=False,display_only_mean=True,save=True,save_path=plot_path,display=True)

# Testing the alignment algorithm

## Randomly initialized custom ResNet-18

In [None]:
from Custom_ResNet18 import custom_resnet_18

random_custom_models = []
for i in range(num_models_per_experiment):
    model = custom_resnet_18(num_classes=10).to(device)
    random_custom_models.append(model)
    
path = "./experiment_results/tracked_params/alignment/custom_random_model"
r_params = eval_models(random_custom_models, data_loaders, path, True)

model_params_dict['random initialized'] = r_params

plot_path = "./experiment_results/custom_random_init"
inf.plot_trainings_mean_min_max(model_params_dict, display_train_acc=True, display_only_mean=True, save=True, save_path=plot_path, display=True)

## Clustered custom ResNet-18 with alignment

In [None]:
path_to_custom_pretrained = "./"

def load_custom_pre_trained_models():

    pre_trained_models = []

    for i in range(10):
        for j in range(i+1,10):
            # first check if this file exists: 
            try:
                model = custom_resnet_18(num_classes=10).to(device)
                model.load_state_dict(torch.load(f'{path_to_custom_pretrained}model_{i}_{j}.pt'))
                pre_trained_models.append(model)
            except:
                print(f"Pretrained model {(i,j)} not found")
                continue

    # randomly shuffle the models
    np.random.shuffle(pre_trained_models)

    return pre_trained_models

def dft(weights):
    return np.real(np.fft.fft(weights))

def euclidean_distance(vec1, vec2):
    # Euclidean distance is the square root of the sum of the squared differences
    return np.sqrt(np.sum((vec1 - vec2) ** 2))

def calculate_distance(cluster_group, vec):
    total_distance = 0

    fourier_vec = dft(vec)
    for cluster_vec in cluster_group:
        
        fourier_cluster_vec = dft(cluster_vec)

        distance = euclidean_distance(fourier_vec, fourier_cluster_vec)
        total_distance += distance

    return total_distance

def get_custom_clustering_groups(filters_per_model, num_filters_per_model):
    filter_vectors = []
    
    num_models = len(filters_per_model)
    
    for idx, filters in enumerate(filters_per_model):
        
        filter_vectors.append([])

        for i in range(filters.shape[0]):
            # Flatten and normalize the filter
            filter_vec = filters[i].flatten()
            filter_vec /= np.linalg.norm(filter_vec)
            filter_vectors[idx].append(filter_vec)
            
    new_groups = [[] for _ in range(num_filters_per_model)]
    permutations = [[] for _ in range(num_models)]
    
    # Now we go through all models and assign filters in a greedy manner
    for i in range(num_models):
        model_filters = filter_vectors[i]
        used_indices = set()
        
        # We go through every group of the new clusters and assign the min distance vector from the next model to it
        for cluster_idx, cluster_group in enumerate(new_groups):
            min_distance = float('inf')
            selected_vector_idx = None
            
            for idx, vec in enumerate(model_filters):
                dist = calculate_distance(cluster_group, vec)
                if dist < min_distance and idx not in used_indices:
                    min_distance = dist
                    selected_vector_idx = idx
            
            new_groups[cluster_idx].append(model_filters[selected_vector_idx])
            used_indices.add(selected_vector_idx)
            permutations[i].append(selected_vector_idx)

    return new_groups, permutations

def custom_clustering(filters, num_input_channels, num_filters, kernel_size, include_permutations=False):
    
    clustered_groups, permutations = get_custom_clustering_groups(filters, num_filters)
    
    filter_vectors = np.array(clustered_groups)
    average_filters = [np.mean(np.array(vectors), axis=0) for vectors in filter_vectors]
    
    shaped_average_filters = np.array(average_filters).reshape((num_filters, num_input_channels, kernel_size, kernel_size))
    torch_shaped_average_filters = torch.tensor(shaped_average_filters, dtype=torch.float32).to(device)
    
    if include_permutations:
        return torch_shaped_average_filters, permutations
    else:
        return torch_shaped_average_filters
    
def permute_bias(permuted_model_bias, perm):
    permuted_model_bias.weight = torch.nn.Parameter(permuted_model_bias.weight[perm])
    permuted_model_bias.bias = torch.nn.Parameter(permuted_model_bias.bias[perm])
    permuted_model_bias.running_mean = permuted_model_bias.running_mean[perm]
    permuted_model_bias.running_var = permuted_model_bias.running_var[perm]

def permute_layer(permuted_layer, permuted_layer_next, permuted_bias, perm):
    permuted_layer.weight = torch.nn.Parameter(permuted_layer.weight[perm])
    permute_bias(permuted_bias, perm)
    permuted_layer_next.weight = torch.nn.Parameter(permuted_layer_next.weight.transpose(0,1)[perm].transpose(0,1))

def create_initialized_model(models, num_layers):
    final_model = custom_resnet_18(num_classes=10).to(device)
    
    trained_filters_layer1 = [model.conv1.weight.data.cpu().numpy() for model in models]
    sampled_filters_layer1, permutation_layer1 = custom_clustering(filters=trained_filters_layer1,
                                                            num_input_channels=3,
                                                            num_filters=64,
                                                            kernel_size=7,
                                                            include_permutations=True)
    final_model.conv1.weight.data = sampled_filters_layer1
    if num_layers == 1:
        return final_model
    
    for idx, model in enumerate(models):
        permute_layer(model.conv1, model.layer1[0].conv1, model.bn1, permutation_layer1[idx])
    
    trained_filters_layer2 = [model.layer1[0].conv1.weight.data.cpu().numpy() for model in models]
    sampled_filters_layer2, permutation_layer2 = custom_clustering(filters=trained_filters_layer2,
                                                            num_input_channels=64,
                                                            num_filters=64,
                                                            kernel_size=3,
                                                            include_permutations=True)
    
    final_model.layer1[0].conv1.weight.data = sampled_filters_layer2
    if num_layers == 2:
        return final_model
    
    for idx, model in enumerate(models):
        permute_layer(model.layer1[0].conv1, model.layer1[0].conv2, model.layer1[0].bn1, permutation_layer2[idx])
    
    
    trained_filters_layer3 = [model.layer1[0].conv2.weight.data.cpu().numpy() for model in models]
    sampled_filters_layer3, permutation_layer3 = custom_clustering(filters=trained_filters_layer3,
                                                            num_input_channels=64,
                                                            num_filters=64,
                                                            kernel_size=3,
                                                            include_permutations=True)
    final_model.layer1[0].conv2.weight.data = sampled_filters_layer3
    if num_layers == 3:
        return final_model
    
    for idx, model in enumerate(models):
        permute_layer(model.layer1[0].conv2, model.layer1[1].conv1, model.layer1[0].bn2, permutation_layer3[idx])
        
    trained_filters_layer4 = [model.layer1[1].conv1.weight.data.cpu().numpy() for model in models]
    sampled_filters_layer4, permutation_layer4 = custom_clustering(filters=trained_filters_layer4,
                                                            num_input_channels=64,
                                                            num_filters=64,
                                                            kernel_size=3,
                                                            include_permutations=True)
    final_model.layer1[1].conv1.weight.data = sampled_filters_layer4
    if num_layers == 4:
        return final_model
    
    for idx, model in enumerate(models):
        permute_layer(model.layer1[1].conv1, model.layer1[1].conv2, model.layer1[1].bn1, permutation_layer4[idx])

    trained_filters_layer5 = [model.layer1[1].conv1.weight.data.cpu().numpy() for model in models]
    sampled_filters_layer5, permutation_layer5 = custom_clustering(filters=trained_filters_layer4,
                                                            num_input_channels=64,
                                                            num_filters=64,
                                                            kernel_size=3,
                                                            include_permutations=True)
    final_model.layer1[1].conv1.weight.data = sampled_filters_layer4

    return final_model

In [None]:
custom_pre_trained_models = load_custom_pre_trained_models()
num_models_dict = {}
num_models_used = 6
num_layers_candidates = [1,2,4]

subset_models = custom_pre_trained_models[:num_models_used]

for num_layers in num_layers_candidates:
	clustered_models_alignment = []

	for i in range(num_models_per_experiment):
		clustered_models_alignment.append(create_initialized_model(subset_models, num_layers))

	path = f"./experiment_results/tracked_params/alignment/custom_alignment_num_layers_{num_layers}"
	alignment_params = eval_models(clustered_models_alignment, data_loaders, path, True)

	num_models_dict[f'Alignment {num_layers} layers'] = alignment_params

plot_path = "./experiment_results/clustered_num_models"
inf.plot_trainings_mean_min_max(num_models_dict, display_train_acc=False, display_only_mean=True, save=True, save_path=plot_path, display=True)

# Test on Tiny ImageNet

In [None]:
import os

class CustomDataset(torch.utils.data.dataset.Dataset):
    
    def __init__(self, path, is_train=True):
        
		# due to the size of the dataset, it was not included in the GitHub repository
        # the dataset can be downloaded from https://www.kaggle.com/datasets/akash2sharma/tiny-imagenet
        with open(f'{path}/wnids.txt') as f:
            self.labels = [x.strip() for x in f.readlines()]
            
        if not is_train:
            # Load the labels and file names from val_annotations.txt for the test set
            self.labels_map = {}
            with open(os.path.join(path, 'val_annotations.txt')) as f:
                for line in f:
                    parts = line.strip().split()
                    self.labels_map[parts[0]] = parts[1]  # map filename to label

            path = os.path.join(path, 'images')  # Update path to images directory

        self.files = []
        for root, dirs, files in os.walk(path):
            for f in files:
                if f.endswith('JPEG'):
                    fullpath = os.path.join(root, f)
                    self.files.append(fullpath)
                    
        TIN_MEAN = [0.485, 0.456, 0.406]
        TIN_STD = [0.229, 0.224, 0.225]
        
        self.transforms = torchvision.transforms.Compose([
            torchvision.transforms.ToTensor(),
            torchvision.transforms.Normalize(TIN_MEAN, TIN_STD)
        ])
        self.is_train = is_train
            
    def __getitem__(self, index):
        file = self.files[index]
        img = Image.open(file).convert('RGB')
        img = self.transforms(img)

        if self.is_train:
            label_name = file.split('/')[-1].split('_')[0]
            label = self.labels.index(label_name)
        else:
            file_name = file.split('/')[-1]
            label_name = self.labels_map[file_name]
            label = self.labels.index(label_name)

        return img, label
    
    def __len__(self):
        return len(self.files)
    
# Training set
trainset = CustomDataset('./tiny_imagenet/train', is_train=True)
# Test set
testset = CustomDataset('./tiny_imagenet/val', is_train=False)

tinyIN_loaders = [{
    'train': torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=4),
    'test': torch.utils.data.DataLoader(testset, batch_size=128, shuffle=True, num_workers=4),
}]

## Random model

In [None]:
model_params_dict = {}

# Testing on two random initialized models
tiny_random_model = []
for i in range(num_models_per_experiment):
    model = resnet18(num_classes=200).to(device)
    tiny_random_model.append(model)
    
path = "./experiment_results/tracked_params/imagenet/tiny_imagenet_random_models"
r_params = eval_models(tiny_random_model, tinyIN_loaders, path, True)

model_params_dict['ImageNet 5 random initialized models'] = r_params

plot_path = "./experiment_results/random_IN_init"
inf.plot_trainings_mean_min_max(model_params_dict, display_train_acc=False, display_only_mean=True, save=True, save_path=plot_path, display=True)

## Pretrained model

In [None]:
pre_trained_models_subset = pre_trained_models[0:5]

path = f'./experiment_results/tracked_params/imagenet/tiny_imagenet_pretrained_models'
pretrained_params = eval_models(pre_trained_models_subset, tinyIN_loaders, path, True)

model_params_dict[f'ImageNet 5 pretrained models'] = pretrained_params

plot_path = f'./experiment_results/pretrained_IN_init'
inf.plot_trainings_mean_min_max(model_params_dict, display_train_acc=False, display_only_mean=True, save=True, save_path=plot_path, display=True)

## Clustered model

In [None]:
num_models_used = 6
num_clusters = 3
num_layers = 17

clustered_models = []
for i in range(num_models_per_experiment):  # If you want more models, change the range accordingly
    # Calculate the start and end indices for the slice of models to be used
    start_idx = i * num_models_used
    end_idx = start_idx + num_models_used

    # Slice the pre-trained models accordingly
    subset_models = pre_trained_models[start_idx:end_idx]

    # Continue with clustering and saving the model
    model = clustering_single_fourier(subset_models, device, num_clusters=num_clusters, num_layers=num_layers)
    clustered_models.append(model)
    print(f"Finished clustering [{i+1}/{num_models_per_experiment}] models")

In [None]:
path = f'./experiment_results/tracked_params/imagenet/tiny_imagenet_clustered_models'
clustered_params = eval_models(clustered_models, tinyIN_loaders, path, True)

model_params_dict[f'ImageNet 5 clustered models'] = clustered_params

plot_path = f'./experiment_results/clustered_IN_init'
inf.plot_trainings_mean_min_max(model_params_dict, display_train_acc=False, display_only_mean=True, save=True, save_path=plot_path, display=True)