## Set up

In [2]:
import sys
sys.path.append('./scripts/')

import math
import copy
import random # random seed to reproduce MDS and t-SNE plots
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
from PIL import Image

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import DBSCAN
from sklearn import manifold # MDS and t-SNE
from sklearn import cluster # k-Means clustering
from sklearn import preprocessing # scaling attributes
from sklearn.metrics import silhouette_score # silhouette width for clustering
from sklearn.decomposition import PCA
import hdbscan

import imp
import my_datasets
import utilities 
imp.reload(my_datasets) 
imp.reload(utilities) 

plt.rcParams["figure.figsize"] = (3,3)

  import imp


In [3]:
lim = 10

In [4]:
dataset='ilsvrc12'
paths, count, y, idx_to_labels = my_datasets.get_dataset(dataset,lim=10)

print(count, len(paths))

TypeError: get_dataset() got an unexpected keyword argument 'lim'

In [None]:
layer='Mixed_7b.cat_2'
SAVEFOLD0=f'../outputs/{dataset}'

SAVEFOLD=f"{SAVEFOLD0}/{layer}/"

In [None]:
# temp
import torch
model = torch.hub.load('pytorch/vision:v0.9.0', 'inception_v3', pretrained=True)
model.eval()

In [None]:
# gradients_wrt_conv_layer=np.load(f"{SAVEFOLD}/gradients_wrt_conv_layer.npy")
predictions=np.load(f"{SAVEFOLD}/predictions_{lim}.npy", mmap_mode = 'r')
conv_maps=np.load(f"{SAVEFOLD}/conv_maps_{lim}.npy", mmap_mode = 'r')

pvh=np.load(f"{SAVEFOLD}/eigenvectors.npy",allow_pickle=True, mmap_mode = 'r')

In [None]:
transforms = "standardise" # None / "standardise" / "normalise"

In [None]:
conv_maps_avg = conv_maps.mean(3).mean(2)

In [None]:
scale = StandardScaler()
normalise = MinMaxScaler()

standardised_data = scale.fit_transform(conv_maps_avg) 
normalised_data = normalise.fit_transform(conv_maps_avg) # .shape (10000, 2048)

In [None]:
if transforms == "standardise":
    activations = standardised_data
    print("Standardise")
elif transforms == "normalise": 
    activations = normalised_data
    print("Normaliseise")
else: 
    activations = conv_maps_avg
    print("Raw activations")

In [None]:
# pca = PCA(n_components=200)
# activations = pca.fit_transform(activations)

In [None]:
# conv_maps_avg

In [None]:
# activations

In [None]:
random.seed(2021)

## Utilities

Scatterplot to visualise clusters

In [None]:
colors = np.array(['orange', 'blue', 'lime', 'khaki', 'pink', 
                   'green', 'purple', 'yellow'])

# points - a 2D array of (x,y) coordinates of data points
# labels - an array of numeric labels in the interval [0..k-1], one for each point
# centers - a 2D array of (x, y) coordinates of cluster centers
# title - title of the plot
def clustering_scatterplot(points, labels, centers, title):
    n_clusters = np.unique(labels).size
    for i in range(n_clusters):
        h = plt.scatter(points[labels==i,0],
                        points[labels==i,1], 
                        c=colors[i%colors.size],
                        label = 'cluster '+str(i))
    # plot the centers of the clusters
    if centers is not None:
        plt.scatter(centers[:,0], centers[:,1], c='r', marker='*', s=500)

    _ = plt.title(title)
    _ = plt.legend()
    _ = plt.xlabel('x')
    _ = plt.ylabel('y')

## Clustering

In [None]:
neuron = 57
top = 50
top_ims = utilities.get_activations(activations_avg = activations, direction = neuron).argsort()[-top:][::-1]

In [None]:
top_activations = utilities.get_activations(activations_avg = activations, ims=top_ims)
# top_activations.shape # (25, 2048)
# top_activations

In [None]:
# XY_MDS = manifold.MDS(n_components=2).fit_transform(top_activations)
# plt.scatter(x=XY_MDS[:,0],y=XY_MDS[:,1])

In [None]:
# XY_TSNE = manifold.TSNE(n_components=2,perplexity=10).fit_transform(top_activations)
# plt.scatter(x=XY_TSNE[:,0],y=XY_TSNE[:,1])

## Try clustering algorithms

### AgglomerativeClustering

In [None]:
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
clusterer = AgglomerativeClustering(n_clusters=2, metric='euclidean', linkage='ward')
clusterer.fit_predict(top_activations)

In [None]:
# temp distance_threshold = 
from sklearn.metrics.pairwise import pairwise_distances # cosine_distances
distance_matrix = pairwise_distances(top_activations, metric = 'euclidean')
distance_matrix

In [None]:
clusterer = AgglomerativeClustering(metric='precomputed', linkage='ward', distance_threshold=50)
clusterer.fit(distance_matrix)
print(clusterer.labels_)
print(distance_matrix)
print(np.min(distance_matrix[np.nonzero(distance_matrix)]))

In [None]:
clu_labs = clusterer.labels_
clu_lab_order = sorted(range(len(clu_labs)), key=lambda k: clu_labs[k])

In [None]:
fig, ax = plt.subplots(math.ceil(len(top_ims)//5), 5, figsize = (10,20))
ax = ax.flatten()
for idx, im_id in enumerate(top_ims[clu_lab_order]):
    im = Image.open(paths[im_id])
    ax[idx].imshow(im)
    ax[idx].set_title(f"{im_id}: cluster {clu_labs[clu_lab_order][idx]}", size = 8)
    ax[idx].axis('off')

In [None]:
# Apply multi-dimensional scaling (MDS) to project the data to a 2D space
XYcoordinates = manifold.MDS(n_components=2).fit_transform(top_activations)
print("transformation complete")
clustering_scatterplot(points=XYcoordinates[:,:], 
                       labels=clusterer.labels_, 
                       centers=None, 
                       title='MDS')

In [None]:
# Apply t-SNE to project the data to a 2D space
XYcoordinates = manifold.TSNE(n_components=2, perplexity=10).fit_transform(top_activations)
print("transformation complete")
clustering_scatterplot(points=XYcoordinates[:,:], 
                       labels=clusterer.labels_,
                       centers=None, 
                       title='TSNE')

### FeatureAgglomeration

In [None]:
from sklearn.cluster import FeatureAgglomeration
from scipy.cluster.hierarchy import dendrogram, linkage
clusterer = FeatureAgglomeration(n_clusters=2, metric='euclidean', linkage='ward')
clusterer.fit_predict(top_activations) 

In [None]:
clu_labs = clusterer.labels_
clu_lab_order = sorted(range(len(clu_labs)), key=lambda k: clu_labs[k])

In [None]:
fig, ax = plt.subplots(math.ceil(len(top_ims)//5), 5, figsize = (10,20))
ax = ax.flatten()
for idx, im_id in enumerate(top_ims[clu_lab_order]):
    im = Image.open(paths[im_id])
    ax[idx].imshow(im)
    ax[idx].set_title(f"{im_id}: cluster {clu_labs[clu_lab_order][idx]}", size = 8)
    ax[idx].axis('off')

### HDBSCAN

!pip install hdbscan

In [None]:
# test
# clusterer = hdbscan.HDBSCAN()
# clusterer.fit(top_activations)
# clusterer.labels_
# clusterer

In [None]:
# activations.shape # (10000, 2048)
# top_activations.shape # (50, 2048)
#hclusterer.labels_.shape

In [None]:
from sklearn.metrics.pairwise import pairwise_distances # cosine_distances
distance_matrix = pairwise_distances(top_activations, metric = 'euclidean')
clusterer = hdbscan.HDBSCAN(metric='precomputed', cluster_selection_epsilon=5, allow_single_cluster = True)
clusterer.fit(distance_matrix)
print(clusterer.labels_)
print(distance_matrix)
print(np.min(distance_matrix[np.nonzero(distance_matrix)]))

In [None]:
# print(np.max(distance_matrix)) # 1.1333303243154806
# distance_matrix.shape # (50, 50)
# 1-distance_matrix
# np.max(1-distance_matrix) # 1.0
# np.min(1-distance_matrix) # -0.13333032431548064

In [None]:
# ax = plt.subplot()
# im = ax.imshow(1-distance_matrix, cmap='viridis', interpolation='nearest') 
# plt.title("Cosine similarities")
# plt.subplots_adjust(right=0.8)
# cbar_ax = plt.axes([0.85, 0.1, 0.075, 0.8])
# plt.colorbar(mappable=(im), cax=cbar_ax)
# plt.show()

In [None]:
clu_labs = hdbscan_clusterer.labels_
print(clu_labs)
clu_lab_order = sorted(range(len(clu_labs)), key=lambda k: clu_labs[k])

In [None]:
fig, ax = plt.subplots(math.ceil(len(top_ims)//5), 5, figsize = (10,20))
ax = ax.flatten()
for idx, im_id in enumerate(top_ims[clu_lab_order]):
    im = Image.open(paths[im_id])
    ax[idx].imshow(im)
    ax[idx].set_title(f"{im_id}: cluster {clu_labs[clu_lab_order][idx]}", size = 8)
    ax[idx].axis('off')

In [None]:
from sklearn.metrics.pairwise import pairwise_distances # cosine_distances
distance_matrix = pairwise_distances(top_activations, metric = 'euclidean')
distance_matrix

In [None]:
# Append the cluster centers to the dataset.
# clustered_data_sklearn = DBSCAN(eps=10).fit(top_activations) # , metric = "cosine"
# print(clustered_data_sklearn.labels_)
# clustered_data_sklearn