## Set-up analysis and load data.

In [1]:
import pandas as pd

GLOVE_FILE = "glove-100-angular.hdf5"
STACKOVERFLOW_FILE = "stackoverflow-512-angular.hdf5"
IMAGENET_FILE = "imagenet-96-angular.hdf5"

num_queries = 100
k = 10

quantiles = (0.0, 0.1, 0.5, 0.9, 1.0)

In [2]:
import h5py
import numpy as np
from sklearn import preprocessing

def load_data(filename):
    dataset = h5py.File(filename, 'r')

    train_data = np.array(dataset['train'])
    test_data = np.array(dataset['test'])[:num_queries]

    train_data = preprocessing.normalize(train_data, axis=1, norm='l2')
    test_data = preprocessing.normalize(test_data, axis=1, norm='l2')
    return (train_data, test_data)

(train_data, test_data) = load_data(IMAGENET_FILE)
print('train_data.shape:', train_data.shape)

train_data.shape: (1000000, 96)


## How well does the data cluster?

In [12]:
import faiss

def run_kmeans(data, n_centroids):
    sample_size = int(len(data) / 2)

    sample = np.random.permutation(data)[:sample_size]
    sample = sample.astype(np.float32)

    kmeans = faiss.Kmeans(data.shape[1], n_centroids, niter=20, verbose=True)
    kmeans.train(data)
    return kmeans

def compute_cluster_stats(model, data):
    sample_size = int(len(data) / 2)
    sample = np.random.permutation(data)[:sample_size]
    
    dists = []
    other_dists = []
    centroid_dists = []
    
    D, I = model.index.search(sample, len(model.centroids))
    for i, v in enumerate(sample):
        dists.append(np.sqrt(D[i][0]))
        other_dists.extend(np.sqrt(D[i][1:]))
        
    D, I = model.index.search(model.centroids, len(model.centroids))
    for i, v in enumerate(model.centroids):
        centroid_dists.extend(np.sqrt(D[i][1:]))
        
    return (dists, other_dists, centroid_dists)

### Run k-means on the dataset.

In [13]:
n_centroids = 1000
index = ['dist to nearest centroid', 'dist to other centroids', 'dist between centroids']

In [14]:
kmeans = run_kmeans(train_data, n_centroids)
cluster_stats = compute_cluster_stats(kmeans, train_data)

output = [np.quantile(stats, quantiles) for stats in cluster_stats]
df = pd.DataFrame(output, index=index, columns=quantiles)
df.round(3)

Unnamed: 0,0.0,0.1,0.5,0.9,1.0
dist to nearest centroid,0.095,0.506,0.685,0.828,1.009
dist to other centroids,0.217,1.021,1.185,1.295,1.65
dist between centroids,0.252,0.766,0.95,1.09,1.424


### Run k-means on a random matrix.

In [6]:
n = train_data.shape[0]
p = train_data.shape[1]

random_data = np.random.normal(0, 1, n * p).reshape(n, p)
random_data = preprocessing.normalize(random_data, axis=1, norm='l2')
random_data = random_data.astype(np.float32)

In [7]:
random_kmeans = run_kmeans(random_data, n_centroids)
random_cluster_stats = compute_cluster_stats(random_kmeans, random_data)

output = [np.quantile(stats, quantiles) for stats in random_cluster_stats]
df2 = pd.DataFrame(output, index=index, columns=quantiles)
df2.round(3)

Unnamed: 0,0.0,0.1,0.5,0.9,1.0
dist to nearest centroid,0.868,0.984,0.99,0.993,0.998
dist to other centroids,0.98,1.005,1.015,1.025,1.054
dist between centroids,0.22,0.239,0.246,0.254,0.273


## How do kNN distances relate to these clusters?

### For small test sample, calculate distance to centroids, plus nearest neighbors at 1, 10, and 100.

In [8]:
index = ['d(c, v)',
         'd(q, c_10)', 'd(q, c_100)', 'd(q, c_500)', 'd(q, c_900)',
         'nn_1', 'nn_10', 'nn_100']

def compute_test_stats(train_data, test_data, model):
    nn1 = []
    nn10 = []
    nn100 = []

    centroid10 = []
    centroid100 = []
    centroid500 = []
    centroid900 = []

    D, I = model.index.search(test_data, len(model.centroids))

    for i, q in enumerate(test_data):
        dists = np.linalg.norm(q - train_data, axis=1)
        dists.sort()

        centroid10.append(np.sqrt(D[i][9]))
        centroid100.append(np.sqrt(D[i][99]))
        centroid500.append(np.sqrt(D[i][499]))
        centroid900.append(np.sqrt(D[i][899]))

        nn1.append(dists[0])
        nn10.append(dists[9])
        nn100.append(dists[99])
        
    return (centroid10, centroid100, centroid500, centroid900, nn1, nn10, nn100)

In [9]:
test_stats = compute_test_stats(train_data, test_data, kmeans)
output = [np.quantile(cluster_stats[0], quantiles)]
output.extend([np.quantile(stats, quantiles) for stats in test_stats])

df = pd.DataFrame(output, index=index, columns=quantiles)
df.round(3)

Unnamed: 0,0.0,0.1,0.5,0.9,1.0
"d(c, v)",0.187,0.37,0.476,0.631,1.017
"d(q, c_10)",0.451,0.499,0.572,0.721,0.931
"d(q, c_100)",0.587,0.643,0.695,0.817,1.022
"d(q, c_500)",0.706,0.774,0.831,0.928,1.105
"d(q, c_900)",0.871,0.91,0.963,1.031,1.169
nn_1,0.25,0.326,0.431,0.596,0.712
nn_10,0.294,0.383,0.483,0.683,0.826
nn_100,0.355,0.431,0.528,0.719,0.923


### Would more fine-grained clustering help?

In [11]:
more_centroids = 2000
more_kmeans = run_kmeans(train_data, more_centroids)

sample_size = int(len(train_data) / 10)
sample = np.random.permutation(train_data)[:sample_size]
dists = []

D, I = more_kmeans.index.search(sample, len(more_kmeans.centroids))
for i, v in enumerate(sample):
    dists.append(np.sqrt(D[i][0]))

In [12]:
more_test_stats = compute_test_stats(train_data, test_data, more_kmeans)
output = [np.quantile(dists, quantiles)]
output.extend([np.quantile(stats, quantiles) for stats in more_test_stats])

df = pd.DataFrame(output, index=index, columns=quantiles)
df.round(3)

Unnamed: 0,0.0,0.1,0.5,0.9,1.0
"d(c, v)",0.171,0.356,0.461,0.615,0.991
"d(q, c_10)",0.418,0.47,0.545,0.702,0.922
"d(q, c_100)",0.56,0.587,0.655,0.786,1.002
"d(q, c_500)",0.647,0.715,0.77,0.878,1.071
"d(q, c_900)",0.702,0.766,0.83,0.925,1.105
nn_1,0.25,0.326,0.431,0.596,0.712
nn_10,0.294,0.383,0.483,0.683,0.826
nn_100,0.355,0.431,0.528,0.719,0.923
