## Set-up analysis and load data.

In [1]:
import pandas as pd

GLOVE_FILE = "glove-100-angular.hdf5"
STACKOVERFLOW_FILE = "stackoverflow-512-angular.hdf5"
IMAGENET_FILE = "imagenet-96-angular.hdf5"

num_queries = 100
k = 10

quantiles = (0.0, 0.1, 0.5, 0.9, 1.0)

In [2]:
import h5py
import numpy as np
from sklearn import preprocessing

def load_data(filename):
    dataset = h5py.File(filename, 'r')

    train_data = np.array(dataset['train'])
    test_data = np.array(dataset['test'])[:num_queries]

    train_data = preprocessing.normalize(train_data, axis=1, norm='l2')
    test_data = preprocessing.normalize(test_data, axis=1, norm='l2')
    return (train_data, test_data)

(train_data, test_data) = load_data(STACKOVERFLOW_FILE)
print('train_data.shape:', train_data.shape)

train_data.shape: (1000000, 512)


In [8]:
import faiss

def run_kmeans(data, n_centroids):
    sample_size = int(len(data) / 10)

    sample = np.random.permutation(data)[:sample_size]
    sample = sample.astype(np.float32)

    kmeans = faiss.Kmeans(data.shape[1], n_centroids, niter=20, verbose=True)
    kmeans.train(data)
    return kmeans

def compute_cluster_stats(model, data):
    sample_size = int(len(data) / 10)
    sample = np.random.permutation(data)[:sample_size]
    
    dists = []
    other_dists = []
    centroid_dists = []
    
    D, I = model.index.search(sample, len(model.centroids))
    for i, v in enumerate(sample):
        dists.append(np.sqrt(D[i][0]))
        other_dists.extend(np.sqrt(D[i][1:]))
        
    D, I = model.index.search(model.centroids, len(model.centroids))
    for i, v in enumerate(model.centroids):
        centroid_dists.extend(np.sqrt(D[i][1:]))
        
    return (dists, other_dists, centroid_dists)

def compute_cluster_counts(model, data):
    counts = [0] * len(model.centroids)
    
    D, I = model.index.search(data, 1)
    for i, v in enumerate(data):
        centroid = I[i][0]
        counts[centroid] += 1
    return counts

## How robust is k-means to outliers (from another distribution)?

In [4]:
from sklearn import decomposition
from sklearn import neighbors

PCA_DIMS = 100
np.random.seed(0)

pca = decomposition.PCA(n_components=PCA_DIMS)
train_data = pca.fit_transform(train_data)

In [5]:
(other_train_data, other_test_data) = load_data(GLOVE_FILE)

In [6]:
def add_outliers(data, other_data, percentage):
    sample_size = int(data.shape[0] * percentage)
    sample = np.random.permutation(other_data)[:sample_size]
    
    indices = tuple([np.random.randint(data.shape[0], size=sample_size)])
    new_data = np.copy(data)
    new_data[indices] = sample
    return new_data

In [9]:
n_centroids = 1000
index = ['dist to nearest centroid', 'dist to other centroids', 'dist between centroids', 'cluster sizes']

percentage = 0.01
new_data = add_outliers(train_data, other_train_data, percentage)

kmeans = run_kmeans(new_data, n_centroids)
(dists, other_dists, centroid_dists) = compute_cluster_stats(kmeans, new_data)
counts = compute_cluster_counts(kmeans, new_data)

output = [np.quantile(dists, quantiles),
          np.quantile(other_dists, quantiles),
          np.quantile(centroid_dists, quantiles),
          np.quantile(counts, quantiles)]
df = pd.DataFrame(output, index=index, columns=quantiles)
print(df.round(3).to_string())

                            0.0      0.1      0.5       0.9       1.0
dist to nearest centroid  0.175    0.343    0.449     0.601     1.011
dist to other centroids   0.223    0.665    0.824     0.983     1.614
dist between centroids    0.178    0.526    0.686     0.827     1.500
cluster sizes             1.000  472.700  950.500  1588.500  3070.000


In [None]:
sum(1 for c in counts if c < 10)

## How robust is k-means to outliers (random vectors)?

In [None]:
n = train_data.shape[0]
p = train_data.shape[1]

random_data = np.random.normal(0, 1, n * p).reshape(n, p)
random_data = preprocessing.normalize(random_data, axis=1, norm='l2')
random_data = random_data.astype(np.float32)

In [None]:
n_centroids = 1000
index = ['dist to nearest centroid', 'dist to other centroids', 'dist between centroids', 'cluster sizes']

percentage = 0.01
new_data = add_outliers(train_data, random_data, percentage)

kmeans = run_kmeans(new_data[0:1000000,:], n_centroids)
(dists, other_dists, centroid_dists) = compute_cluster_stats(kmeans, new_data[0:1000000,:])
counts = compute_cluster_counts(kmeans, new_data)

output = [np.quantile(dists, quantiles),
          np.quantile(other_dists, quantiles),
          np.quantile(centroid_dists, quantiles),
          np.quantile(counts, quantiles)]
df = pd.DataFrame(output, index=index, columns=quantiles)
print(df.round(3).to_string())

In [None]:
sum(1 for c in counts if c < 10)