# Easily export jupyter cells to python module
https://github.com/fastai/course-v3/blob/master/nbs/dl2/notebook2script.py

In [3]:
! python /tf/src/scripts/notebook2script.py clustering.ipynb

Converted clustering.ipynb to exp/nb_clustering.py


In [None]:
cd /tf/src/data/features

In [None]:
! pip3 install scipy

In [None]:
#export
import pickle
import numpy as np
import os
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score, pairwise_distances_argmin_min

# K -means

In [None]:
# Reading in the data
with open('hman_features.pickle', 'rb') as f:
    hman_dict = pickle.load(f)

hman_features = np.array(list(hman_dict.values()))
print(hman_features.shape)

In [None]:
#export
def load_features(path):
    _, _, fnames = next(os.walk(path))
    
    features = []
    for fname in fnames:
        with open(os.path.join(path, fname), 'rb') as f:
            feature_dict = pickle.load(f)
            features.append(feature_dict)
    
    return features

In [None]:
models_path = "/tf/src/data/features/output_space"
models_features = load_features(models_path)

In [None]:
# Testing that the keys and values are aligned inside dictionaries
test_1 = list(models_features[0].values())
test_2 = list(models_features[0].keys())

assert (models_features[0][test_2[0]] == test_1[0]).all()

## Failed Attempts to use Simulated Annealing for K-means

In [None]:
def calc_silhouette(feature_vectors, k):
    kmeans = KMeans(n_clusters = int(k[0]))
    kmeans.fit(feature_vectors)

    labels    = kmeans.predict(feature_vectors)
    silhouette_avg = silhouette_score(feature_vectors, labels)
    
    return silhouette_avg

In [None]:
feature_vectors = np.array(list(models_features[0].values())[:100])
calc_silhouette(feature_vectors, [4])

In [None]:
def find_bst_k(feature_vectors):
    lw_bnd = [2]
    up_bnd = [3]
    minimizer_kwargs = {"method": "BFGS"}
    res = optimize.dual_annealing(partial(calc_silhouette, feature_vectors),
                                  bounds = list(zip(lw_bnd, up_bnd)),
                                  maxiter=100
                                 )
    
    return res

In [None]:
feature_vectors = np.array(list(models_features[0].values())[:100])
res = find_bst_k(feature_vectors)

In [None]:
print("global minimum: xmin = {0}, f(xmin) = {1:.6f}".format(res.x, res.fun))

In [None]:
#export
def k_means(feature_vectors, k_range=[2, 3]):
    # finding best k
    bst_k          = k_range[0]
    bst_silhouette = -1
    bst_labels     = None
    bst_centroids  = None
    bst_kmeans     = None
    for k in k_range:
        kmeans = KMeans(n_clusters = k)
        kmeans.fit(feature_vectors)

        labels    = kmeans.predict(feature_vectors)
        centroids = kmeans.cluster_centers_
        
        silhouette_avg = silhouette_score(feature_vectors, labels)
        if silhouette_avg > bst_silhouette:
            bst_k          = k
            bst_silhouette = silhouette_avg
            bst_labels     = labels
            bst_centroids  = centroids
            bst_kmeans     = kmeans
    
    print("Best K was", bst_k, "with a silhouette score of", bst_silhouette)
    
    centroid_mthds = pairwise_distances_argmin_min(bst_centroids, feature_vectors)
    return bst_labels, bst_centroids, bst_kmeans, centroid_mthds

In [None]:
for model in models_features:
    feature_vectors = np.array(list(model.values()))
    labels, centroids = k_means(feature_vectors)
    print(labels.shape, centroids.shape)

In [None]:
#export
# Uses PCA first and then t-SNE
def reduce_dims(feature_vectors, dims = 2):
    # hyperparameters from https://towardsdatascience.com/visualising-high-dimensional-datasets-using-pca-and-t-sne-in-python-8ef87e7915b
    pca = PCA(n_components=50)
    pca_features = pca.fit_transform(feature_vectors)

    tsne = TSNE(n_components=dims, verbose=1, perplexity=40, n_iter=300)
    tsne_features = tsne.fit_transform(pca_features)
    
    return tsne_features

In [None]:
#export
def cluster(models_features, k_range = [2], dims = 2):
    clusters = []
    for model in models_features:
        feature_vectors = reduce_dims(np.array(list(model.values())), dims = dims)
        
        experimental_vectors = feature_vectors[:len(experimental_vectors) * 0.1]
        labels, centroids, kmeans, centroid_mthds = k_means(experimental_vectors, k_range = k_range)
        clusters.append((feature_vectors, centroid_mthds, labels, centroids, kmeans))
    
    return clusters

In [None]:
models_clusters = cluster(models_features)