In [4]:
import argparse
import csv
import operator
import pickle
import random
import sys
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from multiprocessing import Pool

In [5]:
# set notebook arguments
args = {}
args['train'] = "train.npz"
#args['seed'] = 1337
args['seed'] = 1000
args['k'] = 25

# set random seed for replicable results
RANDOM_SEED = args['seed']
random.seed(RANDOM_SEED)

In [6]:
with np.load(args['train']) as npz:
    X_index_train = npz['X_index']
    Y_all_train   = npz['Y_all']
    Z_all_train   = npz['Z_all']

X_all_train = Z_all_train[X_index_train[:, 0], :]

train_offsets = Y_all_train - X_all_train

if args['k']:
    km = KMeans(n_clusters=args['k'], n_jobs=-1, random_state=RANDOM_SEED)
    km.fit_predict(train_offsets)
    pickle.dump(km, open('kmeans.pickle', 'wb'))
    print('Just written the k-means result for k=%d.' % (km.n_clusters))
    #sys.exit(0)


Just written the k-means result for k=25.


In [None]:
# semes like this code is not executed on account of sys.exit(0)
# at the end of block above
kmeans = {}

for k in range(2, 20 + 1):
    kmeans[k] = KMeans(n_clusters=k, n_jobs=-1, random_state=RANDOM_SEED)
    kmeans[k].fit_predict(train_offsets)
    print('k-means for k=%d computed.' % (k))

def evaluate(k):
    km = kmeans[k]
    score = silhouette_score(train_offsets, km.labels_, metric='euclidean', random_state=RANDOM_SEED)
    print('Silhouette score for k=%d is %f.' % (k, score))
    return (k, score)

scores = {}

with open('kmeans-scores.txt', 'w', newline='') as f:
    writer = csv.writer(f, dialect='excel-tab', lineterminator='\n')
    writer.writerow(('k', 'silhouette'))
    with Pool(12) as pool:
        for k, score in pool.imap_unordered(evaluate, kmeans):
            scores[k] = score
            writer.writerow((k, score))

k, score = max(scores.items(), key=operator.itemgetter(1))
pickle.dump(kmeans[k], open('kmeans.pickle', 'wb'))
