In [256]:
import helpers
import numpy as np
from scipy.cluster.hierarchy import fcluster, linkage
from sklearn.metrics import adjusted_rand_score,silhouette_score
from k_medoid_clustering import KMedoid
from bernoulli_mixture_model import BMM
import timeit



In [257]:
helper = helpers.Helper()

In [258]:
raw_data_dir = "./populated_data/populated_true_genotypes_5_5_0.01_20.txt"


In [259]:
clustering_methods = ["bmm","slc","k_means"]

In [260]:
no_clusters = 5
vector_size = 5
number_of_iterations_for_bmm = 25

In [261]:
def _do_slc(no_clusters, distance_matrix):
    start = timeit.default_timer()
    labels = fcluster(linkage(distance_matrix, method='complete'), t=no_clusters, criterion='maxclust')
    stop = timeit.default_timer()

    new_data = {}

    for cluster_label in labels - 1:
        if new_data.get(cluster_label):
            new_data[cluster_label] += [map(int, unique_rows.keys()[cluster_label].split(","))]
        else:
            new_data[cluster_label] = [map(int, unique_rows.keys()[cluster_label].split(","))]

    new_data_formatetd = {}
    for key, value in new_data.items():
        new_value = [','.join(map(str, el)) for el in value]
        new_data_formatetd[key] = new_value

    return new_data_formatetd, labels, stop, start

In [262]:
unique_rows, full_data_dict, full_info = helper.read_simulated_data_file(raw_data_dir)
distance_matrix = np.matrix(helper.find_distance_matrix(unique_rows))

for clustering_method in clustering_methods:
    if clustering_method == "slc":
        clustered_dict, labels, stop,start = _do_slc(no_clusters, distance_matrix)

        predicted_labels = labels-1
    elif clustering_method == "k_means":

        predefined_kwargs = {"number_of_clusters": no_clusters, "unique_rows": unique_rows,
                             "full_data_dict": full_data_dict, "full_info": full_info,
                             "vector_size": vector_size}
        k_means_instance = KMedoid(**predefined_kwargs)
        start = timeit.default_timer()
        clustered_dict = k_means_instance.do_k_means_using_sklearn()
        stop = timeit.default_timer()

        predicted_labels = k_means_instance.get_sklearn_predicted_labels()
    elif clustering_method == "bmm":
        bmm = BMM(no_clusters, unique_rows, full_data_dict, full_info, number_of_iterations_for_bmm)
        start = timeit.default_timer()
        clustered_dict,predicted_labels = bmm.do_clustering()
        stop = timeit.default_timer()

        predicted_labels = np.asarray(predicted_labels)
    print "========",clustering_method,"=========="
    print "Silhouette score: ", silhouette_score(distance_matrix,predicted_labels, metric="precomputed")
    print "Time taken: ", (stop - start),"sec"


Silhouette score:  0.790236928105
Time taken:  0.267663955688 sec
Silhouette score:  0.813888888889
Time taken:  0.000236988067627 sec
Silhouette score:  0.925
Time taken:  0.00891304016113 sec
