In [1]:
import helpers
import numpy as np
from scipy.cluster.hierarchy import fcluster, linkage
from sklearn.metrics import adjusted_rand_score,silhouette_score
from k_medoid_clustering import KMedoid
from bernoulli_mixture_model import BMM
import timeit

In [2]:
helper = helpers.Helper()

In [3]:
raw_data_dir = "./populated_data/populated_true_genotypes_20_20_0.01_1000.txt"

In [4]:
clustering_methods = ["k_means","slc","bmm"]

In [5]:
no_clusters = 40
vector_size = 20
number_of_iterations_for_bmm = 10

In [6]:
def _do_slc(no_clusters, distance_matrix):
    labels = fcluster(linkage(distance_matrix, method='complete'), t=no_clusters, criterion='maxclust')

    new_data = {}

    for cluster_label in labels - 1:
        if new_data.get(cluster_label):
            new_data[cluster_label] += [map(int, unique_rows.keys()[cluster_label].split(","))]
        else:
            new_data[cluster_label] = [map(int, unique_rows.keys()[cluster_label].split(","))]

    new_data_formatetd = {}
    for key, value in new_data.items():
        new_value = [','.join(map(str, el)) for el in value]
        new_data_formatetd[key] = new_value

    return new_data_formatetd, labels

In [7]:
def get_true_labels(unique_rows, full_data_dict):
    true_labels = []
    for key in unique_rows.keys():
        label = helper.get_label_of_cluster(vector=key, full_dict = full_data_dict)
        for _ in range(unique_rows[key]):
            true_labels.append(label)
    return true_labels

In [8]:
unique_rows, full_data_dict, full_info = helper.read_simulated_data_file(raw_data_dir)
distance_matrix = np.matrix(helper.find_distance_matrix(unique_rows))
true_labels = get_true_labels(unique_rows, full_data_dict)

for clustering_method in clustering_methods:
    
    if clustering_method == "slc":
        clustered_dict, labels = _do_slc(no_clusters, distance_matrix)
        predicted_labels = labels-1
        
    elif clustering_method == "k_means":

        predefined_kwargs = {"number_of_clusters": no_clusters, "unique_rows": unique_rows,
                             "full_data_dict": full_data_dict, "full_info": full_info,
                             "vector_size": vector_size}
        k_means_instance = KMedoid(**predefined_kwargs)
        clustered_dict = k_means_instance.do_k_means_using_sklearn()

        predicted_labels = k_means_instance.get_sklearn_predicted_labels()
    elif clustering_method == "bmm":
        bmm = BMM(no_clusters, unique_rows, full_data_dict, full_info, number_of_iterations_for_bmm)
        clustered_dict,predicted_labels = bmm.do_clustering()
        
    print "========",clustering_method,"=========="
    print "Adjusted Rand Index: ", adjusted_rand_score(true_labels, predicted_labels)

[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 18, 11, 0, 13, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 3, 3, 1, 1, 1, 8, 15, 15, 16, 18, 0, 0, 0, 2, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 9, 4, 5, 5, 8, 4, 15, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 11, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 19, 14, 13, 13, 13, 13, 15, 15, 15, 13, 13, 13, 13, 13, 13, 13, 

  from ipykernel import kernelapp as app


[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 18, 11, 0, 13, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 3, 3, 1, 1, 1, 8, 15, 15, 16, 18, 0, 0, 0, 2, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 9, 4, 5, 5, 8, 4, 15, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 11, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 19, 14, 13, 13, 13, 13, 15, 15, 15, 13, 13, 13, 13, 13, 13, 13, 

KeyboardInterrupt: 