This notebook will represent results showed in figure 4.5, where we compare clustering algorithm performances in terms of adjusted rand index using different (in size and error) datasets.

In [25]:
import helpers
import numpy as np
from scipy.cluster.hierarchy import fcluster, linkage
from sklearn.metrics import adjusted_rand_score,silhouette_score
from k_medoid_clustering import KMedoid
from bernoulli_mixture_model import BMM
import timeit

Helper contains widely used methods that we will utilise

In [26]:
helper = helpers.Helper()

Please define the raw data file obtained through simulation or use the existing one.

In [27]:
raw_data_dir = "./populated_data/populated_true_genotypes_10_10_0.01_100.txt"

Clustering methods that should be tested

In [28]:
clustering_methods = ["k_means","slc","bmm"]

Number of clusters to split data into, vector size is the length of mutation vector and number of iterations is only applicable for BMM as it will show how many times EM step is iterated.

In [29]:
no_clusters = 10
vector_size = 10
number_of_iterations_for_bmm = 20

Single-linkage-clustering method

In [30]:
def _do_slc(no_clusters, distance_matrix):
    labels = fcluster(linkage(distance_matrix, method='complete'), t=no_clusters, criterion='maxclust')

    new_data = {}

    for cluster_label in labels - 1:
        if new_data.get(cluster_label):
            new_data[cluster_label] += [map(int, unique_rows.keys()[cluster_label].split(","))]
        else:
            new_data[cluster_label] = [map(int, unique_rows.keys()[cluster_label].split(","))]

    new_data_formatetd = {}
    for key, value in new_data.items():
        new_value = [','.join(map(str, el)) for el in value]
        new_data_formatetd[key] = new_value

    return new_data_formatetd, labels

Provides labels of true data and allows us to compare them to predicted labels later

In [31]:
def get_true_labels(unique_rows, full_data_dict):
    true_labels = []
    for key in unique_rows.keys():
        label = helper.get_label_of_cluster(vector=key, full_dict = full_data_dict)
        for _ in range(unique_rows[key]):
            true_labels.append(label)
    return true_labels

Performs clustering, extracts predicted labels and compares them to true labels using Adjusted Rand Index for all different clustering techniques

In [32]:
unique_rows, full_data_dict, full_info = helper.read_simulated_data_file(raw_data_dir)
distance_matrix = np.matrix(helper.find_distance_matrix(unique_rows))
true_labels = get_true_labels(unique_rows, full_data_dict)

for clustering_method in clustering_methods:
    
    if clustering_method == "slc":
        clustered_dict, labels = _do_slc(no_clusters, distance_matrix)
        predicted_labels = labels-1
        
    elif clustering_method == "k_means":

        predefined_kwargs = {"number_of_clusters": no_clusters, "unique_rows": unique_rows,
                             "full_data_dict": full_data_dict, "full_info": full_info,
                             "vector_size": vector_size}
        k_means_instance = KMedoid(**predefined_kwargs)
        clustered_dict = k_means_instance.do_k_means_using_sklearn()

        predicted_labels = k_means_instance.get_sklearn_predicted_labels()
    elif clustering_method == "bmm":
        bmm = BMM(no_clusters, unique_rows, full_data_dict, full_info, number_of_iterations_for_bmm)
        clustered_dict,predicted_labels = bmm.do_clustering()
        
    print "========",clustering_method,"=========="
    print "Adjusted Rand Index: ", adjusted_rand_score(true_labels, predicted_labels)

Adjusted Rand Index:  0.835657873067
Adjusted Rand Index:  0.813922296593


  from ipykernel import kernelapp as app


Adjusted Rand Index:  0.837396040304
