In [1]:
import numpy as np
import pandas as pd
import os
import glob
from collections import OrderedDict
import pickle

from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, scale
from sklearn.decomposition import PCA

import framework.clustering
from framework.common.util import save_data_table

In [28]:
input_dir = "pollen"
n_clusters = 11
clustering_method = "hc"

label_col = "cell_subtype"
latent_space_end_col_idx = -2

latent_space_file = "latent_representations.txt"
output_dir_name = "benchmark_results"

In [29]:
full_dir = "results/{}".format(input_dir)
output_dir = full_dir + "/" + output_dir_name

def load_latent_space(filepath, latent_space_end_col_idx, label_col):
    df = pd.read_csv(filepath, sep="\t", header=0, index_col=0)
    latent_space = df.iloc[:, 0:latent_space_end_col_idx].values.astype(
        dtype=np.float64)
    labels = df.loc[:, label_col].values
    label_encoder = LabelEncoder()
    label_encoder.fit(labels)
    int_labels = label_encoder.transform(labels)
    return df, latent_space, labels, int_labels

def evaluate_clustering(true_clusters, pred_clusters, features):
    return OrderedDict({
        "ari": metrics.adjusted_rand_score(true_clusters, pred_clusters),
        "ami": metrics.adjusted_mutual_info_score(true_clusters, pred_clusters),
        "completeness_score": metrics.completeness_score(true_clusters, pred_clusters),
        "homogeneity_score": metrics.homogeneity_score(true_clusters, pred_clusters),
        "v_measure_score": metrics.v_measure_score(true_clusters, pred_clusters),
        "silhouette_score": metrics.silhouette_score(features, pred_clusters,
                                                     metric='euclidean')
    })


print("Benchmarking Experiment(s) in Directory: {}".format(
        os.path.basename(os.path.normpath(full_dir))))

clustering_results = []
clustering_results.append(["experiment_name", "model_name", "ari", "ami", 
                           "completeness_score", "homogeneity_score",
                           "v_measure_score", "silhouette_score"])

for filepath in glob.iglob(
        full_dir + "/**/" + latent_space_file,
        recursive=True):
    model_dir = os.path.dirname(filepath)
    model_name = os.path.basename(model_dir)
    experiment_name = os.path.basename(
        os.path.dirname(model_dir))

    df, latent_space, labels, int_labels = load_latent_space(
        filepath, latent_space_end_col_idx, label_col)
    
    if not os.path.exists(output_dir + "/clustering"):
        os.makedirs(output_dir + "/clustering")
    
    clustering_obj_file = output_dir + "/clustering/" + \
        clustering_method + "_" + experiment_name + ".pkl"
    if not os.path.exists(clustering_obj_file):
        clustering_method_ref = eval("framework.clustering.cluster_" + clustering_method)
        clustering_obj = clustering_method_ref(latent_space, n_clusters)
        with open(clustering_obj_file, "wb") as f:
            pickle.dump(clustering_obj, f)
    else:
        with open(clustering_obj_file, "rb") as f:
            clustering_obj = pickle.load(f)
    
    results = list(evaluate_clustering(int_labels, clustering_obj["clusters"], latent_space).values())
    results = [experiment_name, model_name] + results
    clustering_results.append(results)
    
save_data_table(clustering_results, 
                output_dir + "/{}_clustering_results.txt".format(clustering_method))

Benchmarking Experiment(s) in Directory: pollen
