In [1]:
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import v_measure_score

In [2]:
import pandas as pd
import numpy as np
import os

In [3]:
def ProcessCase(data_path, case_no, output_path = None, rep_count = 20, chain_count = 4, region_count = 3):
    rand = []
    mutual_info = []
    vmeasure = []
    ancestral_metric = []
    for rep in range(rep_count):
        rep_path = os.path.join(data_path, "rep" + str(rep))
        case_path = os.path.join(rep_path, "case" + str(case_no))
        truth0 = pd.read_csv(os.path.join(case_path, "cluster_labels.txt"), header=None, names=["ID", "Cluster"])
        ancestral_matrix_truth = np.asarray(pd.read_csv(os.path.join(case_path, "ancestral_matrix.csv"), header=None))

        # Check which chain achieved the highest likelihood.
        for region in range(1, region_count +1):
            region_path = os.path.join(case_path, "single_region_genotype" + str(region))
            log_likelihoods = []
            for chain in range(chain_count):
                chain_path = os.path.join(region_path, "chain" + str(chain))
                log_likelihood = pd.read_table(os.path.join(chain_path, "joint", "tree0", "log_lik.txt"), header=None, names=["LogLik"])
                log_likelihoods.append(log_likelihood["LogLik"])

            best_chain = np.argmax(log_likelihoods)
            pred_path = os.path.join(region_path, "chain" + str(best_chain), "joint", "tree0")
            predicted = pd.read_table(os.path.join(pred_path, "cluster_labels.tsv"), header=None, names=["ID", "Cluster"])
            ancestral_matrix = np.asarray(pd.read_csv(os.path.join(pred_path, "ancestral_matrix.csv"), header=None))

            rand.append(adjusted_rand_score(truth0["Cluster"], predicted["Cluster"]))
            mutual_info.append(adjusted_mutual_info_score(truth0["Cluster"], predicted["Cluster"]))
            vmeasure.append(v_measure_score(truth0["Cluster"], predicted["Cluster"]))
            ancestral_metric.append(np.mean(np.abs(ancestral_matrix_truth - ancestral_matrix)))
        
    df = pd.DataFrame(list(zip(rand, mutual_info, vmeasure, ancestral_metric)), columns=["AdjRandIndex", "AdjMutualInformation", "VMeasure", "AncestralMetric"])
    if output_path is not None:
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        output_file = os.path.join(output_path, "single_region_case" + str(case_no) + ".csv")
        df.to_csv(output_file, index=False)
    return(df)


In [4]:
# Process results from our method.
output_path = "/Users/seonghwanjun/ScRNACloneEvaluation/data/simul/quadternary_multiregion/snvs/"
data_path = "/Users/seonghwanjun/data/single_region/simul/quadternary_multiregion/"
df0 = ProcessCase(data_path, 0, output_path, region_count=1)
df1 = ProcessCase(data_path, 1, output_path, region_count=1)
df2 = ProcessCase(data_path, 2, output_path, region_count=1)
df3 = ProcessCase(data_path, 3, output_path, region_count=1)


In [5]:
# Process results from our method.
output_path = "/Users/seonghwanjun/ScRNACloneEvaluation/data/simul/quadternary_cn_multiregion/snvs/"
data_path = "/Users/seonghwanjun/data/single_region/simul/quadternary_cn_multiregion/"
df0 = ProcessCase(data_path, 0, output_path, region_count=1)
df1 = ProcessCase(data_path, 1, output_path, region_count=1)
df2 = ProcessCase(data_path, 2, output_path, region_count=1)
df3 = ProcessCase(data_path, 3, output_path, region_count=1)
