In [1]:
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import v_measure_score

In [2]:
import pandas as pd
import numpy as np
import os

In [3]:
def ProcessCase(data_path, case_no, output_path = None, rep_count = 20, chain_count = 4):
    rand = []
    mutual_info = []
    vmeasure = []
    ancestral_metric = []
    for rep in range(rep_count):
        rep_path = os.path.join(data_path, "rep" + str(rep))
        case_path = os.path.join(rep_path, "case" + str(case_no))
        truth0 = pd.read_csv(os.path.join(case_path, "cluster_labels.txt"), header=None, names=["ID", "Cluster"])
        ancestral_matrix_truth = np.asarray(pd.read_csv(os.path.join(case_path, "ancestral_matrix.csv"), header=None))
        
        # Check which chain achieved the highest likelihood.
        log_likelihoods = []
        for chain in range(chain_count):
            chain_path = os.path.join(case_path, "genotype", "chain" + str(chain))
            log_likelihood = pd.read_table(os.path.join(chain_path, "joint", "tree0", "log_lik.txt"), header=None, names=["LogLik"])
            log_likelihoods.append(log_likelihood["LogLik"])

        best_chain = np.argmax(log_likelihoods)
        pred_path = os.path.join(case_path, "genotype", "chain" + str(best_chain), "joint", "tree0")
        predicted = pd.read_table(os.path.join(pred_path, "cluster_labels.tsv"), header=None, names=["ID", "Cluster"])
        ancestral_matrix = np.asarray(pd.read_csv(os.path.join(pred_path, "ancestral_matrix.csv"), header=None))

        rand.append(adjusted_rand_score(truth0["Cluster"], predicted["Cluster"]))
        mutual_info.append(adjusted_mutual_info_score(truth0["Cluster"], predicted["Cluster"]))
        vmeasure.append(v_measure_score(truth0["Cluster"], predicted["Cluster"]))
        ancestral_metric.append(np.mean(np.abs(ancestral_matrix_truth - ancestral_matrix)))
        
    df = pd.DataFrame(list(zip(rand, mutual_info, vmeasure, ancestral_metric)), columns=["AdjRandIndex", "AdjMutualInformation", "VMeasure", "AncestralMetric"])
    if output_path is not None:
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        output_file = os.path.join(output_path, "case" + str(case_no) + ".csv")
        df.to_csv(output_file, index=False)
    return(df)


In [5]:
# PSM: posterior similarity matrix.
def ProcessCasePSM(data_path, case_no, output_path = None, rep_count = 20, chain_count = 4):
    rand = []
    mutual_info = []
    vmeasure = []
    for rep in range(rep_count):
        rep_path = os.path.join(data_path, "rep" + str(rep))
        case_path = os.path.join(rep_path, "case" + str(case_no))
        truth0 = pd.read_csv(os.path.join(case_path, "cluster_labels.txt"), header=None, names=["ID", "Cluster"])
        
        # Check which chain achieved the highest likelihood.
        log_likelihoods = []
        for chain in range(chain_count):
            chain_path = os.path.join(case_path, "genotype", "chain" + str(chain))
            log_likelihood = pd.read_table(os.path.join(chain_path, "joint", "tree0", "log_lik.txt"), header=None, names=["LogLik"])
            log_likelihoods.append(log_likelihood["LogLik"])

        best_chain = np.argmax(log_likelihoods)
        pred_path = os.path.join(case_path, "genotype", "chain" + str(best_chain))
        predicted = pd.read_table(os.path.join(pred_path, "psm_SNV_assignment.txt"), header=None, names=["ID", "Cluster"], sep="\t")

        rand.append(adjusted_rand_score(truth0["Cluster"], predicted["Cluster"]))
        mutual_info.append(adjusted_mutual_info_score(truth0["Cluster"], predicted["Cluster"]))
        vmeasure.append(v_measure_score(truth0["Cluster"], predicted["Cluster"]))
        
    df = pd.DataFrame(list(zip(rand, mutual_info, vmeasure)), columns=["AdjRandIndex", "AdjMutualInformation", "VMeasure"])
    if output_path is not None:
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        output_file = os.path.join(output_path, "case" + str(case_no) + "_psm.csv")
        df.to_csv(output_file, index=False)
    return(df)


In [6]:
# Process results from our method.
output_path = "../../_output/simul/binary/snvs/"
data_path = "../..//data/simulation/binary/"
df0 = ProcessCase(data_path, 0, output_path)
df1 = ProcessCase(data_path, 1, output_path)
df2 = ProcessCase(data_path, 2, output_path)
df3 = ProcessCase(data_path, 3, output_path)

df0 = ProcessCasePSM(data_path, 0, output_path)
df1 = ProcessCasePSM(data_path, 1, output_path)
df2 = ProcessCasePSM(data_path, 2, output_path)
df3 = ProcessCasePSM(data_path, 3, output_path)


In [5]:
# Process results from our method.
output_path = "../../_output/simul/binary_cn/snvs/"
data_path = "../..//data/simulation/binary_cn/"
df0 = ProcessCase(data_path, 0, output_path)
df1 = ProcessCase(data_path, 1, output_path)
df2 = ProcessCase(data_path, 2, output_path)
df3 = ProcessCase(data_path, 3, output_path)


In [6]:
# Process results from our method.
output_path = "../../_output/simul/quadternary_cn/snvs/"
data_path = "../../data/simulation/quadternary_cn/"
df0 = ProcessCase(data_path, 0, output_path)
df1 = ProcessCase(data_path, 1, output_path)
df2 = ProcessCase(data_path, 2, output_path)
df3 = ProcessCase(data_path, 3, output_path)


In [7]:
# Process results from our method.
output_path = "../../_output/simul/quadternary_multiregion/snvs/"
data_path = "../../data/simulation/quadternary_multiregion/"
df0 = ProcessCase(data_path, 0, output_path)
df1 = ProcessCase(data_path, 1, output_path)
df2 = ProcessCase(data_path, 2, output_path)
df3 = ProcessCase(data_path, 3, output_path)


In [8]:
# Process results from our method.
output_path = "../../_output/simul/quadternary_cn_multiregion/snvs/"
data_path = "../../data/simulation/quadternary_cn_multiregion/"
df0 = ProcessCase(data_path, 0, output_path)
df1 = ProcessCase(data_path, 1, output_path)
df2 = ProcessCase(data_path, 2, output_path)
df3 = ProcessCase(data_path, 3, output_path)


In [9]:
# Compute the metrics on PhyloWGS.
def ProcessPhyloWGSOutput(data_path, case_no, output_path = None, rep_count = 20):
    rand = []
    mutual_info = []
    vmeasure = []
    ancestral_metric = []
    for rep in range(rep_count):
        rep_path = os.path.join(data_path, "rep" + str(rep))
        case_path = os.path.join(rep_path, "case" + str(case_no))
        clustering_file = os.path.join(case_path, "phylowgs", "clustering.txt")
        ancestral_matrix_file = os.path.join(case_path, "phylowgs", "ancestral_matrix.txt")
        if os.path.exists(clustering_file) and os.path.exists(ancestral_matrix_file):
            truth0 = pd.read_csv(os.path.join(case_path, "cluster_labels.txt"), header=None, names=["ID", "Cluster"])
            ancestral_matrix_truth = np.asarray(pd.read_csv(os.path.join(case_path, "ancestral_matrix.csv"), header=None))

            predicted = pd.read_table(clustering_file, header=None, names=["ID", "Cluster"], sep=" ")
            ancestral_matrix = np.asarray(pd.read_table(ancestral_matrix_file, header=None, sep=" "))

            rand.append(adjusted_rand_score(truth0["Cluster"], predicted["Cluster"]))
            mutual_info.append(adjusted_mutual_info_score(truth0["Cluster"], predicted["Cluster"]))
            vmeasure.append(v_measure_score(truth0["Cluster"], predicted["Cluster"]))
            ancestral_metric.append(np.mean(np.abs(ancestral_matrix_truth - ancestral_matrix)))

    df = pd.DataFrame(list(zip(rand, mutual_info, vmeasure, ancestral_metric)), columns=["AdjRandIndex", "AdjMutualInformation", "VMeasure", "AncestralMetric"])
    if output_path is not None:
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        output_file = os.path.join(output_path, "phylowgs.csv")
        df.to_csv(output_file, index=False)
    return df


### NOTE: Run extract PhyloWGS results script first.

In [10]:
output_path = "../../_output/simul/binary/snvs/"
data_path = "../../data/simulation/binary/"
df = ProcessPhyloWGSOutput(data_path, 0, output_path)

In [11]:
output_path = "../../_output/simul/quadternary_multiregion/snvs/"
data_path = "../../data/simulation/quadternary_multiregion/"
df = ProcessPhyloWGSOutput(data_path, 0, output_path)

In [12]:
# Compute the metrics on PhyloWGS.
def ProcessCanopyOutput(data_path, case_no, output_path = None, rep_count = 20):
    rand = []
    mutual_info = []
    vmeasure = []
    ancestral_metric = []
    for rep in range(rep_count):
        rep_path = os.path.join(data_path, "rep" + str(rep))
        case_path = os.path.join(rep_path, "case" + str(case_no))
        clustering_file = os.path.join(case_path, "canopy", "predicted.csv")
        ancestral_matrix_file = os.path.join(case_path, "canopy", "ancestral_matrix.csv")
        if os.path.exists(clustering_file) and os.path.exists(ancestral_matrix_file):
            truth0 = pd.read_csv(os.path.join(case_path, "cluster_labels.txt"), header=None, names=["ID", "Cluster"])
            ancestral_matrix_truth = np.asarray(pd.read_csv(os.path.join(case_path, "ancestral_matrix.csv"), header=None))

            predicted = pd.read_csv(clustering_file)
            ancestral_matrix = np.asarray(pd.read_table(ancestral_matrix_file, header=None, sep=" "))

            rand.append(adjusted_rand_score(truth0["Cluster"], predicted["Cluster"]))
            mutual_info.append(adjusted_mutual_info_score(truth0["Cluster"], predicted["Cluster"]))
            vmeasure.append(v_measure_score(truth0["Cluster"], predicted["Cluster"]))
            ancestral_metric.append(np.mean(np.abs(ancestral_matrix_truth - ancestral_matrix)))

    df = pd.DataFrame(list(zip(rand, mutual_info, vmeasure, ancestral_metric)), columns=["AdjRandIndex", "AdjMutualInformation", "VMeasure", "AncestralMetric"])
    if output_path is not None:
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        output_file = os.path.join(output_path, "canopy.csv")
        df.to_csv(output_file, index=False)
    return df


In [13]:
output_path = "../../_output/simul/binary/snvs/"
data_path = "../../data/simulation/binary/"
df = ProcessCanopyOutput(data_path, 0, output_path)

In [14]:
output_path = "../../_output/simul/binary_cn/snvs/"
data_path = "../../data/simulation/binary_cn/"
df = ProcessCanopyOutput(data_path, 0, output_path)

In [15]:
output_path = "../../_output/simul/quadternary_cn/snvs/"
data_path = "../../data/simulation/quadternary_cn/"
df = ProcessCanopyOutput(data_path, 0, output_path)

In [16]:
output_path = "../../_output/simul/quadternary_cn_multiregion/snvs/"
data_path = "../../data/simulation/quadternary_cn_multiregion//"
df = ProcessCanopyOutput(data_path, 0, output_path)

In [17]:
output_path = "../../_output/simul/quadternary_multiregion/snvs/"
data_path = "../../data/simulation/quadternary_multiregion/"
df = ProcessCanopyOutput(data_path, 0, output_path)