In [1]:
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import v_measure_score

In [2]:
import pandas as pd
import numpy as np
import os

In [9]:
# Process ddClone results.
def ProcessDDCloneResults(data_path, case_no, output_path = None, rep_count = 20):
    rand = []
    mutual_info = []
    vmeasure = []
    ancestral_metric = []
    for rep in range(rep_count):
        rep_path = os.path.join(data_path, "rep" + str(rep))
        case_path = os.path.join(rep_path, "case" + str(case_no))
        truth0 = pd.read_csv(os.path.join(case_path, "cluster_labels.txt"), header=None, names=["ID", "Cluster"])

        pred_path = os.path.join(case_path, "ddClone")
        predicted = pd.read_table(os.path.join(pred_path, "results.txt"), sep=" ")

        rand.append(adjusted_rand_score(truth0["Cluster"], predicted["clusterID"]))
        mutual_info.append(adjusted_mutual_info_score(truth0["Cluster"], predicted["clusterID"]))
        vmeasure.append(v_measure_score(truth0["Cluster"], predicted["clusterID"]))

    df = pd.DataFrame(list(zip(rand, mutual_info, vmeasure)), columns=["AdjRandIndex", "AdjMutualInformation", "VMeasure"])
    if output_path is not None:
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        output_file = os.path.join(output_path, "ddClone_case" + str(case_no) + ".csv")
        df.to_csv(output_file, index=False)
    return(df)
    

In [11]:
data_path = "../../data/simulation/binary/"
output_path = "../../_output/simul/binary/snvs/"
df1 = ProcessDDCloneResults(data_path, 1, output_path)
df2 = ProcessDDCloneResults(data_path, 2, output_path)
df3 = ProcessDDCloneResults(data_path, 3, output_path)


In [16]:
data_path = "../../data/simulation/binary_cn/"
output_path = "../../_output/simul/binary_cn/snvs/"
df1 = ProcessDDCloneResults(data_path, 1, output_path)
df2 = ProcessDDCloneResults(data_path, 2, output_path)
df3 = ProcessDDCloneResults(data_path, 3, output_path)


In [25]:
data_path = "../../data/simulation/quadternary_cn/"
output_path = "../../_output/simul/quadternary_cn/snvs/"
df1 = ProcessDDCloneResults(data_path, 1, output_path)
df2 = ProcessDDCloneResults(data_path, 2, output_path)
df3 = ProcessDDCloneResults(data_path, 3, output_path)


In [13]:
# Process B-SCITE results.
def ProcessBSCITE(data_path, case_no, output_path = None, rep_count = 20):
    #case_path = "/Users/seonghwanjun/data/simulation/binary4/case" + str(case_no) + "/sim0"
    rand = []
    mutual_info = []
    vmeasure = []
    ancestral_metric = []
    for rep in range(rep_count):
        rep_path = os.path.join(data_path, "rep" + str(rep))
        case_path = os.path.join(rep_path, "case" + str(case_no))
        truth0 = pd.read_csv(os.path.join(case_path, "cluster_labels.txt"), header=None, names=["ID", "Cluster"])
        A0 = np.asarray(pd.read_csv(os.path.join(case_path, "ancestral_matrix.csv"), header=None))

        pred_path = os.path.join(case_path, "bscite")
        clustering_prediction_file = os.path.join(pred_path, "results.txt")
        if os.path.exists(clustering_prediction_file):
            predicted = pd.read_table(clustering_prediction_file, sep=" ")

            rand.append(adjusted_rand_score(truth0["Cluster"], predicted["Cluster"]))
            mutual_info.append(adjusted_mutual_info_score(truth0["Cluster"], predicted["Cluster"]))
            vmeasure.append(v_measure_score(truth0["Cluster"], predicted["Cluster"]))

            # Read the ancestral matrix line-by-line.
            with open(os.path.join(pred_path, "bscite.matrices"), "r") as f:
                line = f.readline()
                mutation_count = int(line.split()[1])
                f.readline()
                A = []
                for _ in range(mutation_count):
                    line = f.readline()
                    A.append(line.split())

            A = np.asarray(A, dtype=int)

            ancestral_metric.append(np.mean(np.abs(A - A0)))
        
    df = pd.DataFrame(list(zip(rand, mutual_info, vmeasure, ancestral_metric)), columns=["AdjRandIndex", "AdjMutualInformation", "VMeasure", "AncestralMetric"])
    if output_path is not None:
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        output_file = os.path.join(output_path, "bscite_case" + str(case_no) + ".csv")
        df.to_csv(output_file, index=False)
    return(df)


In [None]:
# Run scripts from R to process B-SCITE output before executing this code.
data_path = "../../data/simulation/binary/"
output_path = "../../_output/simul/binary/snvs/"
df1 = ProcessBSCITE(data_path, 1, output_path)
df2 = ProcessBSCITE(data_path, 2, output_path)
df3 = ProcessBSCITE(data_path, 3, output_path)


In [19]:
# Run scripts from R to process B-SCITE output before executing this code.
data_path = "../../data/simulation/binary_cn/"
output_path = "../../_output/simul/binary_cn/snvs/"
df1 = ProcessBSCITE(data_path, 1, output_path)
df2 = ProcessBSCITE(data_path, 2, output_path)
df3 = ProcessBSCITE(data_path, 3, output_path)


In [20]:
# Run scripts from R to process B-SCITE output before executing this code.
data_path = "../../data/simulation/quadternary_cn/"
output_path = "../../_output/simul/quadternary_cn/snvs/"
df1 = ProcessBSCITE(data_path, 1, output_path)
df2 = ProcessBSCITE(data_path, 2, output_path)
df3 = ProcessBSCITE(data_path, 3, output_path)


In [21]:
# Run scripts from R to process B-SCITE output before executing this code.
data_path = "../../data/simulation/quadternary_cn_multiregion/"
output_path = "../../_output/simul/quadternary_cn_multiregion/snvs/"
df1 = ProcessBSCITE(data_path, 1, output_path)
df2 = ProcessBSCITE(data_path, 2, output_path)
df3 = ProcessBSCITE(data_path, 3, output_path)


In [22]:
# Run scripts from R to process B-SCITE output before executing this code.
data_path = "../../data/simulation/quadternary_multiregion/"
output_path = "../../_output/simul/quadternary_multiregion/snvs/"
df1 = ProcessBSCITE(data_path, 1, output_path)
df2 = ProcessBSCITE(data_path, 2, output_path)
df3 = ProcessBSCITE(data_path, 3, output_path)
