In [6]:
# Run ExtractPhyloWGSResults.py and Rscripts/ProcessBSCITEResults.R.

from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import v_measure_score

import pandas as pd
import numpy as np
import os

data_path = "../../_output/HGSOC_10X/"

import sklearn
sklearn.__version__

'0.23.1'

In [4]:
gt = pd.read_csv("../../data/HGSOC_10X/gt.txt", header=0, sep=" ")
valid_clone_names = ["A_B_C_D_E_F_G_H_I", "A_B_C_D", "A_B", "C_D", "A", "B", "C", "D", "E_F_G_H_I", "E_F", "E", "F"]

# Evaluate the ancestral metric.
# Get the true ancestral metric: not many SNVs, just do a plain double for loops. 
snv_count = gt.shape[0]
A = np.zeros(shape = (snv_count, snv_count))
for i in range(snv_count):
    clone_i = set(gt.iloc[i]["CloneName"].split("_"))
    for j in range(snv_count):
        clone_j = set(gt.iloc[j]["CloneName"].split("_"))
        if clone_i != clone_j and clone_j.issubset(clone_i):
            A[i,j] = 1

idx = np.array(np.where(gt["CloneName"].isin(valid_clone_names)))[0]
valid_idx = np.ix_(idx, idx)
A0 = A[valid_idx]


In [22]:
# Our method.
chain_count = 20
metrics = np.zeros([chain_count, 4])
for chain in range(chain_count):
    chain_path = "../../data/HGSOC_10X/phylex/chain" + str(chain)
    predicted = pd.read_csv(chain_path + "/joint/tree0/cluster_labels.tsv", header=None, sep="\t", names=["ID", "CloneName"])
    tbl_join = predicted.join(gt, lsuffix='_caller', rsuffix='_other')
    ret = tbl_join[tbl_join["CloneName_other"].isnull() == False]
    ret_valid = tbl_join[tbl_join["CloneName_other"].isin(valid_clone_names)]

    ancestral_matrix = pd.read_csv(chain_path + "/joint/tree0/ancestral_matrix.csv", header=None)
    ancestral_matrix = np.asarray(ancestral_matrix)
    ancestral_matrix_0 = ancestral_matrix[valid_idx]
    metrics[chain,0] = v_measure_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[chain,1] = adjusted_rand_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[chain,2] = adjusted_mutual_info_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[chain,3] = np.mean(np.abs(ancestral_matrix_0 - A0))
    
# V-measure, adjusted rand score, adjusted mutual info, ancestral metric.
print(metrics.mean(0))
print(metrics.std(0))


[0.35924743 0.20642445 0.26550135 0.23321762]
[0.04175973 0.03233021 0.03756476 0.01285182]


In [23]:
results_phylex = pd.DataFrame(metrics, columns = ["Vmeasure", "AdjRandScore", "AdjMutualInfo", "AncReconErr"])
results_phylex["Method"] = "PhylEx"


In [24]:
# TSSB
# Our method.
chain_count = 20
metrics_tssb = np.zeros([chain_count, 4])
for chain in range(chain_count):
    chain_path = "../../data/HGSOC_10X/tssb/chain" + str(chain)
    predicted = pd.read_csv(chain_path + "/joint/tree0/cluster_labels.tsv", header=None, sep="\t", names=["ID", "CloneName"])
    tbl_join = predicted.join(gt, lsuffix='_caller', rsuffix='_other')
    ret = tbl_join[tbl_join["CloneName_other"].isnull() == False]
    ret_valid = tbl_join[tbl_join["CloneName_other"].isin(valid_clone_names)]

    ancestral_matrix = pd.read_csv(chain_path + "/joint/tree0/ancestral_matrix.csv", header=None)
    ancestral_matrix = np.asarray(ancestral_matrix)
    ancestral_matrix_0 = ancestral_matrix[valid_idx]
    metrics_tssb[chain,0] = v_measure_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics_tssb[chain,1] = adjusted_rand_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics_tssb[chain,2] = adjusted_mutual_info_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics_tssb[chain,3] = np.mean(np.abs(ancestral_matrix_0 - A0))
    
# V-measure, adjusted rand score, adjusted mutual info, ancestral metric.
print(metrics_tssb.mean(0))
print(metrics_tssb.std(0))


[0.2031598  0.15447029 0.1564563  0.23805793]
[0.04313064 0.04902003 0.04950668 0.02520797]


In [25]:
results_tssb = pd.DataFrame(metrics_tssb, columns = ["Vmeasure", "AdjRandScore", "AdjMutualInfo", "AncReconErr"])
results_tssb["Method"] = "TSSB"


In [26]:
results = results_phylex.append(results_tssb)

In [28]:
results.to_csv("../../data/NatComm/Table3.csv", index=False)