In [1]:
# Run ExtractPhyloWGSResults.py and Rscripts/ProcessBSCITEResults.R.

from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import v_measure_score

import pandas as pd
import numpy as np
import os

#data_path = "/Users/seonghwanjun/data/cell-line/HGSOC_SS3/"
data_path = "/Users/seonghwanjun/PhylExAnalysis/data/HGSOC_SS3/"

import sklearn
sklearn.__version__

'0.23.1'

In [2]:
#gt = pd.read_csv("/Users/seonghwanjun/data/cell-line/bulk/OV2295/genotype/gt.txt", header=0, sep=" ")
gt = pd.read_csv("../../data/HGSOC_SS3/gt.txt", header=0, sep=" ")
valid_clone_names = ["A_B_C_D_E_F_G_H_I", "A_B_C_D", "A_B", "C_D", "A", "B", "C", "D", "E_F_G_H_I", "E_F", "E", "F"]

# Evaluate the ancestral metric.
# Get the true ancestral metric: not many SNVs, just do a plain double for loops. 
snv_count = gt.shape[0]
A = np.zeros(shape = (snv_count, snv_count))
for i in range(snv_count):
    clone_i = set(gt.iloc[i]["CloneName"].split("_"))
    for j in range(snv_count):
        clone_j = set(gt.iloc[j]["CloneName"].split("_"))
        if clone_i != clone_j and clone_j.issubset(clone_i):
            A[i,j] = 1

idx = np.array(np.where(gt["CloneName"].isin(valid_clone_names)))[0]
valid_idx = np.ix_(idx, idx)
A0 = A[valid_idx]

In [28]:
# TSSB.
metrics = np.zeros([rep_count, 4])
for rep in range(rep_count):
    #rep_path = "/Users/seonghwanjun/data/cell-line/bulk/OV2295/genotype/bulk_only/rep" + str(rep)
    rep_path = os.path.join(data_path, "tssb_2_05_05", "tssb", "chain" + str(rep))
    predicted = pd.read_csv(rep_path + "/joint/tree0/cluster_labels.tsv", header=None, sep="\t", names=["ID", "CloneName"])
    tbl_join = predicted.join(gt, lsuffix='_caller', rsuffix='_other')
    ret = tbl_join[tbl_join["CloneName_other"].isnull() == False]
    ret_valid = tbl_join[tbl_join["CloneName_other"].isin(valid_clone_names)]

    ancestral_matrix = pd.read_csv(rep_path + "/joint/tree0/ancestral_matrix.csv", header=None)
    ancestral_matrix = np.asarray(ancestral_matrix)
    ancestral_matrix_0 = ancestral_matrix[valid_idx]
    metrics[rep,0] = v_measure_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,1] = adjusted_rand_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,2] = adjusted_mutual_info_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,3] = np.mean(np.abs(ancestral_matrix_0 - A0))
    
# V-measure, adjusted rand score, adjusted mutual info, ancestral metric.
print(metrics.mean(0))
print(metrics.std(0))


[0.24742038 0.29172581 0.1922902  0.19820416]
[0.05455369 0.05858822 0.05845756 0.02063189]


In [30]:
results_tssb = pd.DataFrame(metrics, columns = ["Vmeasure", "AdjRandScore", "AdjMutualInfo", "AncReconErr"])
results_tssb["Method"] = "TSSB"
results_tssb

Unnamed: 0,Vmeasure,AdjRandScore,AdjMutualInfo,AncReconErr,Method
0,0.123168,0.172215,0.058058,0.247637,TSSB
1,0.251525,0.284054,0.197022,0.195652,TSSB
2,0.27207,0.314759,0.21874,0.186673,TSSB
3,0.338829,0.395765,0.289923,0.16966,TSSB
4,0.355595,0.424069,0.307744,0.151229,TSSB
5,0.252659,0.307073,0.196813,0.191871,TSSB
6,0.251525,0.284054,0.197022,0.195652,TSSB
7,0.222078,0.29689,0.164536,0.198015,TSSB
8,0.205211,0.220939,0.148273,0.220227,TSSB
9,0.278078,0.318328,0.225189,0.190454,TSSB


In [25]:
# ddClone
pred_path = os.path.join(data_path, "ddClone")
predicted = pd.read_table(os.path.join(pred_path, "results.txt"), sep=" ")
predicted.columns=["ID", "phi", "CloneName"]

tbl_join = predicted.join(gt, lsuffix='_caller', rsuffix='_other')
ret = tbl_join[tbl_join["CloneName_other"].isnull() == False]
ret_valid = tbl_join[tbl_join["CloneName_other"].isin(valid_clone_names)]

print(v_measure_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(adjusted_rand_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(adjusted_mutual_info_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))


0.4829870174461928
0.04580008626266983
0.0744216195098232


In [7]:
# B-SCITE:
pred_path = os.path.join(data_path, "B-SCITE")
clustering_prediction_file = os.path.join(pred_path, "results.txt")
predicted = pd.read_table(clustering_prediction_file, sep=" ")

tbl_join = predicted.join(gt, lsuffix='_caller', rsuffix='_other')
ret = tbl_join[tbl_join["CloneName_other"].isnull() == False]
ret_valid = tbl_join[tbl_join["CloneName_other"].isin(valid_clone_names)]

# Read the ancestral matrix line-by-line.
with open(os.path.join(pred_path, "bscite.matrices"), "r") as f:
    line = f.readline()
    mutation_count = int(line.split()[1])
    f.readline()
    A = []
    for _ in range(mutation_count):
        line = f.readline()
        A.append(line.split())

A = np.asarray(A, dtype=int)
A = A[valid_idx]

print(v_measure_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(adjusted_rand_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(adjusted_mutual_info_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(np.mean(np.abs(A - A0)))


0.4301579869830684
0.08168476438575224
0.20509209020698457
0.2556710775047259


In [8]:
# Compute the metrics on Canopy.
clustering_file = os.path.join(data_path, "canopy", "predicted.csv")
ancestral_matrix_file = os.path.join(data_path, "canopy", "ancestral_matrix.csv")

predicted = pd.read_csv(clustering_file)
predicted.columns=["ID", "CloneName"]

tbl_join = predicted.join(gt, lsuffix='_caller', rsuffix='_other')
ret = tbl_join[tbl_join["CloneName_other"].isnull() == False]
ret_valid = tbl_join[tbl_join["CloneName_other"].isin(valid_clone_names)]

ancestral_matrix = np.asarray(pd.read_table(ancestral_matrix_file, header=None, sep=" "))
ancestral_matrix = ancestral_matrix[valid_idx]

print(v_measure_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(adjusted_rand_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(adjusted_mutual_info_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(np.mean(np.abs(ancestral_matrix - A0)))


0.447844219827592
0.2823002055523911
0.26238513702386707
0.19092627599243855


In [9]:
# PhyloWGS:
clustering_file = os.path.join(data_path, "phylowgs", "clustering.txt")
ancestral_matrix_file = os.path.join(data_path, "phylowgs", "ancestral_matrix.txt")

predicted = pd.read_table(clustering_file, header=None, names=["ID", "CloneName"], sep=" ")
ancestral_matrix = np.asarray(pd.read_table(ancestral_matrix_file, header=None, sep=" "))

tbl_join = predicted.join(gt, lsuffix='_caller', rsuffix='_other')
ret = tbl_join[tbl_join["CloneName_other"].isnull() == False]
ret_valid = tbl_join[tbl_join["CloneName_other"].isin(valid_clone_names)]

print(v_measure_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(adjusted_rand_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(adjusted_mutual_info_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(np.mean(np.abs(A0 - ancestral_matrix[valid_idx])))


2.60577903472376e-16
0.0
2.6057790347237605e-16
0.2684310018903592


In [71]:
# Our method.
rep_count = 20
metrics = np.zeros([rep_count, 4])
for rep in range(rep_count):
    rep_path = os.path.join(data_path, "phylex_5_1_02", "phylex", "chain" + str(rep))
    predicted = pd.read_csv(rep_path + "/joint/tree0/cluster_labels.tsv", header=None, sep="\t", names=["ID", "CloneName"])
    tbl_join = predicted.join(gt, lsuffix='_caller', rsuffix='_other')
    ret = tbl_join[tbl_join["CloneName_other"].isnull() == False]
    ret_valid = tbl_join[tbl_join["CloneName_other"].isin(valid_clone_names)]

    ancestral_matrix = pd.read_csv(rep_path + "/joint/tree0/ancestral_matrix.csv", header=None)
    ancestral_matrix = np.asarray(ancestral_matrix)
    ancestral_matrix_0 = ancestral_matrix[valid_idx]
    metrics[rep,0] = v_measure_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,1] = adjusted_rand_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,2] = adjusted_mutual_info_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,3] = np.mean(np.abs(ancestral_matrix_0 - A0))
    
# V-measure, adjusted rand score, adjusted mutual info, ancestral metric.
print(metrics.mean(0))
print(metrics.std(0))

[0.8679923  0.88393576 0.83608845 0.03936673]
[0.01875681 0.02273957 0.02529606 0.00985336]


In [72]:
supp_tbl3 = pd.DataFrame(metrics, columns = ["Vmeasure", "AdjRandScore", "AdjMutualInfo", "AncReconErr"])
supp_tbl3["lambda0 max"] = 5
supp_tbl3["lambda max"] = 1
supp_tbl3["gamma max"] = 0.2
supp_tbl3.shape

(20, 7)

In [73]:
# Our method.
rep_count = 20
metrics = np.zeros([rep_count, 4])
for rep in range(rep_count):
    rep_path = os.path.join(data_path, "phylex_5_1_1", "phylex", "chain" + str(rep))
    predicted = pd.read_csv(rep_path + "/joint/tree0/cluster_labels.tsv", header=None, sep="\t", names=["ID", "CloneName"])
    tbl_join = predicted.join(gt, lsuffix='_caller', rsuffix='_other')
    ret = tbl_join[tbl_join["CloneName_other"].isnull() == False]
    ret_valid = tbl_join[tbl_join["CloneName_other"].isin(valid_clone_names)]

    ancestral_matrix = pd.read_csv(rep_path + "/joint/tree0/ancestral_matrix.csv", header=None)
    ancestral_matrix = np.asarray(ancestral_matrix)
    ancestral_matrix_0 = ancestral_matrix[valid_idx]
    metrics[rep,0] = v_measure_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,1] = adjusted_rand_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,2] = adjusted_mutual_info_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,3] = np.mean(np.abs(ancestral_matrix_0 - A0))
    
# V-measure, adjusted rand score, adjusted mutual info, ancestral metric.
print(metrics.mean(0))
print(metrics.std(0))

[0.87029974 0.88764812 0.83896881 0.03792533]
[0.0131808  0.01635229 0.01752512 0.00767558]


In [74]:
results_phylex = pd.DataFrame(metrics, columns = ["Vmeasure", "AdjRandScore", "AdjMutualInfo", "AncReconErr"])
results_phylex["Method"] = "PhylEx"
results_phylex

temp = pd.DataFrame(metrics, columns = ["Vmeasure", "AdjRandScore", "AdjMutualInfo", "AncReconErr"])
temp["lambda0 max"] = 5
temp["lambda max"] = 1
temp["gamma max"] = 1
supp_tbl3 = supp_tbl3.append(temp)
supp_tbl3.shape

(40, 7)

In [75]:
# Our method.
rep_count = 20
metrics = np.zeros([rep_count, 4])
for rep in range(rep_count):
    rep_path = os.path.join(data_path, "phylex_10_1_1", "phylex", "chain" + str(rep))
    predicted = pd.read_csv(rep_path + "/joint/tree0/cluster_labels.tsv", header=None, sep="\t", names=["ID", "CloneName"])
    tbl_join = predicted.join(gt, lsuffix='_caller', rsuffix='_other')
    ret = tbl_join[tbl_join["CloneName_other"].isnull() == False]
    ret_valid = tbl_join[tbl_join["CloneName_other"].isin(valid_clone_names)]

    ancestral_matrix = pd.read_csv(rep_path + "/joint/tree0/ancestral_matrix.csv", header=None)
    ancestral_matrix = np.asarray(ancestral_matrix)
    ancestral_matrix_0 = ancestral_matrix[valid_idx]
    metrics[rep,0] = v_measure_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,1] = adjusted_rand_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,2] = adjusted_mutual_info_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,3] = np.mean(np.abs(ancestral_matrix_0 - A0))
    
# V-measure, adjusted rand score, adjusted mutual info, ancestral metric.
print(metrics.mean(0))
print(metrics.std(0))

[0.86341447 0.882625   0.82959099 0.03775992]
[0.02169685 0.018047   0.0275778  0.00460842]


In [76]:
temp = pd.DataFrame(metrics, columns = ["Vmeasure", "AdjRandScore", "AdjMutualInfo", "AncReconErr"])
temp["lambda0 max"] = 10
temp["lambda max"] = 1
temp["gamma max"] = 1
supp_tbl3 = supp_tbl3.append(temp)
supp_tbl3.shape

(60, 7)

In [77]:
# Our method.
rep_count = 20
metrics = np.zeros([rep_count, 4])
for rep in range(rep_count):
    rep_path = os.path.join(data_path, "phylex_1_02_05", "phylex", "chain" + str(rep))
    predicted = pd.read_csv(rep_path + "/joint/tree0/cluster_labels.tsv", header=None, sep="\t", names=["ID", "CloneName"])
    tbl_join = predicted.join(gt, lsuffix='_caller', rsuffix='_other')
    ret = tbl_join[tbl_join["CloneName_other"].isnull() == False]
    ret_valid = tbl_join[tbl_join["CloneName_other"].isin(valid_clone_names)]

    ancestral_matrix = pd.read_csv(rep_path + "/joint/tree0/ancestral_matrix.csv", header=None)
    ancestral_matrix = np.asarray(ancestral_matrix)
    ancestral_matrix_0 = ancestral_matrix[valid_idx]
    metrics[rep,0] = v_measure_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,1] = adjusted_rand_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,2] = adjusted_mutual_info_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,3] = np.mean(np.abs(ancestral_matrix_0 - A0))
    
# V-measure, adjusted rand score, adjusted mutual info, ancestral metric.
print(metrics.mean(0))
print(metrics.std(0))

[0.85877776 0.85090353 0.82749746 0.04978733]
[0.059001   0.10431945 0.07183817 0.03483031]


In [78]:
temp = pd.DataFrame(metrics, columns = ["Vmeasure", "AdjRandScore", "AdjMutualInfo", "AncReconErr"])
temp["lambda0 max"] = 1
temp["lambda max"] = 0.2
temp["gamma max"] = 0.5
supp_tbl3 = supp_tbl3.append(temp)
supp_tbl3.shape

(80, 7)

In [37]:
# Source data for Table 1.
tbl1 = results_phylex.append(results_tssb)
tbl1.to_csv("../../data/NatComm/Table1.csv", index=False)

In [79]:
# Source data for Supplementary Table 3. 
supp_tbl3.to_csv("../../data/NatComm/SupplementaryTable3.csv", index=False)