In [1]:
# Run ExtractPhyloWGSResults.py and Rscripts/ProcessBSCITEResults.R first.

from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import v_measure_score

import pandas as pd
import numpy as np
import os

import sklearn
sklearn.__version__

'0.23.1'

In [2]:
data_path = "/Users/seonghwanjun/data/cell-line/bulk/OV2295/genotype/"

gt = pd.read_csv("/Users/seonghwanjun/data/cell-line/bulk/OV2295/genotype/gt.txt", header=0, sep=" ")
valid_clone_names = ["A_B_C_D_E_F_G_H_I", "A_B_C_D", "A_B", "C_D", "A", "B", "C", "D", "E_F_G_H_I", "E_F", "E", "F"]

# Evaluate the ancestral metric.
# Get the true ancestral metric: not many SNVs, just do a plain double for loops. 
snv_count = gt.shape[0]
A = np.zeros(shape = (snv_count, snv_count))
for i in range(snv_count):
    clone_i = set(gt.iloc[i]["CloneName"].split("_"))
    for j in range(snv_count):
        clone_j = set(gt.iloc[j]["CloneName"].split("_"))
        if clone_i != clone_j and clone_j.issubset(clone_i):
            A[i,j] = 1

idx = np.array(np.where(gt["CloneName"].isin(valid_clone_names)))[0]
valid_idx = np.ix_(idx, idx)
A0 = A[valid_idx]


In [3]:
# Our method.
rep_count = 20
metrics = np.zeros([rep_count, 4])
for rep in range(rep_count):
    #rep_path = "/Users/seonghwanjun/data/cell-line/bulk/OV2295/genotype/results/rep" + str(rep)
    rep_path = "/Users/seonghwanjun/PhylExAnalysis/_output/HGSOC/phylex/chain" + str(rep)
    predicted = pd.read_csv(rep_path + "/joint/tree0/cluster_labels.tsv", header=None, sep="\t", names=["ID", "CloneName"])
    tbl_join = predicted.join(gt, lsuffix='_caller', rsuffix='_other')
    ret = tbl_join[tbl_join["CloneName_other"].isnull() == False]
    ret_valid = tbl_join[tbl_join["CloneName_other"].isin(valid_clone_names)]

    ancestral_matrix = pd.read_csv(rep_path + "/joint/tree0/ancestral_matrix.csv", header=None)
    ancestral_matrix = np.asarray(ancestral_matrix)
    ancestral_matrix_0 = ancestral_matrix[valid_idx]
    metrics[rep,0] = v_measure_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,1] = adjusted_rand_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,2] = adjusted_mutual_info_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,3] = np.mean(np.abs(ancestral_matrix_0 - A0))
    
# V-measure, adjusted rand score, adjusted mutual info, ancestral metric.
print(metrics.mean(0))
print(metrics.std(0))
print(metrics)

[0.87046922 0.87925848 0.8427254  0.03993384]
[0.03885913 0.06316747 0.0461935  0.02158111]
[[0.87329505 0.88976462 0.84461452 0.03638941]
 [0.89512617 0.90580255 0.87393241 0.03213611]
 [0.74920556 0.62172987 0.69092814 0.13090737]
 [0.90218254 0.90779706 0.8782858  0.03213611]
 [0.89512617 0.90580255 0.87393241 0.03213611]
 [0.89512617 0.90580255 0.87393241 0.03213611]
 [0.8683263  0.88960707 0.83554984 0.036862  ]
 [0.88968235 0.90566878 0.86488983 0.0326087 ]
 [0.88968235 0.90566878 0.86488983 0.0326087 ]
 [0.88968235 0.90566878 0.86488983 0.0326087 ]
 [0.77857314 0.81919142 0.74447363 0.04914934]
 [0.89512617 0.90580255 0.87393241 0.03213611]
 [0.86540286 0.8889076  0.84151696 0.02835539]
 [0.88968235 0.90566878 0.86488983 0.0326087 ]
 [0.87329505 0.88976462 0.84461452 0.03638941]
 [0.88968235 0.90566878 0.86488983 0.0326087 ]
 [0.87329505 0.88976462 0.84461452 0.03638941]
 [0.83391511 0.84165523 0.80022687 0.05151229]
 [0.88968235 0.90566878 0.86488983 0.0326087 ]
 [0.87329505 0.

In [5]:
# TSSB.
metrics = np.zeros([rep_count, 4])
for rep in range(rep_count):
    rep_path = "/Users/seonghwanjun/data/cell-line/bulk/OV2295/genotype/bulk_only/rep" + str(rep)
    predicted = pd.read_csv(rep_path + "/joint/tree0/cluster_labels.tsv", header=None, sep="\t", names=["ID", "CloneName"])
    tbl_join = predicted.join(gt, lsuffix='_caller', rsuffix='_other')
    ret = tbl_join[tbl_join["CloneName_other"].isnull() == False]
    ret_valid = tbl_join[tbl_join["CloneName_other"].isin(valid_clone_names)]

    ancestral_matrix = pd.read_csv(rep_path + "/joint/tree0/ancestral_matrix.csv", header=None)
    ancestral_matrix = np.asarray(ancestral_matrix)
    ancestral_matrix_0 = ancestral_matrix[valid_idx]
    metrics[rep,0] = v_measure_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,1] = adjusted_rand_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,2] = adjusted_mutual_info_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,3] = np.mean(np.abs(ancestral_matrix_0 - A0))
    
# V-measure, adjusted rand score, adjusted mutual info, ancestral metric.
print(metrics.mean(0))
print(metrics.std(0))

[0.23721991 0.28376081 0.17966876 0.20370983]
[0.06754353 0.0773308  0.07521317 0.03178864]


In [6]:
# ddClone
pred_path = os.path.join(data_path, "ddClone")
predicted = pd.read_table(os.path.join(pred_path, "results.txt"), sep=" ")
predicted.columns=["ID", "phi", "CloneName"]

tbl_join = predicted.join(gt, lsuffix='_caller', rsuffix='_other')
ret = tbl_join[tbl_join["CloneName_other"].isnull() == False]
ret_valid = tbl_join[tbl_join["CloneName_other"].isin(valid_clone_names)]

print(v_measure_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(adjusted_rand_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(adjusted_mutual_info_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))


0.4829870174461928
0.04580008626266983
0.0744216195098232


In [33]:
# B-SCITE:
pred_path = os.path.join(data_path, "B-SCITE")
clustering_prediction_file = os.path.join(pred_path, "results.txt")
predicted = pd.read_table(clustering_prediction_file, sep=" ")

tbl_join = predicted.join(gt, lsuffix='_caller', rsuffix='_other')
ret = tbl_join[tbl_join["CloneName_other"].isnull() == False]
ret_valid = tbl_join[tbl_join["CloneName_other"].isin(valid_clone_names)]

# Read the ancestral matrix line-by-line.
with open(os.path.join(pred_path, "bscite.matrices"), "r") as f:
    line = f.readline()
    mutation_count = int(line.split()[1])
    f.readline()
    A = []
    for _ in range(mutation_count):
        line = f.readline()
        A.append(line.split())

A = np.asarray(A, dtype=int)
A = A[valid_idx]

print(v_measure_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(adjusted_rand_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(adjusted_mutual_info_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(np.mean(np.abs(A - A0)))


0.4301579869830684
0.08168476438575224
0.20509209020698457
0.2556710775047259


In [7]:
# Compute the metrics on Canopy.
clustering_file = os.path.join(data_path, "canopy", "predicted.csv")
ancestral_matrix_file = os.path.join(data_path, "canopy", "ancestral_matrix.csv")

predicted = pd.read_csv(clustering_file)
predicted.columns=["ID", "CloneName"]

tbl_join = predicted.join(gt, lsuffix='_caller', rsuffix='_other')
ret = tbl_join[tbl_join["CloneName_other"].isnull() == False]
ret_valid = tbl_join[tbl_join["CloneName_other"].isin(valid_clone_names)]

ancestral_matrix = np.asarray(pd.read_table(ancestral_matrix_file, header=None, sep=" "))
ancestral_matrix = ancestral_matrix[valid_idx]

print(v_measure_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(adjusted_rand_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(adjusted_mutual_info_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(np.mean(np.abs(ancestral_matrix - A0)))


0.447844219827592
0.2823002055523911
0.26238513702386707
0.19092627599243855


In [9]:
# PhyloWGS:
clustering_file = os.path.join(data_path, "phylowgs", "clustering.txt")
ancestral_matrix_file = os.path.join(data_path, "phylowgs", "ancestral_matrix.txt")

predicted = pd.read_table(clustering_file, header=None, names=["ID", "CloneName"], sep=" ")
ancestral_matrix = np.asarray(pd.read_table(ancestral_matrix_file, header=None, sep=" "))

tbl_join = predicted.join(gt, lsuffix='_caller', rsuffix='_other')
ret = tbl_join[tbl_join["CloneName_other"].isnull() == False]
ret_valid = tbl_join[tbl_join["CloneName_other"].isin(valid_clone_names)]

print(v_measure_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(adjusted_rand_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(adjusted_mutual_info_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(np.mean(np.abs(A0 - ancestral_matrix[valid_idx])))


2.60577903472376e-16
0.0
2.6057790347237605e-16
0.2684310018903592


## Now, we will do comparison on 10X data
- PhylEx
- TSSB
- Canopy

In [14]:
data_path = "/Users/seonghwanjun/PhylExAnalysis/_output/HGSOC_10X/"

gt = pd.read_csv("/Users/seonghwanjun/PhylExAnalysis/data/HGSOC_10X_gt.txt", header=0, sep=" ")
valid_clone_names = ["A_B_C_D_E_F_G_H_I", "A_B_C_D", "A_B", "C_D", "A", "B", "C", "D", "E_F_G_H_I", "E_F", "E", "F"]

# Evaluate the ancestral metric.
# Get the true ancestral metric: not many SNVs, just do a plain double for loops. 
snv_count = gt.shape[0]
A = np.zeros(shape = (snv_count, snv_count))
for i in range(snv_count):
    clone_i = set(gt.iloc[i]["CloneName"].split("_"))
    for j in range(snv_count):
        clone_j = set(gt.iloc[j]["CloneName"].split("_"))
        if clone_i != clone_j and clone_j.issubset(clone_i):
            A[i,j] = 1

idx = np.array(np.where(gt["CloneName"].isin(valid_clone_names)))[0]
valid_idx = np.ix_(idx, idx)
A0 = A[valid_idx]


In [16]:
# Our method
rep_count = 20
metrics = np.zeros([rep_count, 4])
for rep in range(rep_count):
    rep_path = "/Users/seonghwanjun/PhylExAnalysis/_output/HGSOC_10X/phylex/chain" + str(rep)
    predicted = pd.read_csv(rep_path + "/joint/tree0/cluster_labels.tsv", header=None, sep="\t", names=["ID", "CloneName"])
    tbl_join = predicted.join(gt, lsuffix='_caller', rsuffix='_other')
    ret = tbl_join[tbl_join["CloneName_other"].isnull() == False]
    ret_valid = tbl_join[tbl_join["CloneName_other"].isin(valid_clone_names)]

    ancestral_matrix = pd.read_csv(rep_path + "/joint/tree0/ancestral_matrix.csv", header=None)
    ancestral_matrix = np.asarray(ancestral_matrix)
    ancestral_matrix_0 = ancestral_matrix[valid_idx]
    metrics[rep,0] = v_measure_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,1] = adjusted_rand_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,2] = adjusted_mutual_info_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,3] = np.mean(np.abs(ancestral_matrix_0 - A0))
    
# V-measure, adjusted rand score, adjusted mutual info, ancestral metric.
print(metrics.mean(0))
print(metrics.std(0))
print(metrics)

[0.35924743 0.20642445 0.26550135 0.23321762]
[0.04175973 0.03233021 0.03756476 0.01285182]
[[0.3452676  0.19168738 0.25093872 0.23646102]
 [0.32491389 0.20821186 0.25210552 0.22634398]
 [0.36825671 0.21408401 0.26479319 0.22416187]
 [0.29516966 0.18575727 0.20430549 0.24042849]
 [0.37108071 0.19501037 0.28034681 0.22812934]
 [0.34770504 0.17285554 0.26361354 0.2388415 ]
 [0.35978674 0.19345951 0.25519235 0.24042849]
 [0.41007804 0.21194732 0.30368761 0.22912121]
 [0.26746092 0.14573752 0.17549935 0.26304305]
 [0.31072338 0.15471751 0.23509679 0.25609998]
 [0.37312503 0.22830871 0.27131184 0.22832771]
 [0.39147698 0.21611203 0.28906949 0.24320571]
 [0.3584095  0.22231637 0.2654479  0.23090657]
 [0.32185852 0.21407826 0.22319856 0.23606427]
 [0.36133818 0.16616436 0.26669551 0.24439595]
 [0.41767983 0.21659528 0.32312607 0.22733585]
 [0.37850337 0.23514835 0.28956632 0.22475699]
 [0.41098739 0.29310614 0.30836245 0.20273755]
 [0.3345835  0.22231549 0.25170688 0.21920254]
 [0.43654356 0.

In [17]:
# TSSB
rep_count = 20
metrics = np.zeros([rep_count, 4])
for rep in range(rep_count):
    rep_path = "/Users/seonghwanjun/PhylExAnalysis/_output/HGSOC_10X/tssb/chain" + str(rep)
    predicted = pd.read_csv(rep_path + "/joint/tree0/cluster_labels.tsv", header=None, sep="\t", names=["ID", "CloneName"])
    tbl_join = predicted.join(gt, lsuffix='_caller', rsuffix='_other')
    ret = tbl_join[tbl_join["CloneName_other"].isnull() == False]
    ret_valid = tbl_join[tbl_join["CloneName_other"].isin(valid_clone_names)]

    ancestral_matrix = pd.read_csv(rep_path + "/joint/tree0/ancestral_matrix.csv", header=None)
    ancestral_matrix = np.asarray(ancestral_matrix)
    ancestral_matrix_0 = ancestral_matrix[valid_idx]
    metrics[rep,0] = v_measure_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,1] = adjusted_rand_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,2] = adjusted_mutual_info_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,3] = np.mean(np.abs(ancestral_matrix_0 - A0))
    
# V-measure, adjusted rand score, adjusted mutual info, ancestral metric.
print(metrics.mean(0))
print(metrics.std(0))
print(metrics)

[0.2031598  0.15447029 0.1564563  0.23805793]
[0.04313064 0.04902003 0.04950668 0.02520797]
[[0.23302675 0.16781757 0.18234865 0.23963499]
 [0.17750419 0.10614669 0.12068581 0.24796667]
 [0.13716754 0.05611451 0.06413609 0.30470145]
 [0.26290119 0.22004112 0.22848329 0.20710177]
 [0.17091626 0.15671862 0.13245661 0.23209681]
 [0.15077375 0.10912486 0.09018903 0.2685975 ]
 [0.18207864 0.1107243  0.1281407  0.26046419]
 [0.19414306 0.14720837 0.14037971 0.25709185]
 [0.20755505 0.15229581 0.17162796 0.2269391 ]
 [0.27839628 0.25187088 0.24430484 0.20075382]
 [0.25879597 0.22400507 0.20982058 0.20511803]
 [0.2150326  0.16893296 0.1791509  0.22297163]
 [0.26072911 0.20274493 0.21188146 0.2209879 ]
 [0.124174   0.10814338 0.08383262 0.25431462]
 [0.22559928 0.18185054 0.18992964 0.21821067]
 [0.14955087 0.08376118 0.09396274 0.26502678]
 [0.20755505 0.15229581 0.17162796 0.2269391 ]
 [0.22559928 0.18185054 0.18992964 0.21821067]
 [0.22366269 0.1857212  0.17217865 0.22931958]
 [0.17803435 0.

In [18]:
# Compute the metrics on Canopy.
clustering_file = os.path.join(data_path, "canopy", "predicted.csv")
ancestral_matrix_file = os.path.join(data_path, "canopy", "ancestral_matrix.csv")

predicted = pd.read_csv(clustering_file)
predicted.columns=["ID", "CloneName"]

tbl_join = predicted.join(gt, lsuffix='_caller', rsuffix='_other')
ret = tbl_join[tbl_join["CloneName_other"].isnull() == False]
ret_valid = tbl_join[tbl_join["CloneName_other"].isin(valid_clone_names)]

ancestral_matrix = np.asarray(pd.read_table(ancestral_matrix_file, header=None, sep=" "))
ancestral_matrix = ancestral_matrix[valid_idx]

print(v_measure_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(adjusted_rand_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(adjusted_mutual_info_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(np.mean(np.abs(ancestral_matrix - A0)))


0.30457639884702015
0.17605997183999517
0.1684235272613015
0.24479269986113866
