In [4]:
# Run ExtractPhyloWGSResults.py and Rscripts/ProcessBSCITEResults.R.

from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import v_measure_score

import pandas as pd
import numpy as np
import os

data_path = "/Users/seonghwanjun/data/cell-line/bulk/OV2295/genotype/"

import sklearn
sklearn.__version__

'0.23.1'

In [2]:
gt = pd.read_csv("/Users/seonghwanjun/data/cell-line/bulk/OV2295/genotype/gt.txt", header=0, sep=" ")
valid_clone_names = ["A_B_C_D_E_F_G_H_I", "A_B_C_D", "A_B", "C_D", "A", "B", "C", "D", "E_F_G_H_I", "E_F", "E", "F"]

# Evaluate the ancestral metric.
# Get the true ancestral metric: not many SNVs, just do a plain double for loops. 
snv_count = gt.shape[0]
A = np.zeros(shape = (snv_count, snv_count))
for i in range(snv_count):
    clone_i = set(gt.iloc[i]["CloneName"].split("_"))
    for j in range(snv_count):
        clone_j = set(gt.iloc[j]["CloneName"].split("_"))
        if clone_i != clone_j and clone_j.issubset(clone_i):
            A[i,j] = 1

idx = np.array(np.where(gt["CloneName"].isin(valid_clone_names)))[0]
valid_idx = np.ix_(idx, idx)
A0 = A[valid_idx]


In [6]:
# Our method.
rep_count = 20
metrics = np.zeros([rep_count, 4])
for rep in range(rep_count):
    rep_path = "/Users/seonghwanjun/data/cell-line/bulk/OV2295/genotype/results/rep" + str(rep)
    predicted = pd.read_csv(rep_path + "/joint/tree0/cluster_labels.tsv", header=None, sep="\t", names=["ID", "CloneName"])
    tbl_join = predicted.join(gt, lsuffix='_caller', rsuffix='_other')
    ret = tbl_join[tbl_join["CloneName_other"].isnull() == False]
    ret_valid = tbl_join[tbl_join["CloneName_other"].isin(valid_clone_names)]

    ancestral_matrix = pd.read_csv(rep_path + "/joint/tree0/ancestral_matrix.csv", header=None)
    ancestral_matrix = np.asarray(ancestral_matrix)
    ancestral_matrix_0 = ancestral_matrix[valid_idx]
    metrics[rep,0] = v_measure_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,1] = adjusted_rand_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,2] = adjusted_mutual_info_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,3] = np.mean(np.abs(ancestral_matrix_0 - A0))
    
# V-measure, adjusted rand score, adjusted mutual info, ancestral metric.
print(metrics.mean(0))
print(metrics.std(0))

[0.86594364 0.88898542 0.84137349 0.02906427]
[0.00180256 0.0002348  0.00149649 0.00212665]


In [5]:
# PhyloSub.
metrics = np.zeros([rep_count, 4])
for rep in range(rep_count):
    rep_path = "/Users/seonghwanjun/data/cell-line/bulk/OV2295/genotype/bulk_only/rep" + str(rep)
    predicted = pd.read_csv(rep_path + "/joint/tree0/cluster_labels.tsv", header=None, sep="\t", names=["ID", "CloneName"])
    tbl_join = predicted.join(gt, lsuffix='_caller', rsuffix='_other')
    ret = tbl_join[tbl_join["CloneName_other"].isnull() == False]
    ret_valid = tbl_join[tbl_join["CloneName_other"].isin(valid_clone_names)]

    ancestral_matrix = pd.read_csv(rep_path + "/joint/tree0/ancestral_matrix.csv", header=None)
    ancestral_matrix = np.asarray(ancestral_matrix)
    ancestral_matrix_0 = ancestral_matrix[valid_idx]
    metrics[rep,0] = v_measure_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,1] = adjusted_rand_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,2] = adjusted_mutual_info_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"])
    metrics[rep,3] = np.mean(np.abs(ancestral_matrix_0 - A0))
    
# V-measure, adjusted rand score, adjusted mutual info, ancestral metric.
print(metrics.mean(0))
print(metrics.std(0))

[0.23721991 0.28376081 0.17966876 0.20370983]
[0.06754353 0.0773308  0.07521317 0.03178864]


In [31]:
# ddClone
pred_path = os.path.join(data_path, "ddClone")
predicted = pd.read_table(os.path.join(pred_path, "results.txt"), sep=" ")
predicted.columns=["ID", "phi", "CloneName"]

tbl_join = predicted.join(gt, lsuffix='_caller', rsuffix='_other')
ret = tbl_join[tbl_join["CloneName_other"].isnull() == False]
ret_valid = tbl_join[tbl_join["CloneName_other"].isin(valid_clone_names)]

print(v_measure_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(adjusted_rand_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(adjusted_mutual_info_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))


0.4829870174461928
0.04580008626266983
0.0744216195098232


In [33]:
# B-SCITE:
pred_path = os.path.join(data_path, "B-SCITE")
clustering_prediction_file = os.path.join(pred_path, "results.txt")
predicted = pd.read_table(clustering_prediction_file, sep=" ")

tbl_join = predicted.join(gt, lsuffix='_caller', rsuffix='_other')
ret = tbl_join[tbl_join["CloneName_other"].isnull() == False]
ret_valid = tbl_join[tbl_join["CloneName_other"].isin(valid_clone_names)]

# Read the ancestral matrix line-by-line.
with open(os.path.join(pred_path, "bscite.matrices"), "r") as f:
    line = f.readline()
    mutation_count = int(line.split()[1])
    f.readline()
    A = []
    for _ in range(mutation_count):
        line = f.readline()
        A.append(line.split())

A = np.asarray(A, dtype=int)
A = A[valid_idx]

print(v_measure_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(adjusted_rand_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(adjusted_mutual_info_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(np.mean(np.abs(A - A0)))


0.4301579869830684
0.08168476438575224
0.20509209020698457
0.2556710775047259


In [35]:
# Compute the metrics on Canopy.
clustering_file = os.path.join(data_path, "canopy", "predicted.csv")
ancestral_matrix_file = os.path.join(data_path, "canopy", "ancestral_matrix.csv")

predicted = pd.read_csv(clustering_file)
predicted.columns=["ID", "CloneName"]

tbl_join = predicted.join(gt, lsuffix='_caller', rsuffix='_other')
ret = tbl_join[tbl_join["CloneName_other"].isnull() == False]
ret_valid = tbl_join[tbl_join["CloneName_other"].isin(valid_clone_names)]

ancestral_matrix = np.asarray(pd.read_table(ancestral_matrix_file, header=None, sep=" "))
ancestral_matrix = ancestral_matrix[valid_idx]

print(v_measure_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(adjusted_rand_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(adjusted_mutual_info_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(np.mean(np.abs(ancestral_matrix - A0)))


0.4939098826345035
0.3857127534055561
0.32674744749201423
0.1781663516068053


In [36]:
# PhyloWGS:
clustering_file = os.path.join(data_path, "phylowgs", "clustering.txt")
ancestral_matrix_file = os.path.join(data_path, "phylowgs", "ancestral_matrix.txt")

predicted = pd.read_table(clustering_file, header=None, names=["ID", "CloneName"], sep=" ")
ancestral_matrix = np.asarray(pd.read_table(ancestral_matrix_file, header=None, sep=" "))

tbl_join = predicted.join(gt, lsuffix='_caller', rsuffix='_other')
ret = tbl_join[tbl_join["CloneName_other"].isnull() == False]
ret_valid = tbl_join[tbl_join["CloneName_other"].isin(valid_clone_names)]

print(v_measure_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(adjusted_rand_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(adjusted_mutual_info_score(ret_valid["CloneName_other"], ret_valid["CloneName_caller"]))
print(np.mean(np.abs(A0 - ancestral_matrix[valid_idx])))


2.60577903472376e-16
0.0
2.6057790347237605e-16
0.2684310018903592
