In [1]:
import pandas as pd
import numpy as np
import pathlib
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import RocCurveDisplay
from sklearn.decomposition import NMF
import warnings
warnings.filterwarnings("ignore")
from sklearn.mixture import GaussianMixture
import os
import ot
import pickle
import argparse
import Levenshtein
import itertools
from helper_functions import *

##### input args
PROJECT = "gs-mrd"
release_version = "09122024"
merge_version = "20240914"

##### configurations/paths
path_to_main_src = "/media/hieunguyen/HNSD01/src/gs-mrd"
path_to_merge_samples = f"{path_to_main_src}/all_samples/{merge_version}"

path_to_save_output = os.path.join(path_to_main_src, "output")

all_fold_res = [item for item in pathlib.Path(os.path.join(path_to_save_output, "02_output")).glob("*/*.csv")]

In [2]:

folddf = pd.read_csv(all_fold_res[0], index_col=[0])
i = all_fold_res[0].name.replace("resdf_fold", "").replace(".csv", "")
folddf["feature_combinations"] = folddf["feature_combinations"].apply(
    lambda x: x.replace("NMF_FLEN_1", "NMF_FLEN").replace("NMF_FLEN_2", "NMF_FLEN").replace("NMF_NUCLEOSOME_1", "NMF_NUCLEOSOME").replace("NMF_NUCLEOSOME_2", "NMF_NUCLEOSOME").replace("NMF_EM_1", "NMF_EM").replace("NMF_EM_2", "NMF_EM"))
folddf.columns = ["feat", f"SEN_fold{i}", f"SPEC_fold{i}"]
for file in all_fold_res[1:]:
    tmpdf = pd.read_csv(file, index_col = [0])
    tmpdf["feature_combinations"] = tmpdf["feature_combinations"].apply(
        lambda x: x.replace("NMF_FLEN_1", "NMF_FLEN").replace("NMF_FLEN_2", "NMF_FLEN").replace("NMF_NUCLEOSOME_1", "NMF_NUCLEOSOME").replace("NMF_NUCLEOSOME_2", "NMF_NUCLEOSOME").replace("NMF_EM_1", "NMF_EM").replace("NMF_EM_2", "NMF_EM")
    )
    i = file.name.replace("resdf_fold", "").replace(".csv", "")
    tmpdf.columns = ["feat", f"SEN_fold{i}", f"SPEC_fold{i}"]
    folddf = folddf.merge(tmpdf, right_on = "feat", left_on = "feat")
folddf["avg_SEN"] = folddf[[item for item in folddf.columns if "SEN" in item]].mean(axis = 1)
folddf["avg_SPEC"] = folddf[[item for item in folddf.columns if "SPEC" in item]].mean(axis = 1)

folddf = folddf[["feat", "avg_SEN", "avg_SPEC"]]

In [9]:
folddf[folddf["avg_SPEC"] >= 0.95].sort_values(["avg_SPEC", "avg_SEN"], ascending = False)

Unnamed: 0,feat,avg_SEN,avg_SPEC
486,"OT_FLEN,OT_NUCLEOSOME",0.204195,0.973216
487,"OT_NUCLEOSOME,NMF_EM",0.215163,0.965054
488,"OT_NUCLEOSOME,NMF_NUCLEOSOME",0.219208,0.963559
3,"OT_EM,NMF_EM",0.236552,0.960735
498,"OT_FLEN,NMF_NUCLEOSOME",0.199133,0.960424
466,"OT_FLEN,NMF_EM",0.202124,0.960293
489,"NMF_EM,NMF_NUCLEOSOME",0.18896,0.959696
473,"OT_FLEN,OT_NUCLEOSOME,NMF_NUCLEOSOME",0.235448,0.957681
438,"OT_FLEN,OT_NUCLEOSOME,NMF_EM",0.239193,0.956998
428,"OT_EM,OT_NUCLEOSOME",0.281558,0.956986


In [10]:
folddf.sort_values(["avg_SPEC", "avg_SEN"], ascending = False)

Unnamed: 0,feat,avg_SEN,avg_SPEC
486,"OT_FLEN,OT_NUCLEOSOME",0.204195,0.973216
487,"OT_NUCLEOSOME,NMF_EM",0.215163,0.965054
488,"OT_NUCLEOSOME,NMF_NUCLEOSOME",0.219208,0.963559
3,"OT_EM,NMF_EM",0.236552,0.960735
498,"OT_FLEN,NMF_NUCLEOSOME",0.199133,0.960424
...,...,...,...
1567,"EM_score,FLEN_score,NUCLEOSOME_score,EM_shanno...",0.617573,0.745919
1584,"EM_score,FLEN_score,NUCLEOSOME_score,EM_shanno...",0.620416,0.745367
1587,"EM_score,FLEN_score,NUCLEOSOME_score,EM_shanno...",0.620416,0.745367
1571,"EM_score,FLEN_score,NUCLEOSOME_score,EM_shanno...",0.618621,0.745367


In [3]:
folddf[(folddf["feat"].str.contains("NMF_FLEN")) & (folddf["feat"].str.contains("ichorCNA"))].sort_values("avg_SPEC", ascending = False)

Unnamed: 0,feat,avg_SEN,avg_SPEC
750,"ichorCNA,NMF_FLEN",0.538038,0.827040
753,"ichorCNA,NMF_FLEN,NMF_NUCLEOSOME",0.540663,0.825473
751,"FLEN_score,ichorCNA,NMF_FLEN",0.538831,0.824355
755,"FLEN_score,ichorCNA,NMF_FLEN,NMF_NUCLEOSOME",0.541456,0.822788
752,"OT_FLEN,ichorCNA,NMF_FLEN",0.543767,0.818986
...,...,...,...
1581,"EM_score,FLEN_score,NUCLEOSOME_score,EM_shanno...",0.619368,0.745919
1583,"EM_score,FLEN_score,NUCLEOSOME_score,EM_shanno...",0.618621,0.745367
1584,"EM_score,FLEN_score,NUCLEOSOME_score,EM_shanno...",0.620416,0.745367
1587,"EM_score,FLEN_score,NUCLEOSOME_score,EM_shanno...",0.620416,0.745367


In [None]:
folddf.style.background_gradient(subset = ["avg_SPEC", "avg_SEN"]).to_excel(os.path.join(f"resdf_folds.color.xlsx"), index = False)