In [1]:
import pandas as pd
import numpy as np
import pathlib
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import RocCurveDisplay
from sklearn.decomposition import NMF
import warnings
warnings.filterwarnings("ignore")
from sklearn.mixture import GaussianMixture
import os
import ot
import pickle
import argparse
import Levenshtein
import itertools
from helper_functions import *

##### input args
PROJECT = "gs-mrd"
release_version = "09122024"
merge_version = "20240914"

##### configurations/paths
path_to_main_src = "/media/hieunguyen/HNSD01/src/gs-mrd"
path_to_merge_samples = f"{path_to_main_src}/all_samples/{merge_version}"

path_to_save_output = os.path.join(path_to_main_src, "output")

all_fold_res = [item for item in pathlib.Path(os.path.join(path_to_save_output, "02_output")).glob("*/*.csv")]

In [2]:

folddf = pd.read_csv(all_fold_res[0], index_col=[0])
i = all_fold_res[0].name.replace("resdf_fold", "").replace(".csv", "")
folddf["feature_combinations"] = folddf["feature_combinations"].apply(
    lambda x: x.replace("NMF_FLEN_1", "NMF_FLEN").replace("NMF_FLEN_2", "NMF_FLEN").replace("NMF_NUCLEOSOME_1", "NMF_NUCLEOSOME").replace("NMF_NUCLEOSOME_2", "NMF_NUCLEOSOME").replace("NMF_EM_1", "NMF_EM").replace("NMF_EM_2", "NMF_EM"))
folddf.columns = ["feat", f"SEN_fold{i}", f"SPEC_fold{i}"]
for file in all_fold_res[1:]:
    tmpdf = pd.read_csv(file, index_col = [0])
    tmpdf["feature_combinations"] = tmpdf["feature_combinations"].apply(
        lambda x: x.replace("NMF_FLEN_1", "NMF_FLEN").replace("NMF_FLEN_2", "NMF_FLEN").replace("NMF_NUCLEOSOME_1", "NMF_NUCLEOSOME").replace("NMF_NUCLEOSOME_2", "NMF_NUCLEOSOME").replace("NMF_EM_1", "NMF_EM").replace("NMF_EM_2", "NMF_EM")
    )
    i = file.name.replace("resdf_fold", "").replace(".csv", "")
    tmpdf.columns = ["feat", f"SEN_fold{i}", f"SPEC_fold{i}"]
    folddf = folddf.merge(tmpdf, right_on = "feat", left_on = "feat")
folddf["avg_SEN"] = folddf[[item for item in folddf.columns if "SEN" in item]].mean(axis = 1)
folddf["avg_SPEC"] = folddf[[item for item in folddf.columns if "SPEC" in item]].mean(axis = 1)

folddf = folddf[["feat", "avg_SEN", "avg_SPEC"]]

In [3]:
folddf[folddf["avg_SPEC"] >= 0.95].sort_values(["avg_SPEC", "avg_SEN"], ascending = False)

Unnamed: 0,feat,avg_SEN,avg_SPEC
2,NMF_NUCLEOSOME,0.118838,0.983279
53,OT_FLEN,0.157376,0.980644
43,OT_NUCLEOSOME,0.164089,0.976104
41,"OT_FLEN,OT_NUCLEOSOME",0.193115,0.975348
49,OT_EM,0.180985,0.974361
6,NMF_EM,0.09383,0.971571
47,"OT_FLEN,NMF_NUCLEOSOME",0.188579,0.968511
34,"OT_NUCLEOSOME,NMF_NUCLEOSOME",0.202997,0.965333
37,"OT_FLEN,OT_NUCLEOSOME,NMF_NUCLEOSOME",0.221409,0.964577
22,"OT_EM,NMF_NUCLEOSOME",0.228994,0.958298


In [4]:
folddf.sort_values(["avg_SPEC", "avg_SEN"], ascending = False)

Unnamed: 0,feat,avg_SEN,avg_SPEC
2,NMF_NUCLEOSOME,0.118838,0.983279
53,OT_FLEN,0.157376,0.980644
43,OT_NUCLEOSOME,0.164089,0.976104
41,"OT_FLEN,OT_NUCLEOSOME",0.193115,0.975348
49,OT_EM,0.180985,0.974361
...,...,...,...
1706,"EM_score,FLEN_score,NUCLEOSOME_score,EM_shanno...",0.612381,0.763297
1723,"EM_score,FLEN_score,NUCLEOSOME_score,EM_shanno...",0.614480,0.761069
1726,"EM_score,FLEN_score,NUCLEOSOME_score,EM_shanno...",0.614480,0.761069
1710,"EM_score,FLEN_score,NUCLEOSOME_score,EM_shanno...",0.612923,0.761069


In [5]:
folddf[(folddf["feat"].str.contains("NMF_FLEN")) & (folddf["feat"].str.contains("ichorCNA"))].sort_values("avg_SPEC", ascending = False)

Unnamed: 0,feat,avg_SEN,avg_SPEC
239,"ichorCNA,NMF_FLEN",0.542293,0.835769
246,"ichorCNA,NMF_FLEN,NMF_NUCLEOSOME",0.543375,0.832837
693,"OT_NUCLEOSOME,ichorCNA,NMF_FLEN",0.558040,0.827123
703,"OT_FLEN,OT_NUCLEOSOME,ichorCNA,NMF_FLEN",0.558040,0.827123
748,"OT_FLEN,ichorCNA,NMF_FLEN",0.547551,0.827123
...,...,...,...
1720,"EM_score,FLEN_score,NUCLEOSOME_score,EM_shanno...",0.613938,0.763297
1722,"EM_score,FLEN_score,NUCLEOSOME_score,EM_shanno...",0.612923,0.761069
1723,"EM_score,FLEN_score,NUCLEOSOME_score,EM_shanno...",0.614480,0.761069
1710,"EM_score,FLEN_score,NUCLEOSOME_score,EM_shanno...",0.612923,0.761069


In [6]:
folddf.sort_values(by = ["avg_SPEC", "avg_SEN"], ascending = False).style.background_gradient(subset = ["avg_SPEC", "avg_SEN"]).to_excel(os.path.join(f"resdf_folds.color.xlsx"), index = False)