In [None]:
import pandas as pd
import numpy as np
import pathlib
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import RocCurveDisplay
from sklearn.decomposition import NMF
import warnings
warnings.filterwarnings("ignore")
from sklearn.mixture import GaussianMixture
import os
import ot
import pickle
import argparse
import Levenshtein
import itertools

from predict import *

inputdir = "/media/hieunguyen/HNHD01/raw_data/MRD_GW_v1_20250318/Metadata_Genome-wide_Version1_07.03.25"

metadata_not_use = pd.read_excel(os.path.join(inputdir, "Metadata_Genome-wide Version1_07.03.25.xlsx"))
metadata = pd.read_csv(os.path.join(inputdir, "meta_full.csv"))

all_train_samples = metadata[metadata["Set"] == "train"]["SampleID"].unique()
all_test_samples = metadata[metadata["Set"] == "test"]["SampleID"].unique()
all_validate_samples = metadata[metadata["Set"] == "validate"]["SampleID"].unique()

##### generate sample list for each class, all samples
samplelist = dict()
for label in metadata.Cancer.unique():
    samplelist[label] = metadata[metadata["Cancer"] == label]["SampleID"].to_list()

##### generate sample list for each class, train samples only
train_samplelist = dict()
for label in metadata.Cancer.unique():
    train_samplelist[label] = metadata[(metadata["Cancer"] == label) & (metadata["Set"] == "train")]["SampleID"].to_list()
    
##### generate sample list for each class, test samples only
test_samplelist = dict()
for label in metadata.Cancer.unique():
    test_samplelist[label] = metadata[(metadata["Cancer"] == label) & (metadata["Set"] == "test")]["SampleID"].to_list()

##### generate sample list for each class, validate samples only
validate_samplelist = dict()
for label in metadata.Cancer.unique():
    validate_samplelist[label] = metadata[(metadata["Cancer"] == label) & (metadata["Set"] == "validate")]["SampleID"].to_list()

##### read features in
featuredf = dict()
train_featuredf = dict()
test_featuredf = dict()
validate_featuredf = dict()

for f in ["NUCLEOSOME", "FLEN", "EM", "IchorCNA"]:
    featuredf[f] = pd.read_csv(os.path.join(inputdir, f"{f}.csv"))

featuredf["IchorCNA"].columns = ["SampleID", "ichorCNA"]

for f in ["NUCLEOSOME", "FLEN", "EM", "IchorCNA"]:
    train_featuredf[f] = featuredf[f][featuredf[f]["SampleID"].isin(all_train_samples)]
    test_featuredf[f] = featuredf[f][featuredf[f]["SampleID"].isin(all_test_samples)]
    validate_featuredf[f] = featuredf[f][featuredf[f]["SampleID"].isin(all_validate_samples)]

train_featuredf["IchorCNA"].columns = ["SampleID", "ichorCNA"]
test_featuredf["IchorCNA"].columns = ["SampleID", "ichorCNA"]
validate_featuredf["IchorCNA"].columns = ["SampleID", "ichorCNA"]

cmcdir = "/media/hieunguyen/HNSD01/src/gs-mrd/v0.2/commercial_models/release_v0.2_20240610"
nmf_flen_cancer_signal = 2
path_to_save_output = os.path.join(cmcdir, "validate_output")
path_to_model_files = os.path.join(cmcdir, "model_files")
cmc_cutoffdf = pd.read_excel(os.path.join(path_to_model_files, "cutoffdf.xlsx"), index_col = [0])

##### function "predict" is from the file predict.py.This function is applied to this commercial version only. 
cmc_validate_outputdf = predict(input_featuredf = validate_featuredf, 
        path_to_save_output = path_to_save_output, 
        path_to_model_files = path_to_model_files, 
        nmf_flen_cancer_signal = nmf_flen_cancer_signal, 
        output_filename = "MRD_20250320_validate_output")
cmc_validate_outputdf.columns = ["IchorCNA" if item == "ichorCNA" else item for item in cmc_validate_outputdf.columns]

cmc_validate_outputdf = cmc_validate_outputdf.merge(metadata, right_on = "SampleID", left_on = "SampleID")
cmc_validate_outputdf["True_label"] = cmc_validate_outputdf[["Cancer", "Label"]].apply(
    lambda x: x[1] if x[0] != "Healthy" else "Healthy", axis = 1
)
cmc_validate_outputdf["Label"] = cmc_validate_outputdf["Label"].apply(lambda x: 1 if x == "Pos" else 0)

all_features = ['IchorCNA', 'FLEN', 'EM', 'NUCLEOSOME', 'OT_FLEN',
       'OT_NUCLEOSOME', 'NMF_FLEN', 'NMF_NUCLEOSOME']

In [None]:
from itertools import combinations

cutoffdf = cmc_cutoffdf.copy()

for feat in all_features:
    c = float(cutoffdf[cutoffdf["feature"] == feat].cutoff.values[0])
    cmc_validate_outputdf[f"prediction_{feat}"] = cmc_validate_outputdf[feat].apply(
        lambda x: 1 if x > c else 0
    )
cmc_validate_outputdf = cmc_validate_outputdf[cmc_validate_outputdf["Label"] != "?"]
validate_resdf = pd.DataFrame(data = all_features, columns = ["feature"])
validate_resdf["SEN"] = validate_resdf["feature"].apply(
    lambda x: cmc_validate_outputdf[(cmc_validate_outputdf[f"prediction_{x}"] == 1) & (cmc_validate_outputdf["Label"] == 1)].shape[0]/cmc_validate_outputdf[cmc_validate_outputdf["Label"]== 1].shape[0]
)
validate_resdf["SPEC"] = validate_resdf["feature"].apply(
    lambda x: cmc_validate_outputdf[(cmc_validate_outputdf[f"prediction_{x}"] == 0) & (cmc_validate_outputdf["Label"] == 0)].shape[0]/cmc_validate_outputdf[cmc_validate_outputdf["Label"]== 0].shape[0]
)
feature_combinations = []
for i in range(1, len(all_features) + 1):
    feature_combinations.extend(combinations(all_features, i))

validate_combinedf = pd.DataFrame(data = [",".join(feature_combinations[i]) for i in range(len(feature_combinations))], columns = ["feature_combinations"])
def get_Sen_Spec_for_combi(combi, inputdf):
    input_feats = combi.split(",")
    tmpdf =  inputdf[["Label"] + [f"prediction_{i}" for i in input_feats]]
    tmpdf["sum"] = tmpdf[[f"prediction_{i}" for i in input_feats]].sum(axis = 1)
    tmpdf["prediction"] = tmpdf["sum"].apply(lambda x: 1 if x != 0 else 0)
    sen = tmpdf[(tmpdf["prediction"] == 1) & (tmpdf["Label"] == 1)].shape[0]/tmpdf[tmpdf["Label"]== 1].shape[0]
    spec = tmpdf[(tmpdf["prediction"] == 0) & (tmpdf["Label"] == 0)].shape[0]/tmpdf[tmpdf["Label"]== 0].shape[0]
    return(sen, spec)

validate_combinedf[["SEN_validate", "SPEC_validate"]] = validate_combinedf["feature_combinations"].apply(lambda x: get_Sen_Spec_for_combi(x, cmc_validate_outputdf)).apply(pd.Series)
validate_combinedf.to_excel(os.path.join(cmcdir,  "prediction_by_cmcModel_validate.xlsx"), index  = False)

In [None]:
validate_combinedf[validate_combinedf["feature_combinations"] == "IchorCNA,NMF_FLEN"]