In [1]:
import pandas as pd
import numpy as np
import pathlib
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import RocCurveDisplay
from sklearn.decomposition import NMF
import warnings
warnings.filterwarnings("ignore")
from sklearn.mixture import GaussianMixture
import os
import ot
import pickle
import argparse
import Levenshtein
import itertools

from predict import *

inputdir = "/media/hieunguyen/HNHD01/raw_data/MRD_GW_v1_20250318/Metadata_Genome-wide_Version1_07.03.25"

metadata_not_use = pd.read_excel(os.path.join(inputdir, "Metadata_Genome-wide Version1_07.03.25.xlsx"))
metadata = pd.read_csv(os.path.join(inputdir, "meta_full.csv"))

all_train_samples = metadata[metadata["Set"] == "train"]["SampleID"].unique()
all_test_samples = metadata[metadata["Set"] == "test"]["SampleID"].unique()
all_validate_samples = metadata[metadata["Set"] == "validate"]["SampleID"].unique()

##### generate sample list for each class, all samples
samplelist = dict()
for label in metadata.Cancer.unique():
    samplelist[label] = metadata[metadata["Cancer"] == label]["SampleID"].to_list()

##### generate sample list for each class, train samples only
train_samplelist = dict()
for label in metadata.Cancer.unique():
    train_samplelist[label] = metadata[(metadata["Cancer"] == label) & (metadata["Set"] == "train")]["SampleID"].to_list()
    
##### generate sample list for each class, test samples only
test_samplelist = dict()
for label in metadata.Cancer.unique():
    test_samplelist[label] = metadata[(metadata["Cancer"] == label) & (metadata["Set"] == "test")]["SampleID"].to_list()

##### generate sample list for each class, validate samples only
validate_samplelist = dict()
for label in metadata.Cancer.unique():
    validate_samplelist[label] = metadata[(metadata["Cancer"] == label) & (metadata["Set"] == "validate")]["SampleID"].to_list()

##### read features in
featuredf = dict()
train_featuredf = dict()
test_featuredf = dict()
validate_featuredf = dict()

for f in ["NUCLEOSOME", "FLEN", "EM", "IchorCNA"]:
    featuredf[f] = pd.read_csv(os.path.join(inputdir, f"{f}.csv"))

featuredf["IchorCNA"].columns = ["SampleID", "ichorCNA"]

for f in ["NUCLEOSOME", "FLEN", "EM", "IchorCNA"]:
    train_featuredf[f] = featuredf[f][featuredf[f]["SampleID"].isin(all_train_samples)]
    test_featuredf[f] = featuredf[f][featuredf[f]["SampleID"].isin(all_test_samples)]
    validate_featuredf[f] = featuredf[f][featuredf[f]["SampleID"].isin(all_validate_samples)]

train_featuredf["IchorCNA"].columns = ["SampleID", "ichorCNA"]
test_featuredf["IchorCNA"].columns = ["SampleID", "ichorCNA"]
validate_featuredf["IchorCNA"].columns = ["SampleID", "ichorCNA"]

cmcdir = "/media/hieunguyen/HNSD01/src/gs-mrd/v0.2/commercial_models/release_v0.2_20240610"
nmf_flen_cancer_signal = 2
path_to_save_output = os.path.join(cmcdir, "validate_output")
path_to_model_files = os.path.join(cmcdir, "model_files")
cmc_cutoffdf = pd.read_excel(os.path.join(path_to_model_files, "cutoffdf.xlsx"), index_col = [0])

all_features = ['ichorCNA', 'FLEN', 'EM', 'NUCLEOSOME', 'OT_FLEN',
       'OT_NUCLEOSOME', 'NMF_FLEN', 'NMF_NUCLEOSOME']


# Predictions with the current commercial models

## Prediction on "train" set

In [2]:
##### function "predict" is from the file predict.py.This function is applied to this commercial version only. 
cmc_train_outputdf = predict(input_featuredf = train_featuredf, 
        path_to_save_output = path_to_save_output, 
        path_to_model_files = path_to_model_files, 
        nmf_flen_cancer_signal = nmf_flen_cancer_signal, 
        output_filename = "MRD_20250320_train_output")

# cmc_train_outputdf.columns = ["IchorCNA" if item == "ichorCNA" else item for item in cmc_train_outputdf.columns]

cmc_train_outputdf = cmc_train_outputdf.merge(metadata, right_on = "SampleID", left_on = "SampleID")
cmc_train_outputdf["True_label"] = cmc_train_outputdf[["Cancer", "Label"]].apply(
    lambda x: x[1] if x[0] != "Healthy" else "Healthy", axis = 1
)
cmc_train_outputdf["Label"] = cmc_train_outputdf["Label"].apply(lambda x: 1 if x == "Pos" else 0)

## Prediction on "test" set

In [3]:
##### function "predict" is from the file predict.py.This function is applied to this commercial version only. 
cmc_test_outputdf = predict(input_featuredf = test_featuredf, 
        path_to_save_output = path_to_save_output, 
        path_to_model_files = path_to_model_files, 
        nmf_flen_cancer_signal = nmf_flen_cancer_signal, 
        output_filename = "MRD_20250320_test_output")

# cmc_test_outputdf.columns = ["IchorCNA" if item == "ichorCNA" else item for item in cmc_test_outputdf.columns]

cmc_test_outputdf = cmc_test_outputdf.merge(metadata, right_on = "SampleID", left_on = "SampleID")
cmc_test_outputdf["True_label"] = cmc_test_outputdf[["Cancer", "Label"]].apply(
    lambda x: x[1] if x[0] != "Healthy" else "Healthy", axis = 1
)
cmc_test_outputdf["Label"] = cmc_test_outputdf["Label"].apply(lambda x: 1 if x == "Pos" else 0)

## Prediction on "validate" set

In [4]:
##### function "predict" is from the file predict.py.This function is applied to this commercial version only. 
cmc_validate_outputdf = predict(input_featuredf = validate_featuredf, 
        path_to_save_output = path_to_save_output, 
        path_to_model_files = path_to_model_files, 
        nmf_flen_cancer_signal = nmf_flen_cancer_signal, 
        output_filename = "MRD_20250320_validate_output")

# cmc_validate_outputdf.columns = ["IchorCNA" if item == "ichorCNA" else item for item in cmc_validate_outputdf.columns]

cmc_validate_outputdf = cmc_validate_outputdf.merge(metadata, right_on = "SampleID", left_on = "SampleID")
cmc_validate_outputdf["True_label"] = cmc_validate_outputdf[["Cancer", "Label"]].apply(
    lambda x: x[1] if x[0] != "Healthy" else "Healthy", axis = 1
)
cmc_validate_outputdf["Label"] = cmc_validate_outputdf["Label"].apply(lambda x: 1 if x == "Pos" else 0)

# Summary prediction results

In [5]:
from itertools import combinations

cutoffdf = cmc_cutoffdf.copy()

def get_Sen_Spec_for_combi(combi, inputdf):
    input_feats = combi.split(",")
    tmpdf =  inputdf[["Label"] + [f"prediction_{i}" for i in input_feats]]
    tmpdf["sum"] = tmpdf[[f"prediction_{i}" for i in input_feats]].sum(axis = 1)
    tmpdf["prediction"] = tmpdf["sum"].apply(lambda x: 1 if x != 0 else 0)
    sen = tmpdf[(tmpdf["prediction"] == 1) & (tmpdf["Label"] == 1)].shape[0]/tmpdf[tmpdf["Label"]== 1].shape[0]
    spec = tmpdf[(tmpdf["prediction"] == 0) & (tmpdf["Label"] == 0)].shape[0]/tmpdf[tmpdf["Label"]== 0].shape[0]
    return(sen, spec)


## Summary on train set

In [6]:
for feat in all_features:
    c = float(cutoffdf[cutoffdf["feature"] == feat].cutoff.values[0])
    cmc_train_outputdf[f"prediction_{feat}"] = cmc_train_outputdf[feat].apply(
        lambda x: 1 if x > c else 0
    )
cmc_train_outputdf = cmc_train_outputdf[cmc_train_outputdf["Label"] != "?"]
train_resdf = pd.DataFrame(data = all_features, columns = ["feature"])
train_resdf["SEN"] = train_resdf["feature"].apply(
    lambda x: cmc_train_outputdf[(cmc_train_outputdf[f"prediction_{x}"] == 1) & (cmc_train_outputdf["Label"] == 1)].shape[0]/cmc_train_outputdf[cmc_train_outputdf["Label"]== 1].shape[0]
)
train_resdf["SPEC"] = train_resdf["feature"].apply(
    lambda x: cmc_train_outputdf[(cmc_train_outputdf[f"prediction_{x}"] == 0) & (cmc_train_outputdf["Label"] == 0)].shape[0]/cmc_train_outputdf[cmc_train_outputdf["Label"]== 0].shape[0]
)
feature_combinations = []
for i in range(1, len(all_features) + 1):
    feature_combinations.extend(combinations(all_features, i))

train_combinedf = pd.DataFrame(data = [",".join(feature_combinations[i]) for i in range(len(feature_combinations))], columns = ["feature_combinations"])


train_combinedf[["SEN_train", "SPEC_train"]] = train_combinedf["feature_combinations"].apply(lambda x: get_Sen_Spec_for_combi(x, cmc_train_outputdf)).apply(pd.Series)
train_combinedf.to_excel(os.path.join(cmcdir,  "prediction_by_cmcModel_train.xlsx"), index  = False)

## Summary on test set

In [7]:
for feat in all_features:
    c = float(cutoffdf[cutoffdf["feature"] == feat].cutoff.values[0])
    cmc_test_outputdf[f"prediction_{feat}"] = cmc_test_outputdf[feat].apply(
        lambda x: 1 if x > c else 0
    )
cmc_test_outputdf = cmc_test_outputdf[cmc_test_outputdf["Label"] != "?"]
test_resdf = pd.DataFrame(data = all_features, columns = ["feature"])
test_resdf["SEN"] = test_resdf["feature"].apply(
    lambda x: cmc_test_outputdf[(cmc_test_outputdf[f"prediction_{x}"] == 1) & (cmc_test_outputdf["Label"] == 1)].shape[0]/cmc_test_outputdf[cmc_test_outputdf["Label"]== 1].shape[0]
)
test_resdf["SPEC"] = test_resdf["feature"].apply(
    lambda x: cmc_test_outputdf[(cmc_test_outputdf[f"prediction_{x}"] == 0) & (cmc_test_outputdf["Label"] == 0)].shape[0]/cmc_test_outputdf[cmc_test_outputdf["Label"]== 0].shape[0]
)
feature_combinations = []
for i in range(1, len(all_features) + 1):
    feature_combinations.extend(combinations(all_features, i))

test_combinedf = pd.DataFrame(data = [",".join(feature_combinations[i]) for i in range(len(feature_combinations))], columns = ["feature_combinations"])

test_combinedf[["SEN_test", "SPEC_test"]] = test_combinedf["feature_combinations"].apply(lambda x: get_Sen_Spec_for_combi(x, cmc_test_outputdf)).apply(pd.Series)
test_combinedf.to_excel(os.path.join(cmcdir,  "prediction_by_cmcModel_test.xlsx"), index  = False)

## Summary on validate set

In [8]:
for feat in all_features:
    c = float(cutoffdf[cutoffdf["feature"] == feat].cutoff.values[0])
    cmc_validate_outputdf[f"prediction_{feat}"] = cmc_validate_outputdf[feat].apply(
        lambda x: 1 if x > c else 0
    )
cmc_validate_outputdf = cmc_validate_outputdf[cmc_validate_outputdf["Label"] != "?"]
validate_resdf = pd.DataFrame(data = all_features, columns = ["feature"])
validate_resdf["SEN"] = validate_resdf["feature"].apply(
    lambda x: cmc_validate_outputdf[(cmc_validate_outputdf[f"prediction_{x}"] == 1) & (cmc_validate_outputdf["Label"] == 1)].shape[0]/cmc_validate_outputdf[cmc_validate_outputdf["Label"]== 1].shape[0]
)
validate_resdf["SPEC"] = validate_resdf["feature"].apply(
    lambda x: cmc_validate_outputdf[(cmc_validate_outputdf[f"prediction_{x}"] == 0) & (cmc_validate_outputdf["Label"] == 0)].shape[0]/cmc_validate_outputdf[cmc_validate_outputdf["Label"]== 0].shape[0]
)
feature_combinations = []
for i in range(1, len(all_features) + 1):
    feature_combinations.extend(combinations(all_features, i))

validate_combinedf = pd.DataFrame(data = [",".join(feature_combinations[i]) for i in range(len(feature_combinations))], columns = ["feature_combinations"])


validate_combinedf[["SEN_validate", "SPEC_validate"]] = validate_combinedf["feature_combinations"].apply(lambda x: get_Sen_Spec_for_combi(x, cmc_validate_outputdf)).apply(pd.Series)
validate_combinedf.to_excel(os.path.join(cmcdir,  "prediction_by_cmcModel_validate.xlsx"), index  = False)

In [9]:
cmc_summarydf = train_combinedf.merge(test_combinedf, right_on = "feature_combinations", left_on = "feature_combinations")
cmc_summarydf = cmc_summarydf.merge(validate_combinedf, right_on = "feature_combinations", left_on = "feature_combinations")
cmc_summarydf["num_combi"] = cmc_summarydf["feature_combinations"].apply(lambda x: len(x.split(",")))
cmc_summarydf.sort_values(by = ["SPEC_train", "SEN_train"], ascending=[False, False])

Unnamed: 0,feature_combinations,SEN_train,SPEC_train,SEN_test,SPEC_test,SEN_validate,SPEC_validate,num_combi
24,"EM,NMF_FLEN",0.3050,1.0000,0.29,1.000000,0.291667,0.992481,2
6,NMF_FLEN,0.2650,1.0000,0.25,1.000000,0.253788,0.992481,1
2,EM,0.1650,1.0000,0.13,1.000000,0.117424,1.000000,1
45,"ichorCNA,EM,NMF_FLEN",0.4575,0.9875,0.43,0.966667,0.458333,0.973684,3
13,"ichorCNA,NMF_FLEN",0.4375,0.9875,0.43,0.966667,0.431818,0.973684,2
...,...,...,...,...,...,...,...,...
234,"ichorCNA,EM,NUCLEOSOME,OT_FLEN,OT_NUCLEOSOME,N...",0.7200,0.2500,0.73,0.250000,0.719697,0.248120,6
251,"ichorCNA,FLEN,NUCLEOSOME,OT_FLEN,OT_NUCLEOSOME...",0.7200,0.2500,0.75,0.250000,0.719697,0.248120,7
238,"ichorCNA,NUCLEOSOME,OT_FLEN,OT_NUCLEOSOME,NMF_...",0.7175,0.2500,0.75,0.250000,0.719697,0.248120,6
229,"ichorCNA,FLEN,NUCLEOSOME,OT_FLEN,OT_NUCLEOSOME...",0.6950,0.2500,0.72,0.250000,0.700758,0.248120,6


In [10]:
cmc_summarydf[cmc_summarydf["feature_combinations"] == "ichorCNA,NMF_FLEN"]

Unnamed: 0,feature_combinations,SEN_train,SPEC_train,SEN_test,SPEC_test,SEN_validate,SPEC_validate,num_combi
13,"ichorCNA,NMF_FLEN",0.4375,0.9875,0.43,0.966667,0.431818,0.973684,2


In [11]:
cmc_summarydf.to_excel(os.path.join(cmcdir, "summary_prediction_by_cmcModel.xlsx"), index  = False)

In [12]:
for num_combi in cmc_summarydf["num_combi"].unique():  
    cmc_summarydf[cmc_summarydf["num_combi"] == num_combi].sort_values(by = ["SPEC_train", "SEN_train"], ascending = [False, False]).to_excel(f"{cmcdir}/summary_cmcModels_{num_combi}combi.xlsx", index = False)

In [14]:
cmc_train_outputdf.to_excel(f"{cmcdir}/cmcModels_train_output.xlsx", index = False)
cmc_test_outputdf.to_excel(f"{cmcdir}/cmcModels_test_output.xlsx", index = False)
cmc_validate_outputdf.to_excel(f"{cmcdir}/cmcModels_validate_output.xlsx", index = False)