In [None]:
import pandas as pd
import numpy as np
import pathlib
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import RocCurveDisplay
from sklearn.decomposition import NMF
import warnings
warnings.filterwarnings("ignore")
from sklearn.mixture import GaussianMixture
import os
import ot
import pickle
import argparse
import Levenshtein
import itertools
from sklearn.metrics import roc_curve
from helper_functions import *

path_to_main_input = "/media/hieunguyen/HNHD01/raw_data/MRD_GW_v1_20250318"
inputdir = os.path.join(path_to_main_input, "Metadata_Genome-wide_Version1_07.03.25")

# test_batch_name = "cmc_sample_20250326"
test_batch_name = "Metadata_MRDGW_Track_progress_20250226"

PROJECT = "gs-mrd"
release_version = "20250326"

expected_spec = 0.90
# expected_spec = 0.95
# expected_spec = 0.98
# expected_spec = 0.99
# expected_spec = 1

##### configurations/paths
path_to_main_src = "/media/hieunguyen/HNSD01/src/gs-mrd/v0.2"
path_to_save_output = f"{path_to_main_src}/output/{release_version}/SPEC_{expected_spec}"
path_to_save_feature_order = f"{path_to_main_src}/output/{release_version}/feature_order"

os.system(f"mkdir -p {path_to_save_output}")
os.system(f"mkdir -p {path_to_save_feature_order}")

os.system(f"mkdir -p {path_to_save_output}/models")
os.system(f"mkdir -p {path_to_save_output}/results")
os.system(f"mkdir -p {path_to_save_output}/new_prediction/{test_batch_name}")

path_to_input_test_data = os.path.join(path_to_main_input, "makePrediction_data", test_batch_name)
##### read features in
test_featuredf = dict()

for f in ["NUCLEOSOME", "FLEN", "EM", "IchorCNA"]:
    test_featuredf[f] = pd.read_csv(os.path.join(path_to_input_test_data, f"{f}.csv"))

test_featuredf["IchorCNA"].columns = ["SampleID", "ichorCNA"]

##### distance matrix based on edit distance of End motif 4bp
nucleotides = ['A', 'C', 'G', 'T']
motifs = [''.join(p) for p in itertools.product(nucleotides, repeat=4)]

# Initialize an empty distance matrix
distance_matrix = pd.DataFrame(index=motifs, columns=motifs)

# Compute the Levenshtein distance between each pair of 4-mer motifs
for motif1 in motifs:
    for motif2 in motifs:
        distance_matrix.loc[motif1, motif2] = Levenshtein.distance(motif1, motif2)

# Convert the distance matrix to integer type
M_EM = distance_matrix.to_numpy().copy()
M_EM /= M_EM.max() * 0.1


cutoffdf = pd.read_excel(f"{path_to_save_output}/models/cutoff_SPEC{expected_spec}.xlsx")

##### absolute difference between a sample and the reference
# prepare references
ref = dict()

em_ref = pd.read_csv(f"{path_to_save_output}/models/Healthy_reference_EM.csv")
em_ref.columns = ["motif", "Healthy"]
ref["EM"] = em_ref.copy()

flen_ref = pd.read_csv(f"{path_to_save_output}/models/Healthy_reference_FLEN.csv")
flen_ref.columns = ["FLEN", "Healthy"]
ref["FLEN"] = flen_ref.copy()

nuc_ref = pd.read_csv(f"{path_to_save_output}/models/Healthy_reference_NUCLEOSOME.csv")
nuc_ref.columns = ["Nucleosome", "Healthy"]
ref["NUCLEOSOME"] = nuc_ref.copy()


In [None]:
metadata = pd.read_excel(os.path.join(path_to_input_test_data, "metadata.xlsx"))
if "ichorCNA" in metadata.columns:
    metadata = metadata.drop(columns=["ichorCNA"], axis = 1)

In [None]:
test_featuredf[f]

In [None]:
for f in ["NUCLEOSOME", "FLEN", "EM", "IchorCNA"]:
    test_featuredf[f] = test_featuredf[f][test_featuredf[f]["SampleID"].isin(metadata["SampleID"])]

In [None]:
test_features = dict()

all_samples = test_featuredf["FLEN"].SampleID.unique()

# add score EM, FLEN, NUCLEOSOME to test_features
for f in ["EM", "FLEN", "NUCLEOSOME"]:
    inputdf = test_featuredf[f].set_index("SampleID").T.copy()
    inputdf["Healthy"] = ref[f]["Healthy"].values
    for sampleid in all_samples:
        inputdf[sampleid] = abs(inputdf[sampleid] - inputdf["Healthy"])
    input_scoredf = inputdf.drop("Healthy", axis = 1).sum().reset_index()
    input_scoredf.columns = ["SampleID", f"{f}_score"]
    input_scoredf = input_scoredf.merge(metadata, right_on = "SampleID", left_on = "SampleID")
    test_features[f"{f}_score"] = input_scoredf

f = "EM"
inputdf = test_featuredf[f].set_index("SampleID").T

em_shannondf = pd.DataFrame(data = inputdf.columns, columns = ["SampleID"])
def calculate_em_shannon(x, inputdf):
    tmpdf = inputdf[x].values
    shannon = -np.sum([item * np.log2(item) for item in tmpdf])/256
    return(shannon)
em_shannondf["EM_shannon"] = em_shannondf["SampleID"].apply(lambda x: calculate_em_shannon(x, inputdf))
em_shannondf = em_shannondf.merge(metadata, right_on = "SampleID", left_on = "SampleID")
test_features["EM_shannon"] = em_shannondf
##### OT distance
for f in ["EM", "FLEN", "NUCLEOSOME"]:
    barycenter = pd.read_csv(f"{path_to_save_output}/models/Healthy_OT_{f}_baryl2.csv")
    bary_l2 = barycenter.baryl2.to_numpy()
    ot_scoredf = pd.DataFrame(data = all_samples, columns = ["SampleID"])
    ot_scoredf[f"OT_{f}"] = ot_scoredf["SampleID"].apply(lambda x: 
        calculate_ot_distance_to_healthy_nuc(x, 
                                             bary_l2, 
                                             test_featuredf[f].set_index("SampleID").T, 
                                             n = test_featuredf[f].shape[1] - 1))
    ot_scoredf = ot_scoredf.merge(metadata, right_on = "SampleID", left_on = "SampleID")
    test_features[f"OT_{f}"] = ot_scoredf
    
test_features["ichorCNA"] = test_featuredf["IchorCNA"]

for input_feature in ["EM", "FLEN", "NUCLEOSOME"]:
    filename = os.path.join(path_to_save_output, "models", f'NMF_{input_feature}.sav')
    model = pickle.load(open(filename, 'rb'))
    X = test_featuredf[input_feature].set_index("SampleID")
    W = model.transform(X.to_numpy())
    H = model.components_
    nmf_signal_cancer = cutoffdf[cutoffdf["feature"].str.contains(input_feature)]["feature"].values[0].split("_")[2]
    nmfdf = pd.DataFrame(data = W, columns = ["V1", "V2"])
    nmfdf["SampleID"] = list(X.index)
    nmfdf["V1_scale"] = nmfdf[["V1", "V2"]].apply(lambda x: x[0]/sum(x), axis = 1)
    nmfdf["V2_scale"] = nmfdf[["V1", "V2"]].apply(lambda x: x[1]/sum(x), axis = 1)
    nmfdf = nmfdf.merge(metadata, right_on = "SampleID", left_on = "SampleID")
    tmpdf = nmfdf[["SampleID", f"V{nmf_signal_cancer}_scale"]].copy()
    tmpdf.columns = ["SampleID", f"NMF_{input_feature}_{nmf_signal_cancer}"]
    test_features[f"NMF_{input_feature}_{nmf_signal_cancer}"] = tmpdf.copy()
test_outputdf = pd.DataFrame(data = metadata["SampleID"].to_list(), columns = ["SampleID"])
for feat in test_features.keys():
    tmpdf = test_features[feat][["SampleID", feat]]
    tmpdf.columns = ["SampleID", feat]
    test_outputdf = test_outputdf.merge(tmpdf, right_on = "SampleID", left_on = "SampleID")

test_outputdf = test_outputdf.merge(metadata, right_on = "SampleID", left_on = "SampleID")


In [None]:
with open(f"{path_to_save_output}/models/feature_combinations.pkl", "rb") as f:
        feature_combinations = pickle.load(f)

def get_Sen_Spec_for_combi(combi, inputdf):
    input_feats = combi.split(",")
    tmpdf =  inputdf[["Label"] + [f"prediction_{i}" for i in input_feats]]
    tmpdf["sum"] = tmpdf[[f"prediction_{i}" for i in input_feats]].sum(axis = 1)
    tmpdf["prediction"] = tmpdf["sum"].apply(lambda x: 1 if x != 0 else 0)
    sen = tmpdf[(tmpdf["prediction"] == 1) & (tmpdf["Label"] == 1)].shape[0]/tmpdf[tmpdf["Label"]== 1].shape[0]
    spec = tmpdf[(tmpdf["prediction"] == 0) & (tmpdf["Label"] == 0)].shape[0]/tmpdf[tmpdf["Label"]== 0].shape[0]
    return(sen, spec)

all_features = test_features.keys()
for feat in all_features:
    c = float(cutoffdf[cutoffdf["feature"] == feat].cutoff.values[0])
    test_outputdf[f"prediction_{feat}"] = test_outputdf[feat].apply(
        lambda x: 1 if x > c else 0
    )
test_combinedf = pd.DataFrame(data = [",".join(feature_combinations[i]) for i in range(len(feature_combinations))], 
                              columns = ["feature_combinations"])

test_outputdf.to_excel(f"{path_to_save_output}/new_prediction/{test_batch_name}/test_outputdf.xlsx", index = False)

In [None]:
test_outputdf