In [28]:
import pandas as pd
import numpy as np
import pathlib
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import RocCurveDisplay
from sklearn.decomposition import NMF
import warnings
warnings.filterwarnings("ignore")
from sklearn.mixture import GaussianMixture
import os
import ot
import pickle
import argparse
import Levenshtein
import itertools
from helper_functions import *

##### input args
PROJECT = "gs-mrd"
release_version = "06062024"
merge_version = "20240914"

##### configurations/paths
path_to_main_src = "/media/hieunguyen/HNSD01/src/gs-mrd"
path_to_merge_samples = f"{path_to_main_src}/all_samples/{merge_version}"
path_to_model_files = f"{path_to_main_src}/model_files/{release_version}"

##### metadata
motif_order = pd.read_csv("motif_order.csv").motif_order.to_list()
metadata = pd.read_csv(os.path.join(path_to_merge_samples, "EM_batch_metadata.csv"))
metadata["SampleID2"] = metadata["SampleID"].apply(lambda x: x.split("-")[1])
data_metadata = metadata.copy()

general_metadata = pd.read_excel("All Samples GW_MRD_010924.modified.xlsx", index_col = [0])
metadata = metadata[metadata["SampleID2"].isin(general_metadata["SampleID"].to_list())]

metadata["Cancer"] = metadata["SampleID"].apply(lambda x: general_metadata[general_metadata["SampleID"] == x.split("-")[1]].Cancer.unique()[0])
metadata["True label"] = metadata["SampleID"].apply(lambda x: general_metadata[general_metadata["SampleID"] == x.split("-")[1]]["True label"].unique()[0])

featuredf = dict()
for input_feature in ["EM", "FLEN", "NUCLEOSOME", "IchorCNA"]:
    tmpdf = pd.read_csv(f"{path_to_merge_samples}/{input_feature}_features.csv")
    if input_feature == "EM":
        featuredf[input_feature] = tmpdf[["SampleID"] + motif_order].copy()
    else:
        featuredf[input_feature] = tmpdf.copy()
metadata

Unnamed: 0,SampleID,RUN,Group_RUN,SampleID2,Cancer,True label
0,7-ZMC057,Research-mrdgw-all-batch2,06062024,ZMC057,CRC,+
1,8-ZMG093,Research-mrdgw-all-batch2,06062024,ZMG093,Gastric,+
2,6-ZMC071A,Research-mrdgw-all-batch2,06062024,ZMC071A,CRC,+
3,6-MDGAAA16,Research-mrdgw-all-batch2,06062024,MDGAAA16,CRC,+
4,8-ZMC046B,Research-mrdgw-all-batch2,06062024,ZMC046B,CRC,+
...,...,...,...,...,...,...
1446,3-ZMB009B,batch_270824,from_ECD_WGS,ZMB009B,Breast,-
1447,6-UHAA48,batch_270824,from_ECD_WGS,UHAA48,Lung,-
1448,5-ZMG138C,batch_270824,from_ECD_WGS,ZMG138C,Gastric,-
1449,4-ZMG096C,batch_270824,from_ECD_WGS,ZMG096C,Gastric,-


In [37]:
metadata[metadata["SampleID2"] == "ZMB009B"]

Unnamed: 0,SampleID,RUN,Group_RUN,SampleID2,Cancer,True label
967,8-ZMB009B,R5435,cmc_MRD,ZMB009B,Breast,-
1085,8-ZMB009B,batch_150724_160724,from_ECD_WGS,ZMB009B,Breast,-
1446,3-ZMB009B,batch_270824,from_ECD_WGS,ZMB009B,Breast,-


In [39]:
general_metadata[general_metadata["SampleID"] == "ZMB009B"]

Unnamed: 0,SampleID,Cancer,True label,ichorCNA,FLEN,EM,NUCLEOSOME,OT_FLEN,OT_NUCLEOSOME,NMF_FLEN,...,ichorCNA.1,FLEN.1,EM.1,NUCLEOSOME.1,OT_FLEN.1,OT_NUCLEOSOME.1,NMF_FLEN.1,NMF_NUCLEOSOME.1,Run,Kit
1019,ZMB009B,Breast,-,0.05323,0.184422,0.029302,0.017348,0.312606,0.017366,0.344792,...,+,-,-,-,+,-,-,-,R5435,No UID


In [36]:
metadata[metadata["SampleID2"].duplicated()]

Unnamed: 0,SampleID,RUN,Group_RUN,SampleID2,Cancer,True label
240,10-HMAAAA28,Research-mrdgw-all-batch5,06062024,HMAAAA28,Lung,+
242,9-HMAAAA19,Research-mrdgw-all-batch5,06062024,HMAAAA19,Lung,+
243,8-HMAAAA09,Research-mrdgw-all-batch5,06062024,HMAAAA09,Lung,+
255,1-HMAAAA05,Research-mrdgw-all-060624,06062024,HMAAAA05,Lung,+
259,5-YCAB67,Research-mrdgw-all-060624,06062024,YCAB67,Lung,+
...,...,...,...,...,...,...
1446,3-ZMB009B,batch_270824,from_ECD_WGS,ZMB009B,Breast,-
1447,6-UHAA48,batch_270824,from_ECD_WGS,UHAA48,Lung,-
1448,5-ZMG138C,batch_270824,from_ECD_WGS,ZMG138C,Gastric,-
1449,4-ZMG096C,batch_270824,from_ECD_WGS,ZMG096C,Gastric,-
