In [None]:
import pandas as pd
import numpy as np
import pathlib
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import RocCurveDisplay
from sklearn.decomposition import NMF
import warnings
warnings.filterwarnings("ignore")
from sklearn.mixture import GaussianMixture
import os
import ot
import pickle
import argparse
import Levenshtein
import itertools
from helper_functions import *

inputdir = "/media/hieunguyen/HNHD01/raw_data/MRD_GW_v1_20250318/Metadata_Genome-wide_Version1_07.03.25"

PROJECT = "gs-mrd"
release_version = "20250319"

##### configurations/paths
path_to_main_src = "/media/hieunguyen/HNSD01/src/gs-mrd/v0.2"
path_to_model_files = f"{path_to_main_src}/model_files/{release_version}"
path_to_save_features = f"{path_to_main_src}/model_files/{release_version}/features"

os.system(f"mkdir -p {path_to_save_features}")
os.system(f"mkdir -p {path_to_model_files}")

metadata_not_use = pd.read_excel(os.path.join(inputdir, "Metadata_Genome-wide Version1_07.03.25.xlsx"))
metadata = pd.read_csv(os.path.join(inputdir, "meta_full.csv"))

##### read features in
features = dict()
for f in ["NUCLEOSOME", "FLEN", "EM", "IchorCNA"]:
    features[f] = pd.read_csv(os.path.join(inputdir, f"{f}.csv"))
features["IchorCNA"].columns = ["SampleID", "ichorCNA"]

##### generate sample list for each class, all samples
samplelist = dict()
for label in metadata.Cancer.unique():
    samplelist[label] = metadata[metadata["Cancer"] == label]["SampleID"].to_list()

##### generate sample list for each class, train samples only
train_samplelist = dict()
for label in metadata.Cancer.unique():
    train_samplelist[label] = metadata[(metadata["Cancer"] == label) & (metadata["Set"] == "train")]["SampleID"].to_list()
    
##### generate sample list for each class, test samples only
test_samplelist = dict()
for label in metadata.Cancer.unique():
    test_samplelist[label] = metadata[(metadata["Cancer"] == label) & (metadata["Set"] == "test")]["SampleID"].to_list()

##### generate sample list for each class, validate samples only
validate_samplelist = dict()
for label in metadata.Cancer.unique():
    validate_samplelist[label] = metadata[(metadata["Cancer"] == label) & (metadata["Set"] == "validate")]["SampleID"].to_list()

##### distance matrix based on edit distance of End motif 4bp
nucleotides = ['A', 'C', 'G', 'T']
motifs = [''.join(p) for p in itertools.product(nucleotides, repeat=4)]

# Initialize an empty distance matrix
distance_matrix = pd.DataFrame(index=motifs, columns=motifs)

# Compute the Levenshtein distance between each pair of 4-mer motifs
for motif1 in motifs:
    for motif2 in motifs:
        distance_matrix.loc[motif1, motif2] = Levenshtein.distance(motif1, motif2)

# Convert the distance matrix to integer type
M_EM = distance_matrix.to_numpy().copy()
M_EM /= M_EM.max() * 0.1

final_features = dict() 

In [23]:
samplelist.keys()

dict_keys(['Healthy', 'Lung', 'Breast', 'HCC', 'Ovarian', 'CRC', 'Gastric'])

In [25]:
metadata

Unnamed: 0,SampleID,Label,Set,Cancer,Run,Kit,Team run,System,Group (Early/Met)
0,ZK0AAAI82NB,Neg,train,Healthy,R5484,No UID,ECD,MGI,
1,ZK0AAAH81NB,Neg,train,Healthy,R5451,No UID,ECD,MGI,
2,ZK0CAAA56NB,Neg,train,Healthy,R5170,No UID,ECD,MGI,
3,ZK0AAAA16NB,Neg,train,Healthy,R5063,No UID,ECD,MGI,
4,ZK0CAAA72NB,Neg,train,Healthy,R5184,No UID,ECD,MGI,
...,...,...,...,...,...,...,...,...,...
1325,ZMC039NB,Pos,validate,CRC,R5169,No UID,ECD,MGI,Early
1326,ZYCAB05NB,Pos,validate,Ovarian,R5167,No UID,ECD,MGI,Early
1327,ZMB159NB,Pos,validate,Breast,R5170,No UID,ECD,MGI,Early
1328,ZMB059NB,Pos,validate,Breast,R5083,No UID,ECD,MGI,Early
