In [1]:
import pandas as pd
import numpy as np
import pathlib
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import RocCurveDisplay
from sklearn.decomposition import NMF
import warnings
warnings.filterwarnings("ignore")
from sklearn.mixture import GaussianMixture
import os
import ot
import pickle
import argparse
import Levenshtein
import itertools
import umap
from sklearn.decomposition import PCA
from helper_functions import *
##### input args

PROJECT = "gs-mrd"
release_version = "10062024_modified_092024"
data_version = "20240924"
outdir = "/media/hieunguyen/HNSD_mini/outdir"
path_to_main_output = os.path.join(outdir, PROJECT, "data_analysis")
path_to_03_output = os.path.join(path_to_main_output, "03_output")
os.system(f"mkdir -p {path_to_03_output}")

files = [item for item in pathlib.Path(os.path.join("all_samples", data_version)).glob("*.csv")]
featuredf = dict()
for file in files:
    feature_name = file.name.split(".csv")[0]
    featuredf[feature_name] = pd.read_csv(file)

metadata = pd.read_csv(f"./metadata/{data_version}/metadata.csv")

##### check if the SampleID and the metadata match
assert len([item for item in metadata.SampleID.unique() if item in featuredf["EM"].SampleID.unique()]) == featuredf["EM"].shape[0]

# Some visualization on the data

In [None]:
umapdf = dict()
pcadf = dict()

for f in ["EM", "NUCLEOSOME", "FLEN"]:
    maindf = featuredf[f].copy()
    umap_reducer = umap.UMAP()
    pca_reducer = PCA(n_components=2)

    umap_res = umap_reducer.fit_transform(maindf.iloc[:, 1:].to_numpy())
    pca_res = pca_reducer.fit_transform(maindf.iloc[:, 1:].to_numpy())

    umapdf[f] = pd.DataFrame(umap_res, columns=["UMAP1", "UMAP2"])
    umapdf[f]["SampleID"] = maindf.SampleID.values

    pcadf[f]  = pd.DataFrame(pca_res, columns=["PCA1", "PCA2"])
    pcadf[f]["SampleID"] = maindf.SampleID.values

    umapdf[f] = umapdf[f].merge(metadata, right_on = "SampleID", left_on = "SampleID")
    pcadf[f] = pcadf[f].merge(metadata, right_on = "SampleID", left_on = "SampleID")

    plt.figure(figsize=(10, 6))
    sns.scatterplot(data = umapdf[f], x = "UMAP1", y = "UMAP2", hue = "true_label")
    figname = f"UMAP_grouped_by_true_label.{f}.svg"
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.savefig(os.path.join(path_to_03_output, figname), format='svg')
    # plt.show()

    plt.figure(figsize=(10, 6))
    sns.scatterplot(data = umapdf[f], x = "UMAP1", y = "UMAP2", hue = "sheet")
    figname = f"UMAP_grouped_by_sheet.{f}.svg"
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.savefig(os.path.join(path_to_03_output, figname), format='svg')
    # plt.show()

    plt.figure(figsize=(10, 6))
    sns.scatterplot(data = pcadf[f], x = "PCA1", y = "PCA2", hue = "true_label")
    figname = f"PCA_grouped_by_true_label.{f}.svg"
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.savefig(os.path.join(path_to_03_output, figname), format='svg')
    # plt.show()

    plt.figure(figsize=(10, 6))
    sns.scatterplot(data = pcadf[f], x = "PCA1", y = "PCA2", hue = "sheet")
    figname = f"PCA_grouped_by_sheet.{f}.svg"
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.savefig(os.path.join(path_to_03_output, figname), format='svg')
    # plt.show()


In [None]:
path_to_main_src = "/media/hieunguyen/HNSD01/src/gs-mrd"
path_to_model_files = f"{path_to_main_src}/model_files/{release_version}"
motif_order = pd.read_csv("motif_order.csv").motif_order.to_list()

##### load cut-off values for all features
cutoffdf = pd.read_csv(os.path.join(path_to_model_files, "cutoff.csv"))

##### load reference values for all features (healthy references)
ref = dict()

em_ref = pd.read_csv(f"{path_to_model_files}/Healthy_reference_EM.csv")
em_ref.columns = ["motif", "Healthy"]
ref["EM"] = em_ref.copy()

flen_ref = pd.read_csv(f"{path_to_model_files}/Healthy_reference_FLEN.csv")
flen_ref.columns = ["FLEN", "Healthy"]
ref["FLEN"] = flen_ref.copy()

nuc_ref = pd.read_csv(f"{path_to_model_files}/Healthy_reference_NUCLEOSOME.csv")
nuc_ref.columns = ["Nucleosome", "Healthy"]
ref["NUCLEOSOME"] = nuc_ref.copy()


In [None]:
all_samples = featuredf["EM"].SampleID.to_list()
final_features = dict()
for f in ["EM", "FLEN", "NUCLEOSOME"]:
    inputdf = featuredf[f].set_index("SampleID").T.copy()
    inputdf["Healthy"] = ref[f]["Healthy"].values
    for sampleid in all_samples:
        inputdf[sampleid] = abs(inputdf[sampleid] - inputdf["Healthy"])
    input_scoredf = inputdf.drop("Healthy", axis = 1).sum().reset_index()
    input_scoredf.columns = ["SampleID", f"{f}_score"]
    input_scoredf = input_scoredf.merge(metadata, right_on = "SampleID", left_on = "SampleID")
    final_features[f"{f}_score"] = input_scoredf

##### OT distance
for f in ["EM", "FLEN", "NUCLEOSOME"]:
    barycenter = pd.read_csv(f"{path_to_model_files}/Healthy_OT_{f}_baryl2.csv")
    bary_l2 = barycenter.baryl2.to_numpy()
    ot_scoredf = pd.DataFrame(data = all_samples, columns = ["SampleID"])
    ot_scoredf[f"OT_{f}"] = ot_scoredf["SampleID"].apply(lambda x: 
        calculate_ot_distance_to_healthy_nuc(x, 
                                             bary_l2, 
                                             featuredf[f].set_index("SampleID").T, 
                                             n = featuredf[f].shape[1] - 1))
    ot_scoredf = ot_scoredf.merge(metadata, right_on = "SampleID", left_on = "SampleID")
    final_features[f"OT_{f}"] = ot_scoredf
    
final_features["ichorCNA"] = featuredf["IchorCNA"]
outputdf = pd.DataFrame(data = metadata["SampleID"].to_list(), columns = ["SampleID"])
for feat in final_features.keys():
    tmpdf = final_features[feat][["SampleID", feat]]
    tmpdf.columns = ["SampleID", feat]
    outputdf = outputdf.merge(tmpdf, right_on = "SampleID", left_on = "SampleID")

outputdf = outputdf.merge(metadata, right_on = "SampleID", left_on = "SampleID")