In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pathlib
from tqdm import tqdm
from feature_class import *
from helper_functions import *
import ot
import argparse

##### input args
reference_version = "20240925"
data_version = "RUN_metadata_test"
path_to_feature_dir = f"/Users/hieunguyen/data/WGS_features/{data_version}/feature"
path_to_metadata = f"/Users/hieunguyen/data/WGS_features/{data_version}/metadata/metadata.xlsx"

obj = WGS_GW_features(path_to_feature_dir=path_to_feature_dir,
                      path_to_metadata=path_to_metadata)

batch_metadata = obj.match_metadata.copy()
control_samples = batch_metadata[batch_metadata["Label"] == "Control"]["SampleID"].unique()

##### for calculating feature drift, we use healthy control samples only. 
flendf = obj.generate_flen_matrix()[control_samples]
emdf = obj.generate_em_matrix()[control_samples]
nucdf = obj.generate_nuc_matrix()[control_samples]

median_flendf = flendf.median(axis=1)
median_emdf = emdf.median(axis=1)
median_nucdf = nucdf.median(axis=1)

##### keep this path default, the feature_drift_reference always goes with the repo
flen_barycenter = pd.read_csv(f"feature_drift_reference/OT/{reference_version}/flen_barycenter.csv")
em_barycenter = pd.read_csv(f"feature_drift_reference/OT/{reference_version}/em_barycenter.csv")
nuc_barycenter = pd.read_csv(f"feature_drift_reference/OT/{reference_version}/nuc_barycenter.csv")

median_ref_flendf = pd.read_csv(f"feature_drift_reference/APE/median_flendf.csv", index=False)
median_ref_emdf = pd.read_csv(f"feature_drift_reference/APE/median_emdf.csv", index=False)
median_ref_nucdf = pd.read_csv(f"feature_drift_reference/APE/median_nucdf.csv", index=False)

##### OT dist
flen_distdf = pd.DataFrame(data = flendf.columns, columns=["SampleID"])
flen_distdf["dist_to_ref"] = flen_distdf["SampleID"].apply(lambda x: calculate_ot_distance_to_ref(x, 
                                                                                        flen_barycenter["flen_barycenter"].to_numpy(), 
                                                                                        flendf))

em_distdf = pd.DataFrame(data = emdf.columns, columns=["SampleID"])
em_distdf["dist_to_ref"] = em_distdf["SampleID"].apply(lambda x: calculate_ot_distance_to_ref(x, 
                                                                                        em_barycenter["em_barycenter"].to_numpy(), 
                                                                                        emdf, n = 256))

nuc_distdf = pd.DataFrame(data = nucdf.columns, columns=["SampleID"])
nuc_distdf["dist_to_ref"] = nuc_distdf["SampleID"].apply(lambda x: calculate_ot_distance_to_ref(x, 
                                                                                        nuc_barycenter["nuc_barycenter"].to_numpy(), 
                                                                                        nucdf, n = 601))




100%|██████████| 32/32 [00:00<00:00, 811.36it/s]
100%|██████████| 32/32 [00:00<00:00, 903.09it/s]
100%|██████████| 32/32 [00:00<00:00, 1120.72it/s]


In [15]:
nuc_distdf

Unnamed: 0,SampleID,dist_to_ref
0,ZK0AAAD26NB,0.098772
1,ZK0AAAD20NB,0.098776
2,ZK0AAAD18NB,0.098432
3,ZK0DAAA33NB,0.098733
4,ZK0AAAD36NB,0.099248
5,ZK0AAAB72NB,0.098616
6,ZK0AAAD19NB,0.098755
7,ZK0AAAD39NB,0.09863
8,ZK0AAAB75NB,0.098609
9,ZK0AAAD31NB,0.098493
