In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pathlib
from tqdm import tqdm
from feature_class import *
from helper_functions import *
import ot
import argparse

##### input args
reference_version = "20240925"
data_version = "RUN_metadata_test"
path_to_feature_dir = f"/Users/hieunguyen/data/WGS_features/{data_version}/feature"
path_to_metadata = f"/Users/hieunguyen/data/WGS_features/{data_version}/metadata/metadata.xlsx"

obj = WGS_GW_features(path_to_feature_dir=path_to_feature_dir,
                      path_to_metadata=path_to_metadata)

batch_metadata = obj.match_metadata.copy()
control_samples = batch_metadata[batch_metadata["Label"] == "Control"]["SampleID"].unique()

##### for calculating feature drift, we use healthy control samples only. 
flendf = obj.generate_flen_matrix()[control_samples]
emdf = obj.generate_em_matrix()[control_samples]
nucdf = obj.generate_nuc_matrix()[control_samples]

median_flendf = flendf.median(axis=1)
median_emdf = emdf.median(axis=1)
median_nucdf = nucdf.median(axis=1)

##### keep this path default, the feature_drift_reference always goes with the repo
flen_barycenter = pd.read_csv(f"feature_drift_reference/OT/{reference_version}/flen_barycenter.csv")
em_barycenter = pd.read_csv(f"feature_drift_reference/OT/{reference_version}/em_barycenter.csv")
nuc_barycenter = pd.read_csv(f"feature_drift_reference/OT/{reference_version}/nuc_barycenter.csv")

median_ref_flendf = pd.read_csv(f"feature_drift_reference/APE/{reference_version}/median_flendf.csv")
median_ref_emdf = pd.read_csv(f"feature_drift_reference/APE/{reference_version}/median_emdf.csv")
median_ref_nucdf = pd.read_csv(f"feature_drift_reference/APE/{reference_version}/median_nucdf.csv")

##### OT dist
flen_distdf = pd.DataFrame(data = flendf.columns, columns=["SampleID"])
flen_distdf["dist_to_ref"] = flen_distdf["SampleID"].apply(lambda x: calculate_ot_distance_to_ref(x, 
                                                                                        flen_barycenter["flen_barycenter"].to_numpy(), 
                                                                                        flendf))

em_distdf = pd.DataFrame(data = emdf.columns, columns=["SampleID"])
em_distdf["dist_to_ref"] = em_distdf["SampleID"].apply(lambda x: calculate_ot_distance_to_ref(x, 
                                                                                        em_barycenter["em_barycenter"].to_numpy(), 
                                                                                        emdf, n = 256))

nuc_distdf = pd.DataFrame(data = nucdf.columns, columns=["SampleID"])
nuc_distdf["dist_to_ref"] = nuc_distdf["SampleID"].apply(lambda x: calculate_ot_distance_to_ref(x, 
                                                                                        nuc_barycenter["nuc_barycenter"].to_numpy(), 
                                                                                        nucdf, n = 601))

##### APE
# flen has too many median 0 features, remove them before calculating APE
ape_flendf = pd.DataFrame(data = median_flendf, columns=["median_flen"])
ape_flendf["ref_median_flen"] = median_ref_flendf["0"].to_numpy()
ape_flendf = ape_flendf[(ape_flendf["median_flen"] != 0) & (ape_flendf["ref_median_flen"] != 0)].reset_index()

ape_flen = abs(ape_flendf["median_flen"].to_numpy() - ape_flendf["ref_median_flen"].to_numpy()) / ape_flendf["ref_median_flen"].to_numpy()
ape_em = abs(median_emdf - median_ref_emdf["0"].to_numpy()) / median_ref_emdf["0"].to_numpy()
ape_nuc = abs(median_nucdf - median_ref_nucdf["0"].to_numpy()) / median_ref_nucdf["0"].to_numpy()

ape_flen = pd.DataFrame(
    {"feat": ape_flendf.feat.unique(),
     "ape": ape_flen
     }
).reset_index().drop("index", axis = 1)

ape_em = pd.DataFrame(
    {"feat": ape_em.index,
    "ape": ape_em
    }    
).reset_index().drop("index", axis = 1)

ape_nuc = pd.DataFrame(
    {"feat": ape_nuc.index,
    "ape": ape_nuc
    }
).reset_index().drop("index", axis = 1)


100%|██████████| 32/32 [00:00<00:00, -554.90it/s]
100%|██████████| 32/32 [00:00<00:00, 896.48it/s]
100%|██████████| 32/32 [00:00<00:00, 1088.55it/s]


In [40]:
ape_nuc

Unnamed: 0_level_0,feat,ape
feat,Unnamed: 1_level_1,Unnamed: 2_level_1
-300,-300,0.016532
-299,-299,0.014802
-298,-298,0.014345
-297,-297,0.025594
-296,-296,0.006190
...,...,...
296,296,0.016214
297,297,0.024375
298,298,0.024265
299,299,0.003756
