In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pathlib
from tqdm import tqdm
from helper_functions import *
tqdm.pandas()
maindir = "/Users/hieunguyen/data/WGS_features"
outdir = "/Users/hieunguyen/data/outdir"
PROJECT = "WGS_feature_dist_distance"
output_version = "20240906"
data_version = "20240807"
path_to_main_output = os.path.join(outdir, PROJECT, output_version)
path_to_01_output = os.path.join(path_to_main_output, "I01_output")
os.makedirs(path_to_01_output, exist_ok = True)

path_to_metadata = "/Users/hieunguyen/data/metadata/metadata_nonBS.csv"
metadata = pd.read_csv(path_to_metadata)

feature_name = "EM_FLEN"
maindf = pd.read_csv(os.path.join(path_to_01_output, "{}.csv".format(feature_name)), index_col = [0])

##### filter metadata, remove runs with suspected feature drift.
metadata = metadata[metadata["SampleID"].isin(maindf.columns)]
metadata = metadata[metadata["Run"].isin(["R5434", "R5451", "R5601"]) == False]

training_run = ['R5044',
                'R5063',
                'R5083',
                'R5097',
                'R5119',
                'R5151',
                'R5167',
                'R5168',
                'R5169',
                'R5170',
                'R5184',
                'R5201',
                'R5219',
                'R5253']

training_control_samples = metadata[(metadata['Run'].isin(training_run)) & (metadata["Label"] == "Control")]["SampleID"].unique()
testing_samples = metadata[(~metadata['Run'].isin(training_run))]["SampleID"].unique()
controldf_train = maindf[training_control_samples].copy()
testdf = maindf[testing_samples].copy()

##### calculate the bary center for training samples
training_control_barycenter = calculate_barycenter(controldf_train.to_numpy())

##### calculate ground transportation cost matrix 
m, n = 171, 256  # Example dimensions

# Generate coordinates using numpy's meshgrid
x = np.arange(m)
y = np.arange(n)
xx, yy = np.meshgrid(x, y)

# Combine the coordinates
coordinates = np.c_[xx.ravel(), yy.ravel()]

M = ot.dist(coordinates, coordinates)



In [11]:
##### caclulate the OT distance from testing samples to "training samples" barycenter
resdf = pd.DataFrame(data = testing_samples, columns=["SampleID"])
resdf = resdf.merge(metadata[["SampleID", "Label", "Run"]], right_on = "SampleID", left_on = "SampleID")
resdf["dist_to_ref"] = resdf["SampleID"].progress_apply(lambda x: calculate_ot_distance_to_ref(x, training_control_barycenter, testdf, n = maindf.shape[0], M = M))