# Introduction


Use this notebook as a `changelog` notebook for setting `REFERENCE` feature-set in measuring feature **drifts**. 

The first version is `20240925`. In this version, we use all `training` data feature as **reference**. 

## Version 20240925, optimal transport distance.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pathlib
from tqdm import tqdm
from feature_class import *
from helper_functions import *

data_version = "20240924"
reference_version = "20240925"

path_to_feature_dir = f"/Users/hieunguyen/data/WGS_features/{data_version}/feature"
path_to_metadata = f"/Users/hieunguyen/data/WGS_features/{data_version}/metadata/metadata_nonBS_200924.xlsx"

path_to_save_output = f"feature_drift_reference/OT/{reference_version}"
os.system(f"mkdir -p {path_to_save_output}")

if os.path.exists(f"{path_to_save_output}/status.csv"):
    print(f"Reference version {reference_version} has been generated")
    exit()
else:
    obj = WGS_GW_features(path_to_feature_dir=path_to_feature_dir,
                        path_to_metadata=path_to_metadata)

    metadata = obj.metadata.copy()
    ##### keep only healthy control samples
    ref_metadata = metadata[(metadata["RUN_metadata train"] == 1) & (metadata["Label"] == "Control")]

    ref_metadata[["SampleID", "Run", "RUN_metadata train"]].to_csv(f"{path_to_save_output}/reference_samples.csv", index=False)
    ref_samples = ref_metadata["SampleID"].unique()

    print(f"There are {ref_metadata.shape[0]} reference samples in this version")

    flendf = obj.generate_flen_matrix()
    emdf = obj.generate_em_matrix()
    nucdf = obj.generate_nuc_matrix()

    ref_flendf = flendf[ref_samples].copy()
    ref_emdf = emdf[ref_samples].copy()
    ref_nucdf = nucdf[ref_samples].copy()

    flen_barycenter = calculate_barycenter(A = ref_flendf.to_numpy(), n = ref_flendf.shape[0])
    em_barycenter = calculate_barycenter(A = ref_emdf.to_numpy(), n = ref_emdf.shape[0])
    nuc_barycenter = calculate_barycenter(A = ref_nucdf.to_numpy(), n = ref_nucdf.shape[0])

    pd.DataFrame(data = flen_barycenter, columns = ["flen_barycenter"]).to_csv(f"{path_to_save_output}/flen_barycenter.csv", index=False)   
    pd.DataFrame(data = em_barycenter, columns = ["em_barycenter"]).to_csv(f"{path_to_save_output}/em_barycenter.csv", index=False)   
    pd.DataFrame(data = nuc_barycenter, columns = ["nuc_barycenter"]).to_csv(f"{path_to_save_output}/nuc_barycenter.csv", index=False)   

    pd.DataFrame(data = [f"finished_generated_data_ref_version_{reference_version}"],
                columns=["status"]).to_csv(f"{path_to_save_output}/status.csv", index=False)