In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pathlib
from tqdm import tqdm

maindir = "/Users/hieunguyen/data/WGS_features"
outdir = "/Users/hieunguyen/data/outdir"
data_version = "20240822"
PROJECT = "WGS_feature_dist_distance"
output_version = "20240822"

path_to_main_output = os.path.join(outdir, PROJECT, output_version)
path_to_01_output = os.path.join(path_to_main_output, "01_output")

os.system("mkdir -p {}".format(path_to_01_output))

path_to_feature = os.path.join(maindir, data_version, "feature")
path_to_metadata = os.path.join(maindir, data_version, "metadata")


metadata = pd.read_excel(os.path.join(path_to_metadata, "metadata_WGS_20240823.xlsx"))

#####-----------------------------------------------------------------#####
##### Pre process the fragment length distribution feature
#####-----------------------------------------------------------------#####
feature_name = "flen"
all_flen_features = [item for item in pathlib.Path(path_to_feature).glob("*/*_GWfeature_{}.csv".format(feature_name))]

maindf = pd.DataFrame(data = range(1, 302), columns = ["feat"])
for file in tqdm(all_flen_features):
    tmpdf = pd.read_csv(file, index_col = [0])[["freq"]].reset_index()
    sampleid = file.name.split("_")[0].split("-")[1]
    tmpdf.columns = ["feat", sampleid]
    maindf = maindf.merge(tmpdf, right_on = "feat", left_on="feat")
maindf = maindf.set_index("feat")
maindf.to_csv(os.path.join(path_to_01_output, "{}.csv".format(feature_name)))
maindf = maindf[[item for item in maindf.columns if item in metadata["SampleID"].values]]
for input_class in metadata["Label"].unique():
    selecteddf = maindf[[item for item in maindf.columns if item in metadata[metadata["Label"] == input_class]["SampleID"].values]]
    selecteddf.to_csv(os.path.join(path_to_01_output, "{}.{}.csv".format(input_class, feature_name)))
metadata = metadata[metadata["SampleID"].isin(maindf.columns)]
metadata.to_csv(os.path.join(path_to_01_output, "metadata.filtered.{}.csv".format(feature_name)))



100%|██████████| 1471/1471 [00:05<00:00, 279.84it/s]


In [2]:
#####-----------------------------------------------------------------#####
##### Pre process the fragment length distribution feature
#####-----------------------------------------------------------------#####
feature_name = "flen"
all_flen_features = [item for item in pathlib.Path(path_to_feature).glob("*/*_GWfeature_{}.csv".format(feature_name))]

maindf = pd.DataFrame(data = range(1, 152), columns = ["feat"])
for file in tqdm(all_flen_features):
    tmpdf = pd.read_csv(file, index_col = [0])[["count"]][50:201]
    tmpdf["feat"] = range(1, 152)
    sampleid = file.name.split("_")[0].split("-")[1]
    tmpdf.columns = [sampleid, "feat"]
    tmpdf[sampleid] = tmpdf[sampleid].apply(lambda x: x/tmpdf[sampleid].sum())
    maindf = maindf.merge(tmpdf, right_on = "feat", left_on="feat")
maindf = maindf.set_index("feat")
maindf.to_csv(os.path.join(path_to_01_output, "{}.csv".format(feature_name)))
maindf = maindf[[item for item in maindf.columns if item in metadata["SampleID"].values]]
for input_class in metadata["Label"].unique():
    selecteddf = maindf[[item for item in maindf.columns if item in metadata[metadata["Label"] == input_class]["SampleID"].values]]
    selecteddf.to_csv(os.path.join(path_to_01_output, "{}.{}_151.csv".format(input_class, feature_name)))


100%|██████████| 1471/1471 [00:08<00:00, 183.86it/s]


In [3]:
#####-----------------------------------------------------------------#####
##### Pre process nucleosome footprint features
#####-----------------------------------------------------------------#####
feature_name = "Nucleosome"
all_nuc_features = [item for item in pathlib.Path(path_to_feature).glob("*/*_GWfeature_{}.csv".format(feature_name))]

maindf = pd.DataFrame(data = range(-300, 301), columns = ["feat"])
for file in tqdm(all_nuc_features):
    sampleid = file.name.split("_")[0].split("-")[1]
    tmpdf = pd.read_csv(file)
    tmpdf = tmpdf.set_index("Sample").T
    tmpdf["feat"] = range(-300, 301)
    tmpdf.columns = [sampleid, "feat"]
    maindf = maindf.merge(tmpdf, right_on = "feat", left_on="feat")
    
maindf = maindf.set_index("feat")
maindf.to_csv(os.path.join(path_to_01_output, "{}.csv".format(feature_name)))
maindf = maindf[[item for item in maindf.columns if item in metadata["SampleID"].values]]
for input_class in metadata["Label"].unique():
    selecteddf = maindf[[item for item in maindf.columns if item in metadata[metadata["Label"] == input_class]["SampleID"].values]]
    selecteddf.to_csv(os.path.join(path_to_01_output, "{}.{}.csv".format(input_class, feature_name)))


100%|██████████| 1471/1471 [00:15<00:00, 96.75it/s]


In [4]:
#####-----------------------------------------------------------------#####
##### Pre process end motif
#####-----------------------------------------------------------------#####
feature_name = "EM"
maindf = pd.DataFrame(data = ["{}{}{}{}".format(i,j,k,l) for i in ["A", "T", "G", "C"] for j in ["A", "T", "G", "C"] for k in ["A", "T", "G", "C"] for l in ["A", "T", "G", "C"]], columns = ["feat"])
all_em_features = [item for item in pathlib.Path(path_to_feature).glob("*/*_GWfeature_{}.csv".format(feature_name))]
for file in tqdm(all_em_features):
    sampleid = file.name.split("_")[0].split("-")[1]
    tmpdf = pd.read_csv(file)[["motif", "freq"]]
    tmpdf.columns = ["feat", sampleid]
    maindf = maindf.merge(tmpdf, right_on = "feat", left_on="feat")
maindf = maindf.set_index("feat")
maindf.to_csv(os.path.join(path_to_01_output, "{}.csv".format(feature_name)))
maindf = maindf[[item for item in maindf.columns if item in metadata["SampleID"].values]]
for input_class in metadata["Label"].unique():
    selecteddf = maindf[[item for item in maindf.columns if item in metadata[metadata["Label"] == input_class]["SampleID"].values]]
    selecteddf.to_csv(os.path.join(path_to_01_output, "{}.{}.csv".format(input_class, feature_name)))
    


100%|██████████| 1471/1471 [00:04<00:00, 301.91it/s]
