In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import warnings 
import pathlib 
import os 
import ot
import openpyxl
from openpyxl.styles import PatternFill
from tqdm import tqdm
warnings.filterwarnings("ignore")

project = "ecd_wgs_enriched_features"
inputdir = f"/Volumes/HNSD01/storage/{project}"
outputdir = f"/Volumes/HNSD01/outdir/{project}"

path_to_00_output = os.path.join(outputdir, "00_output")
path_to_01_output = os.path.join(outputdir, "01_output")
path_to_02_output = os.path.join(outputdir, "02_output")
os.system(f"mkdir -p {path_to_02_output}")

input_files = [item for item in pathlib.Path(path_to_01_output).glob("*/avg_OT_dist.xlsx")]

df = pd.DataFrame()
for file in tqdm(input_files):
    tmpdf = pd.read_excel(file)
    filenames = str(file).split("/")[-2].split("_")
    panel = "_".join(filenames[0:2])
    strategy = str(file).split("/")[-2].replace(f"{panel}_", "")
    tmpdf["panel"] = panel
    tmpdf["strategy"] = strategy
    df = pd.concat([df, tmpdf], axis = 0)

full_avg_ot_dist = pd.read_excel(os.path.join(path_to_00_output, "avg_OT_dist.xlsx"))
full_avg_ot_dist["panel"] = "full"
full_avg_ot_dist["strategy"] = "full"
summary_avg_dist_ot = dict()
for feature_type in df["feature"].unique().tolist():
    tmpdf = df[df["feature"] == feature_type]
    tmpdf = pd.concat([tmpdf, full_avg_ot_dist[full_avg_ot_dist["feature"] == feature_type]], axis = 0)
    summary_avg_dist_ot[feature_type] = tmpdf.sort_values(by = "avg_dist_full", ascending = False).copy()
    
    summary_avg_dist_ot[feature_type].to_excel(os.path.join(path_to_02_output, 
                                                            f"summary_avg_OT_dist_{feature_type}.xlsx"), index = False)

    wb = openpyxl.load_workbook(os.path.join(path_to_02_output, f"summary_avg_OT_dist_{feature_type}.xlsx"))
    ws = wb.active

    highlight_fill = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")
    highlight_fill2 = PatternFill(start_color="FFCCCC", end_color="FFCCCC", fill_type="solid")

    for row in ws.iter_rows(min_row=2, max_row=ws.max_row):
        panel_value = row[3].value
        if panel_value == "full":
            for cell in row:
                cell.fill = highlight_fill
        if("Lung" in panel_value) or ("LUNG" in panel_value) or ("lung" in panel_value):
            for cell in row:
                cell.fill = highlight_fill2
    wb.save(os.path.join(path_to_02_output, f"summary_avg_OT_dist_{feature_type}.highlight.xlsx"))


100%|██████████| 125/125 [00:00<00:00, 405.04it/s]


In [2]:
full_flen_entropydf = pd.read_excel(os.path.join(path_to_00_output, "fragment_length_entropy.xlsx"))
full_diff_entropy = full_flen_entropydf.groupby("Label")["entropy"].mean().Cancer - full_flen_entropydf.groupby("Label")["entropy"].mean().Control
full_diff_entropy

-0.024174491109321905

In [3]:
input_entropy_files = [item for item in pathlib.Path(path_to_01_output).glob("*/fragment_length_entropy.xlsx")]
diff_entropydf = pd.DataFrame()
for file in tqdm(input_entropy_files):
    tmp_entropydf = pd.read_excel(file)
    diff_entropy = tmp_entropydf.groupby("Label")["entropy"].mean().Cancer - tmp_entropydf.groupby("Label")["entropy"].mean().Control
    filenames = str(file).split("/")[-2].split("_")
    panel = "_".join(filenames[0:2])
    strategy = str(file).split("/")[-2].replace(f"{panel}_", "")
    tmpdf = pd.DataFrame({
        "panel": [panel],
        "strategy": [strategy],
        "diff_entropy_flen": [diff_entropy]
    })
    diff_entropydf = pd.concat([diff_entropydf, tmpdf], axis = 0)
diff_entropydf["abs_diff_entropy_flen"] = diff_entropydf["diff_entropy_flen"].abs()
diff_entropydf = pd.concat([diff_entropydf, 
                             pd.DataFrame({
                                 "panel": ["full"],
                                 "strategy": ["full"],
                                 "diff_entropy_flen": [full_diff_entropy],
                                 "abs_diff_entropy_flen": [abs(full_diff_entropy)]
                             })], axis = 0)
diff_entropydf = diff_entropydf.sort_values(by = "abs_diff_entropy_flen", ascending = False)
diff_entropydf.to_excel(os.path.join(path_to_02_output, "summary_diff_entropy_flen.xlsx"), index = False)
wb = openpyxl.load_workbook(os.path.join(path_to_02_output, "summary_diff_entropy_flen.xlsx"))
ws = wb.active

highlight_fill = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")

for row in ws.iter_rows(min_row=2, max_row=ws.max_row):
    panel_value = row[0].value
    strategy_value = row[1].value
    if panel_value == "full":
        for cell in row:
            cell.fill = highlight_fill
    if("Lung" in strategy_value) or ("LUNG" in strategy_value) or ("lung" in strategy_value):
        for cell in row:
            cell.fill = highlight_fill2
wb.save(os.path.join(path_to_02_output, "summary_diff_entropy_flen_highlight.xlsx"))

100%|██████████| 125/125 [00:00<00:00, 301.27it/s]


In [4]:
full_em_entropydf = pd.read_excel(os.path.join(path_to_00_output, "end_motif_entropy.xlsx"))
full_diff_entropy = full_em_entropydf.groupby("Label")["entropy"].mean().Cancer - full_em_entropydf.groupby("Label")["entropy"].mean().Control
full_diff_entropy

0.0046004873420981696

In [5]:
input_entropy_files = [item for item in pathlib.Path(path_to_01_output).glob("*/end_motif_entropy.xlsx")]
diff_entropydf = pd.DataFrame()
for file in tqdm(input_entropy_files):
    tmp_entropydf = pd.read_excel(file)
    diff_entropy = tmp_entropydf.groupby("Label")["entropy"].mean().Cancer - tmp_entropydf.groupby("Label")["entropy"].mean().Control
    filenames = str(file).split("/")[-2].split("_")
    panel = "_".join(filenames[0:2])
    strategy = str(file).split("/")[-2].replace(f"{panel}_", "")
    tmpdf = pd.DataFrame({
        "panel": [panel],
        "strategy": [strategy],
        "diff_entropy_em": [diff_entropy]
    })
    diff_entropydf = pd.concat([diff_entropydf, tmpdf], axis = 0)

diff_entropydf["abs_diff_entropy_em"] = diff_entropydf["diff_entropy_em"].abs()
diff_entropydf = pd.concat([diff_entropydf, 
                             pd.DataFrame({
                                 "panel": ["full"],
                                 "strategy": ["full"],
                                 "diff_entropy_em": [full_diff_entropy],
                                 "abs_diff_entropy_em": [abs(full_diff_entropy)]
                             })], axis = 0)
diff_entropydf = diff_entropydf.sort_values(by = "abs_diff_entropy_em", ascending = False)
diff_entropydf.to_excel(os.path.join(path_to_02_output, "summary_diff_entropy_EM.xlsx"), index = False)

wb = openpyxl.load_workbook(os.path.join(path_to_02_output, "summary_diff_entropy_EM.xlsx"))
ws = wb.active

highlight_fill = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")

for row in ws.iter_rows(min_row=2, max_row=ws.max_row):
    panel_value = row[0].value
    strategy_value = row[1].value
    if panel_value == "full":
        for cell in row:
            cell.fill = highlight_fill
    if("Lung" in strategy_value) or ("LUNG" in strategy_value) or ("lung" in strategy_value):
        for cell in row:
            cell.fill = highlight_fill2
wb.save(os.path.join(path_to_02_output, "summary_diff_entropy_EM_highlight.xlsx"))

100%|██████████| 125/125 [00:00<00:00, 317.09it/s]
