# GS atlas results for CLOD samples

In [1]:
import pandas as pd
import numpy as np
import pathlib
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pysam
import os
import argparse
import sys
from helper_functions import *
outputdir = "./outputdir_02102023"

topK = 500
atlas_sample_types = "Tissue,WBC"

path_to_03_output = os.path.join(outputdir, "03_output_noFDR")
path_to_04_output = os.path.join(outputdir, "04_output_noFDR", "top{}_{}".format(topK, atlas_sample_types.replace(",", "_and_")))
path_to_05_output = os.path.join(outputdir, "05_output_noFDR", "top{}_{}".format(topK, atlas_sample_types.replace(",", "_and_")))
path_to_06_output = os.path.join(outputdir, "06_output_noFDR", "top{}_{}".format(topK, atlas_sample_types.replace(",", "_and_")))
path_to_07_output = os.path.join(outputdir, "07_output_noFDR", "top{}_{}".format(topK, atlas_sample_types.replace(",", "_and_")))
path_to_08_output = os.path.join(outputdir, "08_output_noFDR", "top{}_{}".format(topK, atlas_sample_types.replace(",", "_and_")))
path_to_10_output = os.path.join(outputdir, "10_output_noFDR", "top{}_{}".format(topK, atlas_sample_types.replace(",", "_and_")))

path_to_13_output = os.path.join(outputdir, "13_output_noFDR", "top{}_{}".format(topK, atlas_sample_types.replace(",", "_and_")))
os.system("mkdir -p {}".format(path_to_13_output))

path_to_res = os.path.join(path_to_10_output)

all_clod_res = [item for item in pathlib.Path(path_to_res).glob("*.deconvo.csv")]
clod_metadata = pd.read_excel(os.path.join(outputdir, "cLOD_samples", "cLOD_metadata.xlsx"))

clod_bamfile = pd.read_csv(os.path.join(outputdir, "cLOD_samples", "BAM", "cLOD.csv"))
clod_bamfile["reduced_filename"] = clod_bamfile["bam_file"].apply(lambda x: x.split(".deduplicated")[0].split("/")[-1])

clod_metadata = clod_metadata.merge(clod_bamfile, right_on = "SampleID", left_on = "Sample")

In [2]:
clod_metadata["deconvo_results"] = clod_metadata["reduced_filename"].apply(
    lambda x: [item for item in all_clod_res if x in item.name][0] if len([item for item in all_clod_res if x == item.name.split(".deduplicated")[0].replace("Sample_", "")]) == 1 else "error"
                                                                          )

In [4]:
df = pd.DataFrame(data = ["Liver", "Breast", "Gastric", "Lung" ,"CRC", "WBC"], columns = ["TOO"])
for file in all_clod_res:
    tmpdf = pd.read_csv(file, index_col = [0])
    df = df.merge(tmpdf, right_on = "TOO", left_on = "TOO")
df = df.set_index("TOO").T.reset_index()
df.columns = ["Sample", "Liver", "Breast", "Gastric", "Lung" ,"CRC", "WBC"]
df["reduced_filename"] = df["Sample"].apply(lambda x: x.split(".deduplicated")[0])
res = clod_metadata.merge(df, right_on = "reduced_filename", left_on = "reduced_filename")

In [6]:
path_to_data_rplot = "/datassd/hieunguyen/ECD/tumor_atlas_official/data_for_Rplot"
res.to_csv(os.path.join(path_to_data_rplot, "Figure3_CLOD_samples_GS_atlas.csv"))

In [None]:
for spike_in_rate in res["spike-in"].unique():
    for label in res.LABEL.unique():
        # plot = plt.figure()
        plot = res[(res["spike-in"] == spike_in_rate) & (res["LABEL"] == label)][["Sample_x","Liver", "Breast", "Gastric", "Lung" ,"CRC", "WBC"]].set_index("Sample_x").plot(kind = "bar", stacked = True)
        plt.title("Sample: {}, spike-in rate {}%".format(label, spike_in_rate))
        fig = plot.get_figure()
        plt.tight_layout()
        fig.savefig(os.path.join(path_to_13_output, "Sample_{}_spike_in_rate_{}.png".format(label, spike_in_rate)), dpi = 300)
        # plt.show()

# Loyfer results on CLOD samples

In [14]:
res.head()

Unnamed: 0,Sample_x,Actual tumor_fraction_ichorCNA,Simulated TF,spike-in,LABEL,LABCODE,RUNTM,RUNGW,Moss,SampleID,bam_file,reduced_filename,deconvo_results,Sample_y,Liver,Breast,Gastric,Lung,CRC,WBC
0,LODCRC6,0.02443,0.003226,1.0,Colorectal cancer,CRC8,R3596,R3606,,LODCRC6,/mnt/GS_NAS05/ECDGW_BIMASK_repo/R3606/04_bisma...,34-LODCRC6CT460W_M570-M770,outputdir_02102023/10_output_noFDR/top500_Tiss...,34-LODCRC6CT460W_M570-M770.deduplicated.sorted...,0.036978,0.0,0.060015,0.050239,0.0,0.852767
1,BREAST10R1,0.02187,0.02187,100.0,Breast cancer,BREAST10,R3725,R3724,,BREAST10R1,/mnt/GS_NAS05/ECDGW_BIMASK_repo/R3724/04_bisma...,110-BREAST10R1CT504W_S6569-S6769,outputdir_02102023/10_output_noFDR/top500_Tiss...,110-BREAST10R1CT504W_S6569-S6769.deduplicated....,0.280297,0.0,0.060659,0.001476,0.0,0.657567
2,LODCRC3PR1,0.07539,0.056745,15.0,Colorectal cancer,CRC8,R3549,R3549,x,LODCRC3PR1,/mnt/GS_NAS05/ECDGW_BIMASK_repo/R3549/04_bisma...,63-LODCRC3PR1CT446W_M538-M738,outputdir_02102023/10_output_noFDR/top500_Tiss...,63-LODCRC3PR1CT446W_M538-M738.deduplicated.sor...,0.066974,0.0,0.0,0.0,0.12087,0.812156
3,LODBREAST7R2,0.0229,0.000115,0.5,Breast cancer,BREAST7,R3596,R3606,,LODBREAST7R2,/mnt/GS_NAS05/ECDGW_BIMASK_repo/R3606/04_bisma...,48-LODBREAST7R2CT460W_M588-M788,outputdir_02102023/10_output_noFDR/top500_Tiss...,48-LODBREAST7R2CT460W_M588-M788.deduplicated.s...,0.124639,0.0,0.083581,0.015252,0.0,0.776527
4,LODBREAST6,0.02238,0.000696,1.0,Breast cancer,BREAST7,R3596,R3606,,LODBREAST6,/mnt/GS_NAS05/ECDGW_BIMASK_repo/R3606/04_bisma...,41-LODBREAST6CT460W_M579-M779,outputdir_02102023/10_output_noFDR/top500_Tiss...,41-LODBREAST6CT460W_M579-M779.deduplicated.sor...,0.088567,0.0,0.032319,0.0,0.0,0.879114


In [26]:
loyferdf = pd.read_csv("/datassd/hieunguyen/ECD/tumor_atlas_official/outputdir_02102023/cLOD_samples/custom_cell.csv")
loyferdf = loyferdf.set_index("CellType").T.reset_index()
loyferdf["Breast"] = loyferdf["Breast-Basal-Ep"] + loyferdf["Breast-Luminal-Ep"]
loyferdf["Lung"] = loyferdf["Lung-Ep-Alveo"] + loyferdf["Lung-Ep-Bron"]
loyferdf["WBC"] = loyferdf[['Blood-B', 'Blood-Granul', 'Blood-Mono+Macro', 'Blood-NK', 'Blood-T', 'Megakaryocytes']].apply(
    lambda x: sum(x), axis = 1
)
loyferdf["Liver"] = loyferdf["Liver-Hep"]
loyferdf["CRC"] = loyferdf["Colon-Ep"]
loyferdf["Gastric"] = loyferdf["Gastric-Ep"]

loyferdf = loyferdf[["index", "Liver", "Breast", "Gastric", "Lung", "CRC", "WBC"]]
loyferdf.columns = ["Sample"] + list(loyferdf.columns)[1:]
loyferdf["Sample"] = loyferdf["Sample"].apply(lambda x: x.replace(".sorted", ""))

In [28]:
clod_metadata.merge(loyferdf, right_on = "Sample", left_on = "Sample").to_csv(
    os.path.join(path_to_data_rplot, "Figure3_CLOD_Loyfer_atlas.csv"))