In [None]:
import pandas as pd
import numpy as np
import pathlib 
import os
import matplotlib.pyplot as plt
import seaborn as sns
import random
from tqdm import tqdm
import warnings
import pandas as pd
import argparse

from dataset_paths import *

warnings.filterwarnings('ignore')

output_version = "20241229"
outdir = "/media/hieunguyen/HNSD_mini/outdir"
dataset_name = "LOD"
mode = "all"
PROJECT = "combine_ctcandi_ichorcna"

for input_cancer_class in ["CRC", "Liver", "Lung", "Breast", "pan_cancer"]:
    path_to_main_output = os.path.join(outdir, PROJECT, output_version, dataset_name)
    path_to_07_output = os.path.join(path_to_main_output, f"07_output_{mode}", input_cancer_class)
    path_to_08_output = os.path.join(path_to_main_output, f"08_output_{mode}", input_cancer_class)
    os.system(f"mkdir -p {path_to_08_output}")
    all_read_classification_files = [item for item in pathlib.Path(path_to_07_output).glob("*.raw.read_classification.csv")]
    all_candi_read_files = [item for item in pathlib.Path(path_to_07_output).glob("*candi_reads.csv")]

    metadata = pd.read_csv(f"./metadata/{dataset_name}.csv")
    if dataset_name == "LOD":
        metadata = metadata[metadata["Sample"].duplicated() == False]
        metadata["SampleID"] = metadata["Sample"].values
        metadata.columns = ["ichorCNA" if item == "Actual tumor_fraction_ichorCNA" else item for item in metadata.columns]
        merge_samples = [item for item in all_candi_read_files if item.name.replace(".candi_reads.csv", "") in metadata["SampleID"].unique()]
    elif dataset_name == "VALIDATION":
        metadata = metadata[["SampleID", "ichorCNA"]]
        all_candi_read_files = [item for item in all_candi_read_files if item.name.replace(".candi_reads.csv", "") in metadata["SampleID"].unique()]
        all_read_classification_files = [item for item in all_read_classification_files if item.name.replace(".raw.read_classification.csv", "") in metadata["SampleID"].unique()]
    print(f"Number of candi read files: {len(all_candi_read_files)}")
    print(f"Number of classification files: {len(all_read_classification_files)}")

    if os.path.isfile(os.path.join(path_to_08_output, "feature.csv")) == False:
        fulldf = pd.DataFrame()
        for i in tqdm(range(len(all_candi_read_files))):
            tmp_readdf = pd.read_csv(all_read_classification_files[i], index_col = [0])
            tmp_candidf = pd.read_csv(all_candi_read_files[i], index_col = [0])
            sampleid = all_candi_read_files[i].name.replace(".candi_reads.csv", "")
            raw_count = tmp_readdf.shape[0]
            in_read_count = tmp_readdf[tmp_readdf["read_overlap_rate"] == "in"].shape[0]
            mean_candi_reads = tmp_candidf.candi_reads.mean()
            ratio_raw = mean_candi_reads/raw_count
            ratio_in_read = mean_candi_reads/in_read_count
            tmpdf = pd.DataFrame({"SampleID": sampleid, 
                                "raw_count": raw_count, 
                                "in_read_count": in_read_count, 
                                "mean_candi_reads": mean_candi_reads,
                                "ratio_raw": ratio_raw,
                                "ratio_in_read": ratio_in_read}, index = [0])

            fulldf = pd.concat([fulldf, tmpdf], axis = 0)
        fulldf = fulldf.merge(metadata[["SampleID", "ichorCNA"]], right_on = "SampleID", left_on = "SampleID")
        fulldf.to_csv(os.path.join(path_to_08_output, "feature.csv"))
    else:
        fulldf = pd.read_csv(os.path.join(path_to_08_output, "feature.csv"), index_col = [0])


Number of candi read files: 427
Number of classification files: 427


  3%|▎         | 13/427 [00:01<01:02,  6.68it/s]

In [20]:
metadata

Unnamed: 0,Sample,Actual tumor_fraction_ichorCNA,Simulated TF,spike-in,LABEL,LABCODE,Detected,Decision,RUNTM,RUNGW,SampleID
0,LODCRC6,0.02443,0.003226,1,Colorectal cancer,CRC8,No,Healthy,R3596,R3606,LODCRC6
1,BREAST10R1,0.02187,0.02187,100,Breast cancer,BREAST10,No,Healthy,R3725,R3724,BREAST10R1
2,LODCRC3PR1,0.07539,0.056745,15,Colorectal cancer,CRC8,No,Healthy,R3549,R3549,LODCRC3PR1
3,LODBREAST7R2,0.02290,0.0001145,0.5,Breast cancer,BREAST7,No,Healthy,R3596,R3606,LODBREAST7R2
4,LODBREAST6,0.02238,0.0006958,1,Breast cancer,BREAST7,No,Healthy,R3596,R3606,LODBREAST6
...,...,...,...,...,...,...,...,...,...,...,...
422,CONTROL44R9,0.03381,Healthy-control,HC,Colorectal cancer,,No,,R3715,R3715,CONTROL44R9
423,LODCONTROL2,0.02951,Healthy-control,HC,Colorectal cancer,,No,,R3683,R3683,LODCONTROL2
424,LODCONTROL3,0.02966,Healthy-control,HC,Colorectal cancer,,No,,R3703,R3703,LODCONTROL3
425,LODCONTROL4R1,0.04060,Healthy-control,HC,Colorectal cancer,,No,,R3715,R3715,LODCONTROL4R1
