In [1]:
import pandas as pd
import numpy as np
import pathlib 
import os
import matplotlib.pyplot as plt
import seaborn as sns
import random
from tqdm import tqdm
import warnings
import pandas as pd
import argparse

from dataset_paths import *

warnings.filterwarnings('ignore')

output_version = "20241229"
outdir = "/media/hieunguyen/HNSD_mini/outdir"

mode = "all"
PROJECT = "combine_ctcandi_ichorcna"

for dataset_name in ["SPIKE_IN_20250122"]:

    for input_cancer_class in ["CRC", "Liver", "Lung", "Breast", "pan_cancer"]:
        metadata = pd.read_csv(f"./metadata/{dataset_name}.csv")
        path_to_main_output = os.path.join(outdir, PROJECT, output_version, dataset_name)
        path_to_07_output = os.path.join(path_to_main_output, f"07_output_{mode}", input_cancer_class)
        path_to_08_output = os.path.join(path_to_main_output, f"08_output_{mode}", input_cancer_class)
        os.system(f"mkdir -p {path_to_08_output}")
        all_read_classification_files = [item for item in pathlib.Path(path_to_07_output).glob("*.raw.read_classification.csv")]
        all_candi_read_files = [item for item in pathlib.Path(path_to_07_output).glob("*candi_reads.csv")]

        if dataset_name == "LOD":
            convert_ratio = {
            '50' : 0.5, 
            '100': 1, 
            '0.5': 0.005, 
            '25': 0.25, 
            '15': 0.15, 
            '5': 0.05, 
            '1': 0.01, 
            'HC': 0
            }
            metadata = metadata[metadata["Sample"].duplicated() == False]
            metadata["SampleID"] = metadata["Sample"].values
            metadata.columns = ["ichorCNA" if item == "Actual tumor_fraction_ichorCNA" else item for item in metadata.columns]
            metadata["spike_in_ratio"] = metadata["spike-in"].apply(lambda x: convert_ratio[x])
            metadata['Label'] = metadata["LABEL"].apply(lambda x: "CRC" if x == "Colorectal cancer" else x.split(" ")[0])
            metadata["spike_in_label"] = metadata["Simulated TF"].apply(lambda x: "Control" if x == "Healthy-control" else input_cancer_class)
            metadata["Label"] = metadata[["Label", "spike_in_label"]].apply(lambda x: x[0] if x[1] != "Control" else "Control", axis = 1)
            metadata = metadata[["SampleID", "ichorCNA", "spike_in_ratio", "Label"]]
        elif dataset_name == "VALIDATION":
            metadata = metadata[["SampleID", "ichorCNA"]]
            all_candi_read_files = [item for item in all_candi_read_files if item.name.replace(".candi_reads.csv", "") in metadata["SampleID"].unique()]
            all_read_classification_files = [item for item in all_read_classification_files if item.name.replace(".raw.read_classification.csv", "") in metadata["SampleID"].unique()]
        elif dataset_name in ["SPIKE_IN", "SPIKE_IN_20250122"]:
            metadata = metadata[["SampleID", "Spike_in_label", "Spike_in_ratio", "ichorCNA"]]
            metadata["Spike_in_ratio"] = metadata["Spike_in_ratio"].apply(lambda x: x/100)
            metadata.columns = ["SampleID", "Label", "spike_in_ratio", "ichorCNA"]
            metadata = metadata[["SampleID", "ichorCNA", "spike_in_ratio", "Label"]]
        elif dataset_name in ["REPORT4", "CONTROL"]:
            metadata = metadata[["SampleID", "ichorCNA", "Label"]]
            metadata = metadata[metadata["ichorCNA"].isna() == False]

        print(f"Number of candi read files: {len(all_candi_read_files)}")
        print(f"Number of classification files: {len(all_read_classification_files)}")

        if os.path.isfile(os.path.join(path_to_08_output, "feature.csv")) == False:
            fulldf = pd.DataFrame()
            for i in tqdm(range(len(all_candi_read_files))):
                tmp_readdf = pd.read_csv(all_read_classification_files[i], index_col = [0])
                tmp_candidf = pd.read_csv(all_candi_read_files[i], index_col = [0])
                sampleid = all_candi_read_files[i].name.replace(".candi_reads.csv", "")
                raw_count = tmp_readdf.shape[0]
                in_read_count = tmp_readdf[tmp_readdf["read_overlap_rate"] == "in"].shape[0]
                mean_candi_reads = tmp_candidf.candi_reads.mean()
                ratio_raw = mean_candi_reads/raw_count
                ratio_in_read = mean_candi_reads/in_read_count
                tmpdf = pd.DataFrame({"SampleID": sampleid, 
                                    "raw_count": raw_count, 
                                    "in_read_count": in_read_count, 
                                    "mean_candi_reads": mean_candi_reads,
                                    "ratio_raw": ratio_raw,
                                    "ratio_in_read": ratio_in_read}, index = [0])

                fulldf = pd.concat([fulldf, tmpdf], axis = 0)
            fulldf = fulldf.merge(metadata, right_on = "SampleID", left_on = "SampleID")
            fulldf.to_csv(os.path.join(path_to_08_output, "feature.csv"))
        else:
            print(f"Data exists!")
            fulldf = pd.read_csv(os.path.join(path_to_08_output, "feature.csv"), index_col = [0])


KeyError: "['Spike_in_label', 'Spike_in_ratio'] not in index"

In [2]:
metadata

Unnamed: 0.1,Unnamed: 0,SampleID,Label,spike_in_ratio,ichorCNA
0,0,K0805_spike_in__001_from_ZMH050S_label_Liver_r...,Liver,1.0,0.02463
1,1,K0805_spike_in__0005_from_ZMH051S_label_Liver_...,Liver,0.5,0.02400
2,2,K0805_spike_in__001_from_pool_Liver_label_Live...,Liver,1.0,0.02414
3,3,K0805_spike_in__01_from_pool_Liver_label_Liver...,Liver,10.0,0.04376
4,4,K0805_spike_in__001_from_ZMH050S_label_Liver_r...,Liver,1.0,0.02447
...,...,...,...,...,...
8276,8276,K4AG50_spike_in__08_from_pool_Lung_label_Lung_...,Lung,80.0,0.16130
8277,8277,KAAH78_spike_in__08_from_pool_Lung_label_Lung_...,Lung,80.0,0.16580
8278,8278,KZAC21_spike_in__08_from_pool_Lung_label_Lung_...,Lung,80.0,0.17010
8279,8279,K0AO34_spike_in__08_from_pool_Lung_label_Lung_...,Lung,80.0,0.17110
