In [1]:
import pandas as pd
import numpy as np
import pathlib 
import os
import matplotlib.pyplot as plt
import seaborn as sns
import random
from tqdm import tqdm
import warnings
import pandas as pd
from sklearn.metrics import roc_curve, auc
import pickle
def check_read_inside_region(start, seq, region):
        read_end = start + len(seq)
        region_start = int(region.split(":")[1].split("-")[0])
        region_end = int(region.split(":")[1].split("-")[1])
        if start >= region_start and read_end <= region_end:
            return "in"
        else: 
            return "overlap"

warnings.filterwarnings('ignore')

data_version = "TMD_cov"
output_version = "20240910"

outdir = "/media/hieunguyen/GSHD_HN01/outdir"
PROJECT = "TMD450_TCGA_data_analysis"
thres_hypo = 0.3
thres_hyper = 0.6

mode = "all"

path_to_main_output = os.path.join(outdir, PROJECT, output_version)
path_to_07_output = os.path.join(path_to_main_output, "PANCANCER07_output")
path_to_08_output = os.path.join(path_to_main_output, "PANCANCER08_output")
os.system(f"mkdir -p {path_to_08_output}")

all_files = [item for item in pathlib.Path(path_to_07_output).glob("*.candi_reads.csv")]
all_read_files = [item for item in pathlib.Path(path_to_07_output).glob("*.read_classification.csv")]

##### read countdf: raw count and in-read count number. 
all_read_files = [item for item in pathlib.Path(path_to_07_output).glob("*.read_classification.csv")]
if os.path.isfile(os.path.join(path_to_08_output, "all_count.csv")) == False:
    all_count_files = [item for item in pathlib.Path(path_to_07_output).glob("*.read_count.csv")]
    countdf = pd.DataFrame()
    for file in tqdm(all_count_files):
        tmpdf = pd.read_csv(file, index_col = [0])
        tmpdf["SampleID"] = tmpdf["SampleID"].apply(lambda x: x.replace(".read_classification.csv", ""))
        countdf = pd.concat([countdf, tmpdf], axis = 0)
    countdf.to_csv(os.path.join(path_to_08_output, "all_count.csv"))
else:
    countdf = pd.read_csv(os.path.join(path_to_08_output, "all_count.csv"), index_col = [0])
#### read countdf: raw count and in-read count number. 

all_samples = []
all_mean_candi_reads = []
for file in all_files:
    tmpdf = pd.read_csv(file)
    mean_candi_reads = tmpdf.candi_reads.mean()
    all_samples.append(file.name.split(".")[0])
    all_mean_candi_reads.append(mean_candi_reads)

candidf = pd.DataFrame({"SampleID": all_samples, "num_candi_reads": all_mean_candi_reads})
candidf = candidf.merge(countdf, right_on = "SampleID", left_on = "SampleID")
candidf["ratio_raw"] = candidf["num_candi_reads"] / candidf["raw_count"]
candidf["ratio_in_reads"] = candidf["num_candi_reads"] / candidf["in_read_count"]
candidf.to_excel(os.path.join(path_to_08_output, "candi_reads_all_{}_vs_control.xlsx".format("PAN_CANCER")), index = False)


In [2]:
candidf = pd.read_excel(os.path.join(path_to_08_output, "candi_reads_all_{}_vs_control.xlsx".format("PAN_CANCER")))
convert_ratio = {
    '0001': 0.001, 
    '001': 0.01, 
    '0005': 0.005, 
    '005': 0.05, 
    '01': 0.1,
    '05': 0.5,
    '08': 0.8
}

ichorcnadf = pd.read_csv("validation_Truong_Vi/metadata_Spike_in_silico_GW_samples_from_tissue_highdepth_15112024.csv")
candidf = candidf.merge(ichorcnadf[["SampleID", "ichorCNA"]], right_on = "SampleID", left_on = "SampleID")
candidf["Label"] = candidf["SampleID"].apply(lambda x: x.split("_")[8])
candidf["spike_in_ratio"] = candidf["SampleID"].apply(lambda x: convert_ratio[x.split("_")[4]])

input_cancer_class = "PAN_CANCER"

path_to_tf_output = os.path.join(outdir, PROJECT, output_version, "TF_output")
path_to_02_tf_output = os.path.join(path_to_tf_output, "02_output")


# selected_candi_features = "ratio_raw"
for selected_candi_features in ["ratio_raw", "ratio_in_reads"]:
    print(selected_candi_features)
    all_models = dict()
    path_to_save_models = os.path.join(path_to_02_tf_output, "output", input_cancer_class, selected_candi_features, "models")

    lr_grid = pickle.load(open(os.path.join(path_to_save_models, "LR.pkl"), "rb"))
    all_models["LR"] = lr_grid


    ridge_grid = pickle.load(open(os.path.join(path_to_save_models, "Ridge.pkl"), "rb"))
    all_models["Ridge"] = ridge_grid

    elasticnet_grid = pickle.load(open(os.path.join(path_to_save_models, "ElasticNet.pkl"), "rb"))
    all_models["ElasticNet"] = elasticnet_grid
    for i in all_models.keys():
        input_features = ["ichorCNA", selected_candi_features]
        candidf = candidf[candidf["ichorCNA"].isna() == False] 
        inputdf = candidf[input_features].copy()
        candidf["predicted_TF"] = all_models[i].predict(inputdf.to_numpy())
        candidf.to_excel(os.path.join(path_to_08_output, f"Prediction_{selected_candi_features}_model_{i}_spike_in_data.xlsx"), index = False)

ratio_raw
ratio_in_reads


In [4]:
candidf

Unnamed: 0,SampleID,num_candi_reads,raw_count,in_read_count,ratio_raw,ratio_in_reads,ichorCNA,Label,spike_in_ratio,predicted_TF
0,KZAZ52_spike_in__005_from_ZNC05S_label_CRC_rep...,3.232628,15440,7262,0.000209,0.000445,0.04019,CRC,0.050,0.051048
1,KACE35_spike_in__005_from_ZNB06SR3_label_Breas...,4.220238,20069,9720,0.000210,0.000434,0.04465,Breast,0.050,0.052930
2,K2AA15_spike_in__0005_from_ZNC03S_label_CRC_re...,6.283626,29715,15085,0.000211,0.000417,0.03432,CRC,0.005,0.048571
3,K0AC37_spike_in__0001_from_ZNL08S_label_Lung_r...,4.672566,19961,10139,0.000234,0.000461,0.04623,Lung,0.001,0.053596
4,LBGS087_spike_in__01_from_ZNL15S_label_Lung_re...,15.511696,70788,36730,0.000219,0.000422,0.05653,Lung,0.100,0.057942
...,...,...,...,...,...,...,...,...,...,...
3961,K4AC16_spike_in__005_from_ZNC04S_label_CRC_rep...,7.691860,35529,17543,0.000216,0.000438,0.02653,CRC,0.050,0.045285
3962,K2071_spike_in__0001_from_ZNC03S_label_CRC_rep...,8.781977,40683,21186,0.000216,0.000415,0.02762,CRC,0.001,0.045744
3963,K4AC10_spike_in__001_from_ZNB03SR2_label_Breas...,15.970760,46916,23659,0.000340,0.000675,0.08085,Breast,0.010,0.068204
3964,KAAK49_spike_in__005_from_ZNL09S_label_Lung_re...,9.073746,37459,19667,0.000242,0.000461,0.01833,Lung,0.050,0.041825
