In [31]:
import pandas as pd
import numpy as np
import pathlib 
import os
import matplotlib.pyplot as plt
import seaborn as sns
import random
from tqdm import tqdm
import warnings
import pandas as pd
from sklearn.metrics import roc_curve, auc
import pickle
def check_read_inside_region(start, seq, region):
        read_end = start + len(seq)
        region_start = int(region.split(":")[1].split("-")[0])
        region_end = int(region.split(":")[1].split("-")[1])
        if start >= region_start and read_end <= region_end:
            return "in"
        else: 
            return "overlap"

warnings.filterwarnings('ignore')

data_version = "TMD_cov"
output_version = "20240910"

outdir = "/media/hieunguyen/GSHD_HN01/outdir"
PROJECT = "TMD450_TCGA_data_analysis"
thres_hypo = 0.3
thres_hyper = 0.6

mode = "all"

path_to_main_output = os.path.join(outdir, PROJECT, output_version)
path_to_07_output = os.path.join(path_to_main_output, "PANCANCER07_output")
path_to_08_output = os.path.join(path_to_main_output, "PANCANCER08_output")
os.system(f"mkdir -p {path_to_08_output}")

all_files = [item for item in pathlib.Path(path_to_07_output).glob("*.candi_reads.csv")]
all_read_files = [item for item in pathlib.Path(path_to_07_output).glob("*.read_classification.csv")]

##### read countdf: raw count and in-read count number. 
if os.path.isfile(os.path.join(path_to_07_output, "all_count.csv")) == False:
    all_samples = []
    raw_counts = []
    in_read_counts = []

    for file in tqdm(all_read_files):
        tmpdf = pd.read_csv(file)
        raw_count = tmpdf.shape[0]
        in_read_count = tmpdf[tmpdf["read_overlap_rate"] == "in"].shape[0]
        all_samples.append(file.name.replace(".read_classification.csv", ""))
        raw_counts.append(raw_count)
        in_read_counts.append(in_read_count)

    countdf = pd.DataFrame({"SampleID": all_samples, "raw_count": raw_counts, "in_read_count": in_read_counts})
    countdf.to_csv(os.path.join(path_to_07_output, "all_count.csv"))
else:
    print(f"Countdf existst, reading in from {os.path.join(path_to_07_output, 'all_count.csv')} ...")
    countdf = pd.read_csv(os.path.join(path_to_07_output, "all_count.csv"), index_col = [0])

all_samples = []
all_mean_candi_reads = []
for file in all_files:
    tmpdf = pd.read_csv(file)
    mean_candi_reads = tmpdf.candi_reads.mean()
    all_samples.append(file.name.split(".")[0])
    all_mean_candi_reads.append(mean_candi_reads)

candidf = pd.DataFrame({"SampleID": all_samples, "num_candi_reads": all_mean_candi_reads})
candidf = candidf.merge(countdf, right_on = "SampleID", left_on = "SampleID")
candidf["ratio_raw"] = candidf["num_candi_reads"] / candidf["raw_count"]
candidf["ratio_in_reads"] = candidf["num_candi_reads"] / candidf["in_read_count"]
candidf.to_excel(os.path.join(path_to_08_output, "candi_reads_all_{}_vs_control.xlsx".format("PAN_CANCER")), index = False)


Countdf existst, reading in from /media/hieunguyen/GSHD_HN01/outdir/TMD450_TCGA_data_analysis/20240910/PANCANCER07_output/all_count.csv ...


In [26]:
candidf = pd.read_excel(os.path.join(path_to_08_output, "candi_reads_all_{}_vs_control.xlsx".format("PAN_CANCER")))
convert_ratio = {
    '0001': 0.001, 
    '001': 0.01, 
    '0005': 0.005, 
    '005': 0.05, 
    '01': 0.1,
    '05': 0.5,
    '08': 0.8
}

ichorcnadf = pd.read_csv("validation_Truong_Vi/metadata_Spike_in_silico_GW_samples_from_tissue_highdepth_15112024.csv")
candidf = candidf.merge(ichorcnadf[["SampleID", "ichorCNA"]], right_on = "SampleID", left_on = "SampleID")
candidf["Label"] = candidf["SampleID"].apply(lambda x: x.split("_")[8])
candidf["spike_in_ratio"] = candidf["SampleID"].apply(lambda x: convert_ratio[x.split("_")[4]])

input_cancer_class = "PAN_CANCER"

path_to_tf_output = os.path.join(outdir, PROJECT, output_version, "TF_output")
path_to_02_tf_output = os.path.join(path_to_tf_output, "02_output")


# selected_candi_features = "ratio_raw"
for selected_candi_features in ["ratio_raw", "ratio_in_reads"]:
    all_models = dict()
    selected_candi_features = "ratio_raw"
    path_to_save_models = os.path.join(path_to_02_tf_output, "output", input_cancer_class, selected_candi_features, "models")

    ridge_grid = pickle.load(open(os.path.join(path_to_save_models, "Ridge.pkl"), "rb"))
    all_models["Ridge"] = ridge_grid

    elasticnet_grid = pickle.load(open(os.path.join(path_to_save_models, "ElasticNet.pkl"), "rb"))
    all_models["ElasticNet"] = elasticnet_grid
    for i in all_models.keys():
        input_features = ["ichorCNA", selected_candi_features]
        candidf = candidf[candidf["ichorCNA"].isna() == False] 
        inputdf = candidf[input_features].copy()
        candidf["predicted_TF"] = all_models[i].predict(inputdf.to_numpy())
        candidf.to_excel(os.path.join(path_to_08_output, f"Prediction_{selected_candi_features}_model_{i}_Validation_data.xlsx"), index = False)

In [27]:
candidf

Unnamed: 0.1,SampleID,num_candi_reads,Unnamed: 0,raw_count,in_read_count,ratio_raw,ratio_in_reads,ichorCNA,Label,spike_in_ratio,predicted_TF
0,KZAZ52_spike_in__005_from_ZNC05S_label_CRC_rep...,3.232628,1787,5267,5267,0.000614,0.000614,0.04019,CRC,0.050,0.051048
1,KACE35_spike_in__005_from_ZNB06SR3_label_Breas...,4.220238,4169,7273,7273,0.000580,0.000580,0.04465,Breast,0.050,0.052930
2,K2AA15_spike_in__0005_from_ZNC03S_label_CRC_re...,6.283626,2454,11638,11638,0.000540,0.000540,0.03432,CRC,0.005,0.048571
3,K0AC37_spike_in__0001_from_ZNL08S_label_Lung_r...,4.672566,371,7788,7788,0.000600,0.000600,0.04623,Lung,0.001,0.053596
4,LBGS087_spike_in__01_from_ZNL15S_label_Lung_re...,15.511696,2432,28805,28805,0.000539,0.000539,0.05653,Lung,0.100,0.057942
...,...,...,...,...,...,...,...,...,...,...,...
3961,K4AC16_spike_in__005_from_ZNC04S_label_CRC_rep...,7.691860,1734,13063,13063,0.000589,0.000589,0.02653,CRC,0.050,0.045285
3962,K2071_spike_in__0001_from_ZNC03S_label_CRC_rep...,8.781977,4845,16617,16617,0.000528,0.000528,0.02762,CRC,0.001,0.045744
3963,K4AC10_spike_in__001_from_ZNB03SR2_label_Breas...,15.970760,4235,18607,18607,0.000858,0.000858,0.08085,Breast,0.010,0.068204
3964,KAAK49_spike_in__005_from_ZNL09S_label_Lung_re...,9.073746,3923,15316,15316,0.000592,0.000592,0.01833,Lung,0.050,0.041825
