In [1]:
import pandas as pd
import numpy as np
import pathlib 
import os
import matplotlib.pyplot as plt
import seaborn as sns
import random
from tqdm import tqdm
import warnings
import pandas as pd
import argparse
warnings.filterwarnings('ignore')

data_version = "TMD_cov"
output_version = "20240910"
mode = "all"
input_cancer_class = "Liver"
path_to_read_data = "/media/hieunguyen/GSHD_HN01/raw_data/reads_from_450_regions_LOD_samples"
outdir = "/media/hieunguyen/HNSD_mini/outdir"
PROJECT = "TMD450_TCGA_data_analysis"
thres_hypo = 0.3
thres_hyper = 0.6

path_to_main_output = os.path.join(outdir, PROJECT, output_version)
if mode == "all":
    path_to_03_output = os.path.join(path_to_main_output, "03_output", input_cancer_class)
    path_to_09_output = os.path.join(outdir, PROJECT, output_version, "09_output", input_cancer_class, "thres_hypo_{}_hyper_{}".format(thres_hypo, thres_hyper))
elif mode == "hypo_only":
    path_to_03_output = os.path.join(path_to_main_output, "03_output_all_hypo", input_cancer_class)
    path_to_09_output = os.path.join(outdir, PROJECT, output_version, "09_output_all_hypo", input_cancer_class, "thres_hypo_{}_hyper_{}".format(thres_hypo, thres_hyper))
elif mode == "hyper_only":
    path_to_03_output = os.path.join(path_to_main_output, "03_output_all_hyper", input_cancer_class)
    path_to_09_output = os.path.join(outdir, PROJECT, output_version, "09_output_all_hyper", input_cancer_class, "thres_hypo_{}_hyper_{}".format(thres_hypo, thres_hyper))


path_to_save_panel = os.path.join( path_to_main_output, "panel")

cpg450df = pd.read_excel(os.path.join(path_to_save_panel, "TMD450_overlapping_TCGA.xlsx"))
cpg450df = cpg450df[cpg450df['overlapTCGA'] == "yes"]
cpg450df = cpg450df.drop_duplicates(subset=['cpg'])

def assign_read_type(x, thres_hypo, thres_hyper):
    if x < thres_hypo:
        return "hypo"
    elif x > thres_hyper:
        return "hyper"
    else:
        return "none"
def check_read_inside_region(start, seq, region):
        read_end = start + len(seq)
        region_start = int(region.split(":")[1].split("-")[0])
        region_end = int(region.split(":")[1].split("-")[1])
        if start >= region_start and read_end <= region_end:
            return "in"
        else: 
            return "overlap"
        
all_read_files = [item for item in pathlib.Path(path_to_read_data).glob("*.sorted.csv") if item.name.replace(".sorted.csv", "")]
testdf = pd.read_excel(os.path.join(path_to_03_output, "countDMPs.xlsx"))
if "hyper" not in testdf.columns:
    testdf["hyper"] = 0
if "hypo" not in testdf.columns:
    testdf["hypo"] = 0
testdf["hypo_or_hyper"] = testdf[["hyper", "hypo"]].apply(lambda x: "hyper" if x[0] > x[1] else "hypo", axis = 1)


In [2]:
file = all_read_files[0]
tmpdf = pd.read_csv(file, index_col = [0])
tmpdf["read_overlap_rate"] = tmpdf[["start", "seq", "region"]].apply(lambda x: check_read_inside_region(x[0], x[1], x[2]), axis = 1)
raw_count = tmpdf.shape[0]
in_read_count = tmpdf[tmpdf["read_overlap_rate"] == "in"].shape[0]

##### keep only reads that are completely inside the region
tmpdf = tmpdf[tmpdf["read_overlap_rate"] == "in"]

##### assign read type: hyper or hypo reads based on the given thresholds
tmpdf["read_classification"] = tmpdf["alpha"].apply(lambda x: assign_read_type(x, thres_hypo, thres_hyper))

##### considers only regions that are tested with the TCGA data
tmpdf["region"] = tmpdf["region"].apply(lambda x: x.replace(":", "_").replace("-", "_"))
tmpdf = tmpdf[tmpdf["region"].isin(testdf.Var1.unique())]
##### count hypo and hyper reads in each region
resdf = tmpdf.groupby(["region", "read_classification"]).seq.count().reset_index().pivot_table(index = "region", columns = "read_classification", values = "seq").reset_index().fillna(0)

##### get the region type from TCGA test results
resdf["region_type"] = resdf["region"].apply(lambda x: testdf[testdf.Var1 == x].hypo_or_hyper.values[0])
if "hyper" not in resdf.columns:
    resdf["hyper"] = 0
if "hypo" not in resdf.columns:
    resdf["hypo"] = 0

##### assign candi reads equal to number of hypo or hyper reads, depending on the region type
resdf["candi_reads"] = resdf[["region_type", "hyper", "hypo"]].apply(lambda x: x[1] if x[0] == "hyper" else x[2], axis = 1)


In [3]:
resdf

read_classification,region,hyper,hypo,none,region_type,candi_reads
0,chr10_101140065_101140385,0.0,42.0,3.0,hyper,0.0
1,chr10_101284337_101284644,0.0,9.0,6.0,hyper,0.0
2,chr10_107164022_107164546,1.0,28.0,1.0,hyper,1.0
3,chr10_122163615_122163857,0.0,8.0,0.0,hyper,0.0
4,chr10_122164217_122164660,7.0,66.0,8.0,hyper,7.0
...,...,...,...,...,...,...
202,chr9_132587027_132587206,0.0,11.0,0.0,hyper,0.0
203,chr9_14346834_14347128,1.0,31.0,4.0,hyper,1.0
204,chr9_19788616_19788772,0.0,10.0,1.0,hyper,0.0
205,chr9_36986086_36986754,0.0,58.0,3.0,hyper,0.0
