In [None]:
import pandas as pd
import numpy as np
import pathlib 
import os
import matplotlib.pyplot as plt
import seaborn as sns
import random
from tqdm import tqdm
import warnings
import pandas as pd
warnings.filterwarnings('ignore')

data_version = "TMD_cov"
output_version = "20240910"

outdir = "/media/hieunguyen/HNSD_mini/outdir"
PROJECT = "TMD450_TCGA_data_analysis"
thres_hypo = 0.3
thres_hyper = 0.6

# input_cancer_class = "Liver"
all_cancer_classes = ["Liver", "Gastric", "Lung", "Breast", "CRC"]
for input_cancer_class in all_cancer_classes:
    print("working on input cancer class {}".format(input_cancer_class))

    path_to_main_output = os.path.join(outdir, PROJECT, output_version)
    path_to_07_output = os.path.join(outdir, PROJECT, output_version, input_cancer_class, "thres_hypo_{}_hyper_{}".format(thres_hypo, thres_hyper))
    os.system("mkdir -p {}".format(path_to_07_output))

    path_to_main_src = pathlib.Path("/media/hieunguyen/HNSD01/src/tmd_features")

    path_to_read_data = "/media/hieunguyen/GSHD_HN01/raw_data/reads_from_450_regions"
    path_to_save_panel = os.path.join( path_to_main_output, "panel")

    cpg450df = pd.read_excel(os.path.join(path_to_save_panel, "TMD450_overlapping_TCGA.xlsx"))
    cpg450df = cpg450df[cpg450df['overlapTCGA'] == "yes"]
    cpg450df = cpg450df.drop_duplicates(subset=['cpg'])

    metadata = pd.read_excel("metadata_cfDNA_lowpdepth_TMD_bam_cov.xlsx")
    metadata = metadata[metadata["Label"].isin([input_cancer_class, "Control"])]
    metadata = metadata[metadata["Set"] == "train"]

    metadata.head()
    metadata.shape

    def assign_read_type(x, thres_hypo, thres_hyper):
        if x < thres_hypo:
            return "hypo"
        elif x > thres_hyper:
            return "hyper"
        else:
            return "none"
    def check_read_inside_region(start, seq, region):
            read_end = start + len(seq)
            region_start = int(region.split(":")[1].split("-")[0])
            region_end = int(region.split(":")[1].split("-")[1])
            if start >= region_start and read_end <= region_end:
                return "in"
            else: 
                return "overlap"
            
    all_read_files = [item for item in pathlib.Path(path_to_read_data).glob("*.sorted.csv") if item.name.replace(".sorted.csv", "") in metadata["SampleID"].values]

    for file in tqdm(all_read_files):
        tmpdf = pd.read_csv(file, index_col = [0])

        
        tmpdf["read_overlap_rate"] = tmpdf[["start", "seq", "region"]].apply(lambda x: check_read_inside_region(x[0], x[1], x[2]), axis = 1)
        tmpdf = tmpdf[tmpdf["read_overlap_rate"] == "in"]

        tmpdf["read_classification"] = tmpdf["alpha"].apply(lambda x: assign_read_type(x, thres_hypo, thres_hyper))

        countdf = tmpdf.groupby(["region", "read_classification"]).size().unstack().fillna(0)
        tmpdf.to_csv(os.path.join(path_to_07_output, file.name.replace(".sorted.csv", ".read_classification.csv")))
