In [5]:
import sys
import pandas as pd
from mphelper import ProcessWrapPool
import tempfile
from pathlib import Path
from biodatatools.utils.common import json_load

In [3]:
sys.path.append(str(Path.cwd().parent))
import utils

In [7]:
PROJECT_DIR_d = "/home/yc2553/projects/HEA/databases/"
PROJECT_DIR_d2 = "/home/kl945/EnhancerAtlas/"
PROJECT_DIR_o = "/fs/cbsuhy02/storage/yc2553/yc2553/projects/3.Human_atlas/output/"

# Read count matrix

## Get raw counts

In [19]:
ps = ["divergent", "unidirectional"]
samples = list(json_load(f"{PROJECT_DIR_d}PROcap/metainfo/classifications.json"))
beds = json_load(f"{PROJECT_DIR_d}PROcap/metainfo/sample_beds.json")
bws = json_load(f"{PROJECT_DIR_d}PROcap/metainfo/sample_bws.json")

In [107]:
# This takes a few hours

pwpool = ProcessWrapPool(len(ps))

for p in ps:
	p2 = p if p == "divergent" else "unidirectional-no-divergent"
	inputfile = f"{PROJECT_DIR_d2}PROcap/Peaks/PINTS/Merged/20250905_raw_merged-1bp_{p2}_peaks.bed.bgz"
	outputfile = f"{PROJECT_DIR_d}PROcap/processed/raw_reads_{p}.txt"
	pwpool.run(utils.get_count_matrix, args=[inputfile, outputfile, samples, bws])

In [111]:
len(pwpool.finished_tasks)

2

In [112]:
pwpool.close()

## Data normalization

In [116]:
# Normalize the whole dataset
# Combine all distal and proximal data matrices

dfs = []
for p in ps:
	inputfile = f"{PROJECT_DIR_d}PROcap/processed/raw_reads_{p}.txt"
	dfs.append(pd.read_table(inputfile, index_col=0))
concatenated_df = pd.concat(dfs)
outputfile = f"{PROJECT_DIR_d}PROcap/processed/raw_reads_all.txt"
concatenated_df.to_csv(outputfile, sep="\t")

In [29]:
# Generate a sample file

results = [[s for s in concatenated_df.columns]]
df = pd.DataFrame(results).T
df.columns = [""]
outputfile = f"{PROJECT_DIR_d}PROcap/processed/norm_samples.txt"
df.to_csv(outputfile, sep="\t", index=False)

In [118]:
# Do normalization for the whole dataset using DESeq2
# See details in "2.Count_normalization.ipynb"

# Binary matrix

In [121]:
def binary(inputfile, samples, outputfile):
	b1 = pybedtools.BedTool(inputfile)
	es = ["_".join(i.fields[:3]) for i in b1]
	results = []
	for s in samples:
		overlap_es = set()
		for n in range(2):
			b2 = pybedtools.BedTool(beds[s][n])
			overlap = b1.intersect(b2, u=True)
			overlap_es |= set(["_".join(i.fields[:3]) for i in overlap])
		row = [1 if e in overlap_es else 0 for e in es]
		results.append(row)
	df_binary = pd.DataFrame(results, columns=es, index=samples).T
	df_binary.to_csv(outputfile, sep="\t")

In [122]:
pwpool = ProcessWrapPool(len(ps)*len(ds))

for p, d in itertools.product(ps, ds):
	inputfile = f"{PROJECT_DIR_d}PROcap/processed/merged_{p}_{d}.bed.bgz"
	outputfile = f"{PROJECT_DIR_d}PROcap/processed/binary_{p}_{d}.txt"
	pwpool.run(binary, args=[inputfile, samples, outputfile])

In [124]:
len(pwpool.finished_tasks)

4

In [125]:
pwpool.close()

# rpm normalization

In [27]:
def bw_rpm(bw_ins, bw_outs, chroms):
	with tempfile.TemporaryDirectory() as tempdir:
		bws_temp = [os.path.join(tempdir, f"filtered_{i}.bw") for i in range(len(bw_outs))]
		
		# Filter BigWig files by chromosomes
		for bw_in, bw_temp in zip(bw_ins, bws_temp):
			commands = ["biodatatools filter_bigwig_by_chroms",
						"-i", bw_in,
						"-o", bw_temp,
						"-chroms", " ".join(chroms)
						]
			subprocess.run(" ".join(commands), shell=True)
		
		# Normalize BigWig files to RPM
		commands = ["biodatatools normalize_bigwig",
					"-ibws", " ".join(bws_temp),
					"-obws", " ".join(bw_outs),
					"-mode rpm"
					]
		subprocess.run(" ".join(commands), shell=True)

In [20]:
ks = ["pl", "mn"]
chroms = [f"chr{n}" for n in range(1,23)] + ["chrX"]

In [34]:
pwpool = ProcessWrapPool(10)

for s in samples:
	bw_ins = []
	bw_outs = []
	for n in range(2):
		bw_in = bws[s][n]
		bw_ins.append(bw_in)
		bw_out = f"{PROJECT_DIR_o}rpm/{s}_{ks[n]}_rpm.bw"
		bw_outs.append(bw_out)
	pwpool.run(bw_rpm, args=[bw_ins, bw_outs, chroms])

In [21]:
len(pwpool.finished_tasks)

In [38]:
pwpool.close()