In [1]:
from mphelper import ProcessWrapPool
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import itertools
import biodataplot.metaplot as bpm
from biodata.bigwig import BigWigIReader
from biodata.bed import BEDXReader
from genomictools import GenomicCollection
import pybedtools
from collections import defaultdict

In [2]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))
import utils

In [3]:
import matplotlib
font_dir = Path.cwd().parent / "font"
for font in ["Arial.ttf", "Arial_Bold.ttf"]:
    matplotlib.font_manager.fontManager.addfont(font_dir / font)
matplotlib.rcParams["font.family"] = "Arial"
bfontsize = 12
sfontsize = 9

In [4]:
PROJECT_DIR_d = "/fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/PROcap/"
PROJECT_DIR_o = "/fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/output/"

# Generate feature matrix

## Get individual matrix

In [5]:
cell = "C1"
ds = ["distal", "proximal"]
p = "unidirectional"
ks = ["+", "-"]
ms = ["downsampled", "original"]

In [6]:
samples = [f"{cell}_{n}M" for n in [30, 20, 10, 5, 1]]

In [7]:
# Only focus on unidirectional elements at different sequencing depths
# Check if they would show transcription on the other side with deeper sequencing depths
# "*R2_coverage_pl.bw" & "*R2_coverage_mn.bw": positive values

bws = {}
for s in samples:
	bws[s] = {("downsampled", "PROcap_pl"): f"{PROJECT_DIR_d}Analysis/{s}_R2_coverage_pl.bw",
			  ("downsampled", "PROcap_mn"): f"{PROJECT_DIR_d}Analysis/{s}_R2_coverage_mn.bw",
			  ("original", "PROcap_pl"): f"{PROJECT_DIR_d}Analysis/{cell}_dedup_chr1-22-X_R2_coverage_pl.bw",
			  ("original", "PROcap_mn"): f"{PROJECT_DIR_d}Analysis/{cell}_dedup_chr1-22-X_R2_coverage_mn.bw",
			 }

In [8]:
df_features = defaultdict(dict)
for s, d in itertools.product(samples, ds):
	bed = f"{PROJECT_DIR_d}bed_plot/{s}_{p}_{d}.bed"
	all_regions = BEDXReader.read_all(GenomicCollection, bed, strandfield="f7")
	for k in ks:
		regions = list(filter(lambda a: a.stranded_genomic_pos.strand==k, all_regions))
		index = [f"{r.chrom}_{r.chromStart}_{r.chromEnd}" for r in regions]
		for m in bws[s]:
			if (m[1] == "PROcap_pl" and k == "+") or (m[1] == "PROcap_mn" and k == "-"):
				sign = 1
			else:
				sign = -1
			df_features[(s,d)][(m,k)] = sign*pd.DataFrame(bpm.generate_signal_profile(regions, BigWigIReader(bws[s][m]), fixed_size=1001, use_strand=True), index=index)

## Combine dataframes for plotting

In [9]:
ms = ["downsampled", "original"]

In [10]:
df_heatmaps = defaultdict(dict)
for s, d in df_features:
	for m in ms:
		dfs = [df_features[(s,d)][((m, "PROcap_pl"), "+")] + df_features[(s,d)][((m, "PROcap_mn"), "+")],
			   df_features[(s,d)][((m, "PROcap_pl"), "-")] + df_features[(s,d)][((m, "PROcap_mn"), "-")]]
		df_heatmaps[d][(s,m)] = pd.concat(dfs)

# Generate heatmaps

## Settings

### cmap

In [11]:
# Set same data range as Fig.1d & SuppFig.1c

cmaps = {} 
colors = ["#2c7bb6", "#ffffff", "#d7191c"]
for d in ds:
	if d == "distal":
		cvals = [-40, 0, 80]
	else:
		cvals = [-250, 0, 500]
	norm = plt.Normalize(min(cvals), max(cvals))
	tuples = list(zip(map(norm, cvals), colors))
	cmaps[d] = matplotlib.colors.LinearSegmentedColormap.from_list("", tuples)

### Sorting order

In [12]:
for s in samples:
	inputfiles = [f"{PROJECT_DIR_d}bed_plot/{s}_{p}_{d}.bed" for d in ds]
	outputfile = f"{PROJECT_DIR_o}labels/{s}_distance_to_center.json"
	utils.get_distance_to_center(inputfiles, outputfile)

## Generate heatmaps

In [13]:
def generate_heatmap(d, samples, ms, df_heatmaps, cbar_ticks, cmap, outputfile, test=False, xlabel="Distance (kb)"):
	fig, axes = plt.subplots(len(samples), len(ms), figsize=(4.5, 5.5))
	labelpad = 2
	y_align = -0.1
	wspace = 0.25
	hspace = 0.2

	for s in samples:
		row = samples.index(s)
		for m in ms:
			col = ms.index(m)
			ax = axes[row, col]

			if row == 0 and col == len(ms)-1:
				cbar = True
				cbar_ax = fig.add_axes([0.95, 0.77, 0.03, 0.1])
			else:
				cbar = False
				cbar_ax = None			
			cbar_kws = {"ticks": cbar_ticks}

			sort_file = f"{PROJECT_DIR_o}labels/{s}_distance_to_center.json"
			utils.generate_feature_heatmap(df_heatmaps[d][(s,m)], cbar_ticks, cmap, cbar, cbar_ax, cbar_kws, ax, sort_file, test)
			
			if row == 0 and col == len(ms)-1:
				cbar_ax.set_yticklabels(cbar_ticks)
				cbar_ax.tick_params(axis="y", labelsize=sfontsize, pad=labelpad)
				
			if row == 0:
				ax.set_title(m.capitalize(), fontsize=bfontsize, pad=labelpad+5, fontweight="bold")

			if col == 0:
				b = pybedtools.BedTool(f"{PROJECT_DIR_d}bed_plot/{s}_{p}_{d}.bed")
				ylabel = f"{s.split('_')[-1]}\n(n={len(b)})"
				ax.set_ylabel(ylabel, fontsize=bfontsize, fontweight="bold")
				ax.get_yaxis().set_label_coords(y_align, 0.5)
			else:
				ax.set_ylabel("")
			
			xtick_list = [0, 500, 1000]
			xticklabel_list = ["-0.5", "0", "0.5"]
			ax.set_xlim([0, 1000])
			ax.set_xticks(xtick_list)
			if row == len(samples)-1:
				ax.set_xticklabels(xticklabel_list)
				ax.set_xlabel(xlabel, fontsize=bfontsize, fontweight="bold")
				
			else:
				ax.set_xticklabels([])
				ax.set_xlabel("")

			ax.tick_params(labelsize=sfontsize, pad=labelpad)

	fig.subplots_adjust(wspace=wspace, hspace=hspace)
	plt.savefig(outputfile, bbox_inches="tight", dpi=300)

In [14]:
pwpool = ProcessWrapPool(len(ds))

# test = True
test = False
for d in ds:
	if d == "distal":
		cbar_ticks = [-40, 0, 40, 80]
		outputfile = f"{PROJECT_DIR_o}supp_figures/SuppFig3a.png"
	else:
		cbar_ticks = [-250, 0, 250, 500]
		outputfile = f"{PROJECT_DIR_o}supp_figures/SuppFig3c.png"
	pwpool.run(generate_heatmap, args=[d, samples, ms, df_heatmaps, cbar_ticks, cmaps[d], outputfile, test])

In [17]:
len(pwpool.finished_tasks)

2

In [18]:
pwpool.close()