In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from mphelper import ProcessWrapPool
import numpy as np
import itertools
import biodataplot.metaplot as bpm
from biodata.bigwig import BigWigIReader
from biodata.bed import BEDXReader
from genomictools import GenomicCollection
import json

In [2]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))
import utils

In [3]:
import matplotlib
font_dir = Path.cwd().parent / "font"
for font in ["Arial.ttf", "Arial_Bold.ttf"]:
    matplotlib.font_manager.fontManager.addfont(font_dir / font)
matplotlib.rcParams["font.family"] = "Arial"
bfontsize = 12
sfontsize = 9

In [4]:
PROJECT_DIR_d = "/fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/PROcap/"
PROJECT_DIR_r = "/fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/resources/"
PROJECT_DIR_o = "/fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/output/"

# Generate feature matrix

## Get individual matrix

In [5]:
s = "C1"
ks = ["+", "-"]
ps = ["divergent", "unidirectional"]
ds = ["distal", "proximal"]
cs = ["", "_control"]
gs = [f"{p}_{d}{c}" for p, d, c in itertools.product(ps, ds, cs)]
gs2 = [f"{p}_{d}" for p, d in itertools.product(ps, ds)]

In [6]:
# PRO-cap: for visualization, use full length R2 reads, not just 5' end of reads
# "*R2_coverage_pl.bw" & "*R2_coverage_mn.bw": positive values

bws = {"PROcap_pl": f"{PROJECT_DIR_d}Analysis/{s}_dedup_chr1-22-X_R2_coverage_pl.bw",
	   "PROcap_mn": f"{PROJECT_DIR_d}Analysis/{s}_dedup_chr1-22-X_R2_coverage_mn.bw",
	   "DNase": f"{PROJECT_DIR_r}ENCODE/ENCFF414OGC.bigWig",
	   "H3K27ac": f"{PROJECT_DIR_r}ENCODE/ENCFF849TDM.bigWig"
		}

In [7]:
# For better visualizaiton and easier interpretation, we flipped the side with more reads to the right

df_features = {}
for g in gs:
	bed = f"{PROJECT_DIR_d}bed_plot/{s}_{g}.bed"
	if "control" in bed:
		all_regions = BEDXReader.read_all(GenomicCollection, bed)
		use_strand = False
	else:
		all_regions = BEDXReader.read_all(GenomicCollection, bed, strandfield="f7")
		use_strand = True

	for m in bws:
		if "PROcap" in m and "control" not in bed:
			for k in ks:
				regions = list(filter(lambda a: a.stranded_genomic_pos.strand==k, all_regions))
				index = [f"{r.chrom}_{r.chromStart}_{r.chromEnd}" for r in regions]
				# Change the sign: maxTSS - positive values; minTSS - negative values
				# Note "*R2_coverage_mn.bw" contains positive values
				if (m == "PROcap_pl" and k == "+") or (m == "PROcap_mn" and k == "-"):
					sign = 1
				else:
					sign = -1
				df_features[(m,g,k)] = sign*pd.DataFrame(bpm.generate_signal_profile(regions, BigWigIReader(bws[m]), fixed_size=1001, use_strand=use_strand), index=index)
		else:
			index = [f"{r.chrom}_{r.chromStart}_{r.chromEnd}" for r in all_regions]
			df_features[(m,g)] = pd.DataFrame(bpm.generate_signal_profile(all_regions, BigWigIReader(bws[m]), fixed_size=1001, use_strand=use_strand), index=index)

In [8]:
df_reformat = {}
for k in df_features:
	df = df_features[k].reset_index()
	df_reformat[k] = pd.melt(df,
						  id_vars="index",
						  value_vars=list(df.columns[1:]),
						  var_name="Position",
						  value_name="Feature"
						  )

## Combine dataframes for plotting

In [9]:
ms = ["PRO-cap", "DNase", "H3K27ac"]

In [10]:
df_heatmaps = {}
for m, g in itertools.product(ms, gs2):
	if m == "PRO-cap":
		dfs = [df_features[("PROcap_pl", g, "+")] + df_features[("PROcap_mn", g, "+")],
			   df_features[("PROcap_pl", g, "-")] + df_features[("PROcap_mn", g, "-")]]
		df_heatmaps[(m,g)] = pd.concat(dfs)
	else:
		df_heatmaps[(m,g)] = df_features[(m,g)]

In [11]:
df_metaplots = {}
for m, g in itertools.product(ms, gs2):
	if m == "PRO-cap":
		# show maximum and minimum TSS separately
		df1 = pd.concat([df_reformat[("PROcap_pl", g, "+")], 
						 df_reformat[("PROcap_mn", g, "-")]])
		df1["Label"] = "maxTSS"
		df2 = pd.concat([df_reformat[("PROcap_pl", g, "-")], 
						 df_reformat[("PROcap_mn", g, "+")]])
		df2["Label"] = "minTSS"
		df_metaplots[(m,g)] = pd.concat([df1, df2]).reset_index(drop=True)	
	else:
		df1 = df_reformat[(m,g)].copy()
		df1["Label"] = "TRE"
		df2 = df_reformat[(m,f"{g}_control")].copy()
		df2["Label"] = "Ctrl"
		df_metaplots[(m,g)] = pd.concat([df1, df2]).reset_index(drop=True)	

# Metaplots & heatmaps

## Settings

### ylims

In [12]:
for d, m in itertools.product(ds, ms):
	maxs = []
	mins = []
	if m == "PRO-cap":
		labels = ["maxTSS", "minTSS"]
	else:
		labels = ["TRE", "Ctrl"]
	for p in ps:
		df = df_metaplots[(m,f"{p}_{d}")]
		for label in labels:
			maxs.append(df[df["Label"]==label].groupby("Position")["Feature"].mean().max())
			mins.append(df[df["Label"]==label].groupby("Position")["Feature"].mean().min())
	print(d, m, max(maxs), min(mins))

distal PRO-cap 65.80727186311788 -15.354919201520913
distal DNase 3.3319399758897625 0.04579757371671772
distal H3K27ac 16.20998938280395 0.6118987135700138
proximal PRO-cap 473.4977841790383 -107.22368712608021
proximal DNase 5.4795589627846235 0.04331937279020853
proximal H3K27ac 21.494214860070723 0.5600905455763172


### cmap

In [13]:
# PRO-cap

customized_cmaps = {} 
colors = ["#2c7bb6", "#ffffff", "#d7191c"]
for d in ds:
	if d == "distal":
		cvals = [-40, 0, 80]
	else:
		cvals = [-250, 0, 500]
	norm = plt.Normalize(min(cvals), max(cvals))
	tuples = list(zip(map(norm, cvals), colors))
	customized_cmaps[d] = matplotlib.colors.LinearSegmentedColormap.from_list("", tuples)

In [14]:
cmaps = {}
for d in ds:
	cmaps[d] = [customized_cmaps[d]] + ["viridis"]*(len(ms)-1)

### Sorting order

In [15]:
# Get distance from prominent TSS to center for each element
# Elements in heatmap will be sorted by the distance

inputfiles = [f"{PROJECT_DIR_d}bed_plot/{s}_{g}.bed" for g in gs2]
outputfile = f"{PROJECT_DIR_o}labels/{s}_distance_to_center.json"
utils.get_distance_to_center(inputfiles, outputfile)

## Generate metaplots & heatmaps

In [16]:
def generate_metaplot_and_heatmap(d, ps, ms, df_metaplots, df_heatmaps, sort_file, ylims, yticks, cmaps, outputfile, test, figsize, xlabel="Distance (kb)"):
	width_ratios = [12]*len(ps)+[1]
	height_ratios = [10,7]*len(ms)
	fig, axes = plt.subplots(len(ms)*2, len(ps)+1, figsize=figsize, gridspec_kw={'width_ratios': width_ratios, 'height_ratios': height_ratios})
	labelpad = 2
	wspace = 0.25
	hspace = 0.2

	for m in ms:
		for p in ps:
			col = ps.index(p)
			for n in range(2):
				row = ms.index(m) * 2 + n
				ax = axes[row, col]
				
				# Metaplot
				if n == 0:
					if m == "PRO-cap":
						hue_order = ["minTSS", "maxTSS"]
						palette = ["#2c7bb6", "#d7191c"]
					else:
						hue_order = ["Ctrl", "TRE"]
						# Divergent
						if col == 0:
							palette = ["#969696", "#313695"]
						# Unidirectional
						else:
							palette = ["#969696", "#de77ae"]

					if row == 0:
						ax.set_title(p.capitalize(), fontsize=bfontsize, fontweight="bold", pad=10)
						ax.axhline(y=0, ls="--", c="#969696")
						
					utils.generate_feature_metaplot(df_metaplots[(m,f"{p}_{d}")], palette, hue_order, ax, test)		
					
					ax.set_ylim(ylims[ms.index(m)])
					ax.set_yticks(yticks[ms.index(m)])
					if col == 0:
						ax.set_ylabel(m, fontsize=bfontsize, fontweight="bold")
						if d == "proximal":
							if m == "PRO-cap":
								y_align = -0.3
							else:
								y_align = -0.2	
						else:
							y_align = -0.25
						ax.get_yaxis().set_label_coords(y_align, 0.5)
					else:
						ax.set_ylabel("")
						ax.set_yticklabels([])
						
					if (row == 0 and col == 1) or row == 2: 
						ax.legend(loc="upper right", bbox_to_anchor=(1.02,1.02), frameon=False,
								  fontsize=sfontsize, handlelength=1.0, handletextpad=0.5, labelspacing=0.2)
					else:
						ax.legend([],[], frameon=False)
						
				# Heatmap
				else:
					if col == 1:
						cbar = True
						cbar_ax = axes[row, 2]
					else:
						cbar = False
						cbar_ax = None				
					cbar_kws = {"ticks": yticks[ms.index(m)]}
					utils.generate_feature_heatmap(df_heatmaps[(m,f"{p}_{d}")], yticks[ms.index(m)], cmaps[ms.index(m)], cbar, cbar_ax, cbar_kws, ax, sort_file, test)

					if col == 1:
						cbar_ax.set_yticklabels(yticks[ms.index(m)])
						cbar_ax.tick_params(axis="y", labelsize=sfontsize, pad=labelpad)
			
				xtick_list = [0,500,1000]
				xticklabel_list = ["-0.5", "0", "0.5"]
				ax.set_xlim([0, 1000])
				ax.set_xticks(xtick_list)
				if row == len(ms)*2-1:
					ax.set_xticklabels(xticklabel_list)
					ax.set_xlabel(xlabel, fontsize=bfontsize, fontweight="bold")
				else:
					ax.set_xticklabels([])
					ax.set_xlabel("")

				ax.tick_params(labelsize=sfontsize, pad=labelpad)

				# Leave the last subplot in each even row empty
				if row % 2 == 0:
					axes[row, len(ks)].set_visible(False)

	fig.subplots_adjust(wspace=wspace, hspace=hspace)
	plt.savefig(outputfile, bbox_inches="tight", dpi=300) 

In [17]:
pwpool = ProcessWrapPool(2)

# test = True
test = False
sort_file = f"{PROJECT_DIR_o}labels/{s}_distance_to_center.json"
for d in ds:
	if d == "distal":
		ylims = [[-50, 90], [-0.5, 5], [-2, 20]]
		yticks = [[-40, 0, 40, 80], [0, 1.5, 3, 4.5], [0, 6, 12, 18]]
		outputfile = f"{PROJECT_DIR_o}figures/Fig1d.png"
		figsize = (4,8)
	else:
		ylims = [[-300, 550], [-0.5, 6.5], [-2, 26]]
		yticks = [[-250, 0, 250, 500], [0, 2, 4, 6], [0, 8, 16, 24]]
		outputfile = f"{PROJECT_DIR_o}supp_figures/SuppFig1c.png"
		figsize = (4.5,9)
	pwpool.run(generate_metaplot_and_heatmap, args=[d, ps, ms, df_metaplots, df_heatmaps, sort_file, ylims, yticks, cmaps[d], outputfile, test, figsize])

In [19]:
len(pwpool.finished_tasks)

2

In [20]:
pwpool.close()