In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from mphelper import ProcessWrapPool
import itertools
import biodataplot.metaplot as bpm
from biodata.bigwig import BigWigIReader
from biodata.bed import BEDXReader
from genomictools import GenomicCollection

In [2]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))
import utils

In [3]:
import matplotlib
font_dir = Path.cwd().parent / "font"
for font in ["Arial.ttf", "Arial_Bold.ttf"]:
    matplotlib.font_manager.fontManager.addfont(font_dir / font)
matplotlib.rcParams["font.family"] = "Arial"
bfontsize = 12
sfontsize = 9

In [4]:
PROJECT_DIR_s = "/fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/softwares/procapnet/"
PROJECT_DIR_d = "/fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/PROcap/"
PROJECT_DIR_o = "/fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/output/"

# Generate feature matrix

In [5]:
cell = "C1"
ds = ["distal", "proximal"]
p = "unidirectional"
ks = ["+", "-"]

In [6]:
samples = [f"{cell}_{n}M" for n in [30, 20, 10, 5, 1]]

## Get individual matrix

In [7]:
# Add GPU at the end

script = f"{PROJECT_DIR_s}slurm/predict.sh"
scale = "True"
model_type = "strand_merged_umap"
for s, d in itertools.product(samples, ds):
	inputfile = f"{PROJECT_DIR_d}bed_plot/{s}_{p}_{d}.bed"
	output_prefix = f"{PROJECT_DIR_o}procapnet/prediction/{s}_{p}_{d}."
	commands = " ".join(["sbatch " + script,
						cell,
						model_type,
						f"{PROJECT_DIR_o}procapnet/",
						inputfile,
						output_prefix,
						scale,
						str(ds.index(d))
						])
	print(commands)

sbatch /fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/softwares/procapnet/slurm/predict.sh C1 strand_merged_umap /fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/output/procapnet/ /fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/PROcap/bed_plot/C1_30M_unidirectional_distal.bed /fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/output/procapnet/prediction/C1_30M_unidirectional_distal. True 0
sbatch /fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/softwares/procapnet/slurm/predict.sh C1 strand_merged_umap /fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/output/procapnet/ /fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/PROcap/bed_plot/C1_30M_unidirectional_proximal.bed /fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/output/procapnet/prediction/C1_30M_unidirectional_proximal. True 1
sbatch /fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/softwares/procapnet/slurm/

In [8]:
# "*pos.bigWig" & "*neg.bigWig": positive values

bws = {}
for s, d in itertools.product(samples, ds):
	bws[(s,d)] = {"PROcap_pl": f"{PROJECT_DIR_o}procapnet/prediction/{s}_{p}_{d}.pred_profiles.pos.bigWig",
				  "PROcap_mn": f"{PROJECT_DIR_o}procapnet/prediction/{s}_{p}_{d}.pred_profiles.neg.bigWig"
				 }

In [9]:
df_features = {}
for s, d in bws:
	bed = f"{PROJECT_DIR_d}bed_plot/{s}_{p}_{d}.bed"
	all_regions = BEDXReader.read_all(GenomicCollection, bed, strandfield="f7")
	use_strand = True

	for m, k in itertools.product(bws[(s,d)].keys(), ks):
		regions = list(filter(lambda a: a.stranded_genomic_pos.strand==k, all_regions))
		index = [f"{r.chrom}_{r.chromStart}_{r.chromEnd}" for r in regions]
		if (m == "PROcap_pl" and k == "+") or (m == "PROcap_mn" and k == "-"):
			sign = 1
		else:
			sign = -1
		df_features[(s,d,m,k)] = sign*pd.DataFrame(bpm.generate_signal_profile(regions, BigWigIReader(bws[(s,d)][m]), fixed_size=501, use_strand=use_strand), index=index)

In [10]:
# 10-bp bins

df_bins = {}
for k in df_features:
	df_bins[k] = utils.bin_values(df_features[k])

In [11]:
df_reformat = {}
for k in df_bins:
	df = df_bins[k].reset_index()
	df_reformat[k] = pd.melt(df,
						  id_vars="index",
						  value_vars=list(df.columns[1:]),
						  var_name="Position",
						  value_name="Feature"
						  )

## Combine dataframes for plotting

In [12]:
df_metaplots = {}
for s, d in bws:
	# show maximum and minimum TSS separately
	df1 = pd.concat([df_reformat[(s, d, "PROcap_pl", "+")], 
					 df_reformat[(s, d, "PROcap_mn", "-")]])
	df1["Label"] = "maxTSS"
	df2 = pd.concat([df_reformat[(s, d, "PROcap_pl", "-")], 
					 df_reformat[(s, d, "PROcap_mn", "+")]])
	df2["Label"] = "minTSS"
	df_metaplots[(s,d)] = pd.concat([df1, df2]).reset_index(drop=True)	

# Generate metaplots

## ylims

In [13]:
for d in ds:
	maxs = []
	mins = []
	labels = ["maxTSS", "minTSS"]
	for s in samples:
		df = df_metaplots[(s,d)]
		for label in labels:
			maxs.append(df[df["Label"]==label].groupby("Position")["Feature"].mean().max())
			mins.append(df[df["Label"]==label].groupby("Position")["Feature"].mean().min())
	print(d, m, max(maxs), min(mins))

distal PROcap_mn 19.81123762367517 -6.256183109029338
proximal PROcap_mn 79.83225911149665 -20.000989814368978


In [13]:
for d in ds:
	maxs = []
	mins = []
	labels = ["maxTSS", "minTSS"]
	for s in samples:
		df = df_metaplots[(s,d)]
		for label in labels:
			maxs.append(df[df["Label"]==label].groupby("Position")["Feature"].mean().max())
			mins.append(df[df["Label"]==label].groupby("Position")["Feature"].mean().min())
	print(d, m, max(maxs), min(mins))

distal PROcap_mn 19.81123762367517 -6.256183109029338
proximal PROcap_mn 79.83225911149665 -20.000989814368978


## Generate metaplots

In [14]:
def generate_metaplot(d, samples, ylims, yticks, outputfile, test, xlabel="Distance (kb)"):
	fig, axes = plt.subplots(len(samples), 1, figsize=(2, 5.5))
	labelpad = 2
	wspace = 0.25 
	hspace = 0.2

	hue_order = ["minTSS", "maxTSS"]
	palette = ["#2c7bb6", "#d7191c"]

	for s in samples:
		row = samples.index(s)
		ax = axes[row]

		ax.axhline(y=0, color="#969696", linestyle="--")
		utils.generate_feature_metaplot(df_metaplots[(s,d)], palette, hue_order, ax, test)
			
		if row == 0:
			ax.legend(loc="upper left", title=None, fontsize=sfontsize, frameon=False,
					 handlelength=1.0, handletextpad=0.5, labelspacing=0.2)
			ax.set_title("Predicted", fontsize=bfontsize, pad=labelpad+5, fontweight="bold")
		else:
			ax.legend([],[], frameon=False)
			ax.set_title("")
	
		xtick_list = [0, 24, 48]
		xticklabel_list = ["-250", "0", "250"]
		ax.set_xlim([0, 48])
		ax.set_xticks(xtick_list)
		if row == len(samples)-1:
			ax.set_xlabel(xlabel, fontsize=bfontsize, fontweight="bold")
			ax.set_xticklabels(xticklabel_list)
		else:
			ax.set_xlabel("")
			ax.set_xticklabels([])

		ax.set_ylabel("")
		ax.set_ylim(ylims)
		ax.set_yticks(yticks)
		
		ax.tick_params(labelsize=sfontsize, pad=labelpad)

	fig.subplots_adjust(hspace=hspace)
	plt.savefig(outputfile, bbox_inches="tight", dpi=300) 

In [15]:
pwpool = ProcessWrapPool(len(ds))

# test = True
test = False
for d in ds:
	if d == "distal":
		outputfile = f"{PROJECT_DIR_o}supp_figures/SuppFig3b.pdf"
		ylims = [-17, 32]
		yticks = [-15, 0, 15, 30]
	else:
		outputfile = f"{PROJECT_DIR_o}supp_figures/SuppFig3d.pdf"
		ylims = [-55, 100]
		yticks = [-45, 0, 45, 90]
	pwpool.run(generate_metaplot, args=[d, samples, ylims, yticks, outputfile, test])

In [17]:
len(pwpool.finished_tasks)

2

In [18]:
pwpool.close()