In [1]:
from mphelper import ProcessWrapPool
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter, defaultdict
import seaborn as sns
from biodata.delimited import DelimitedReader, DelimitedWriter
import pybedtools
import itertools
import json
import biodataplot.metaplot as bpm
from biodata.bigwig import BigWigIReader
from biodata.bed import BEDXReader
from genomictools import GenomicCollection

In [2]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))
import utils

In [3]:
import matplotlib
font_dir = Path.cwd().parent / "font"
for font in ["Arial.ttf", "Arial_Bold.ttf"]:
    matplotlib.font_manager.fontManager.addfont(font_dir / font)
matplotlib.rcParams["font.family"] = "Arial"
bfontsize = 12
sfontsize = 9

In [4]:
PROJECT_DIR_d = "/fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/PROcap/"
PROJECT_DIR_r = "/fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/resources/"
PROJECT_DIR_o = "/fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/output/"

# Get elements centered on CTCF motifs

In [5]:
s = "HCT116"
ks = ["+", "-"]
ps = ["divergent", "unidirectional"]
ds = ["distal", "proximal"]
gs = [f"{p}_{d}" for p, d in itertools.product(ps, ds)]
ts = ["maxTSS", "minTSS"]
labels = ["Up", "Down"]
motif = "CTCF"

In [6]:
inputfile = f"{PROJECT_DIR_o}FIMO/{s}_CTCF_pos.json"
with open(inputfile, "r") as f:
	motifs = json.load(f)

In [7]:
mapping = {}
for g in gs:
	inputfile = f"{PROJECT_DIR_d}bed_plot/{s}_{g}.bed"
	outputfile = f"{PROJECT_DIR_d}bed_plot/{s}_{g}_CTCF.bed"
	with DelimitedWriter(outputfile) as dw:
		with DelimitedReader(inputfile) as dr:
			for cols in dr:
				chrom, start, end = cols[:3]
				e = "_".join([chrom, start, end])
				if e in motifs:
					dw.write([chrom, motifs[e][0]-250, motifs[e][0]+250] + cols[3:])
					mapping[(chrom, motifs[e][0]-250, motifs[e][0]+250)] = e

# Generate feature matrix

## Get individual matrix

In [8]:
# "*5pl_rpm.bw": positive values
# "*5mn_rpm.bw": negative values

bws = {"CTCF_U_pl": f"{PROJECT_DIR_d}Alignments/CTCF_U_5pl_rpm.bw",
	   "CTCF_U_mn": f"{PROJECT_DIR_d}Alignments/CTCF_U_5mn_rpm.bw",
	   "CTCF_T_pl": f"{PROJECT_DIR_d}Alignments/CTCF_T_5pl_rpm.bw",
	   "CTCF_T_mn": f"{PROJECT_DIR_d}Alignments/CTCF_T_5mn_rpm.bw",
	  }

In [9]:
df_features = {}
use_strand = True
for g in gs:
	bed = f"{PROJECT_DIR_d}bed_plot/{s}_{g}_CTCF.bed"
	all_regions = BEDXReader.read_all(GenomicCollection, bed, strandfield="f7")
	for k in ks:
		regions = list(filter(lambda a: a.stranded_genomic_pos.strand==k, all_regions))
		index = [mapping[(r.chrom, r.chromStart, r.chromEnd)] for r in regions]
		for m in bws:
			# Change the sign: maxTSS - positive values; minTSS - negative values
			if k == "+":
				sign = 1
			else:
				sign = -1
			df_features[(m,g,k)] = sign*pd.DataFrame(bpm.generate_signal_profile(regions, BigWigIReader(bws[m]), fixed_size=501, use_strand=use_strand), index=index)

In [10]:
# 10-bp bins

df_bins = {}
for k in df_features:
	df_bins[k] = utils.bin_values(df_features[k])

In [11]:
df_reformat = {}
for k in df_bins:
	df = df_bins[k].reset_index()
	df_reformat[k] = pd.melt(df,
						  id_vars="index",
						  value_vars=list(df.columns[1:]),
						  var_name="Position",
						  value_name="Feature"
						  )

## Combine dataframes for plotting

In [12]:
inputfile = f"{PROJECT_DIR_o}labels/{s}_CTCF_DE.json"
with open(inputfile, "r") as f:
	DE_labels = json.load(f)

In [13]:
df_metaplots = {}
for d, t, label in itertools.product(ds, ts, labels):
	dfs = []
	for p in ps:
		if t == "minTSS":
			df1 = pd.concat([df_reformat[("CTCF_U_pl", f"{p}_{d}", "-")], 
							df_reformat[("CTCF_U_mn", f"{p}_{d}", "+")]])
			df2 = pd.concat([df_reformat[("CTCF_T_pl", f"{p}_{d}", "-")], 
							df_reformat[("CTCF_T_mn", f"{p}_{d}", "+")]])
		else:
			df1 = pd.concat([df_reformat[("CTCF_U_pl", f"{p}_{d}", "+")], 
							df_reformat[("CTCF_U_mn", f"{p}_{d}", "-")]])
			df2 = pd.concat([df_reformat[("CTCF_T_pl", f"{p}_{d}", "+")], 
							df_reformat[("CTCF_T_mn", f"{p}_{d}", "-")]])

		filtered_df1 = df1[df1["index"].isin([k for k, v in DE_labels[t].items() if v == label])].copy()
		filtered_df1["Label"] = f"{t} (0h)"
		filtered_df2 = df2[df2["index"].isin([k for k, v in DE_labels[t].items() if v == label])].copy()
		filtered_df2["Label"] = f"{t} (6h)"
		dfs.extend([filtered_df1, filtered_df2])
	df_metaplots[(d,t,label)] = pd.concat(dfs).reset_index(drop=True)	

# Generate metaplots

## Settings

In [14]:
for d, t in itertools.product(ds, ts):
	maxs = []
	mins = []
	labels2 = [f"{t} (0h)", f"{t} (6h)"]
	for l1, l2 in itertools.product(labels, labels2):
		df = df_metaplots[(d,t,l1)]
		maxs.append(df[df["Label"]==l2].groupby("Position")["Feature"].mean().max())
		mins.append(df[df["Label"]==l2].groupby("Position")["Feature"].mean().min())
	print(d, t, max(maxs), min(mins))

distal maxTSS 1.151615365731473 0.0002401128122350846
distal minTSS 0.0 -0.7152549104335216
proximal maxTSS 3.335671051068509 0.0029787190796173725
proximal minTSS 0.0 -1.2136396134667324


## Generate metaplots

In [15]:
def generate_metaplot(d, t, labels, df_metaplots, ylims, yticks, outputfile, xlabel="Distance (bp)"):
	fig, axes = plt.subplots(1,len(labels),figsize=(5.5,1.7))
	labelpad = 2
	y_align = -0.2
	wspace = 0.3
	
	hue_order = [f"{t} (0h)", f"{t} (6h)"]
	palette = ["#a6cee3", "#08519c"] if t == "minTSS" else ["#fb9a99", "#a50f15"]
	
	for label in labels:
		col = labels.index(label)
		ax = axes[col]
		df = df_metaplots[(d,t,label)]
		sns.lineplot(data=df, x="Position", y="Feature", hue="Label", hue_order=hue_order, palette=palette, ax=ax, errorbar="se")
		
		ax.spines[["right", "top"]].set_visible(False)
		ax.spines['left'].set_position(('outward', 10))
		ax.spines['bottom'].set_position(('outward', 10))

		tot = len(set(df["index"]))
		ax.set_title(f"{label} (n={tot})", fontsize=bfontsize, pad=labelpad+5, fontweight="bold")
		if col == 1:
			if t == "minTSS":
				loc = "lower right"
			else:
				loc = "upper right"
			ax.legend(loc=loc, fontsize=sfontsize)
		else:
			ax.legend([],[], frameon=False)
	
		ax.set_ylim(ylims)
		ax.set_yticks(yticks)
		ax.set_yticklabels([str(y) for y in yticks])
		if col == 0:
			ax.set_ylabel("PRO-cap", fontsize=bfontsize, fontweight="bold")
			ax.yaxis.set_label_coords(y_align, 0.5)
		else:
			ax.set_ylabel("")
			ax.set_yticklabels([])
		
		xticklabel_list = ["-250", "", "", "", "", "0", "", "", "", "", "250"]
		xtick_list = [n*5 for n in range(11)]
		ax.set_xlim([0, 50])
		ax.set_xticks(xtick_list)
		ax.set_xticklabels(xticklabel_list)
		ax.set_xlabel(xlabel, fontsize=bfontsize, fontweight="bold")
		
		ax.tick_params(labelsize=sfontsize, pad=labelpad)

		ax.axhline(y=0, color="#bdbdbd", ls="--", lw="1")
		ax.axvline(x=25, ymin=-1, ymax=1, color="#bdbdbd", ls="--", lw="1", clip_on=False, zorder=1)
		
	fig.subplots_adjust(wspace=wspace)
	plt.savefig(outputfile, bbox_inches="tight", dpi=300) 

In [16]:
# The matplotlib version for other analysis is v3.9.2
# Need to change matplotlib to v3.8.3 or lower here
# Otherwise, "ax.axvline(...)" won't display as expected

In [17]:
pwpool = ProcessWrapPool(len(ds)*len(ts))

for d, t in itertools.product(ds, ts):
	folder = "figures/" if d == "distal" else "supp_figures/"
	if d == "distal":
		if t == "minTSS":
			ylims = [-1, 0.1]
			yticks = [-1, 0]
			outputfile = f"{PROJECT_DIR_o}{folder}Fig4d.pdf"
		else:
			ylims = [0, 1.6]
			yticks = [0, 1.5]
			outputfile = f"{PROJECT_DIR_o}{folder}Fig4f.pdf"
	else:
		if t == "minTSS":
			ylims = [-1.5, 0.1]
			yticks = [-1.5, 0]
			outputfile = f"{PROJECT_DIR_o}{folder}SuppFig10b.pdf"
		else:
			ylims = [0, 4]
			yticks = [0, 3.5]
			outputfile = f"{PROJECT_DIR_o}{folder}SuppFig10d.pdf"
	pwpool.run(generate_metaplot, args=[d, t, labels, df_metaplots, ylims, yticks, outputfile])

In [19]:
len(pwpool.finished_tasks)

4

In [20]:
pwpool.close()