In [1]:
from mphelper import ProcessWrapPool
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import itertools
import biodataplot.metaplot as bpm
from biodata.bigwig import BigWigIReader
from biodata.bed import BEDXReader
from genomictools import GenomicCollection
import json

In [2]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))
import utils

In [3]:
import matplotlib
font_dir = Path.cwd().parent / "font"
for font in ["Arial.ttf", "Arial_Bold.ttf"]:
    matplotlib.font_manager.fontManager.addfont(font_dir / font)
matplotlib.rcParams["font.family"] = "Arial"
bfontsize = 12
sfontsize = 9

In [4]:
PROJECT_DIR_d = "/fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/PROcap/"
PROJECT_DIR_r = "/fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/resources/"
PROJECT_DIR_o = "/fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/output/"

# Generate feature matrix

## Get individual matrix

In [5]:
s = "HCT116"
ps = ["divergent", "unidirectional"]
ds = ["distal", "proximal"]
gs = [f"{p}_{d}" for p, d in itertools.product(ps, ds)]
t = "minTSS"
labels = ["Up", "Unchanged"]

In [6]:
bws = {"CTCF": f"{PROJECT_DIR_r}ENCODE/ENCFF388PVO.bigWig",
	   "RAD21": f"{PROJECT_DIR_r}ENCODE/ENCFF776IXR.bigWig"
		}
ms = list(bws.keys())

In [7]:
df_features = {}
use_strand = True
for g in gs:
	bed = f"{PROJECT_DIR_d}bed_plot/{s}_{g}.bed"
	all_regions = BEDXReader.read_all(GenomicCollection, bed, strandfield="f7")
	index = [f"{r.chrom}_{r.chromStart}_{r.chromEnd}" for r in all_regions]
	for m in bws:
		df_features[(m,g)] = pd.DataFrame(bpm.generate_signal_profile(all_regions, BigWigIReader(bws[m]), fixed_size=1001, use_strand=use_strand), index=index)

In [8]:
df_reformat = {}
for k in df_features:
	df = df_features[k].reset_index()
	df_reformat[k] = pd.melt(df,
						  id_vars="index",
						  value_vars=list(df.columns[1:]),
						  var_name="Position",
						  value_name="Feature"
						  )

## Combine dataframes for plotting

In [9]:
inputfile = f"{PROJECT_DIR_o}labels/{s}_CTCF_ChIP.json"
with open(inputfile, "r") as f:
	binding_labels = json.load(f)

In [10]:
inputfile = f"{PROJECT_DIR_o}labels/{s}_CTCF_DE.json"
with open(inputfile, "r") as f:
	DE_labels = json.load(f)

In [11]:
df_heatmaps = {}
for m, d in itertools.product(ms, ds):
	for label in labels:
		dfs = []
		for p in ps:
			df = df_features[(m,f"{p}_{d}")]
			filtered_df = df[df.index.isin([k for k, v in binding_labels.items() if v == "CTCF binding"]) &
						     df.index.isin([k for k, v in DE_labels[t].items() if v == label])
							]
			dfs.append(filtered_df)
		df_heatmaps[(m,d,label)] = pd.concat(dfs)

In [12]:
df_metaplots = {}
for m, d in itertools.product(ms, ds):
	dfs = []
	for label in labels:
		for p in ps:
			df = df_reformat[(m,f"{p}_{d}")]
			filtered_df = df[df["index"].isin([k for k, v in binding_labels.items() if v == "CTCF binding"]) &
						     df["index"].isin([k for k, v in DE_labels[t].items() if v == label])
							].copy()
			filtered_df["Label"] = label
			dfs.append(filtered_df)
	df_metaplots[(m,d)] = pd.concat(dfs).reset_index(drop=True)

# Up vs. unchanged

## Settings

In [13]:
for m, d in itertools.product(ms, ds):
	maxs = []
	mins = []
	df = df_metaplots[(m,d)]
	for label in labels:
		maxs.append(df[df["Label"]==label].groupby("Position")["Feature"].mean().max())
		mins.append(df[df["Label"]==label].groupby("Position")["Feature"].mean().min())
	print(d, m, max(maxs), min(mins))

distal CTCF 73.97304311394691 1.5713288303779598
proximal CTCF 63.64774284579537 1.919783892011956
distal RAD21 6.401952110947325 0.5282060103490949
proximal RAD21 3.8493831370477425 0.5395014392437808


## Metaplots & heatmaps

In [14]:
def generate_metaplot_and_heatmap(d, ps, ms, df_metaplots, df_heatmaps, sort_file, ylims, yticks, cmap, outputfile, test, xlabel="Distance (kb)"):
	height_ratios = [5,5,5,0.8,1]
	fig, axes = plt.subplots(len(height_ratios), len(ms), figsize=(3.5, 5), gridspec_kw={'height_ratios': height_ratios})
	for col in range(len(ms)):
		axes[3,col].axis("off")
	labelpad = 2
	y_align = -0.25
	wspace = 0.3
	hspace = 0.3

	hue_order = ["Up", "Unchanged"]
	palette = ["#feb24c", "#d9d9d9"]
	
	for m in ms:
		col = ms.index(m)
		for row in range(3):
			ax = axes[row, col]
			# Metaplot
			if row == 0:
				utils.generate_feature_metaplot(df_metaplots[(m,d)], palette, hue_order, ax, test)
				
				ax.set_ylim(ylims[ms.index(m)])
				ax.set_yticks(yticks[ms.index(m)])
				if col == 0:
					ax.set_ylabel("ChIP-seq", fontsize=bfontsize, fontweight="bold")
					ax.get_yaxis().set_label_coords(y_align, 0.5)
				else:
					ax.set_ylabel("")

				if col == 0:
					ax.legend(loc="upper center", bbox_to_anchor=(1.1, 1.6), ncol=len(hue_order), fontsize=sfontsize)
				else:
					ax.legend([],[], frameon=False)
					
			# Heatmap
			else:
				if row == 2:
					cbar = True
					cbar_ax = axes[4, col]
				else:
					cbar = False
					cbar_ax = None		
				cbar_kws = {"ticks": yticks[ms.index(m)], "orientation": "horizontal"}

				utils.generate_feature_heatmap(df_heatmaps[(m,d,labels[row-1])], yticks[ms.index(m)], cmap, cbar, cbar_ax, cbar_kws, ax, sort_file, test)
				
				if row == 2:
					cbar_ax.set_xticklabels(yticks[ms.index(m)])
					cbar_ax.tick_params(axis="x", labelsize=sfontsize, pad=labelpad)

				if col == 0:
					ylabel = f"{hue_order[row-1]}\n(n={len(df_heatmaps[(m,d,labels[row-1])])})"
					ax.set_ylabel(ylabel, fontsize=bfontsize, fontweight="bold")
					ax.get_yaxis().set_label_coords(y_align/3, 0.5)
				else:
					ax.set_ylabel("")
				
			if row == 0:
				ax.set_title(m, fontsize=bfontsize, pad=5, fontweight="bold")
		
			xtick_list = [0,500,1000]
			xticklabel_list = ["-0.5", "0", "0.5"]
			ax.set_xlim([0, 1000])
			ax.set_xticks(xtick_list)
			if row == 2:
				ax.set_xticklabels(xticklabel_list)
				ax.set_xlabel(xlabel, fontsize=bfontsize, fontweight="bold")
			else:
				ax.set_xticklabels([])
				ax.set_xlabel("")

			ax.tick_params(labelsize=sfontsize, pad=labelpad)

	fig.subplots_adjust(wspace=wspace, hspace=hspace)
	plt.savefig(outputfile, bbox_inches="tight", dpi=300) 

In [15]:
pwpool = ProcessWrapPool(len(ds))

# test = True
test = False
cmap = "viridis"
sort_file = f"{PROJECT_DIR_o}labels/{s}_CTCF_sum.json"
for d in ds:
	if d == "distal":
		ylims = [[-10, 100], [-1, 10], [-1, 10]]
		yticks = [[0, 30, 60, 90], [0, 3, 6, 9], [0, 3, 6, 9]]
	else:
		ylims = [[-10, 100], [-1, 7], [-2, 14]]
		yticks = [[0, 30, 60, 90], [0, 2, 4, 6], [0, 4, 8, 12]]
	folder = "supp_figures/" if d == "distal" else "other_figures/"
	outputfile = f"{PROJECT_DIR_o}{folder}SuppFig7e.png"
	pwpool.run(generate_metaplot_and_heatmap, args=[d, ps, ms, df_metaplots, df_heatmaps, sort_file, ylims, yticks, cmap, outputfile, test])

In [18]:
len(pwpool.finished_tasks)

2

In [19]:
pwpool.close()