In [1]:
### Goals
# 1. Show metaplots can be upregulated after CTCF degradation.

### Input
# 1. PROcap bw files in HCT116 before and after CTCF degradation

### Output
# 1. Metaplots of elements with upregulated and unchanged signals on the side with less reads

In [1]:
PROJECT_DIR_s = "/home/yc2553/projects/TRE_directionality/codes/"
PROJECT_DIR_r = "/home/yc2553/projects/TRE_directionality/Resources/"
PROJECT_DIR_d = "/home/yc2553/projects/TRE_directionality/PROcap/"
PROJECT_DIR_o = "/home/yc2553/projects/TRE_directionality/output/"

In [2]:
from mphelper import ProcessWrapPool
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from biodata.delimited import DelimitedReader, DelimitedWriter

In [3]:
import sys
sys.path.append(PROJECT_DIR_s)

In [41]:
import importlib
import utils
importlib.reload(utils)

<module 'utils' from '/home/yc2553/projects/TRE_directionality/codes/utils.py'>

# 1. Generate feature matrix

## 1.1 Generate element files

In [5]:
s = "HCT116"
ks = ["pl", "mn"]
ts = ["maximum", "minimum"]
ps = ["divergent", "unidirectional"]
ds = ["proximal", "distal"]
labels = ["Up", "Unchanged", "Down"]

In [11]:
# Only include elements with CTCF binding

inputfile = PROJECT_DIR_o + "labels/" + "_".join([s, "CTCF_binary"]) + ".txt"
df_binding = pd.read_table(inputfile)
included = list(df_binding[df_binding["Value"]=="CTCF binding"]["Element"])

In [12]:
# DE labels

labels2 = {}
for t in ts:
	inputfile = PROJECT_DIR_o + "labels/" + "_".join([s, "CTCF_degron", t]) + ".txt"
	df_degron = pd.read_table(inputfile)
	labels2[t] = dict(zip(df_degron["Element"], df_degron["Value"]))

In [14]:
# Combine divergent and unidirectional
# CTCF binding

prefix = []
for d in ds:
	for t in ts:
		for label in labels:
			prefix.append("_".join([s, d, t, label]))
			outputfile = PROJECT_DIR_o + "feature_matrix/" + "_".join([s, d, t, label]) + ".bed"
			with DelimitedWriter(outputfile) as dw:
				for p in ps:
					inputfile = PROJECT_DIR_d + "bed_plot/" + "_".join([s, p, d]) + ".bed"
					with DelimitedReader(inputfile) as dr:
						for cols in dr:
							chrom, _, _, start2, end2 = cols[:5]
							if str((chrom, start2, end2)) in included and labels2[t][str((chrom, start2, end2))] == label:
								dw.write(cols)

## 1.2 Generate feature matrix for heatmap

In [15]:
# Among elements with CTCF binding, some showed upregulation after CTCF degradation while others not
# One possible reason is CTCF binding signals; strong binding may interfere with transcription more severely

bws = {"CTCF": PROJECT_DIR_r + "ENCODE/ENCFF388PVO.bigWig",
	   "RAD21": PROJECT_DIR_r + "ENCODE/ENCFF776IXR.bigWig"
	   }

In [42]:
pwpool = ProcessWrapPool(len(prefix)*len(bws))

strand_sensitive = False
strand = None

for g in prefix:
	bed_in = PROJECT_DIR_o + "feature_matrix/" + g + ".bed"
	for m in bws:
		outputfile = PROJECT_DIR_o + "feature_matrix/" + "_".join([g, m]) + ".txt"	
		pwpool.run(utils.get_feature_matrix_for_heatmap, args=[bed_in, bws[m], outputfile, strand_sensitive, strand])

In [43]:
# 12*2

len(pwpool.finished_tasks)

16

In [44]:
pwpool.close()

## 1.2 Reformat feature matrix for metaplot

In [30]:
labels3 = {}
for g in prefix:
	for m in bws:
		inputfile = PROJECT_DIR_o + "feature_matrix/" + "_".join([g, m]) + ".txt"
		labels3[inputfile] = inputfile.split("/")[-1].split("_")[-2]

In [45]:
# Combine divergent and unidirectional elements

pwpool = ProcessWrapPool(len(ds)*len(bws))

for d in ds:
	for t in ts:
		for m in bws:
			inputfiles = [PROJECT_DIR_o + "feature_matrix/" + "_".join([s, d, t, label, m]) + ".txt" for label in labels]
			outputfile = PROJECT_DIR_o + "feature_matrix/" + "_".join([s, d, t, m, "reformat"]) + ".txt"
			pwpool.run(utils.reformat_feature_matrix_for_metaplot, args=[inputfiles, outputfile, labels3])

In [47]:
# 2*2*2

len(pwpool.finished_tasks)

8

In [48]:
pwpool.close()

# 2. Up vs. Unchanged

## 2.1 Settings

### 2.1.1 ylims

In [36]:
# Decide on the ylims
# TREs & controls
# Set same value ranges for metaplots and heatmaps

def get_ylims(ms):
	for d in ds:
		for t in ts:
			for m in ms:
				maxs = []
				mins = []
				inputfiles = [PROJECT_DIR_o + "feature_matrix/" + "_".join([s, d, t, label, m]) + ".txt" for label in labels]
				for inputfile in inputfiles:
					df = pd.read_table(inputfile)
					df2 = df.drop(["Element"], axis=1)
					maxs.append(max(df2.mean()))
					mins.append(min(df2.mean()))
				print(d, t, m, max(maxs), min(mins))

In [37]:
ms = list(bws.keys())

In [49]:
get_ylims(ms)

proximal maximum CTCF 65.5829699120279 1.4947985698069846
proximal maximum RAD21 3.91938620435316 0.49916898412893046
proximal minimum CTCF 65.66092846847948 1.581872006257375
proximal minimum RAD21 3.975506935757203 0.39017133265733717
distal maximum CTCF 67.83409863625738 0.9630413970281911
distal maximum RAD21 5.851195062933997 0.4322274452032045
distal minimum CTCF 74.57393088167262 1.26565819978714
distal minimum RAD21 6.353954400954672 0.3763418184085326


## 2.2 Metaplots & heatmaps

In [40]:
def generate_metaplot_and_heatmap(s, d, t, f_sort, ms, ylims, yticks, cbarticks, cmaps, test, outputfile, xlabel="Distance to center (kb)", fontsize=20, legend_fontsize=12, labelpad=7.5, labelsize=20, y_align= -0.2, wspace=0.3, hspace=0.3):
	"""
	Generate metaplots and heatmaps for a list of features.  
	"""
	
	height_ratios = [5,5,5,0.5,1]
	fig, axes = plt.subplots(len(height_ratios), len(ms), figsize=(5*len(ms), 12), gridspec_kw={'height_ratios': height_ratios})
	for col in range(len(ms)):
		if len(ms) == 1:
			axes[len(height_ratios)-2].axis("off")
		else:
			axes[len(height_ratios)-2,col].axis("off")

	hue_order = ["Up", "Unchanged"]
	palette = ["#fb8072", "#d9d9d9"]
	
	for m in ms:
		col = ms.index(m)
		for row in range(len(height_ratios)-2):
			if len(ms) == 1:
				ax = axes[row]
			else:
				ax = axes[row, col]
			# Metaplot
			if row == 0:
				inputfile = PROJECT_DIR_o + "feature_matrix/" + "_".join([s, d, t, m, "reformat"]) + ".txt"
				utils.generate_feature_metaplot(inputfile, palette, hue_order, ax, test)
				
				# Y axis
				ax.set_ylim(ylims[ms.index(m)])
				ax.set_yticks(yticks[ms.index(m)])
				ax.tick_params(labelsize=labelsize, pad=labelpad)
				if col == 0:
					ax.set_ylabel("ChIP-seq", fontsize=fontsize, fontweight="bold")
					ax.get_yaxis().set_label_coords(y_align, 0.5)
				else:
					ax.set_ylabel("")

				# Legend
				if col == 0:
					ax.legend(loc='upper center', bbox_to_anchor=(1.1, 1.6), ncol=len(hue_order), fontsize=fontsize)
				else:
					ax.legend([],[], frameon=False)
					
			# Heatmap
			else:
				if row == len(height_ratios)-3:
					cbar = True
					if len(ms) == 1:
						cbar_ax = axes[len(height_ratios)-1]
					else:
						cbar_ax = axes[len(height_ratios)-1, col]
				else:
					cbar = False
					cbar_ax = None				

				feature_files = [PROJECT_DIR_o + "feature_matrix/" + "_".join([s, d, t, labels[row-1], m]) + ".txt"]
				cbar_kws = {"ticks": cbarticks[ms.index(m)], "orientation": "horizontal"}
				# Sort by the sum of CTCF
				utils.generate_feature_heatmap(feature_files, f_sort, cbarticks[ms.index(m)], cmaps[ms.index(m)], cbar, cbar_ax, cbar_kws, ax, test)

				if row == len(height_ratios)-3:
					cbar_ax.set_xticklabels(cbarticks[ms.index(m)])
					cbar_ax.tick_params(axis="x", labelsize=labelsize, pad=labelpad)

				if col == 0:
					ax.set_ylabel(labels[row-1].capitalize(), fontsize=fontsize, fontweight="bold")
					ax.get_yaxis().set_label_coords(y_align, 0.5)
				else:
					ax.set_ylabel("")
				
			# Title
			if row == 0:
				ax.set_title(m, fontsize=25, pad=labelpad+5, fontweight="bold")
		
			# X axis
			xtick_list = [0,500,1000]
			xticklabel_list = ["-0.5", "0", "0.5"]
			ax.set_xlim([0, 1000])
			ax.set_xticks(xtick_list)
			if row == len(height_ratios)-3:
				ax.set_xticklabels(xticklabel_list)
				ax.set_xlabel(xlabel, fontsize=fontsize, fontweight="bold")
				ax.tick_params(labelsize=labelsize, pad=labelpad)
			else:
				ax.set_xticklabels([])
				ax.set_xlabel("")

	fig.subplots_adjust(wspace=wspace, hspace=hspace)
	plt.savefig(outputfile, bbox_inches = 'tight', dpi=300) 

In [50]:
pwpool = ProcessWrapPool(len(ds))

# test = True
test = False
cmaps = ["viridis"]*2
sum_file = PROJECT_DIR_o + "labels/" + "_".join([s, "CTCF_sum"]) + ".txt"
for t in ts:
	for d in ds:
		if d == "distal":
			ylims = [[-5, 95], [-1, 10]]
			yticks = [[0, 30, 60, 90], [0, 3, 6, 9]]
		else:
			ylims = [[-5, 80], [-1,7]]
			yticks = [[0, 25, 50, 75], [0,2,4,6]]
		outputfile = PROJECT_DIR_o + "other_figures/" + "_".join([s, d, t, "DE_CTCF_RAD21_binding"]) + ".png"
		pwpool.run(generate_metaplot_and_heatmap, args=[s, d, t, sum_file, ms, ylims, yticks, yticks, cmaps, test, outputfile])

In [53]:
len(pwpool.finished_tasks)

4

In [54]:
pwpool.close()