In [1]:
### Goals
# 1. Show ChIP-seq of CTCF & cohesin (RAD21) between divergent and unidirectional elements (HCT116, distal, no auxin)

### Input
# 1. ChIP-seq bw files in HCT116

### Output
# 1. Fig.2A
# Metaplots and heatmaps of CTCF & cohesin (RAD21) between divergent and unidirectional elements
# 2. SuppFig.3A
# Metaplots of of CTCF & cohesin (RAD21) among divergent elements with varying DIs

In [1]:
PROJECT_DIR_s = "/home/yc2553/projects/TRE_directionality/codes/"
PROJECT_DIR_r = "/home/yc2553/projects/TRE_directionality/Resources/"
PROJECT_DIR_d = "/home/yc2553/projects/TRE_directionality/PROcap/"
PROJECT_DIR_o = "/home/yc2553/projects/TRE_directionality/output/"

In [2]:
from mphelper import ProcessWrapPool
import pandas as pd
import matplotlib.pyplot as plt
from biodata.delimited import DelimitedReader
import numpy as np
import pyBigWig
import seaborn as sns
import subprocess

In [3]:
import sys
sys.path.append(PROJECT_DIR_s)

In [4]:
import importlib
import utils
importlib.reload(utils)

<module 'utils' from '/home/yc2553/projects/TRE_directionality/codes/utils.py'>

# 1. Generate feature matrix

## 1.1 Generate feature matrix for heatmap

In [5]:
s = "HCT116"
ks = ["pl", "mn"]
ps = ["divergent", "unidirectional"]
ds = ["proximal", "distal"]
gs = [p+"_"+d for p in ps for d in ds]
cs = ["", "_control"]

In [51]:
# CTCF: https://www.encodeproject.org/files/ENCFF388PVO/@@download/ENCFF388PVO.bigWig
# RAD21: https://www.encodeproject.org/files/ENCFF776IXR/@@download/ENCFF776IXR.bigWig

bws = {"CTCF": PROJECT_DIR_r + "ENCODE/ENCFF388PVO.bigWig",
		"RAD21": PROJECT_DIR_r + "ENCODE/ENCFF776IXR.bigWig"
		}

In [7]:
pwpool = ProcessWrapPool(20)

strand_sensitive = False
strand = None
for g in gs:
	for c in cs:
		bed_in = PROJECT_DIR_d + "bed_plot/" + "_".join([s, g+c]) + ".bed"
		for m in bws:
			outputfile = PROJECT_DIR_o + "feature_matrix/" + "_".join([s, g, m+c]) + ".txt"			
			pwpool.run(utils.get_feature_matrix_for_heatmap, args=[bed_in, bws[m], outputfile, strand_sensitive, strand])

In [10]:
# 4*2*2

len(pwpool.finished_tasks)

16

In [11]:
pwpool.close()

## 1.2 Reformat feature matrix for metaplot

In [9]:
labels = {}
for g in gs:
	for c in cs:
		for m in bws:
			inputfile = PROJECT_DIR_o + "feature_matrix/" + "_".join([s, g, m+c]) + ".txt"
			if "control" in inputfile:
				label = "Control"
			else:
				label = g.split("_")[0].capitalize()
			labels[inputfile] = label

In [12]:
# Combine divergent and unidirectional elements

pwpool = ProcessWrapPool(20)

for d in ds:
	for m in bws:
		inputfiles = [PROJECT_DIR_o + "feature_matrix/" + "_".join([s, p, d, m+c]) + ".txt" for p in ps for c in cs]
		outputfile = PROJECT_DIR_o + "feature_matrix/" + "_".join([s, d, m, "reformat"]) + ".txt"
		pwpool.run(utils.reformat_feature_matrix_for_metaplot, args=[inputfiles, outputfile, labels])

In [19]:
# 2*2

len(pwpool.finished_tasks)

4

In [20]:
pwpool.close()

# 2. Divergent vs. unidirectional

## 2.1 Settings

### 2.1.1 ylims

In [49]:
# Set same value ranges for metaplots and heatmaps
# Divergent & unidirectional & controls

def get_ylims(d, m):
	maxs = []
	mins = []
	inputfiles = [PROJECT_DIR_o + "feature_matrix/" + "_".join([s, p, d, m+c]) + ".txt" for p in ps for c in cs]
	for inputfile in inputfiles:
		df = pd.read_table(inputfile)
		df2 = df.drop(["Element"], axis=1)
		maxs.append(max(df2.mean()))
		mins.append(min(df2.mean()))
	print(d, m, max(maxs), min(mins))

In [52]:
ms = list(bws.keys())

In [53]:
for d in ds:
	for m in ms:
		get_ylims(d, m)

proximal CTCF 25.902577852076245 0.5514182884236124
proximal RAD21 2.753199483200548 0.43967587678585873
distal CTCF 29.72227435389055 0.6293929613368858
distal RAD21 4.16346475071798 0.47953974074663075


### 2.1.2  Sorting order

In [17]:
# Calculate the sum of CTCF ChIP-seq signals

dfs = []
for g in gs:
	inputfile = PROJECT_DIR_o + "feature_matrix/" + "_".join([s, g, "CTCF"]) + ".txt"
	df = pd.read_table(inputfile)
	df2 = df.drop(["Element"], axis=1)
	# Sum each row 
	df2["Value"] = df2.sum(axis=1)
	# Add back element
	df2["Element"] = df["Element"]
	df2["Label"] = g
	df2 = df2[["Element", "Value", "Label"]]
	dfs.append(df2)

In [18]:
sum_df = pd.concat(dfs, ignore_index=True)
outputfile = PROJECT_DIR_o + "labels/" + "_".join([s, "CTCF_sum"]) + ".txt"
sum_df.to_csv(outputfile, sep="\t", index=False)

## 2.2 Metaplots & heatmaps

In [55]:
def generate_metaplot_and_heatmap(s, d, ms, f_sort, ylims, yticks, cbarticks, cmaps, test, outputfile, xlabel="Distance to center (kb)", fontsize=20, legend_fontsize=12, labelpad=7.5, labelsize=20, y_align= -0.2, wspace=0.3, hspace=0.3):
	"""
	Generate metaplots and heatmaps for a list of features.  
	"""
	
	height_ratios = [5,5,5,0.5,1]
	fig, axes = plt.subplots(len(height_ratios), len(ms), figsize=(5*len(ms), 12), gridspec_kw={'height_ratios': height_ratios})
	# Don't show the plot on row 3
	for col in range(len(ms)):
		if len(ms) == 1:
			axes[3].axis("off")
		else:
			axes[3,col].axis("off")
		
	for m in ms:
		col = ms.index(m)
		for row in range(3):
			if len(ms) == 1:
				ax = axes[row]
			else:
				ax = axes[row, col]
			# Metaplot
			if row == 0:
				inputfile = PROJECT_DIR_o + "feature_matrix/" + "_".join([s, d, m, "reformat"]) + ".txt"
				hue_order = ["Control", "Divergent", "Unidirectional"]
				palette = ["#d9d9d9", "#313695", "#fccde5"]
				utils.generate_feature_metaplot(inputfile, palette, hue_order, ax, test)
				
				# Y axis
				ax.set_ylim(ylims[ms.index(m)])
				ax.set_yticks(yticks[ms.index(m)])
				ax.tick_params(labelsize=labelsize, pad=labelpad)
				if col == 0:
					ax.set_ylabel("ChIP-seq", fontsize=fontsize, fontweight="bold")
					ax.get_yaxis().set_label_coords(y_align, 0.5)
				else:
					ax.set_ylabel("")

				# Legend
				if col == 0:
					ax.legend(loc='upper center', bbox_to_anchor=(1.1, 1.6), ncol=len(hue_order), fontsize=fontsize)
				else:
					ax.legend([],[], frameon=False)
					
			# Heatmap
			else:
				if row == 2:
					cbar = True
					if len(ms) == 1:
						cbar_ax = axes[4]
					else:
						cbar_ax = axes[4, col]
				else:
					cbar = False
					cbar_ax = None				

				feature_files = [PROJECT_DIR_o + "feature_matrix/" + "_".join([s, ps[row-1], d, m]) + ".txt"]
				cbar_kws = {"ticks": cbarticks[ms.index(m)], "orientation": "horizontal"}
				# Sort by the sum of CTCF
				utils.generate_feature_heatmap(feature_files, f_sort, cbarticks[ms.index(m)], cmaps[ms.index(m)], cbar, cbar_ax, cbar_kws, ax, test)

				if row == 2:
					cbar_ax.set_xticklabels(cbarticks[ms.index(m)])
					cbar_ax.tick_params(axis="x", labelsize=labelsize, pad=labelpad)

				if col == 0:
					ax.set_ylabel(ps[row-1].capitalize(), fontsize=fontsize, fontweight="bold")
					ax.get_yaxis().set_label_coords(y_align, 0.5)
				else:
					ax.set_ylabel("")
				
			# Title
			if row == 0:
				ax.set_title(m, fontsize=25, pad=labelpad+5, fontweight="bold")
		
			# X axis
			xtick_list = [0,500,1000]
			xticklabel_list = ["-0.5", "0", "0.5"]
			ax.set_xlim([0, 1000])
			ax.set_xticks(xtick_list)
			if row == 2:
				ax.set_xticklabels(xticklabel_list)
				ax.set_xlabel(xlabel, fontsize=fontsize, fontweight="bold")
				ax.tick_params(labelsize=labelsize, pad=labelpad)
			else:
				ax.set_xticklabels([])
				ax.set_xlabel("")

	fig.subplots_adjust(wspace=wspace, hspace=hspace)
	plt.savefig(outputfile, bbox_inches = 'tight', dpi=300) 

In [36]:
pwpool = ProcessWrapPool(len(ds))

# test = True
test = False
cmaps = ["viridis"]*2
f_sort = PROJECT_DIR_o + "labels/" + "_".join([s, "CTCF_sum"]) + ".txt"
for d in ds:
	if d == "distal":
		ylims = [[-3, 33], [-0.5, 5.0]]
		yticks = [[0, 10, 20, 30], [0, 1.5, 3.0, 4.5]]
	else:
		ylims = [[-3, 33], [-0.5, 3.5]]
		yticks = [[0, 10, 20, 30], [0, 1, 2, 3]]
	outputfile = PROJECT_DIR_o + "other_figures/" + "_".join([s, "CTCF_RAD21", d]) + ".png"
	pwpool.run(generate_metaplot_and_heatmap, args=[s, d, ms, f_sort, ylims, yticks, yticks, cmaps, test, outputfile])

In [41]:
len(pwpool.finished_tasks)

2

In [42]:
pwpool.close()

# 3. DI vs. ChIP-seq

In [29]:
# Divide divergent elements into different DI categories, check how skewedness correlates with CTCF/cohesin signals

p = "divergent"

## 3.1 Settings

### 3.1.1 Get DI quantiles

In [26]:
def get_DI_matrix(peak_files, bws, outputfile):

	bw1 = pyBigWig.open(bws[0])
	bw2 = pyBigWig.open(bws[1])
	
	results = []
	for peak in peak_files:
		with DelimitedReader(peak) as dr:
			for cols in dr:
				chrom, start, end, pstart, pend = cols[:5]
				start = int(start)
				end = int(end)
				reads1 = 0
				reads2 = 0
				# pyBigWig: 0-based start, 0-based end; the end is not included
				if bw1.intervals(chrom, start, end+1) != None:
					for start2, end2, score in bw1.intervals(chrom, start, end+1):
						l = min(end+1, end2) - max(start, start2)
						reads1 += abs(score) * l
				if bw2.intervals(chrom, start, end+1) != None:
					for start2, end2, score in bw2.intervals(chrom, start, end+1):
						l = min(end+1, end2) - max(start, start2)
						reads2 += abs(score) * l
				DI = (reads1-reads2)/(reads1+reads2)
				results.append([(chrom, pstart, pend), DI])
	df = pd.DataFrame(results, columns=["Element", "DI"])
	df.to_csv(outputfile, sep="\t", index=False)

In [30]:
for d in ds:
	peak_files = [PROJECT_DIR_d + "bed_plot/" + "_".join([s, p, d]) + ".bed"]
	bws2 = [PROJECT_DIR_d + "Alignments/" + "_".join(["brm_CTCF_U1_and_CTCF_U2_erm", "5"+k]) + ".bw" for k in ks]
	outputfile = PROJECT_DIR_o + "labels/" + "_".join([s, d, "DI", "temp"]) + ".txt"
	get_DI_matrix(peak_files, bws2, outputfile)

In [31]:
# Divide divergent elements into four categories based on absolute DIs

for d in ds:
	inputfile = PROJECT_DIR_o + "labels/" + "_".join([s, d, "DI", "temp"]) + ".txt"
	df = pd.read_table(inputfile)
	df["DI"] = df["DI"].abs()
	quartile_values = df["DI"].quantile([0, 0.25, 0.5, 0.75, 1])
	df["Quantile"] = pd.cut(df["DI"], bins=quartile_values, labels=["Q1", "Q2", "Q3", "Q4"], right=False)
	outputfile = PROJECT_DIR_o + "labels/" + "_".join([s, d, "DI"]) + ".txt"
	df.to_csv(outputfile, sep="\t", index=False)
	subprocess.run("rm " + inputfile, shell=True)

In [56]:
DI_order = ["Q1", "Q2", "Q3", "Q4"]

In [39]:
for d in ds:
	fig, ax = plt.subplots(1,1,figsize=(3,4))
	inputfile = PROJECT_DIR_o + "labels/" + "_".join([s, d, "DI"]) + ".txt"
	df = pd.read_table(inputfile)
	g = sns.boxplot(data=df, x="Quantile", y="DI", 
					order=DI_order, 
					palette=["#efedf5", "#bcbddc", "#807dba", "#3f007d"])
	g.set_xlabel("")
	g.set_ylabel("|Directionality index|", fontsize=16, fontweight="bold")
	g.spines.right.set_visible(False)
	g.spines.top.set_visible(False)
	outputfile = PROJECT_DIR_o + "other_figures/" + "_".join([s, "DI", d]) + ".png"
	plt.savefig(outputfile, bbox_inches = 'tight', dpi=300) 
	plt.close()

### 3.1.2 ylims

In [47]:
def get_ylims2(d, m):
	f_feature = PROJECT_DIR_o + "feature_matrix/" + "_".join([s, p, d, m]) + ".txt"
	df = pd.read_table(f_feature)
	f_label = PROJECT_DIR_o + "labels/" + "_".join([s, d, "DI"]) + ".txt"
	df_label = pd.read_table(f_label)
	df = df.merge(df_label, on="Element")
	maxs = []	
	mins = []
	for g in DI_order:
		df2 = df[df["Quantile"]==g]
		df2 = df2.drop(["Element", "Quantile"], axis=1)
		maxs.append(max(df2.mean()))
		mins.append(min(df2.mean()))
	print(d, m, max(maxs), min(mins))

In [48]:
for d in ds:
	for m in ms:
		get_ylims2(d, m)

proximal CTCF 10.494617428719524 0.13475068784496097
proximal RAD21 1.1147574565316543 0.13475068784496097
distal CTCF 13.439372778819004 0.10384823847648618
distal RAD21 2.026319542766617 0.10384823847648618


## 3.2 Metaplot

In [57]:
def generate_metaplot(s, d, ms, f_label, outputfile, ylims, yticks, test, xlabel="Distance to center (kb)", fontsize=20, legend_fontsize=12, labelpad=7.5, labelsize=20, y_align= -0.2, wspace=0.2):
	fig, axes = plt.subplots(1,len(ms),figsize=(len(ms)*5,4))

	palette = ["#efedf5", "#bcbddc", "#807dba", "#3f007d"]
	df_label = pd.read_table(f_label)

	for m in ms:
		col = ms.index(m)
		ax = axes[col]
		f_feature = PROJECT_DIR_o + "feature_matrix/" + "_".join([s, d, m, "reformat"]) + ".txt"
		df = pd.read_table(f_feature)
		df = df.merge(df_label, on="Element")
		if test:
			frames = []
			for n in range(len(DI_order)):
				frames.append(df[df["Quantile"]==DI_order[n]].head(10))
			df = pd.concat(frames)
		
		sns.lineplot(data=df, x="Position", y="Feature", hue="Quantile", hue_order=DI_order, palette=palette, ax=ax)
		
		ax.set_title(m, fontsize=25, pad=labelpad+5, fontweight="bold")
		if col == len(ms)-1:
			ax.legend(loc="upper right", fontsize=12)
		else:
			ax.legend([],[], frameon=False)
		
		# Y axis
		ax.set_ylim(ylims[col])
		ax.set_yticks(yticks[col])
		ax.tick_params(labelsize=labelsize, pad=labelpad)
		if col == 0:
			ax.set_ylabel("ChIP-seq", fontsize=fontsize, fontweight="bold")
			ax.get_yaxis().set_label_coords(y_align, 0.5)
		else:
			ax.set_ylabel("")
		
		# X axis
		xtick_list = [0,500,1000]
		xticklabel_list = ["-0.5", "0", "0.5"]
		ax.set_xlim([0, 1000])
		ax.set_xticks(xtick_list)
		ax.set_xticklabels(xticklabel_list)
		ax.set_xlabel(xlabel, fontsize=fontsize, fontweight="bold")
		ax.tick_params(labelsize=labelsize, pad=labelpad)

	fig.subplots_adjust(wspace=wspace)
	plt.savefig(outputfile, bbox_inches = 'tight', dpi=300) 

In [58]:
pwpool = ProcessWrapPool(len(ds))

# test = True
test = False

for d in ds:
	outputfile = PROJECT_DIR_o + "other_figures/" + "_".join([s, "CTCF_DI", d]) + ".png"
	f_label = PROJECT_DIR_o + "labels/" + "_".join([s, d, "DI"]) + ".txt"
	if d == "distal":
		ylims = [[-2,17], [-0.5,3.5]]
		yticks = [[0,5,10,15], [0,1,2,3]]
	else:
		ylims = [[-1,13], [-0.2,1.4]]
		yticks = [[0,4,8,12], [0,0.4,0.8,1.2]]
	pwpool.run(generate_metaplot, args=[s, d, ms, f_label, outputfile, ylims, yticks, test])

In [62]:
len(pwpool.finished_tasks)

2

In [63]:
pwpool.close()