In [1]:
### Goals
# 1. Show metaplots can be upregulated after CTCF degradation.

### Input
# 1. PROcap bw files in HCT116 before and after CTCF degradation

### Output
# 1. Metaplots of elements with upregulated and unchanged signals on the side with less reads

In [1]:
PROJECT_DIR_s = "/home/yc2553/projects/TRE_directionality/codes/"
PROJECT_DIR_r = "/home/yc2553/projects/TRE_directionality/Resources/"
PROJECT_DIR_d = "/home/yc2553/projects/TRE_directionality/PROcap/"
PROJECT_DIR_o = "/home/yc2553/projects/TRE_directionality/output/"

In [2]:
from mphelper import ProcessWrapPool
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter, defaultdict
import seaborn as sns
from biodata.delimited import DelimitedReader, DelimitedWriter
from Bio import SeqIO
import pybedtools

In [3]:
import sys
sys.path.append(PROJECT_DIR_s)

In [4]:
import importlib
import utils
importlib.reload(utils)

<module 'utils' from '/home/yc2553/projects/TRE_directionality/codes/utils.py'>

# 1. Generate feature matrix

In [5]:
s = "HCT116"
ks = ["pl", "mn"]
ps = ["divergent", "unidirectional"]
ds = ["proximal", "distal"]
ts = ["maximum", "minimum"]
gs = [p+"_"+d for p in ps for d in ds]

## 1.1 Generate feature matrix for heatmap

In [7]:
bws = {"phyloP": PROJECT_DIR_r + "UCSC/phyloP/hg38.phyloP100way.bw"
	   }

In [8]:
# May flip the elements based on CTCF motif orientation

pwpool = ProcessWrapPool(len(gs)*len(bws))

strand_sensitive = False
strand = None	
for g in gs:
	bed_in = PROJECT_DIR_d + "bed_plot/" + "_".join([s, g, "CTCF_centered"]) + ".bed"
	for m in bws:
		outputfile = PROJECT_DIR_o + "feature_matrix/" + "_".join([s, g, m, "CTCF_centered"]) + ".txt"
		pwpool.run(utils.get_feature_matrix_for_heatmap, args=[bed_in, bws[m], outputfile, strand_sensitive, strand])

In [9]:
# 4*1

len(pwpool.finished_tasks)

4

In [10]:
pwpool.close()

In [26]:
inputfile = PROJECT_DIR_o + "feature_matrix/" + "_".join([s, g, m, "CTCF_centered"]) + ".txt"
df = pd.read_table(inputfile)
df.head()

Unnamed: 0,Element,0,1,2,3,4,5,6,7,8,...,991,992,993,994,995,996,997,998,999,1000
0,"('chr1', '3910971', '3911060')",-0.065,-0.818,-0.333,-1.545,-0.659,0.416,-0.222,-0.127,-0.298,...,-1.133,-0.661,-0.791,0.188,-2.413,-1.958,0.035,-1.354,0.205,-0.187
1,"('chr1', '9005221', '9005289')",-0.962,0.53,-0.965,0.53,0.53,0.53,0.53,0.53,0.53,...,-2.549,0.382,4.032,-2.438,1.668,6.1,0.677,6.553,4.078,1.401
2,"('chr1', '13829033', '13829145')",0.026,0.341,-0.637,-0.454,-1.357,-0.085,0.23,-1.256,1.982,...,0.14,-0.545,-0.786,-1.807,-0.135,0.485,0.248,-0.251,-1.263,-0.176
3,"('chr1', '23637489', '23637559')",-0.093,1.227,-1.814,-1.209,1.483,-0.643,-0.827,1.483,-0.393,...,0.076,0.076,0.077,0.076,0.077,0.085,0.089,0.077,0.077,0.076
4,"('chr1', '24202073', '24202115')",0.392,-0.569,-0.657,-1.591,0.469,0.396,0.469,0.469,0.392,...,-0.257,-1.795,-0.936,-0.294,0.609,0.07,-0.316,-1.247,0.126,0.454


## 1.2 Reformat feature matrix for metaplot

In [None]:
# Get labels for degron results
# Combine both maximum and minimum results
# Up: Up+Unchanged or Up+Up
# Down: Down+Unchanged or Down+Down
# Unchanged: Unchanged+Unchanged

In [20]:
labels = defaultdict(set)

for t in ts:
	inputfile = PROJECT_DIR_o + "labels/" + "_".join([s, "CTCF_degron", t]) + ".txt"
	df = pd.read_table(inputfile)
	for index, row in df.iterrows():
		chrom, start, end = row["Element"][1:-1].split(", ")
		e = (chrom[1:-1], str(start[1:-1]), str(end[1:-1]))
		labels[e].add(row["Value"])

In [31]:
labels2 = {}
for e in labels:
	chrom, start, end = e
	e2 = str((chrom, start, end))
	if len(labels[e]) == 1:
		labels2[e2] = list(labels[e])[0]
	elif "Unchanged" in labels[e]:
		labels2[e2] = [l for l in list(labels[e]) if l != "Unchanged"][0]
	else:
		labels2[e2] = "ambiguous"

In [39]:
def reformat_feature_matrix_for_metaplot(inputfiles, outputfile):
	"""
	Get a matrix of a given feature for metaplot
	Each row: one 10-bp bin
	Columns: position, feature value, label
	"""
	
	frames = []
	for inputfile in inputfiles:
		df = pd.read_table(inputfile)
		# change the format
		df_reformat = pd.melt(df,
						  id_vars="Element",
						  value_vars=list(df.columns[1:]),
						  var_name="Position",
						  value_name="Feature"
						  )

		lbs = []
		for index, row in df_reformat.iterrows():
			lbs.append(labels2[row["Element"]])
		df_reformat["Label"] = lbs		
		frames.append(df_reformat)

	df = pd.concat(frames)
	df.to_csv(outputfile, sep="\t", index=False)

In [40]:
# Combine divergent and unidirectional elements

pwpool = ProcessWrapPool(len(ds))

for d in ds:
	inputfiles = [PROJECT_DIR_o + "feature_matrix/" + "_".join([s, p, d, m, "CTCF_centered"]) + ".txt" for p in ps for m in bws]
	outputfile = PROJECT_DIR_o + "feature_matrix/" + "_".join([s, d, m, "reformat", "CTCF_centered"]) + ".txt"
	pwpool.run(reformat_feature_matrix_for_metaplot, args=[inputfiles, outputfile])

In [43]:
len(pwpool.finished_tasks)

2

In [44]:
pwpool.close()

In [45]:
# Check the output

inputfile = PROJECT_DIR_o + "feature_matrix/" + "_".join([s, d, m, "reformat", "CTCF_centered"]) + ".txt"
df = pd.read_table(inputfile)
Counter(df["Label"])

Counter({'Unchanged': 1162161,
         'Down': 307307,
         'Up': 208208,
         'ambiguous': 16016})

In [104]:
208208/1001

208.0

# 3. Generate metaplots

## 3.1 Settings

In [55]:
def get_ylims(f_feature):
	df = pd.read_table(f_feature)
	maxs = []
	mins = []
	for k in groups:
		df2 = df[df["Label"]==k]
		means = df2.groupby("Position")["Feature"].mean()
		maxs.append(max(means))
		mins.append(min(means))
	return max(maxs), min(mins)

In [42]:
groups = ["Up", "Unchanged", "Down"]

In [84]:
for d in ds:
	f_feature = PROJECT_DIR_o + "feature_matrix/" + "_".join([s, d, m, "reformat", "CTCF_centered"]) + ".txt"
	print(d, get_ylims(f_feature))

proximal (1.2618243278797112, -0.33733870910251545)
distal (1.4451057701248478, -0.4178365387807179)


In [85]:
df = pd.read_table(f_feature)
df.head()

Unnamed: 0,Element,Position,Feature,Label
0,"('chr1', '1122245', '1122353')",0,-0.027,Unchanged
1,"('chr1', '1344856', '1345140')",0,-1.035,Unchanged
2,"('chr1', '1944140', '1944236')",0,-0.135,Unchanged
3,"('chr1', '2044891', '2045039')",0,0.305,Up
4,"('chr1', '2140105', '2140406')",0,-0.455,Unchanged


## 3.2 Generate metaplots

In [107]:
def generate_metaplot(f_feature, outputfile, ylims, yticks, test, xlabel="Distance to CTCF motif (bp)", fontsize=20, labelpad=7.5, labelsize=20, y_align= -0.2, wspace=0.25):
	fig, ax = plt.subplots(1,3,figsize=(15,4))

	df = pd.read_table(f_feature)
	if test:
		frames = []
		for n in range(len(hue_order)):
			frames.append(df[df["Label"]==hue_order[n]].head(10))
		df = pd.concat(frames)

	for label in groups:
		col = groups.index(label)
		df2 = df[df["Label"]==label]
		g = sns.lineplot(data=df2, x="Position", y="Feature", color="#80b1d3", ax=ax[col])

		g.set_title(label + " (n=" + str(int(len(df2)/1001)) + ")", fontsize=fontsize, fontweight="bold")
		g.spines.right.set_visible(False)
		g.spines.top.set_visible(False)
	
		# Y axis
		g.set_ylim(ylims)
		g.set_yticks(yticks)
		if col == 0:
			g.set_ylabel("phyloP", fontsize=fontsize, fontweight="bold")
			g.tick_params(labelsize=labelsize, pad=labelpad)
		else:
			g.set_ylabel("")

		# X axis
		xticklabel_list = ["-50", "0", "50"]
		xtick_list = [450, 500, 550]
		g.set_xlim([450, 550])
		g.set_xticks(xtick_list)
		g.set_xticklabels(xticklabel_list)
		g.set_xlabel(xlabel, fontsize=16, fontweight="bold")
		g.tick_params(labelsize=labelsize, pad=labelpad)

	plt.subplots_adjust(wspace=0.5)
	plt.savefig(outputfile, bbox_inches = 'tight', dpi=300) 

In [109]:
pwpool = ProcessWrapPool(len(ds))

# test = True
test = False

for d in ds:
	f_feature = PROJECT_DIR_o + "feature_matrix/" + "_".join([s, d, m, "reformat", "CTCF_centered"]) + ".txt"
	outputfile = PROJECT_DIR_o + "other_figures/" + "_".join([s, d, m, "CTCF_centered"]) + ".png"
	ylims = [-1.5, 2.5]
	yticks = [-1, 0, 1, 2]
	pwpool.run(generate_metaplot, args=[f_feature, outputfile, ylims, yticks, test])

In [111]:
len(pwpool.finished_tasks)

2

In [112]:
pwpool.close()