In [32]:
import glob
from mphelper import ProcessWrapPool
import subprocess
import pandas as pd
from biodata.delimited import DelimitedReader, DelimitedWriter
import json
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import mannwhitneyu
from collections import defaultdict
import pybedtools

In [2]:
PROJECT_DIR_s = "/home/yc2553/projects/TRE_directionality/codes/"
PROJECT_DIR_r = "/home/yc2553/projects/TRE_directionality/Resources/"
PROJECT_DIR_d = "/home/yc2553/projects/TRE_directionality/PROcap/"
PROJECT_DIR_d2 = "/fs/cbsuhy02/storage/yc2553/yc2553/databases/"
PROJECT_DIR_o = "/home/yc2553/projects/TRE_directionality/output/"

In [3]:
import sys
sys.path.append(PROJECT_DIR_s)

In [4]:
import importlib
import utils
importlib.reload(utils)

<module 'utils' from '/home/yc2553/projects/TRE_directionality/codes/utils.py'>

# 1. Assign sequence age

In [7]:
s = "HCT116"
ds = ["distal", "proximal"]

In [8]:
inputfile = PROJECT_DIR_o + "labels/" + "_".join([s, "CTCF_binary"]) + ".txt"
df_binding = pd.read_table(inputfile)
df_binding.head(2)

Unnamed: 0,Element,Label
0,"('chr1', '998929', '999173')",No CTCF binding
1,"('chr1', '1006383', '1006639')",No CTCF binding


In [9]:
inputfile = PROJECT_DIR_o + "labels/" + "_".join([s, "CTCF_degron"]) + ".txt"
df_degron = pd.read_table(inputfile)
df_degron.head(2)

Unnamed: 0,Element,Label,Side,Group
0,"('chr1', '998929', '999173')",Unchanged,maximum,divergent_distal
1,"('chr1', '1006383', '1006639')",Unchanged,maximum,divergent_distal


In [10]:
# Separate unidirectional elements into those show upregulation after CTCF loss vs. unchanged group

df_label = df_binding.merge(df_degron, on="Element")
df_label.head(2)

Unnamed: 0,Element,Label_x,Label_y,Side,Group
0,"('chr1', '998929', '999173')",No CTCF binding,Unchanged,maximum,divergent_distal
1,"('chr1', '998929', '999173')",No CTCF binding,Unchanged,minimum,divergent_distal


In [27]:
t = "minimum"
ls = ["Up", "Unchanged"]
es_included = defaultdict(set)
for l in ls:
	es = list(df_label[(df_label["Label_y"]==l)  & (df_label["Side"]==t)]["Element"])
	for e in es:
		chrom, start, end = e[2:-1].split(", ")
		chrom = chrom[:-1]
		start = start[1:-1]
		end = end[1:-1]
		es_included[l].add((chrom, start, end))

In [40]:
p = "unidirectional"
for d in ds:
	inputfile = PROJECT_DIR_d + "bed_plot/" + "_".join([s, p, d]) + ".bed"
	for l in ls:
		outputfile = PROJECT_DIR_d + "bed_plot/" + "_".join([s, p, l, d]) + ".bed"
		with DelimitedWriter(outputfile) as dw:
			with DelimitedReader(inputfile) as dr:
				for cols in dr:
					if (cols[0], cols[3], cols[4]) in es_included[l]:
						dw.write(cols)
		b = pybedtools.BedTool(outputfile)
		print(d, l, len(b))

distal Up 43
distal Unchanged 125
proximal Up 16
proximal Unchanged 48


In [34]:
ps = ["divergent", "unidirectional_Up", "unidirectional_Unchanged"]

In [41]:
#1 TRE/control files 

files1 = []
for p in ps:
	for d in ds:
		files1.append(PROJECT_DIR_d + "bed_plot/" + "_".join([s, p, d]) + ".bed")
len(files1)

6

In [36]:
#2 Syntenic blocks

files2 = glob.glob(PROJECT_DIR_d2 + "UCSC/multiz100way/hg38/chr*.bed")
len(files2)

24

In [37]:
def intersect(fa, fb):
	temp = PROJECT_DIR_o + "temp/" + fa.split("/")[-1]
	with DelimitedWriter(temp) as dw:
		with DelimitedReader(fa) as dr:
			for cols in dr:
				chrom, start, end = cols[:3]
				dw.write([chrom, int(start)-250, int(end)+250]+cols[3:])
	outputfile = temp.split(".bed")[0] + ".txt"
	subprocess.run("bedtools intersect -a " + temp + " -b " + fb + " -sorted -wao > " + outputfile, shell=True)

In [70]:
pwpool = ProcessWrapPool(len(files1))

fb = " ".join(files2)
for fa in files1:
	pwpool.run(intersect, args=[fa, fb])

In [72]:
len(pwpool.finished_tasks)

6

In [73]:
pwpool.close()

In [74]:
# Check the output

inputfile = PROJECT_DIR_o + "temp/" + "_".join([s, p, d]) + ".txt"
df = pd.read_table(inputfile, header=None)
df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,chr1,998532,999532,998929,999173,998930,999134,+,1,chr1,998518,998545,Vertebrate,13
1,chr1,998532,999532,998929,999173,998930,999134,+,1,chr1,998545,998562,Vertebrate,17


# 2. Generate feature matrix

## 2.1 Age

In [47]:
inputfile = PROJECT_DIR_r + "UCSC/seq_age/clades.json"
with open(inputfile, "r") as f:
	clades = json.load(f)

In [80]:
pwpool = ProcessWrapPool(len(ps)*len(ds))

m = "Age"
for p in ps:
	for d in ds:
		inputfile = PROJECT_DIR_o + "temp/" + "_".join([s, p, d]) + ".txt"
		outputfile = PROJECT_DIR_o + "temp/" + "_".join([s, m, p, d]) + ".txt"
		pwpool.run(utils.age_dissection, args=[inputfile, outputfile, clades])

In [84]:
len(pwpool.finished_tasks)

6

In [85]:
pwpool.close()

In [86]:
# Reformat matrix

pwpool = ProcessWrapPool(len(ps)*len(ds))

for p in ps:
	for d in ds:
		inputfile = PROJECT_DIR_o + "temp/" + "_".join([s, m, p, d]) + ".txt"
		outputfile = PROJECT_DIR_o + "temp/" + "_".join([s, m, p, d, "reformat"]) + ".txt"
		pwpool.run(utils.reformat_feature_matrix_for_metaplot, args=[inputfile, outputfile])

In [88]:
len(pwpool.finished_tasks)

6

In [89]:
pwpool.close()

## 2.2 Evo

In [90]:
bws = {"phyloP": PROJECT_DIR_r + "UCSC/phyloP/hg38.phyloP100way.bw",
	   "CDTS": PROJECT_DIR_r + "Other/CDTS/coord_CDTS_percentile_N7794unrelated.bw"
	   }

In [104]:
pwpool = ProcessWrapPool(20)

for p in ps:
	for d in ds:
		bed_in = PROJECT_DIR_d + "bed_plot/" + "_".join([s, p, d]) + ".bed"
		for m in bws:
			output1 = PROJECT_DIR_o + "temp/" + "_".join([s, m, p, d]) + ".txt"
			output2 = PROJECT_DIR_o + "temp/" + "_".join([s, m, p, d, "reformat"]) + ".txt"
			pwpool.run(utils.get_feature_matrix, args=[bed_in, bws[m], output1, output2, False, None, np.nan])

In [106]:
len(pwpool.finished_tasks)

12

In [107]:
pwpool.close()

## 2.3 Combine dataframes for plotting

In [96]:
ms = ["Age", "phyloP", "CDTS"]

In [97]:
dfs_heatmap = {}
for m in ms:
	dfs = []
	for p in ps:
		for d in ds:
			inputfile = PROJECT_DIR_o + "temp/" + "_".join([s, m, p, d]) + ".txt"
			df = pd.read_table(inputfile)
			df["Group"] = p+"_"+d
			dfs.append(df)
	dfs_heatmap[m] = pd.concat(dfs, ignore_index=True)	

In [98]:
dfs_heatmap[m].head(2)

Unnamed: 0,Element,Position,Feature,Group
0,"('chr1', '998929', '999173')",0,,divergent_distal
1,"('chr1', '1006383', '1006639')",0,2.11782,divergent_distal


In [134]:
set(dfs_heatmap[m]["Group"])

{'divergent_distal',
 'divergent_proximal',
 'unidirectional_Unchanged_distal',
 'unidirectional_Unchanged_proximal',
 'unidirectional_Up_distal',
 'unidirectional_Up_proximal'}

In [99]:
# Age
# Get the maximum score for each element
# Get the average values for two groups; U test 

m = "Age"
d = "distal"
values = {}
for p in ps:
	df = dfs_heatmap[m][dfs_heatmap[m]["Group"]==p+"_"+d]
	df = df.drop(columns=["Element", "Group"])
	values[p] = df.max(axis=1).tolist()

In [100]:
for p in values:
	print(p, np.mean(values[p]))

divergent 278.7722688054442
unidirectional_Up 277.90697674418607
unidirectional_Unchanged 277.744


In [101]:
# CDTS
# Get the minimum score for each element
# Get the average values for two groups; U test 

m = "CDTS"
d = "distal"
values2 = {}
for p in ps:
	df = dfs_heatmap[m][dfs_heatmap[m]["Group"]==p+"_"+d]
	df = df.drop(columns=["Element", "Group"])
	df = df.dropna(how='all')
	values2[p] = df.min(axis=1, skipna=True).tolist()

In [102]:
for p in values2:
	print(p, np.mean(values2[p]))

divergent 50.373779041305276
unidirectional_Up 84.71460209898538
unidirectional_Unchanged 48.538641946130646


In [126]:
dfs_metaplot = {}
for m in ms:
	dfs = []
	for p in ps:
		for d in ds:
			inputfile = PROJECT_DIR_o + "temp/" + "_".join([s, m, p, d, "reformat"]) + ".txt"
			df = pd.read_table(inputfile)
			df["Group"] = d
			df["Label"] = p.capitalize()
			dfs.append(df)
	dfs_metaplot[m] = pd.concat(dfs, ignore_index=True)	

In [127]:
dfs_metaplot[m].head(2)

Unnamed: 0,Element,Position,Feature,Group,Label
0,"('chr1', '998929', '999173')",0,,distal,Divergent
1,"('chr1', '1006383', '1006639')",0,2.11782,distal,Divergent


# 3. Metaplots

## 3.1 ylims

In [132]:
# Decide on the ylims
# Set same value ranges for metaplots and heatmaps

def get_ylims(dfs_metaplot):
	labels = ["Divergent", "Unidirectional_up", "Unidirectional_unchanged"]
	for d in ds:
		for m in ms:
			maxs = []
			mins = []
			df = dfs_metaplot[m][dfs_metaplot[m]["Group"]==d]
			for label in labels:
				maxs.append(df[df["Label"]==label].groupby("Position")["Feature"].mean().max())
				mins.append(df[df["Label"]==label].groupby("Position")["Feature"].mean().min())
			print(d, m, max(maxs), min(mins))

In [133]:
get_ylims(dfs_metaplot)

distal Age 224.27619047619046 129.11111111111111
distal phyloP 0.9841395325438921 -0.581116265942191
distal CDTS -1.8544393801087633 -6.84142786519868
proximal Age 369.30434782608694 138.28571428571428
proximal phyloP 1.4006874836049974 -0.8381874999031425
proximal CDTS -3.0864387219254343 -13.68115492661794


## 3.2 Generate metaplots

In [135]:
def generate_metaplot(s, d, ms, ylims, yticks, test, outputfile, xlabel="Distance to center (kb)", fontsize=20, legend_fontsize=12, labelpad=7.5, labelsize=20, y_align= -0.2, hspace=0.2):
	"""
	Generate metaplots and heatmaps for a list of features.  
	"""
	
	fig, axes = plt.subplots(len(ms), 1, figsize=(6, 5*len(ms)))
	hue_order = ["Divergent", "Unidirectional_up", "Unidirectional_unchanged"]
	palette = ["#313695", "#de77ae", "#969696"]
	
	for m in ms:
		row = ms.index(m)
		ax = axes[row]
		df = dfs_metaplot[m][dfs_metaplot[m]["Group"].str.contains(d)]
		utils.generate_feature_metaplot(df, palette, hue_order, ax, test)
					
		# Y axis
		ax.set_ylim(ylims[row])
		ax.set_yticks(yticks[row])
		ax.set_ylabel(m, fontsize=fontsize, fontweight="bold")
		ax.tick_params(labelsize=labelsize, pad=labelpad)
		ax.get_yaxis().set_label_coords(y_align, 0.5)

		# Legend
		if row == 0:
			ax.legend(loc="lower left", fontsize=fontsize, bbox_to_anchor=[0, 1.02])
		else:
			ax.legend([],[], frameon=False)

		# X axis
		xtick_list = [0,500,1000]
		xticklabel_list = ["-0.5", "0", "0.5"]
		ax.set_xlim([0, 1000])
		ax.set_xticks(xtick_list)
		if row == len(ms)-1:
			ax.set_xticklabels(xticklabel_list)
			ax.set_xlabel(xlabel, fontsize=fontsize, fontweight="bold")
			ax.tick_params(labelsize=labelsize, pad=labelpad)
		else:
			ax.set_xticklabels([])
			ax.set_xlabel("")

	fig.subplots_adjust(hspace=hspace)
	plt.savefig(outputfile, bbox_inches = 'tight', dpi=300) 

In [None]:
distal Age 224.27619047619046 129.11111111111111
distal phyloP 0.9841395325438921 -0.581116265942191
distal CDTS -1.8544393801087633 -6.84142786519868
proximal Age 369.30434782608694 138.28571428571428
proximal phyloP 1.4006874836049974 -0.8381874999031425
proximal CDTS -3.0864387219254343 -13.68115492661794

In [136]:
pwpool = ProcessWrapPool(len(ds))

# test = True
test = False
for d in ds:
	outputfile = PROJECT_DIR_o + "other_figures/Fig4A_CTCF_" + d + ".png"
	if d == "proximal":
		ylims = [[100, 440], [-0.15, 1.05], [-17, 2]]
		yticks = [[120, 220, 320, 420], [0, 0.3, 0.6, 0.9], [-15, -10, -5, 0]]
	else:
		ylims = [[110, 250], [-0.05, 0.35], [-10, 1]]
		yticks = [[120, 160, 200, 240], [0, 0.1, 0.2, 0.3], [-9, -6, -3, 0]]
	pwpool.run(generate_metaplot, args=[s, d, ms, ylims, yticks, test, outputfile])

In [139]:
len(pwpool.finished_tasks)

2

In [140]:
pwpool.close()