In [1]:
import glob
from mphelper import ProcessWrapPool
import subprocess
import pandas as pd
from biodata.delimited import DelimitedReader, DelimitedWriter
import json
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import mannwhitneyu

In [2]:
PROJECT_DIR_s = "/home/yc2553/projects/TRE_directionality/codes/"
PROJECT_DIR_r = "/home/yc2553/projects/TRE_directionality/Resources/"
PROJECT_DIR_d = "/home/yc2553/projects/TRE_directionality/PROcap/"
PROJECT_DIR_d2 = "/fs/cbsuhy02/storage/yc2553/yc2553/databases/"
PROJECT_DIR_o = "/home/yc2553/projects/TRE_directionality/output/"

In [3]:
import sys
sys.path.append(PROJECT_DIR_s)

In [4]:
import importlib
import utils
importlib.reload(utils)

<module 'utils' from '/home/yc2553/projects/TRE_directionality/codes/utils.py'>

# 1. Assign sequence age

In [5]:
s = "HCT116"
ps = ["divergent", "unidirectional"]
ds = ["distal", "proximal"]
gs = [p+"_"+d for p in ps for d in ds]
cs = ["", "_control"]

In [6]:
#1 TRE/control files 

files1 = []
for g in gs:
	for c in cs:
		files1.append(PROJECT_DIR_d + "bed_plot/" + "_".join([s, g+c]) + ".bed")
len(files1)

8

In [7]:
#2 Syntenic blocks

files2 = glob.glob(PROJECT_DIR_d2 + "UCSC/multiz100way/hg38/chr*.bed")
len(files2)

24

In [8]:
def intersect(fa, fb):
	temp = PROJECT_DIR_o + "temp/" + fa.split("/")[-1]
	with DelimitedWriter(temp) as dw:
		with DelimitedReader(fa) as dr:
			for cols in dr:
				chrom, start, end = cols[:3]
				dw.write([chrom, int(start)-250, int(end)+250]+cols[3:])
	outputfile = temp.split(".bed")[0] + ".txt"
	subprocess.run("bedtools intersect -a " + temp + " -b " + fb + " -sorted -wao > " + outputfile, shell=True)

In [9]:
pwpool = ProcessWrapPool(len(files1))

fb = " ".join(files2)
for fa in files1:
	pwpool.run(intersect, args=[fa, fb])

In [12]:
len(pwpool.finished_tasks)

8

In [13]:
pwpool.close()

In [14]:
# Check the output

inputfile = PROJECT_DIR_o + "temp/HCT116_divergent_distal.txt"
df = pd.read_table(inputfile, header=None)
df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,chr1,998532,999532,998929,999173,998930,999134,+,1,chr1,998518,998545,Vertebrate,13
1,chr1,998532,999532,998929,999173,998930,999134,+,1,chr1,998545,998562,Vertebrate,17


# 2. Generate feature matrix

## 2.1 Age

In [15]:
inputfile = PROJECT_DIR_r + "UCSC/seq_age/clades.json"
with open(inputfile, "r") as f:
	clades = json.load(f)

In [16]:
pwpool = ProcessWrapPool(len(gs)*len(cs))

m = "Age"
for g in gs:
	for c in cs:
		inputfile = PROJECT_DIR_o + "temp/" + "_".join([s, g+c]) + ".txt"
		outputfile = PROJECT_DIR_o + "temp/" + "_".join([s, m, g+c]) + ".txt"
		pwpool.run(utils.age_dissection, args=[inputfile, outputfile, clades])

In [18]:
# 4*2

len(pwpool.finished_tasks)

8

In [19]:
pwpool.close()

In [20]:
# Reformat matrix

pwpool = ProcessWrapPool(len(gs)*len(cs))

for g in gs:
	for c in cs:
		inputfile = PROJECT_DIR_o + "temp/" + "_".join([s, m, g+c]) + ".txt"
		outputfile = PROJECT_DIR_o + "temp/" + "_".join([s, m, g+c, "reformat"]) + ".txt"
		pwpool.run(utils.reformat_feature_matrix_for_metaplot, args=[inputfile, outputfile])

In [22]:
# 4*2

len(pwpool.finished_tasks)

8

In [23]:
pwpool.close()

## 2.2 Evo

In [24]:
bws = {"phyloP": PROJECT_DIR_r + "UCSC/phyloP/hg38.phyloP100way.bw",
	   "CDTS": PROJECT_DIR_r + "Other/CDTS/coord_CDTS_percentile_N7794unrelated.bw"
	   }

In [25]:
pwpool = ProcessWrapPool(20)

for g in gs:
	for c in cs:
		bed_in = PROJECT_DIR_d + "bed_plot/" + "_".join([s, g+c]) + ".bed"
		for m in bws:
			output1 = PROJECT_DIR_o + "temp/" + "_".join([s, m, g+c]) + ".txt"
			output2 = PROJECT_DIR_o + "temp/" + "_".join([s, m, g+c, "reformat"]) + ".txt"
			pwpool.run(utils.get_feature_matrix, args=[bed_in, bws[m], output1, output2, False, None, np.nan])

In [27]:
# 4*2*2

len(pwpool.finished_tasks)

16

In [28]:
pwpool.close()

## 2.3 Combine dataframes for plotting

In [29]:
ms = ["Age", "phyloP", "CDTS"]

In [30]:
dfs_heatmap = {}
for m in ms:
	dfs = []
	for g in gs:
		inputfile = PROJECT_DIR_o + "temp/" + "_".join([s, m, g]) + ".txt"
		df = pd.read_table(inputfile)
		df["Group"] = g
		dfs.append(df)
	dfs_heatmap[m] = pd.concat(dfs, ignore_index=True)	

In [31]:
# Age
# Get the maximum score for each element
# Get the average values for two groups; U test 

m = "Age"
d = "distal"
values = {}
for p in ps:
	df = dfs_heatmap[m][dfs_heatmap[m]["Group"]==p+"_"+d]
	df = df.drop(columns=["Element", "Group"])
	values[p] = df.max(axis=1).tolist()

In [32]:
# CDTS
# Get the minimum score for each element
# Get the average values for two groups; U test 

m = "CDTS"
d = "distal"
values2 = {}
for p in ps:
	df = dfs_heatmap[m][dfs_heatmap[m]["Group"]==p+"_"+d]
	df = df.drop(columns=["Element", "Group"])
	df = df.dropna(how='all')
	values2[p] = df.min(axis=1, skipna=True).tolist()

In [33]:
dfs_metaplot = {}
for m in ms:
	dfs = []
	for g in gs:
		for c in cs:
			inputfile = PROJECT_DIR_o + "temp/" + "_".join([s, m, g+c, "reformat"]) + ".txt"
			df = pd.read_table(inputfile)
			df["Group"] = g
			if c:
				df["Label"] = "Control"
			else:
				df["Label"] = g.split("_")[0].capitalize()
			dfs.append(df)
	dfs_metaplot[m] = pd.concat(dfs, ignore_index=True)	

# 3. Metaplots

## 3.1 ylims

In [34]:
# Decide on the ylims
# Set same value ranges for metaplots and heatmaps

def get_ylims(dfs_metaplot):
	labels = ["Control", "Divergent", "Unidirectional"]
	for d in ds:
		for m in ms:
			maxs = []
			mins = []
			df = dfs_metaplot[m][dfs_metaplot[m]["Group"].str.contains(d)]
			for label in labels:
				maxs.append(df[df["Label"]==label].groupby("Position")["Feature"].mean().max())
				mins.append(df[df["Label"]==label].groupby("Position")["Feature"].mean().min())
			print(d, m, max(maxs), min(mins))

In [35]:
get_ylims(dfs_metaplot)

distal Age 206.96864111498257 137.5219627169488
distal phyloP 0.32816757999995033 -0.05000684866554116
distal CDTS 0.020497408504523997 -3.38795051553085
proximal Age 365.3274090530408 140.58414666314957
proximal phyloP 0.6911693570655971 -0.07571428560939268
proximal CDTS 0.06640121192594338 -12.066685077229643


## 3.2 Generate metaplots

In [36]:
def generate_metaplot(s, d, ms, ylims, yticks, test, outputfile, xlabel="Distance to center (kb)", fontsize=20, legend_fontsize=12, labelpad=7.5, labelsize=20, y_align= -0.2, hspace=0.2):
	"""
	Generate metaplots and heatmaps for a list of features.  
	"""
	
	fig, axes = plt.subplots(len(ms), 1, figsize=(6, 5*len(ms)))
	hue_order = ["Control", "Divergent", "Unidirectional"]
	palette = ["#969696", "#313695", "#de77ae"]
	
	for m in ms:
		row = ms.index(m)
		ax = axes[row]
		df = dfs_metaplot[m][dfs_metaplot[m]["Group"].str.contains(d)]
		utils.generate_feature_metaplot(df, palette, hue_order, ax, test)
					
		# Y axis
		ax.set_ylim(ylims[row])
		ax.set_yticks(yticks[row])
		ax.set_ylabel(m, fontsize=fontsize, fontweight="bold")
		ax.tick_params(labelsize=labelsize, pad=labelpad)
		ax.get_yaxis().set_label_coords(y_align, 0.5)

		# Legend
		if row == 0:
			ax.legend(loc="lower left", fontsize=fontsize, bbox_to_anchor=[0, 1.02])
		else:
			ax.legend([],[], frameon=False)

		# X axis
		xtick_list = [0,500,1000]
		xticklabel_list = ["-0.5", "0", "0.5"]
		ax.set_xlim([0, 1000])
		ax.set_xticks(xtick_list)
		if row == len(ms)-1:
			ax.set_xticklabels(xticklabel_list)
			ax.set_xlabel(xlabel, fontsize=fontsize, fontweight="bold")
			ax.tick_params(labelsize=labelsize, pad=labelpad)
		else:
			ax.set_xticklabels([])
			ax.set_xlabel("")

	fig.subplots_adjust(hspace=hspace)
	plt.savefig(outputfile, bbox_inches = 'tight', dpi=300) 

In [None]:
proximal Age 365.3274090530408 140.58414666314957
proximal phyloP 0.6911693570655971 -0.07571428560939268
proximal CDTS 0.06640121192594338 -12.066685077229643

distal Age 206.96864111498257 137.5219627169488
distal phyloP 0.32816757999995033 -0.05000684866554116
distal CDTS 0.020497408504523997 -3.38795051553085


In [35]:
pwpool = ProcessWrapPool(len(ds))

# test = True
test = False
for d in ds:
	outputfile = PROJECT_DIR_o + "other_figures/Fig4A_HCT116.png"
	if d == "proximal":
		ylims = [[100, 440], [-0.1, 1], [-17, 2]]
		yticks = [[120, 220, 320, 420], [0, 0.3, 0.6, 0.9], [-15, -10, -5, 0]]
	else:
		ylims = [[110, 250], [-0.05, 0.35], [-5, 0.5]]
		yticks = [[120, 160, 200, 240], [0, 0.1, 0.2, 0.3], [-4.5, -3, -1.5, 0]]
	pwpool.run(generate_metaplot, args=[s, d, ms, ylims, yticks, test, outputfile])

In [38]:
len(pwpool.finished_tasks)

2

In [39]:
pwpool.close()