In [1]:
import glob
from mphelper import ProcessWrapPool
import subprocess
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import mannwhitneyu
import itertools
import biodataplot.metaplot as bpm
from biodata.bigwig import BigWigIReader
from biodata.bed import BEDXReader
from genomictools import GenomicCollection
from collections import defaultdict
from biodata.delimited import DelimitedReader, DelimitedWriter
import tempfile

In [2]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))
import utils

In [3]:
import matplotlib
font_dir = Path.cwd().parent / "font"
for font in ["Arial.ttf", "Arial_Bold.ttf"]:
    matplotlib.font_manager.fontManager.addfont(font_dir / font)
matplotlib.rcParams["font.family"] = "Arial"
bfontsize = 12
sfontsize = 9

In [4]:
PROJECT_DIR_d = "/fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/PROcap/"
PROJECT_DIR_d2 = "/fs/cbsuhy02/storage/yc2553/yc2553/databases/"
PROJECT_DIR_r = "/fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/resources/"
PROJECT_DIR_o = "/fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/output/"

# Assign sequence age

Syntenic blocks that overlapped at least 6 bp of an enhancer sequence were considered when assigning the enhancer’s age and architecture. We considered enhancers mapping to one syntenic block or several syntenic blocks of the same age as “simple” enhancer architectures, while enhancers overlapping adjacent syntenic blocks of different ages have “complex” enhancer architectures. Given complex enhancers are composed of multiple sequence ages, we assigned complex enhancer age according to the oldest age. Sequences without an assigned age were excluded from this analysis.

In [5]:
s = "C1"
ps = ["divergent", "unidirectional"]
ds = ["distal", "proximal"]
cs = ["", "_control"]
gs = [f"{p}_{d}{c}" for p, d, c in itertools.product(ps, ds, cs)]

In [6]:
def intersect(fa, fb, outputfile, extension=250):
	with tempfile.NamedTemporaryFile(delete=False, suffix='.bed') as temp_file:
		temp = temp_file.name
		with DelimitedWriter(temp) as dw, DelimitedReader(fa) as dr:
			for cols in dr:
				chrom, start, end = cols[:3]
				dw.write([chrom, int(start) - extension, int(end) + extension]+cols[3:])
	commands = ["bedtools intersect",
				 "-a", temp,
				 "-b", fb,
				 "-wao",
				 ">", outputfile
				]
	subprocess.run(" ".join(commands), shell=True)

In [7]:
pwpool = ProcessWrapPool(len(gs))

fb = " ".join(glob.glob(f"{PROJECT_DIR_d2}UCSC/multiz100way/hg38/chr*.bed"))
for g in gs:
	fa = f"{PROJECT_DIR_d}bed_plot/{s}_{g}.bed"
	outputfile = f"{PROJECT_DIR_o}temp/{s}_{g}_age.bed"
	pwpool.run(intersect, args=[fa, fb, outputfile])

In [9]:
len(pwpool.finished_tasks)

8

In [10]:
pwpool.close()

In [16]:
# Check the output

inputfile = f"{PROJECT_DIR_o}temp/{s}_{ps[0]}_{ds[0]}_age.bed"
df = pd.read_table(inputfile, header=None, low_memory=False)
df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,chr1,1020787,1021788,1021175,1021375,1021237,1021340,-,1,chr1,1020772,1020792,Vertebrate,5
1,chr1,1020787,1021788,1021175,1021375,1021237,1021340,-,1,chr1,1020792,1020806,Vertebrate,14


In [15]:
# Check the output

inputfile = f"{PROJECT_DIR_o}temp/{s}_{ps[0]}_{ds[0]}_control_age.bed"
df = pd.read_table(inputfile, header=None, low_memory=False)
df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,chr1,1078522,1079523,1,chr1,1078509,1078534,Vertebrate,12
1,chr1,1078522,1079523,1,chr1,1078534,1078566,Vertebrate,32


# Generate feature matrix

## Age

In [17]:
def age_dissection(inputfile, clades):	
	age_per_block = defaultdict(dict)
	reverse = {}

	with DelimitedReader(inputfile) as dr:
		for cols in dr:
			chrom, start, end = cols[:3]
			if "control" not in inputfile:
				sign = cols[7]
				if (chrom, start, end) not in reverse:
					reverse[(chrom, start, end)] = (sign == "-")
			start2, end2, clade, overlap = cols[-4:]
			# Only include regions with overlapping length >= 6bp
			if int(overlap) >= 6:
				block_start = max(int(start), int(start2))
				block_end = min(int(end), int(end2))
				age_per_block[(chrom, start, end)][(block_start, block_end)] = clades[clade]

	results = []
	index = []
	for r in age_per_block:
		chrom, start, end = r
		index.append("_".join([chrom, start, end]))
		bp_features = []
		for i in range(int(start), int(end)):
			feature = np.nan
			for block in age_per_block[r]:
				b_s, b_e = block
				if b_s <= i <= b_e:
					feature = age_per_block[r][block]
					break
			bp_features.append(feature)

		if reverse.get(r, False):
			bp_features.reverse()
		results.append(bp_features)

	df = pd.DataFrame(results, index=index)
	return df

In [18]:
inputfile = f"{PROJECT_DIR_r}other/age/clades.json"
with open(inputfile, "r") as f:
	clades = json.load(f)

In [19]:
m = "Age"
df_features = {}
for g in gs:
	inputfile = f"{PROJECT_DIR_o}temp/{s}_{g}_age.bed"
	df_features[(m,g)] = age_dissection(inputfile, clades)

## Evo

In [20]:
bws = {"phyloP": f"{PROJECT_DIR_r}other/phylop/hg38.phyloP100way.bw",
	   "CDTS": f"{PROJECT_DIR_r}other/CDTS/coord_CDTS_percentile_N7794unrelated.bw"
	   }

In [21]:
for g in gs:
	bed = f"{PROJECT_DIR_d}bed_plot/{s}_{g}.bed"
	if "control" in bed:
		all_regions = BEDXReader.read_all(GenomicCollection, bed)
		use_strand = False
	else:
		all_regions = BEDXReader.read_all(GenomicCollection, bed, strandfield="f7")
		use_strand = True

	for m in bws:
		index = [f"{r.chrom}_{r.chromStart}_{r.chromEnd}" for r in all_regions]
		df_features[(m,g)] = pd.DataFrame(bpm.generate_signal_profile(all_regions, BigWigIReader(bws[m]), fixed_size=1001, use_strand=use_strand), index=index)

## Combine dataframes for plotting

In [22]:
ms = ["Age", "phyloP", "CDTS"]

In [None]:
df_reformat = {}
for k in df_features:
	df = df_features[k].reset_index()
	df_reformat[k] = pd.melt(df,
						  id_vars="index",
						  value_vars=list(df.columns[1:]),
						  var_name="Position",
						  value_name="Feature"
						  )

In [24]:
df_metaplots = {}
for m, d in itertools.product(ms, ds):
	dfs = []
	for p in ps:
		df1 = df_reformat[(m, f"{p}_{d}")].copy()
		df1["Label"] = p.capitalize()
		df2 = df_reformat[(m, f"{p}_{d}_control")].copy()
		df2["Label"] = "Ctrl"
		dfs.extend([df1, df2])
	df_metaplots[(m,d)] = pd.concat(dfs).reset_index(drop=True)	

In [25]:
# Age
# Get the maximum score for each element
# Get the average values for two groups; U test 

m = "Age"
d = "distal"
values = {}
for p in ps:
	values[p] = df_features[(m, f"{p}_{d}")].max(axis=1).tolist()

In [26]:
np.mean(values[ps[0]]), np.mean(values[ps[1]])

(284.84933460076047, 260.19398369412426)

In [27]:
mannwhitneyu(values[ps[0]], values[ps[1]])

MannwhitneyuResult(statistic=16088218.0, pvalue=2.7167264422552394e-11)

In [28]:
# CDTS
# Get the minimum score for each element
# Get the average values for two groups; U test 

m = "CDTS"
d = "distal"
values2 = {}
for p in ps:
	values2[p] = df_features[(m, f"{p}_{d}")].max(axis=1).tolist()

In [29]:
np.mean(values2[ps[0]]), np.mean(values2[ps[1]])

(2.7830905568421724, 3.1799182012229923)

In [30]:
mannwhitneyu(values2[ps[0]], values2[ps[1]])

MannwhitneyuResult(statistic=14209920.5, pvalue=1.1343029373378863e-05)

# Metaplots

## ylims

In [31]:
labels = ["Ctrl", "Divergent", "Unidirectional"]
for m, d in itertools.product(ms, ds):
	maxs = []
	mins = []
	df = df_metaplots[(m,d)]
	for label in labels:
		maxs.append(df[df["Label"]==label].groupby("Position")["Feature"].mean().max())
		mins.append(df[df["Label"]==label].groupby("Position")["Feature"].mean().min())
	print(d, m, max(maxs), min(mins))

distal Age 207.2251955034213 140.37741907489368
proximal Age 361.9407433380084 142.61951466127402
distal phyloP 0.26302839721352844 0.003073939531987875
proximal phyloP 0.4253888037693158 0.017202454665443644
distal CDTS 0.08612798655072214 -3.1320544262092422
proximal CDTS 0.00895907469490209 -8.876712887207963


## Generate metaplots

In [32]:
def generate_metaplot(d, ms, ylims, yticks, outputfile, test, xlabel="Distance (kb)"):
	fig, axes = plt.subplots(len(ms), 1, figsize=(3, 7.5))
	labelpad = 2
	y_align = -0.15
	hspace = 0.2
	
	hue_order = ["Ctrl", "Divergent", "Unidirectional"]
	palette = ["#d9d9d9", "#313695", "#de77ae"]
	
	for m in ms:
		row = ms.index(m)
		ax = axes[row]
		utils.generate_feature_metaplot(df_metaplots[(m,d)], palette, hue_order, ax, test)
					
		ax.set_ylim(ylims[row])
		ax.set_yticks(yticks[row])
		ax.set_ylabel(m, fontsize=bfontsize, fontweight="bold")
		ax.get_yaxis().set_label_coords(y_align, 0.5)

		if row == 0:
			ax.legend(loc="lower left", fontsize=sfontsize, bbox_to_anchor=[0, 1.02])
		else:
			ax.legend([],[], frameon=False)

		xtick_list = [0,500,1000]
		xticklabel_list = ["-0.5", "0", "0.5"]
		ax.set_xlim([0, 1000])
		ax.set_xticks(xtick_list)
		if row == len(ms)-1:
			ax.set_xlabel(xlabel, fontsize=bfontsize, fontweight="bold")
			ax.set_xticklabels(xticklabel_list)
			
		else:
			ax.set_xticklabels([])
			ax.set_xlabel("")

		ax.tick_params(labelsize=sfontsize, pad=labelpad)

	fig.subplots_adjust(hspace=hspace)
	plt.savefig(outputfile, bbox_inches="tight", dpi=300) 

In [33]:
pwpool = ProcessWrapPool(len(ds))

# test = True
test = False
for d in ds:
	if d == "distal":
		ylims = [[110, 250], [-0.05, 0.35], [-5, 0.5]]
		yticks = [[120, 160, 200, 240], [0, 0.1, 0.2, 0.3], [-4.5, -3, -1.5, 0]]
		outputfile = f"{PROJECT_DIR_o}figures/Fig3a.pdf"
	else:
		ylims = [[100, 440], [-0.05, 0.65], [-17, 2]]
		yticks = [[120, 220, 320, 420], [0, 0.2, 0.4, 0.6], [-15, -10, -5, 0]]
	folder = "figures/" if d == "distal" else "other_figures/"
	outputfile = f"{PROJECT_DIR_o}{folder}Fig3a.pdf"
	pwpool.run(generate_metaplot, args=[d, ms, ylims, yticks, outputfile, test])

In [35]:
len(pwpool.finished_tasks)

2

In [36]:
pwpool.close()