In [1]:
import glob
from mphelper import ProcessWrapPool
import subprocess
import pandas as pd
from biodatatools.utils.common import json_load
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import mannwhitneyu
import itertools
import biodataplot.metaplot as bpm
from biodata.bigwig import BigWigIReader
from biodata.bed import BEDXReader
from genomictools import GenomicCollection
from collections import defaultdict
from biodata.delimited import DelimitedReader, DelimitedWriter
import tempfile
import sys
from pathlib import Path

In [2]:
matplotlib.font_manager.fontManager.addfont(f"{Path.cwd().parent}/font/Arial.ttf")
matplotlib.rcParams["font.family"] = "Arial"
bfontsize = 12
sfontsize = 9

In [3]:
sys.path.append(f"{Path.cwd().parent}")
import utils

In [4]:
PROJECT_DIR_d = "/fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/PROcap/"
PROJECT_DIR_d2 = "/fs/cbsuhy02/storage/yc2553/yc2553/databases/"
PROJECT_DIR_r = "/fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/resources/"
PROJECT_DIR_o = "/fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/output/"

# Assign sequence age

In [None]:
# To confirm if the pattern is consistent, here we tried two centering approaches:
# 1. Anchor on the element center
# 2. Anchor on the prominent TSSs

In [5]:
s = "C1"
ps = ["divergent", "unidirectional"]
ds = ["distal", "proximal"]
cs = ["", "_control"]
TSSs = ["divergent_maxTSS", "divergent_minTSS", "unidirectional_TSS"]
# Anchored on element center
gs = [f"{p}_{d}{c}" for p, d, c in itertools.product(ps, ds, cs)]
# Anchored on prominent TSSs
gs2 = [f"{tss}_{d}" for d in ds for tss in TSSs]

In [32]:
def intersect(fa, fb, outputfile):
	with tempfile.NamedTemporaryFile(delete=False, suffix='.bed') as tmp:
		with DelimitedWriter(tmp.name) as dw, DelimitedReader(fa) as dr:
			for cols in dr:
				chrom, start, end = cols[:3]
				dw.write([chrom, int(start) - 250, int(end) + 250]+cols[3:])
		fa = tmp.name
	cmd = f"bedtools intersect -a {fa} -b {fb} -wao > {outputfile}"
	subprocess.run(cmd, shell=True)

## Element center

In [7]:
fb = " ".join(glob.glob(f"{PROJECT_DIR_d2}UCSC/multiz100way/hg38/chr*.bed"))

In [14]:
pwpool = ProcessWrapPool(len(gs))

for g in gs:
	fa = f"{PROJECT_DIR_d}bed_plot/{s}_{g}.bed"
	outputfile = f"{PROJECT_DIR_o}temp/{s}_{g}_age.bed"
	pwpool.run(intersect, args=[fa, fb, outputfile])

In [21]:
len(pwpool.finished_tasks)

8

In [17]:
pwpool.close()

In [23]:
# Check the output

inputfile = f"{PROJECT_DIR_o}temp/{s}_{ps[0]}_{ds[0]}_age.bed"
pd.read_table(inputfile, header=None, low_memory=False).head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,chr1,1020787,1021788,1021175,1021375,1021237,1021340,-,1,chr1,1020772,1020792,Vertebrate,5
1,chr1,1020787,1021788,1021175,1021375,1021237,1021340,-,1,chr1,1020792,1020806,Vertebrate,14


In [24]:
# Check the output

inputfile = f"{PROJECT_DIR_o}temp/{s}_{ps[0]}_{ds[0]}_control_age.bed"
pd.read_table(inputfile, header=None, low_memory=False).head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,chr1,1078522,1079523,1,chr1,1078509,1078534,Vertebrate,12
1,chr1,1078522,1079523,1,chr1,1078534,1078566,Vertebrate,32


## Prominent TSSs

In [17]:
# Divergent: anchor on maximum and minimum TSSs, separately
# Unidirectional: anchor on one prominent TSSs
# No random control regions here

extension = 500
for p, d in itertools.product(ps, ds):
	inputfile = f"{PROJECT_DIR_d}bed_plot/{s}_{p}_{d}.bed"
	if p == "unidirectional":
		outputfile = f"{PROJECT_DIR_o}temp/{s}_{p}_TSS_{d}.bed"
		with DelimitedWriter(outputfile) as dw, BEDXReader(inputfile, ["summit_position", "na", "strand"], x=5) as br:
			for r in br:
				dw.write([r.chrom, int(r.summit_position)-extension, int(r.summit_position)+extension+1, ".", ".", ".", ".", r.strand])
	else:
		coords = {
			"maxTSS": lambda r: r.fwdTSS if r.strand == "+" else r.revTSS,
			"minTSS": lambda r: r.revTSS if r.strand == "+" else r.fwdTSS
		}
		for tss in ["maxTSS", "minTSS"]:
			outputfile = f"{PROJECT_DIR_o}temp/{s}_{p}_{tss}_{d}.bed"
			with DelimitedWriter(outputfile) as dw, BEDXReader(inputfile, ["revTSS", "fwdTSS", "strand"], x=5) as br:
				for r in br:
					dw.write([r.chrom, int(coords[tss](r))-extension, int(coords[tss](r))+extension+1, ".", ".", ".", ".", r.strand])

In [18]:
pwpool = ProcessWrapPool(len(gs2))

for g in gs2:
	fa = f"{PROJECT_DIR_o}temp/{s}_{g}.bed"
	outputfile = f"{PROJECT_DIR_o}temp/{s}_{g}_age.bed"
	pwpool.run(intersect, args=[fa, fb, outputfile, None])

In [20]:
len(pwpool.finished_tasks)

6

In [21]:
pwpool.close()

In [22]:
# Check the output

inputfile = f"{PROJECT_DIR_o}temp/{s}_{ps[0]}_maxTSS_{ds[0]}_age.bed"
pd.read_table(inputfile, header=None, low_memory=False).head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,chr1,1020737,1021738,.,.,.,.,-,1,chr1,1020735,1020763,Vertebrate,26
1,chr1,1020737,1021738,.,.,.,.,-,1,chr1,1020763,1020772,Tetrapod,9


# Generate feature matrix

## Age

In [31]:
def age_dissection(inputfile, clades):	
	age_per_block = defaultdict(dict)
	reverse = {}

	with DelimitedReader(inputfile) as dr:
		for cols in dr:
			chrom, start, end = cols[:3]
			if "control" not in inputfile:
				sign = cols[7]
				if (chrom, start, end) not in reverse:
					reverse[(chrom, start, end)] = (sign == "-")
			start2, end2, clade, overlap = cols[-4:]
			# Only include regions with overlapping length >= 6bp
			if int(overlap) >= 6:
				block_start = max(int(start), int(start2))
				block_end = min(int(end), int(end2))
				age_per_block[(chrom, start, end)][(block_start, block_end)] = clades[clade]

	results = []
	index = []
	for r in age_per_block:
		chrom, start, end = r
		index.append("_".join([chrom, str(int(start)+250), str(int(end)-250)]))
		bp_features = []
		for i in range(int(start), int(end)):
			feature = np.nan
			for block in age_per_block[r]:
				b_s, b_e = block
				if b_s <= i <= b_e:
					feature = age_per_block[r][block]
					break
			bp_features.append(feature)

		if reverse.get(r, False):
			bp_features.reverse()
		results.append(bp_features)

	df = pd.DataFrame(results, index=index)
	return df

In [28]:
clades = json_load(f"{PROJECT_DIR_r}other/age/clades.json")

In [29]:
df_features = defaultdict(dict)

In [33]:
m = "Age"
for k, group_list in [("Element_center", gs), ("Prominent_TSSs", gs2)]:
    for g in group_list:
        inputfile = f"{PROJECT_DIR_o}temp/{s}_{g}_age.bed"
        df_features[k][(m, g)] = age_dissection(inputfile, clades)

## Evo

In [37]:
bws = {"phyloP": f"{PROJECT_DIR_r}other/phylop/hg38.phyloP100way.bw",
	   "CDTS": f"{PROJECT_DIR_r}other/CDTS/coord_CDTS_percentile_N7794unrelated.bw"
	   }

In [38]:
for k, group_list in [("Element_center", gs), ("Prominent_TSSs", gs2)]:
	for g in group_list:
		if k == "Element_center":
			bed = f"{PROJECT_DIR_d}bed_plot/{s}_{g}.bed"
		else:
			bed = f"{PROJECT_DIR_o}temp/{s}_{g}.bed"
		if "control" in bed:
			all_regions = BEDXReader.read_all(GenomicCollection, bed)
			use_strand = False
		else:
			all_regions = BEDXReader.read_all(GenomicCollection, bed, strandfield="f7")
			use_strand = True
	
		for m in bws:
			index = [f"{r.chrom}_{r.chromStart}_{r.chromEnd}" for r in all_regions]
			df_features[k][(m,g)] = pd.DataFrame(bpm.generate_signal_profile(all_regions, BigWigIReader(bws[m]), fixed_size=1001, use_strand=use_strand), index=index)

## Combine dataframes for plotting

In [39]:
ms = ["Age", "phyloP", "CDTS"]

In [40]:
df_reformat = defaultdict(dict)
for k in df_features:
	for k2 in df_features[k]:
		df = df_features[k][k2].reset_index()
		df_reformat[k][k2] = pd.melt(df,
							  id_vars="index",
							  value_vars=list(df.columns[1:]),
							  var_name="Position",
							  value_name="Feature"
							  )

In [41]:
df_metaplots = defaultdict(dict)
for k in df_reformat:
	for m, d in itertools.product(ms, ds):
		dfs = []
		if k == "Element_center":
			for p in ps:
				for suffix, label in [("", p.capitalize()), ("_control", "Ctrl")]:
					df = df_reformat[k][(m, f"{p}_{d}{suffix}")].copy()
					df["Label"] = label
					dfs.append(df)
		else:
			for t in TSSs:
				df = df_reformat[k][(m, f"{t}_{d}")].copy()
				t1, t2 = t.split("_")
				label = f"{t1.capitalize()} ({t2})" if t1 == "divergent" else t1.capitalize()
				df["Label"] = label
				dfs.append(df)
		df_metaplots[k][(m,d)] = pd.concat(dfs).reset_index(drop=True)	

In [32]:
# Age
# Get the maximum score for each element
# Get the average values for two groups; U test 

m = "Age"
d = "distal"
k = "Element_center"
values = {}
for p in ps:
	values[p] = df_features[k][(m, f"{p}_{d}")].max(axis=1).tolist()

In [33]:
np.mean(values[ps[0]]), np.mean(values[ps[1]]), mannwhitneyu(values[ps[0]], values[ps[1]]).pvalue

(284.84933460076047, 260.19398369412426, 2.7167264422552394e-11)

In [36]:
# CDTS
# Get the minimum score for each element
# Get the average values for two groups; U test 

m = "CDTS"
values2 = {}
for p in ps:
	values2[p] = df_features[k][(m, f"{p}_{d}")].min(axis=1).tolist()

In [37]:
np.mean(values2[ps[0]]), np.mean(values2[ps[1]]), mannwhitneyu(values2[ps[0]], values2[ps[1]]).pvalue

(-8.666749832485436, -7.484786294358014, 2.5833347036249925e-16)

# Metaplots

## Element center

In [60]:
def generate_metaplot(k, d, ms, ylims, yticks, outputfile, test, loc="upper"):
	fig, axes = plt.subplots(len(ms), 1, figsize=(3, 7.5))
	labelpad = 2
	y_align = -0.15
	hspace = 0.2

	if k == "Element_center":
		hue_order = ["Ctrl", "Divergent", "Unidirectional"]
		palette = ["#d9d9d9", "#313695", "#de77ae"]
	else:
		hue_order = ["Divergent (maxTSS)", "Divergent (minTSS)", "Unidirectional"]
		palette = ["#313695", "#3182bd", "#de77ae"]
	
	for m in ms:
		row = ms.index(m)
		ax = axes if len(ms) == 1 else axes[row]
		utils.generate_feature_metaplot(df_metaplots[k][(m,d)], palette, hue_order, ax, test)
					
		ax.set_ylim(ylims[row])
		ax.set_yticks(yticks[row])
		ax.set_ylabel(m, fontsize=bfontsize)
		ax.get_yaxis().set_label_coords(y_align, 0.5)

		if row == 0:
			if loc == "upper":
				ax.legend(loc="lower left", fontsize=sfontsize, bbox_to_anchor=[0, 1.02])
			elif loc == "right":
				ax.legend(loc="upper left", fontsize=sfontsize, bbox_to_anchor=[1.02, 1])
		else:
			ax.legend([],[], frameon=False)

		xtick_list=[0,500,1000]
		xticklabel_list=["-0.5", "0", "0.5"]
		ax.set_xticks(xtick_list)
		if row == len(ms)-1:
			ax.set_xlabel("Distance (kb)", fontsize=bfontsize)
			ax.set_xticklabels(xticklabel_list)
			
		else:
			ax.set_xticklabels([])
			ax.set_xlabel("")

		ax.tick_params(labelsize=sfontsize, pad=labelpad)

	fig.subplots_adjust(hspace=hspace)
	plt.savefig(outputfile, bbox_inches="tight", transparent=True) 

In [50]:
k = "Element_center"

In [40]:
labels = ["Divergent", "Unidirectional", "Ctrl"]
for m, d in itertools.product(ms, ds):
	maxs = []
	mins = []
	df = df_metaplots[k][(m,d)]
	for label in labels:
		maxs.append(df[df["Label"]==label].groupby("Position")["Feature"].mean().max())
		mins.append(df[df["Label"]==label].groupby("Position")["Feature"].mean().min())
	print(d, m, max(maxs), min(mins))

distal Age 207.2251955034213 140.37741907489368
proximal Age 361.9407433380084 142.61951466127402
distal phyloP 0.26302839721352844 0.003073939531987875
proximal phyloP 0.4253888037693158 0.017202454665443644
distal CDTS 0.08612798655072214 -3.1320544262092422
proximal CDTS 0.00895907469490209 -8.876712887207963


In [49]:
ylims = {"distal": [[120, 230], [-0.05, 0.35], [-4, 0.4]],
		 "proximal": [[100, 440], [-0.2, 1.1], [-17, 2]]
		}
yticks = {"distal": [[130, 160, 190, 220], [0, 0.1, 0.2, 0.3], [-3.6, -2.4, -1.2, 0]],
		  "proximal": [[120, 220, 320, 420], [0, 0.3, 0.6, 0.9], [-15, -10, -5, 0]]
		 }

In [51]:
pwpool = ProcessWrapPool(len(ds))

# test = True
test = False
for d in ds:
	folder = "figures/" if d == "distal" else "other_figures/"
	outputfile = f"{PROJECT_DIR_o}{folder}Fig3a.pdf"
	pwpool.run(generate_metaplot, args=[k, d, ms, ylims[d], yticks[d], outputfile, test])

In [53]:
len(pwpool.finished_tasks)

2

In [54]:
pwpool.close()

## Prominent TSSs

In [55]:
k = "Prominent_TSSs"

In [48]:
labels = ["Divergent (maxTSS)", "Divergent (minTSS)", "Unidirectional"]
for m, d in itertools.product(ms, ds):
	maxs = []
	mins = []
	df = df_metaplots[k][(m,d)]
	for label in labels:
		maxs.append(df[df["Label"]==label].groupby("Position")["Feature"].mean().max())
		mins.append(df[df["Label"]==label].groupby("Position")["Feature"].mean().min())
	print(d, m, max(maxs), min(mins))

distal Age 207.44250274960282 155.2327150084317
proximal Age 361.5252537034877 231.5768863419293
distal phyloP 0.26469569909494234 -0.03475625517190687
proximal phyloP 0.5152868090657748 -0.10769166893444096
distal CDTS -1.525549798675741 -3.142131024492463
proximal CDTS -4.09283802186516 -8.908121619679443


In [63]:
pwpool = ProcessWrapPool(len(ds))

# test = True
test = False
for d in ds:
	folder = "figures/" if d == "distal" else "other_figures/"
	outputfile = f"{PROJECT_DIR_o}{folder}SuppFig2.pdf"
	pwpool.run(generate_metaplot, args=[k, d, ms, ylims[d], yticks[d], outputfile, test, "right"])

In [65]:
len(pwpool.finished_tasks)

2

In [66]:
pwpool.close()