In [14]:
import pandas as pd
import json
import subprocess
from collections import defaultdict
from biodata.delimited import DelimitedReader, DelimitedWriter
from mphelper import ProcessWrapPool
import gzip
from Bio import AlignIO

In [2]:
PROJECT_DIR_r = "/fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/resources/"
PROJECT_DIR_r2 = "/fs/cbsuhy02/storage/yc2553/yc2553/databases/"

# ENCODE data

## K562

### Epigenomic features

In [3]:
# DNase-seq 
# ENCSR000EOT
# https://www.encodeproject.org/files/ENCFF414OGC/@@download/ENCFF414OGC.bigWig
# https://www.encodeproject.org/files/ENCFF185XRG/@@download/ENCFF185XRG.bed.gz
# https://www.encodeproject.org/files/ENCFF205FNC/@@download/ENCFF205FNC.bam

# H3K27ac ChIP-seq
# ENCSR000AKP
# https://www.encodeproject.org/files/ENCFF849TDM/@@download/ENCFF849TDM.bigWig
# rep1: https://www.encodeproject.org/files/ENCFF121RHF/@@download/ENCFF121RHF.bam
# rep2: https://www.encodeproject.org/files/ENCFF907MNY/@@download/ENCFF907MNY.bam

### Hi-C

In [None]:
# ENCSR479XDG
# rep1/2: https://www.encodeproject.org/files/ENCFF621AIY/@@download/ENCFF621AIY.hic

### RNA-seq

In [None]:
# Get processed TPM file from the older version of ABC model github
# https://raw.githubusercontent.com/broadinstitute/ABC-Enhancer-Gene-Prediction/refs/heads/master/example_chr22/input_data/Expression/K562.ENCFF934YBO.TPM.txt

### TF ChIP-seq data

In [22]:
# ENCODE TF ChIP-seq data actually also covers non-TF proteins

# When a set of experiments have been selected, click on the "Download" button to download a "files.txt" file that contains a list of URLs. The first URL in "files.txt" is "metadata.tsv", a file described below that contains all the experimental metadata for the files resulting from the search. The remaining URLs in files.txt are links that will download each ENCFF accessioned file.

# https://www.encodeproject.org/metadata/?control_type%21=%2A&status=released&perturbed=false&assay_title=TF+ChIP-seq&replicates.library.biosample.donor.organism.scientific_name=Homo+sapiens&biosample_ontology.term_name=K562&assembly=GRCh38&files.file_type=bed+narrowPeak&type=Experiment&files.analyses.status=released&files.preferred_default=true

inputfile = f"{PROJECT_DIR_r}ENCODE/metadata.tsv"
df = pd.read_table(inputfile)
df.head(2)

Unnamed: 0,File accession,File format,File type,File format type,Output type,File assembly,Experiment accession,Assay,Donor(s),Biosample term id,...,Genome annotation,Platform,Controlled by,File Status,s3_uri,File analysis title,File analysis status,Audit WARNING,Audit NOT_COMPLIANT,Audit ERROR
0,ENCFF983BOB,bigWig,bigWig,,signal p-value,GRCh38,ENCSR000ATW,TF ChIP-seq,/human-donors/ENCDO000AAD/,EFO:0002067,...,,,,released,s3://encode-public/2021/01/07/779fe20b-09af-4c...,ENCODE4 v1.6.1 GRCh38,released,"low read depth, inconsistent platforms, low re...","poor library complexity, severe bottlenecking",extremely low read depth
1,ENCFF522HZT,bed narrowPeak,bed,narrowPeak,IDR thresholded peaks,GRCh38,ENCSR000ATW,TF ChIP-seq,/human-donors/ENCDO000AAD/,EFO:0002067,...,,,,released,s3://encode-public/2021/01/07/42310e87-3e57-44...,ENCODE4 v1.6.1 GRCh38,released,"low read depth, inconsistent platforms, low re...","poor library complexity, severe bottlenecking",extremely low read depth


In [12]:
# Somehow, hg19 is found in meta file despite the selection of "GRCh38"
# Get rid of datasets with treatments & modifications

columns = [c for c in df.columns if ("treatment" in c or "genetic modifications" in c or c == "Library depleted in")]
filtered_df = df[(df["File assembly"]=="GRCh38") & (df[columns].isna().all(axis=1)) & (df["File type"]=="bed")]
len(filtered_df)

469

In [16]:
# Download all files (.bed.gz)
# https://www.encodeproject.org/help/batch-download/
# xargs -n 1 curl -O -L < download_ChIP.txt

outputfile = f"{PROJECT_DIR_r}download_ChIP.txt"
df_url = pd.DataFrame(list(filtered_df["File download URL"]))
df_url.to_csv(outputfile, sep="\t", index=False, header=None)

In [13]:
acc = list(filtered_df["File accession"])
target = [tf.split("-human")[0] for tf in list(filtered_df["Experiment target"])]
acc_target = dict(zip(acc, target))

In [14]:
# Some targets have multiple ChIP-seq experiments/files available
# For the follow-up analysis (see Fig.1_supps/4.1.Fig1e.SuppFig1d.ChIP_enrichment.ipynb), we treat each experiment independently

len(acc_target), len(set(acc_target.values()))

(469, 311)

In [15]:
outputfile = f"{PROJECT_DIR_r}acc_target.json"
with open(outputfile, "w") as f:
	json.dump(acc_target, f)

In [None]:
# Get bigwig files for examples (see Fig.1_supps/4-2.SuppFig1e.ChIP_examples.ipynb)

# PHF8 ChIP-seq
# https://www.encodeproject.org/files/ENCFF092HFK/@@download/ENCFF092HFK.bigWig

# SMAD5 ChIP-seq
# https://www.encodeproject.org/files/ENCFF094FAV/@@download/ENCFF094FAV.bigWig

## HCT116

In [None]:
# DNase-seq
# https://www.encodeproject.org/files/ENCFF240LRP/@@download/ENCFF240LRP.bed.gz

# CTCF ChIP-seq
# https://www.encodeproject.org/files/ENCFF388PVO/@@download/ENCFF388PVO.bigWig
# https://www.encodeproject.org/files/ENCFF832INR/@@download/ENCFF832INR.bed.gz

# RAD21 ChIP-seq
# https://www.encodeproject.org/files/ENCFF776IXR/@@download/ENCFF776IXR.bigWig

In [None]:
# CTCF ChIA-PET
# https://www.encodeproject.org/files/ENCFF431BUN/@@download/ENCFF431BUN.bedpe.gz
# Ruan lab ChIA-PET pipeline: https://www.encodeproject.org/pipelines/ENCPL169TBL/
# Produce loop (bedpe) file: First, ChIA-PIPE filters out inter-chromosomal contacts and self-ligation PETs with genomic span less than 8 kbp. Then, the 5’ end of each inter-ligation PET is extended by 500 bp along the reference genome, which makes chromatin fragments closer to the actual protein binding position in DNA. Finally, ChIA-PIPE merges overlapping extended inter-ligation PETs into a cluster with the number of overlapped inter-ligation PET as a PET count of the loop. Loops with PET count =1 are called singletons and they filter them out to retain only highly confident frequent loops.

# Hi-C
# https://www.encodeproject.org/files/ENCFF573OPJ/@@download/ENCFF573OPJ.hic

# Other data

## Mappability

In [None]:
# Paper: https://academic.oup.com/nar/article/46/20/e120/5086676

# k24
# http://hgdownload.soe.ucsc.edu/gbdb/hg38/hoffmanMappability/k24.Umap.MultiTrackMappability.bw
# k36
# http://hgdownload.soe.ucsc.edu/gbdb/hg38/hoffmanMappability/k36.Umap.MultiTrackMappability.bw
# k50
# http://hgdownload.soe.ucsc.edu/gbdb/hg38/hoffmanMappability/k50.Umap.MultiTrackMappability.bw

## phyloP

In [12]:
# http://hgdownload.cse.ucsc.edu/goldenPath/hg38/phyloP100way/hg38.phyloP100way.bw

## CTDS

In [None]:
# Paper: https://www.nature.com/articles/s41588-018-0062-7
# The files were downloaded through the links below a few years ago; Unfortunately, these links are no longer active...
# http://www.hli-opendata.com/noncoding/README_noncoding_HLIopendata_20171019.txt
# http://www.hli-opendata.com/noncoding/coord_CDTS_percentile_N7794unrelated.txt.gz

In [23]:
f1 = f"{PROJECT_DIR_r}other/CDTS/coord_CDTS_percentile_N7794unrelated.txt.gz"
! zcat $f1 | head -n 2 2>/dev/null

chromosome	start	end	CDTS	percentile
chr1	821620	821630	-2.76477	29

gzip: stdout: Broken pipe


In [24]:
# txt --> bedGraph

f2 = f"{PROJECT_DIR_r}other/CDTS/coord_CDTS_percentile_N7794unrelated.bg"
commands = f"zcat {f1} | awk -v OFS='\\t' 'NR>1 {{print $1,$2,$3,$4}}' > {f2}"
subprocess.run(commands, shell=True, check=True)

CompletedProcess(args="zcat /fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/resources/other/CDTS/coord_CDTS_percentile_N7794unrelated.txt.gz | awk -v OFS='\\t' 'NR>1 {print $1,$2,$3,$4}' > /fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/resources/other/CDTS/coord_CDTS_percentile_N7794unrelated.bg", returncode=0)

In [15]:
# bedGraph --> bw

chrom_sizes = f"{PROJECT_DIR_r}genomes/human/hg38.chrom.sizes.filtered"
f3 = f"{PROJECT_DIR_r}other/CDTS/coord_CDTS_percentile_N7794unrelated.bw"
! bedGraphToBigWig $f2 $chrom_sizes $f3

## Age

In [None]:
# Syntenic aging strategy: refer to https://academic.oup.com/mbe/article/38/9/3681/6273355

### Clade timing

In [19]:
# MYA: million years ago

clades = {"None": "NA",
		  "Homo_sapiens": 0,
		  "Primate": 72,
		  "Euarchontoglires": 90,
		  "Boreoeutheria": 96,
		  "Eutheria": 105,
		  "Theria": 159,
		  "Mammalia": 177,
		  "Amniote": 312,
		  "Tetrapod": 352,
		  "Vertebrate": 615}

In [26]:
outputfile = f"{PROJECT_DIR_r}other/age/clades.json"
with open(outputfile, "w") as f:
	json.dump(clades, f)

### Group species into 10 clades

Here age categories are marked in yellow and the name of subsets in multiz100way-README.txt are marked in red.

<img src="attachment:36eaa72f-6ccd-4cf8-85bb-9754c2eb1369.png" width="800"/>

In [64]:
# http://hgdownload.cse.ucsc.edu/goldenPath/hg38/multiz100way/README.txt

inputfile = f"{PROJECT_DIR_r2}UCSC/multiz100way/hg38/README.txt"
temp = defaultdict(set)
versions = {}
with DelimitedReader(inputfile) as dr:
	for cols in dr:
		if "==" in cols[0]:
			# e.g., Primate
			category = cols[0].split()[1]
		if len(cols[0]) > 100:
			# e.g., Human
			species = cols[0][:24].strip()
			# e.g., "hg38"
			version = cols[0][66:112].strip().split("/")[-1]
			versions[version] = species
			temp[category].add(species)

In [65]:
# Re-organize the categories

categories = defaultdict(set)

In [66]:
categories["Homo_sapiens"] = {"Human"}
for k in temp["Primate"]:
	if k != "Human":
		categories["Primate"].add(k)
categories["Euarchontoglires"] = temp["Euarchontoglires"]
categories["Boreoeutheria"] = temp["Laurasiatheria"]
categories["Eutheria"] = temp["Afrotheria"]

In [71]:
# Divide "Mammal" into three categories
# "Armadillo": Xenarthra - Eutheria
# "Opossum", "Tasmanian devil", "Wallaby": Metatheria - Theria
# "Platypus": Prototheria - Mammalia

categories["Eutheria"].add("Armadillo")
categories["Theria"] = {"Opossum", "Tasmanian devil", "Wallaby"}
categories["Mammalia"] = {"Platypus"}
categories["Amniote"] = temp["Aves"]

In [75]:
# Divide "Sarcopterygii" into tetrapod and non-tetrapod
# "American alligator": tetrapod
# "Chinese softshell turtle": tetrapod
# "Coelacanth": non-tetrapod
# "Green seaturtle": tetrapod
# "Lizard": tetrapod
# "Painted turtle": tetrapod
# "Spiny softshell turtle": tetrapod
# "X. tropicalis": tetrapod

for k in temp["Sarcopterygii"]:
	if k != "Coelacanth":
		categories["Tetrapod"].add(k)
categories["Vertebrate"] = temp["Fish"]
categories["Vertebrate"].add("Coelacanth")

In [80]:
outputfile = f"{PROJECT_DIR_r}other/age/MRCA_categories_100way.json"
categories = defaultdict(list, {key: list(value) for key, value in categories.items()})
with open(outputfile, "w") as f:
	json.dump(categories, f)

### Get syntenic blocks

In [None]:
# http://hgdownload.cse.ucsc.edu/goldenpath/hg38/multiz100way/maf/
# The maf/chr*.maf.gz files each contain all the alignments to that particular human chromosome, with additional annotations to indicate gap context, and genomic breaks for the sequence in the underlying genome assemblies.

In [89]:
# Each syntenic block was assigned an age based on the most recent common ancestor (MRCA) of the species present in the alignment block of the UCSC species tree model.

def get_syntenic_blocks(inputfile, outputfile, clades):
	with DelimitedWriter(outputfile) as dw:
		with gzip.open(inputfile, "rt") as f:
			for multiple_alignment in AlignIO.parse(f, "maf"):
				all_species = []
				for seqrec in multiple_alignment:
					version = seqrec.id.split(".")[0]
					category = categories[versions[version]]
					all_species.append(clades.index(category))
					if version == "hg38":
						# 0-based
						start = seqrec.annotations["start"]
						size = seqrec.annotations["size"]
						strand = seqrec.annotations["strand"]
						srcSize = seqrec.annotations["srcSize"]
						chrom = seqrec.id.split(".")[1]
						# "+" strand
						if strand == 1:
							end = start + size
						# "-" strand: http://genomewiki.ucsc.edu/index.php/Coordinate_Transforms
						else:
							start_temp = start
							end_temp = start_temp + size
							start = srcSize - end_temp
							end = srcSize - start_temp
				age = clades[max(all_species)]
				dw.write([chrom, start, end, age])

In [425]:
chroms = [f"chr{n}" for n in range(1, 23)]
chroms.extend(["chrX"])

In [90]:
pwpool = ProcessWrapPool(len(chroms))

for chrom in chroms:
	inputfile = f"{PROJECT_DIR_r2}/UCSC/multiz100way/hg38/{chrom}.maf.gz"
	outputfile = f"{PROJECT_DIR_r2}/UCSC/multiz100way/hg38/{chrom}.bed"
	pwpool.run(get_syntenic_blocks, args=[inputfile, outputfile, list(clades.keys())])

In [86]:
len(pwpool.finished_tasks)

23

In [87]:
pwpool.close()

## TE

In [15]:
# https://repeatmasker.org/genomes/hg38/RepeatMasker-rm405-db20140131/hg38.fa.out.gz
# hg38 - Dec 2013 - RepeatMasker open-4.0.5 - Repeat Library 20140131

In [406]:
# Convert the file into tab-delimited file

inputfile = f"{PROJECT_DIR_r}other/rmsk/hg38.fa.out.gz"
outputfile = f"{PROJECT_DIR_r}other/rmsk/hg38.fa.tbl.gz"
with DelimitedWriter(outputfile) as dw:
	dw.write(["SW score", "perc div.", "perc del.", "perc ins.", "genoName", "genoStart", "genoEnd", "genoLeft", 
			  "strand", "repName", "repClass", "repFamily", "repStart", "repEnd", "repLeft", "id"])
	with DelimitedReader(inputfile) as dr:
		for cols in dr:
			if "SW" not in cols[0] and "score" not in cols[0]:
				cols2 = cols[0].split(" ")
				cols2 = [c for c in cols2 if c != ""]
				# strand
				if cols2[8] == "C":
					cols2[8] = "-"
				if "/" in cols2[10]:
					repClass, repFamily = cols2[10].split("/")
				else:
					repClass = cols2[10]
					repFamily = cols2[10]
				cols_final = cols2[:10] + [repClass, repFamily] + cols2[12:]
				dw.write(cols_final)

In [25]:
inputfile = f"{PROJECT_DIR_r}other/rmsk/hg38.fa.tbl.gz"
df_all = pd.read_table(inputfile)
df_all.head(2)

Unnamed: 0,SW score,perc div.,perc del.,perc ins.,genoName,genoStart,genoEnd,genoLeft,strand,repName,repClass,repFamily,repStart,repEnd,repLeft,id
0,463,1.3,0.6,1.7,chr1,10001,10468,(248945954),+,(TAACCC)n,Simple_repeat,Simple_repeat,463,(0),1.0,
1,4005,11.3,21.5,1.3,chr1,10469,11447,(248944975),-,TAR1,Satellite,telo,1712,483,2.0,


In [429]:
# https://genome.ucsc.edu/cgi-bin/hgTrackUi?g=rmsk
# A "?" at the end of the "Family" or "Class" (for example, DNA?) signifies that the curator was unsure of the classification. At some point in the future, either the "?" will be removed or the classification will be changed.

Counter(df_all["repClass"])

Counter({'SINE': 1861482,
         'LINE': 1609790,
         'LTR': 763158,
         'Simple_repeat': 700455,
         'DNA': 536363,
         'Low_complexity': 103244,
         'LTR?': 8525,
         'Satellite': 7918,
         'Unknown': 6523,
         'Retroposon': 5827,
         'snRNA': 4537,
         'DNA?': 4383,
         'tRNA': 2066,
         'scRNA': 1931,
         'rRNA': 1861,
         'RC': 1822,
         'srpRNA': 1428,
         'RNA': 762,
         'RC?': 435,
         'SINE?': 6})

In [426]:
TEclass = ["SINE", "LINE", "LTR", "DNA"]

In [427]:
target_cols = ["genoName", "genoStart", "genoEnd", "strand", "repName", "repClass", "repFamily", "perc div."]
df_filtered = df_all[df_all["repClass"].isin(TEclass) & df_all["genoName"].isin(chroms)][target_cols]
df_filtered.columns = [i for i in range(len(target_cols))]
df_filtered.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7
2,chr1,11485,11676,-,L1MC5a,LINE,L1,21.2
3,chr1,11678,11780,-,MER5B,DNA,hAT-Charlie,29.4


In [428]:
len(df_all), len(df_filtered)

(5622516, 4565530)

In [414]:
outputfile = f"{PROJECT_DIR_r}other/rmsk/TE_filtered.bed.gz"
df_filtered.to_csv(outputfile, sep="\t", index=False, header=False)

## CRISPR

In [None]:
# A K562 CRISPR benchmarking dataset generated from combining results from different published experiments
# https://github.com/EngreitzLab/CRISPR_comparison/blob/main/resources/crispr_data/EPCrisprBenchmark_ensemble_data_GRCh38.tsv.gz

## LOEUF

In [None]:
# https://gnomad.broadinstitute.org/help/constraint#loeuf 
# https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/constraint/gnomad.v4.1.constraint_metrics.tsv

## Promoter classification

In [None]:
# https://www.nature.com/articles/s41586-022-04877-w
# https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-022-04877-w/MediaObjects/41586_2022_4877_MOESM9_ESM.txt

## CTCF ChIP-seq (GSE179540)

In [None]:
# paper: https://academic.oup.com/nar/article/50/1/207/6471093
# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE179540
# For inducing auxin-inducible degron system, 500μM 3-indole acetic acid was treated for 6 hours.
# CTCF Auxin- rep1: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM5420199
# CTCF Auxin+ rep1: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM5420200
# CPM normalized bigwig, hg19; Liftover to hg38

In [11]:
def bw_liftover(bw_in, bw_out, unlifted_file, chain_file, chrom_sizes):
	commands = ["bwtool lift",
			bw_in,
			chain_file,
			bw_out,
			f"-sizes={chrom_sizes}",
			f"-unlifted={unlifted_file}"]
	subprocess.run(" ".join(commands), shell=True)

In [25]:
ids = ["GSM5420199_I_Con", "GSM5420200_I_IAA"]
chrom_sizes = f"{PROJECT_DIR_r}genomes/human/hg38.chrom.sizes"
chain_file = f"{PROJECT_DIR_r}genomes/human/hg19ToHg38.over.chain.gz"

In [None]:
pwpool = ProcessWrapPool(len(ids))
for i in ids:
	bw_in = f"{PROJECT_DIR_r}other/GSE179540/{i}_CTCF.bw"
	bw_out = f"{PROJECT_DIR_r}other/GSE179540/{i}_CTCF_hg38.bw"
	unlifted_file = f"{PROJECT_DIR_r}other/GSE179540/{i}_CTCF_hg38_unmapped.bed"
	pwpool.run(bw_liftover, args=[bw_in, bw_out, unlifted_file, chain_file, chrom_sizes])

In [28]:
len(pwpool.finished_tasks)

2

In [29]:
pwpool.close()

## JASPAR

In [None]:
# JASPAR
# https://jaspar.elixir.no/download/data/2024/CORE/JASPAR2024_CORE_vertebrates_non-redundant_pfms_meme.txt

In [93]:
def get_motif_file(motif, inputfile, outputfile):
	with open(inputfile, "r") as f:
	    content = f.read()
	
	with open(outputfile, "w") as f:
		infos = content.split("MOTIF")
		# header
		f.write(infos[0])
		# motif matrix
		for k in infos:
			if motif + "\n" in k:
				f.write("MOTIF" + k)

In [96]:
motif = "CTCF"
inputfile = f"{PROJECT_DIR_r}other/JASPAR/JASPAR2024_CORE_vertebrates_non-redundant_pfms_meme.txt"
outputfile = f"{PROJECT_DIR_r}other/JASPAR/JASPAR2024_CORE_vertebrates_{motif}.txt"
get_motif_file(motif, inputfile, outputfile)

# Tools

In [None]:
# For tools not listed in ".yml" files are shown below

## SVR model (TATA & DPR)

In [None]:
# Paper: https://www.nature.com/articles/s41586-020-2689-7
# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE139635

## ProCapNet

In [None]:
# https://github.com/kundajelab/nascent_RNA_models

## ABC model

In [None]:
# https://github.com/broadinstitute/ABC-Enhancer-Gene-Prediction/tree/master
# https://abc-enhancer-gene-prediction.readthedocs.io/en/latest/usage/methods.html#defining-candidate-elements

## CRISPR benchmark

In [None]:
# https://github.com/EngreitzLab/CRISPR_comparison

## FIMO

In [None]:
# https://meme-suite.org/meme/meme-software/

In [128]:
! fimo --version

5.5.4


## bwtool

In [None]:
# https://github.com/CRG-Barcelona/bwtool

In [129]:
! bwtool --version

bwtool 1.0
Copyright (C) 2015 Centre for Genomic Regulation (CRG) in Barcelona.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>.
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.

Written by Andy Pohl.


## statannotations

In [None]:
# To be compatible with seaborn 0.13.2; https://github.com/trevismd/statannotations/pull/155
# pip install git+https://github.com/getzze/statannotations.git@compat-seaborn-13 --upgrade