In [2]:
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict
from biodatatools.utils.common import json_dump
from biodata.delimited import DelimitedReader
from os.path import exists
import os
from bs4 import BeautifulSoup
import matplotlib.image as mpimg

In [4]:
PROJECT_DIR_o = "/home/yc2553/projects/HEA/output/"
PROJECT_DIR_o2 = "/fs/cbsuhy02/storage/yc2553/yc2553/projects/3.Human_atlas/output/"
PROJECT_DIR_r = "/fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/resources/"

# Get motifs

In [7]:
model_type = "strand_merged_umap"
task = "counts"
s = "SC5"
outdirs = {"procapnet": f"{PROJECT_DIR_o}procapnet/modisco_out/{s}/{model_type}/merged/{task}/",
		   "chrombpnet": f"{PROJECT_DIR_o2}chrombpnet/paper/tfmodisco/counts.GSE267154/"
		  }

In [None]:
# ATAC-seq ChromBPNet model (model ID: syn59449898; EXP ID: GSE267154)
# https://www.synapse.org/Synapse:syn59449898/wiki/628018
# GSE267154_H1ESC_ATAC_ChomBPNet: https://www.synapse.org/Synapse:syn63862944
# sequence_motifs.GSE267154.tar.gz: https://www.synapse.org/Synapse:syn63862603

In [11]:
# This command will take the results from the tfmodisco-lite run, as well as a reference database of motifs to compare the extracted patterns to, and generate a HTML report. Each pattern that is extracted by tfmodisco-lite is compared against the database of motifs using TOMTOM to match them with prior knowledge.

motif_file = f"{PROJECT_DIR_r}other/JASPAR/JASPAR2024_CORE_vertebrates_non-redundant_pfms_meme.txt"
k = "chrombpnet"
inputfile = f"{PROJECT_DIR_o2}chrombpnet/paper/tfmodisco/tfmodisco.raw_output.counts.GSE267154.hd5"
if not exists(outdirs[k]):
	os.mkdir(outdirs[k])
commands = ["modisco report",
			"-i", inputfile,
			"-o", outdirs[k],
			"-s", outdirs[k],
			"-m", motif_file
			]
print(" ".join(commands))

modisco report -i /fs/cbsuhy02/storage/yc2553/yc2553/projects/3.Human_atlas/output/chrombpnet/paper/tfmodisco/tfmodisco.raw_output.counts.GSE267154.hd5 -o /fs/cbsuhy02/storage/yc2553/yc2553/projects/3.Human_atlas/output/chrombpnet/paper/tfmodisco/counts.GSE267154/ -s /fs/cbsuhy02/storage/yc2553/yc2553/projects/3.Human_atlas/output/chrombpnet/paper/tfmodisco/counts.GSE267154/ -m /fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/resources/other/JASPAR/JASPAR2024_CORE_vertebrates_non-redundant_pfms_meme.txt


In [12]:
# Get the motif name

names = {}
with DelimitedReader(motif_file) as dr:
	for cols in dr:
		if "MOTIF" in cols[0]:
			id, name = cols[0].split(" ")[1:]
			names[id] = name

In [13]:
def get_html_table(html):
	with open(html, 'r', encoding='utf-8') as file:
	    html_content = file.read()
	soup = BeautifulSoup(html_content, 'html.parser')
	table = soup.find('table')
	table_rows = table.find_all('tr')
	headers = []
	for th in table_rows[0].find_all('th'):
	    headers.append(th.text.strip())
	data = []
	for tr in table_rows[1:]:
	    row = []
	    for td in tr.find_all('td'):
	        row.append(td.text.strip())
	    data.append(row)
	df = pd.DataFrame(data, columns=headers)
	return df

In [14]:
dfs = {"procapnet": get_html_table(f"{PROJECT_DIR_o}procapnet/modisco_out/{s}/{model_type}/merged/{task}/motifs.html"),
	   "chrombpnet": get_html_table(f"{PROJECT_DIR_o2}chrombpnet/paper/tfmodisco/counts.GSE267154/motifs.html")
	  }

In [28]:
def show_patterns(df, outdir, cutoff):
	for index, row in df.iterrows():
		matches = [f"match{n}" for n in range(3) if (row["match"+str(n)] != "NaN" and float(row["qval"+str(n)]) < cutoff)]
		nrows = len(matches) + 2
		fig, ax = plt.subplots(nrows, 1, figsize=(3, nrows), dpi=300)
		
		for n in range(nrows):
			if n == 0:
				title = f"{row['pattern']}.fwd  (n={row['num_seqlets']})"
				image = f"{outdir}trimmed_logos/{row['pattern']}.cwm.fwd.png"
			elif n == 1:
				title = f"{row['pattern']}.rev  (n={row['num_seqlets']})"
				image = f"{outdir}trimmed_logos/{row['pattern']}.cwm.rev.png"
			else:
				ind = row[matches[n-2]]
				title = f"{names[ind]} ({ind})"
				image = f"{outdir}{ind}.png"
			img = mpimg.imread(image)
			ax[n].imshow(img)
			ax[n].axis('off') 
			ax[n].set_title(title, fontsize=8)
		plt.subplots_adjust(hspace=0.4)
		plt.show()

## ProCapNet vs. ChromBPNet

In [4]:
k = "procapnet"
show_patterns(dfs[k], outdirs[k], 0.1)

In [31]:
motifs = {}
motifs["procapnet"] = {"SP": 0,
					   "ETS": 1,
					   "NFY": 2,
					   "YY1": 3,
					   "CREB": 4,
					   "SOX4": 6,
					   "POU5F1-SOX2": 7,
					   "NRF1": 8,
					   "AP1": 9,
					   "THAP11-ZNF143": 11,					   
					   "ZBTB33": 17,
					   "TEAD": 19,
					   "SRF": 20,
					   "TATA": 51
					   }

In [5]:
k = "chrombpnet"
show_patterns(dfs[k], outdirs[k], 0.1)

In [32]:
motifs["chrombpnet"] = {"CTCF": 0,
					  "POU5F1-SOX2": 1,
					  "ZIC": 2,
					  "SP": 3,
					  "TEAD": 5,
					  "SOX": 6,
					  "NFY": 7,
					  "RFX5": 8,
					  "SIX": 9,
					  "THAP11-ZNF143": 11,
					  "AP1": 12,
					  "NRF1": 13,
					  "RFX4": 14,
					  "REST": 15,
					  "ETS": 18,
					  "CREB": 20
					 }

In [35]:
json_dump(f"{PROJECT_DIR_o}procapnet/modisco_out/ESC_motifs.json", motifs)