In [1]:
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from biodatatools.utils.common import json_load
from biodata.delimited import DelimitedReader
from bs4 import BeautifulSoup
import matplotlib.image as mpimg
import itertools
import glob
import os
import pickle

In [2]:
PROJECT_DIR_d = "/home/yc2553/projects/HEA/databases/"
PROJECT_DIR_o = "/home/yc2553/projects/HEA/output/procapnet/"
PROJECT_DIR_r = "/fs/cbsuhy02/storage/yc2553/yc2553/projects/TRE_directionality/resources/"

# Get motifs for each model

In [3]:
labels = json_load(f"{PROJECT_DIR_d}PROcap/metainfo/classifications.json")
groups = json_load(f"{PROJECT_DIR_d}PROcap/metainfo/samples.json")

In [4]:
# Keep these models, though we also lower the cutoffs to include EN55
# We may not necessarily explain why we include these models

folders = glob.glob(f"{PROJECT_DIR_o}deepshap_out/*")
samples = [folder.split("/")[-1] for folder in folders if folder.split("/")[-1] in groups["normal_tissues"]]
len(samples)

15

In [5]:
def get_html_table(html):
	with open(html, 'r', encoding='utf-8') as file:
	    html_content = file.read()
	soup = BeautifulSoup(html_content, 'html.parser')
	table = soup.find('table')
	table_rows = table.find_all('tr')
	headers = []
	for th in table_rows[0].find_all('th'):
	    headers.append(th.text.strip())
	data = []
	for tr in table_rows[1:]:
	    row = []
	    for td in tr.find_all('td'):
	        row.append(td.text.strip())
	    data.append(row)
	df = pd.DataFrame(data, columns=headers)
	return df

In [6]:
# Get the motif name

motif_file = f"{PROJECT_DIR_r}other/JASPAR/JASPAR2024_CORE_vertebrates_non-redundant_pfms_meme.txt"
names = {}
with DelimitedReader(motif_file) as dr:
	for cols in dr:
		if "MOTIF" in cols[0]:
			id, name = cols[0].split(" ")[1:]
			names[id] = name

In [13]:
# task = "profile"
# s = "EN55"
# inputfile = f"{PROJECT_DIR_o}modisco_out/{s}/{model_type}/merged/{task}_modisco_results.hd5"
# outdir = f"{PROJECT_DIR_o}modisco_out/{s}/{model_type}/merged/{task}/"
# if not exists(outdir):
# 	os.mkdir(outdir)
# commands = ["modisco report",
# 			"-i", inputfile,
# 			"-o", outdir,
# 			"-s", outdir,
# 			"-m", motif_file
# 			]
# print(" ".join(commands))

In [7]:
model_type = "strand_merged_umap"
tasks = ["counts", "profile"]

In [8]:
# Will get scripts from Alden
# Now just use the output files

dfs = {}
outdirs = {}
for s, task in itertools.product(samples, tasks):
	if s == "EN55":
		outdir = f"{PROJECT_DIR_o}modisco_out/{s}/{model_type}/merged/{task}/"
	else:
		outdir = f"{PROJECT_DIR_o}modisco_report/{task}/{s}/"
	outdirs[(s, task)] = outdir
	dfs[(s, task)] = get_html_table(f"{outdir}motifs.html")

In [9]:
def show_patterns(s, task, cutoff=0.1):
	print(s, labels[s])
	for index, row in dfs[(s,task)].iterrows():
		if int(row['num_seqlets']) >= 200:
			matches = [f"match{n}" for n in range(3) if (row["match"+str(n)] != "NaN" and float(row["qval"+str(n)]) < cutoff)]
			nrows = len(matches) + 2
			fig, ax = plt.subplots(nrows, 1, figsize=(3, nrows), dpi=300)
		
			for n in range(nrows):
				if n == 0:
					title = f"{row['pattern']}.fwd  (n={row['num_seqlets']})"
					image = f"{outdirs[(s,task)]}trimmed_logos/{row['pattern']}.cwm.fwd.png"
				elif n == 1:
					title = f"{row['pattern']}.rev  (n={row['num_seqlets']})"
					image = f"{outdirs[(s,task)]}trimmed_logos/{row['pattern']}.cwm.rev.png"
				else:
					mid = row[matches[n-2]]
					title = f"{names[mid]} ({mid})"
					image = f"{outdirs[(s,task)]}/{mid}.png"
				img = mpimg.imread(image)
				ax[n].imshow(img)
				ax[n].axis('off') 
				ax[n].set_title(title, fontsize=8)
			plt.subplots_adjust(hspace=0.4)
			plt.show()

In [17]:
# Filtering
#1. can be reliably matched to known motifs (tomtom qval<0.1; manually go through the list to make sure they do match the known motif)
#2. remove simple GC repeats
#3. supported by at least 200 seqlets from TF-MoDISco

In [15]:
# Keep motifs of interest for follow-up analysis

motifs = {}

## Count task

In [10]:
task = "counts"

In [None]:
show_patterns(samples[0], task)

In [18]:
motifs[(samples[0], task)] = {"SP": 0,
							  "ETS": 1,
							  "AP1": 2,
							  "NFY": 3,
							  "CREB": 4,
							  "NRF1": 5,
							  "TATA": 6,
							  "THAP11-ZNF143": 7,
							  "HNF4": 10,
							  "HNF1": 11,
							  "ELK1-SREBF2": 12,
							  "ZNF384": 13,
							  "ZBTB33": 14,
							  "USF1": 15
							   }

In [None]:
show_patterns(samples[1], task)

In [20]:
motifs[(samples[1], task)] = {"SP": 0,
							  "ETS": 1,
							  "CREB": 2,
							  "NFY": 3,
							  "NRF1": 5,
							  "AP1": 6,
							  "ELK1-SREBF2": 10,
							  "TATA": 12,
							  "THAP11-ZNF143": 13,
							  "YY1": 14, 
							  "ZBTB33": 15
							   }

In [None]:
show_patterns(samples[2], task)

In [22]:
motifs[(samples[2], task)] = {"SP": 0,
							  "CREB": 1,
							  "NFY": 2,
							  "ETS": 3,
							  "HNF4": 4,
							  "HNF1": 5,
							  "NRF1": 6,
							  "ETS": 7,
							  "THAP11-ZNF143": 8,
							  "CEBPA": 10,
							  "TATA": 11,
							  "USF1": 12,
							  "ELK1-SREBF2": 14,
							  "ZBTB33": 15
							   }

In [None]:
show_patterns(samples[3], task)

In [24]:
motifs[(samples[3], task)] = {"ETS": 0,
							  "SP": 1,
							  "CREB": 2,
							  "NFY": 3,
							  "AP1": 4, 
							  "NRF1": 5,
							  "IRF": 7,
							  "YY1": 8,
							  "THAP11-ZNF143": 9,
							  "ELK1-SREBF2": 10,
							  "CEBPB": 12,
							  "ZNF384": 15,
							  "TATA": 16,
							  "ZBTB33": 14,
							   }

In [None]:
show_patterns(samples[4], task)

In [26]:
motifs[(samples[4], task)] = {"SP": 0,
							  "ETS": 1,
							  "CREB": 2,
							  "NFY": 3,
							  "NRF1": 4,
							  "THAP11-ZNF143": 5,
							  "YY1": 6,
							  "AP1": 7, 
							  "ZBTB33": 8,
							  "USF1": 10,
							  "ZNF384": 11,
							  "ZNF143": 12,
							  "TATA": 13,
							  "THAP11": 14
							   }

In [None]:
show_patterns(samples[5], task)

In [28]:
motifs[(samples[5], task)] = {"ETS": 0,
							  "SP": 1,
							  "CREB": 2,
							  "NFY": 3,
							  "NRF1": 4,
							  "AP1": 5, 
							  "TATA": 6,
							  "THAP11-ZNF143": 7,
							  "ZBTB33": 9,
							  "YY1": 10,
							  "USF1": 11,
							  "ZNF384": 13,
							  "ZNF143": 14
							   }

In [None]:
show_patterns(samples[6], task)

In [30]:
motifs[(samples[6], task)] = {"ETS": 0,
							  "SP": 1,
							  "CREB": 2,
							  "NFY": 3,
							  "AP1": 4, 
							  "NRF1": 5,
							  "HNF1": 7,
							  "THAP11-ZNF143": 8,
							  "TATA": 10,
							  "ZBTB33": 11,
							  "USF1": 13,
							  "ZNF384": 16
							   }

In [1]:
show_patterns(samples[7], task)

In [32]:
motifs[(samples[7], task)] = {"SP": 0,
							  "ETS": 1,
							  "NFY": 2,
							  "MEF2": 3,
							  "CREB": 4,
							  "NRF1": 5,
							  "THAP11-ZNF143": 8,
							  "YY1": 9,
							  "CTCF": 10,
							  "ZBTB33": 12,
							  "TATA": 13
							   }

In [2]:
show_patterns(samples[8], task)

In [34]:
motifs[(samples[8], task)] = {"SP": 0,
							  "CREB": 1,
							  "ETS": 2,
							  "NFY": 3,
							  "NRF1": 6,
							  "AP1": 8,
							  "USF1": 10,
							  "THAP11-ZNF143": 11,
							  "NR3C2": 14,
							  "ZNF384": 16,
							  "TATA": 18,
							  "ZBTB33": 19
							   }

In [None]:
show_patterns(samples[9], task)

In [36]:
motifs[(samples[9], task)] = {"SP": 0,
							  "ETS": 1,
							  "CREB": 2,
							  "NFY": 3,
							  "NRF1": 4,
							  "AP1": 7,
							  "NR3C2": 9,
							  "THAP11-ZNF143": 11,
							  "ZNF384": 13,
							  "ZBTB33": 14,
							  "SRF": 17,
							  "MEF2": 18,
							  "USF1": 19
							   }

In [None]:
show_patterns(samples[10], task)

In [38]:
motifs[(samples[10], task)] = {"SP": 1,
							   "ETS": 2,
							   "CREB": 3,
							   "NFY": 4,
							   "NRF1": 5,
							   "IRF": 6,
							   "ELK1-SREBF2": 7,
							   "THAP11-ZNF143": 8,
							   "TATA": 9,
							   "AP1": 10,
							   "YY1": 11,
							   "ZBTB33": 14,
							   "ZNF384": 17
							   }

In [None]:
show_patterns(samples[11], task)

In [40]:
motifs[(samples[11], task)] = {"SP": 0,
							   "ETS": 1,
							   "HNF4": 2,
							   "NFY": 3,
							   "CREB": 4,
							   "AP1": 5,
							   "NRF1": 6,
							   "HNF1": 7,
							   "THAP11-ZNF143": 8,
							   "IRF": 9,
							   "TATA": 11,
							   "CEBPB": 12,
							   "NR3C2": 13,
							   "ELK1-SREBF2": 14,
							   "ZNF384": 15,
							   "ZBTB33": 17
							   }

In [None]:
show_patterns(samples[12], task)

In [42]:
motifs[(samples[12], task)] = {"ETS": 0,
							   "SP": 1,
							   "CREB": 2,
							   "NRF1": 3,
							   "NFY": 4,
							   "AP1": 5,
							   "TATA": 6,
							   "THAP11-ZNF143": 8,
							   "ZBTB33": 9,
							   "ELK1-SREBF2": 10,
							   "CEBPA": 11
							   }

In [None]:
show_patterns(samples[13], task)

In [44]:
motifs[(samples[13], task)] = {"SP": 0,
							   "ETS": 1,
							   "CREB": 2,
							   "AP1": 3,
							   "NFY": 4,
							   "NRF1": 5,
							   "THAP11-ZNF143": 7,
							   "CEBPA": 8,
							   "ELK1-SREBF2": 9,
							   "CEBPB": 10,
							   "USF1": 11,
							   "ZBTB33": 12
							   }

In [None]:
show_patterns(samples[14], task)

In [46]:
motifs[(samples[14], task)] = {"SP": 0,
							   "ETS": 1,
							   "NFY": 2,
							   "CREB": 3,
							   "NRF1": 4,
							   "AP1": 5,
							   "THAP11-ZNF143": 7,
							   "SRF": 9,
							   "HSF": 10,
							   "ELK1-SREBF2": 11,
							   "MEF2": 12,
							   "ZBTB33": 13,
							   "USF1": 14,
							   "ZNF384": 15
							   }

## Profile task

In [47]:
task = "profile"

In [None]:
show_patterns(samples[0], task)

In [49]:
motifs[(samples[0], task)] = {"Inr-CA": 0,
							  "SP": 2,
							  "Inr-TA": 4,
							  "CREB": 6,
							  "ETS": 8,
							  "NFY": 9,
							  "TATA": 10,
							  "AP1": 12,
							  "ZNF384": 17,
							  "NRF1": 18,
							  "HNF1": 19,
							  "THAP11-ZNF143": 24,
							  "IRF": 34,
							  "ZBTB33": 36
							   }

In [None]:
show_patterns(samples[1], task)

In [51]:
motifs[(samples[1], task)] = {"Inr-CA": 0,
							  "SP": 2,
							  "Inr-TA": 7,
							  "NFY": 8,
							  "CREB": 9,
							  "ETS": 10,
							  "NRF1": 12,
							  "AP1": 13,
							  "ZNF384": 15,
							  "TATA": 16,
							  "THAP11-ZNF143": 18,
							  "ZBTB33": 23
							   }

In [None]:
show_patterns(samples[2], task)

In [53]:
motifs[(samples[2], task)] = {"Inr-CA": 0,
							  "TATA": 2,
							  "SP": 3,
							  "NFY": 6,
							  "CREB": 7,
							  "ETS": 8,
							  "HNF4": 9,
							  "HNF1": 11,
							  "USF1": 13,
							  "NRF1": 14,
							  "ZNF384": 16,
							  "CEBPB": 17,
							  "THAP11-ZNF143": 21,
							  "ZBTB33": 30
							   }

In [None]:
show_patterns(samples[3], task)

In [55]:
motifs[(samples[3], task)] = {"Inr-CA": 0,
							  "SP": 2,
							  "Inr-TA": 4,
							  "ETS": 6,
							  "NFY": 7,
							  "CREB": 9,
							  "TATA": 11,
							  "NRF1": 12,
							  "AP1": 14, 
							  "ZNF384": 16,
							  "THAP11-ZNF143": 18,
							  "ZBTB33": 23
							   }

In [None]:
show_patterns(samples[4], task)

In [57]:
motifs[(samples[4], task)] = {"Inr-CA": 0,
							  "SP": 1,
							  "Inr-TA": 4,
							  "NFY": 8,
							  "CREB": 9,
							  "ETS": 10,
							  "NRF1": 12,
							  "ZNF384": 13,
							  "TATA": 15,
							  "YY1": 16,
							  "THAP11-ZNF143": 17,
							  "AP1": 20, 
							  "ZBTB33": 22,
							  "USF1": 26,
							  "RFX": 40
							   }

In [None]:
show_patterns(samples[5], task)

In [59]:
motifs[(samples[5], task)] = {"Inr-CA": 1,
							  "SP": 2,
							  "Inr-TA": 4,
							  "NFY": 6,
							  "ETS": 7,
							  "CREB": 8,
							  "NRF1": 10,
							  "TATA": 11,
							  "AP1": 12,
							  "ZNF384": 16,
							  "THAP11-ZNF143": 18,
							  "ZBTB33": 22
							   }

In [9]:
show_patterns(samples[6], task)

In [61]:
motifs[(samples[6], task)] = {"Inr-CA": 0,
							  "SP": 3,
							  "Inr-TA": 4,
							  "NFY": 6,
							  "HNF1": 7,
							  "ETS": 8,
							  "CREB": 12,
							  "USF1": 13,
							  "AP1": 14, 
							  "NRF1": 15,
							  "TATA": 16,
							  "THAP11-ZNF143": 17,
							  "ZBTB33": 32
							   }

In [8]:
show_patterns(samples[7], task)

In [63]:
motifs[(samples[7], task)] = {"Inr-CA": 0,
							  "SP": 1,
							  "NFY": 4,
							  "ETS": 5,
							  "Inr-TA": 6,
							  "CREB": 8,
							  "TATA": 10,
							  "MEF2": 11,
							  "NRF1": 13,
							  "THAP11-ZNF143": 16,
							  "YY1": 19,
							  "AP1": 22,
							  "CTCF": 24,
							  "ZBTB33": 25
							   }

In [1]:
show_patterns(samples[8], task)

In [65]:
motifs[(samples[8], task)] = {"Inr-CA": 0,
							  "Inr-TA": 3,
							  "SP": 4,
							  "CREB": 7,
							  "NFY": 8,
							  "ZNF384": 9,
							  "ETS": 10,
							  "TATA": 11,
							  "AP1": 12,
							  "NRF1": 13,
							  "THAP11-ZNF143": 17,
							  "USF1": 21,
							  "ZBTB33": 27,
							  "NR3C2": 37
							   }

In [2]:
show_patterns(samples[9], task)

In [67]:
motifs[(samples[9], task)] = {"Inr-CA": 0,
							  "SP": 3,
							  "Inr-TA": 4,
							  "NFY": 7,
							  "CREB": 8,
							  "TATA": 10,
							  "ETS": 12,
							  "NRF1": 14,
							  "THAP11-ZNF143": 18,
							  "AP1": 20,
							  "ZNF384": 24,
							  "ZBTB33": 30,
							  "NR3C2": 32
							   }

In [3]:
show_patterns(samples[10], task)

In [69]:
motifs[(samples[10], task)] = {"Inr-CA": 0,
							   "SP": 3,
							   "Inr-TA": 4,
							   "ETS": 5,
							   "NFY": 8,
							   "CREB": 9,
							   "ZNF384": 11,
							   "NRF1": 12,
							   "TATA": 13,
							   "THAP11-ZNF143": 15,
							   "IRF": 17,
							   "ZBTB33": 23,
							   "AP1": 32
							   }

In [4]:
show_patterns(samples[11], task)

In [71]:
motifs[(samples[11], task)] = {"Inr-CA": 0,
							   "SP": 2,
							   "Inr-TA": 3,
							   "NFY": 8,
							   "CREB": 9,
							   "ETS": 10,
							   "TATA": 11,
							   "HNF4": 12,
							   "ZNF384": 13,
							   "AP1": 14,
							   "NRF1": 15,
							   "HNF1": 20,
							   "THAP11-ZNF143": 21,
							   "IRF": 25,
							   "CEBPB": 28,
							   "ZBTB33": 29
							   }

In [5]:
show_patterns(samples[12], task)

In [73]:
motifs[(samples[12], task)] = {"Inr-CA": 0,
							   "SP": 1,
							   "Inr-TA": 4,
							   "CREB": 5,
							   "NFY": 7,
							   "ETS": 10,
							   "TATA": 11,
							   "NRF1": 12,
							   "AP1": 14,
							   "ZNF384": 15,
							   "THAP11-ZNF143": 16,
							   "ATF4": 20,
							   "CEBPB": 21,
							   "ZBTB33": 23
							   }

In [6]:
show_patterns(samples[13], task)

In [75]:
motifs[(samples[13], task)] = {"Inr-CA": 0,
							   "SP": 2,
							   "Inr-TA": 4,
							   "CREB": 6,
							   "NFY": 7,
							   "AP1": 10,
							   "ETS": 11,
							   "ZNF384": 12,
							   "NRF1": 13,
							   "CEBPB": 15,
							   "TATA": 17,
							   "THAP11-ZNF143": 18,
							   "ZBTB33": 22,
							   "ELK1-SREBF2": 25
							   }

In [7]:
show_patterns(samples[14], task)

In [77]:
motifs[(samples[14], task)] = {"Inr-CA": 0,
							   "SP": 1,
							   "Inr-TA": 3,
							   "CREB": 6,
							   "NFY": 7,
							   "ETS": 8,
							   "AP1": 9,
							   "NRF1": 11,
							   "ZNF384": 13,
							   "THAP11-ZNF143": 17,
							   "ZBTB33": 33,
							   "TATA": 38,
							   "CTCF": 39,
							   "RFX": 44
							   }

In [78]:
for k in motifs:
	print(k, len(motifs[k]))

('EN5', 'counts') 14
('BCT5', 'counts') 11
('EN55', 'counts') 13
('EN18', 'counts') 14
('EN6', 'counts') 14
('EN12', 'counts') 13
('EN3', 'counts') 12
('EN23', 'counts') 11
('GT1', 'counts') 12
('GT22', 'counts') 13
('EN4', 'counts') 13
('GT24', 'counts') 16
('GT17', 'counts') 11
('GT23', 'counts') 12
('CHTN15', 'counts') 14
('EN5', 'profile') 14
('BCT5', 'profile') 12
('EN55', 'profile') 14
('EN18', 'profile') 12
('EN6', 'profile') 15
('EN12', 'profile') 12
('EN3', 'profile') 13
('EN23', 'profile') 14
('GT1', 'profile') 14
('GT22', 'profile') 13
('EN4', 'profile') 13
('GT24', 'profile') 16
('GT17', 'profile') 14
('GT23', 'profile') 14
('CHTN15', 'profile') 14


In [79]:
outputfile = f"{PROJECT_DIR_o}modisco_out/all_motifs.json"
with open(outputfile, "wb") as f:
    pickle.dump(motifs, f)