In [1]:
import MotifCompendium
import MotifCompendium.utils.analysis as utils_analysis
import MotifCompendium.utils.motif as utils_motif
from MotifCompendium.utils.similarity import set_default_options
from IPython.display import display, HTML, Image
from typing import List, Union
import pandas as pd
import numpy as np
import os
import h5py

import matplotlib
matplotlib.use('pdf')
from matplotlib import pyplot as plt
import logomaker

In [2]:
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
set_default_options(max_chunk=1000, max_cpus=32, use_gpu=True)

In [3]:
mc_h5_file = '/oak/stanford/groups/akundaje/projects/neuro-variants/motif_compendium/all_data/leiden_96/neuro-variants.all_data.motif_compendium.avg.leiden_96.h5'
selin_h5_file = '/oak/stanford/groups/akundaje/soumyak/motifs/latest/selin/selin.motif_compendium.avg.h5'
hocomoco_meme_file = '/oak/stanford/groups/akundaje/soumyak/motifs/latest/hocomoco_v12/H12CORE_meme_format.meme'
vierstra_meme_file = '/oak/stanford/groups/akundaje/soumyak/motifs/latest/vierstra/all.dbs.meme'
metadata_file = '/oak/stanford/groups/akundaje/projects/neuro-variants/motif_compendium/all_data/leiden_96/neuro-variants.all_data.motif_compendium.avg.metadata.leiden_96.tsv'

logo_dir = '/oak/stanford/groups/akundaje/projects/neuro-variants/motif_compendium/logos'

In [4]:
def _plot_weights(array, path, figsize=(10,3)):
	"""Plot weights as a sequence logo and save to file."""

	if not os.path.isfile(path):
		fig = plt.figure(figsize=figsize)
		ax = fig.add_subplot(111) 

		df = pd.DataFrame(array, columns=['A', 'C', 'G', 'T'])
		df.index.name = 'pos'

		crp_logo = logomaker.Logo(df, ax=ax)
		crp_logo.style_spines(visible=False)
		plt.ylim(min(df.sum(axis=1).min(), 0), df.sum(axis=1).max())

		plt.savefig(path)
		plt.close()

	else:
		pass

In [5]:
def create_modisco_logos(modisco_h5py: os.PathLike, modisco_logo_dir, trim_threshold, pattern_groups: List[str]):
	"""Open a modisco results file and create and write logos to file for each pattern."""
	modisco_results = h5py.File(modisco_h5py, 'r')

	tags = []

	for name in pattern_groups:
		if name not in modisco_results.keys():
			continue

		metacluster = modisco_results[name]
		key = lambda x: int(x[0].split("_")[-1])
		for pattern_name, pattern in sorted(metacluster.items(), key=key):
			tag = pattern_name
			tags.append(tag)

			cwm_fwd = np.array(pattern['contrib_scores'][:])
			cwm_rev = cwm_fwd[::-1, ::-1]

			score_fwd = np.sum(np.abs(cwm_fwd), axis=1)
			score_rev = np.sum(np.abs(cwm_rev), axis=1)

			trim_thresh_fwd = np.max(score_fwd) * trim_threshold
			trim_thresh_rev = np.max(score_rev) * trim_threshold

			pass_inds_fwd = np.where(score_fwd >= trim_thresh_fwd)[0]
			pass_inds_rev = np.where(score_rev >= trim_thresh_rev)[0]

			start_fwd, end_fwd = max(np.min(pass_inds_fwd) - 4, 0), min(np.max(pass_inds_fwd) + 4 + 1, len(score_fwd) + 1)
			start_rev, end_rev = max(np.min(pass_inds_rev) - 4, 0), min(np.max(pass_inds_rev) + 4 + 1, len(score_rev) + 1)

			trimmed_cwm_fwd = cwm_fwd[start_fwd:end_fwd]
			trimmed_cwm_rev = cwm_rev[start_rev:end_rev]

			_plot_weights(trimmed_cwm_fwd, path='{}/{}.cwm.fwd.png'.format(modisco_logo_dir, tag))
			_plot_weights(trimmed_cwm_rev, path='{}/{}.cwm.rev.png'.format(modisco_logo_dir, tag))

	modisco_results.close()
	return tags

In [6]:
def create_selin_logos(modisco_h5py: os.PathLike, modisco_logo_dir, trim_threshold, pattern_groups: List[str]):
	"""Open a modisco results file and create and write logos to file for each pattern."""
	modisco_results = h5py.File(modisco_h5py, 'r')

	tags = []

	for name in pattern_groups:
		if name not in modisco_results.keys():
			continue

		metacluster = modisco_results[name]
		for pattern_name, pattern in metacluster.items():
			tag = pattern_name.replace('/', '-').replace("#", "-")
			tags.append(tag)

			cwm_fwd = np.array(pattern['contrib_scores'][:])
			cwm_rev = cwm_fwd[::-1, ::-1]

			score_fwd = np.sum(np.abs(cwm_fwd), axis=1)
			score_rev = np.sum(np.abs(cwm_rev), axis=1)

			trim_thresh_fwd = np.max(score_fwd) * trim_threshold
			trim_thresh_rev = np.max(score_rev) * trim_threshold

			pass_inds_fwd = np.where(score_fwd >= trim_thresh_fwd)[0]
			pass_inds_rev = np.where(score_rev >= trim_thresh_rev)[0]

			start_fwd, end_fwd = max(np.min(pass_inds_fwd) - 4, 0), min(np.max(pass_inds_fwd) + 4 + 1, len(score_fwd) + 1)
			start_rev, end_rev = max(np.min(pass_inds_rev) - 4, 0), min(np.max(pass_inds_rev) + 4 + 1, len(score_rev) + 1)

			trimmed_cwm_fwd = cwm_fwd[start_fwd:end_fwd]
			trimmed_cwm_rev = cwm_rev[start_rev:end_rev]

			_plot_weights(trimmed_cwm_fwd, path='{}/{}.cwm.fwd.png'.format(modisco_logo_dir, tag))
			_plot_weights(trimmed_cwm_rev, path='{}/{}.cwm.rev.png'.format(modisco_logo_dir, tag))

	modisco_results.close()
	return tags

In [7]:
def read_meme(filename):
	motifs = {}

	with open(filename, "r") as infile:
		motif, width, i = None, None, 0

		for line in infile:
			if motif is None:
				if line[:5] == 'MOTIF':
					motif = line.split()[1]
				else:
					continue

			elif width is None:
				if line[:6] == 'letter':
					width = int(line.split()[5])
					pwm = np.zeros((width, 4))

			elif i < width:
				pwm[i] = list(map(float, line.split()))
				i += 1

			else:
				motifs[motif] = pwm
				motif, width, i = None, None, 0

	return motifs

In [8]:
def compute_per_position_ic(ppm, background, pseudocount):
    alphabet_len = len(background)
    ic = ((np.log((ppm+pseudocount)/(1 + pseudocount*alphabet_len))/np.log(2))
          *ppm - (np.log(background)*background/np.log(2))[None,:])
    return np.sum(ic,axis=1)

In [9]:
def make_logo(match, logo_dir, motifs):
	if match == 'NA':
		return

	background = np.array([0.25, 0.25, 0.25, 0.25])
	ppm = motifs[match]
	ic = compute_per_position_ic(ppm, background, 0.001)

	_plot_weights(ppm*ic[:, None], path='{}/{}.png'.format(logo_dir, match))

In [10]:
def path_to_image_link(path):
    return '=IMAGE("' + path + '#"&RANDBETWEEN(1111111,9999999), 4, 80, 240)'

In [11]:
mc_h5 = h5py.File(mc_h5_file, 'r')

mc_h5.keys()

<KeysViewHDF5 ['neg_patterns', 'pos_patterns']>

In [12]:
selin_h5 = h5py.File(selin_h5_file, 'r')

selin_h5.keys()

<KeysViewHDF5 ['neg_patterns', 'pos_patterns']>

In [13]:
modisco_logo_dir = os.path.join(logo_dir, 'modisco/all_data/leiden_96')
if not os.path.isdir(modisco_logo_dir):
    os.mkdir(modisco_logo_dir)

pattern_groups = [group for group in mc_h5.keys()]
print(pattern_groups)

trim_threshold = 0.1

create_modisco_logos(mc_h5_file, modisco_logo_dir, trim_threshold, ['pos_patterns'])
create_modisco_logos(mc_h5_file, modisco_logo_dir, trim_threshold, ['neg_patterns'])

['neg_patterns', 'pos_patterns']


['neg_patterns.pattern_0',
 'neg_patterns.pattern_1',
 'neg_patterns.pattern_2',
 'neg_patterns.pattern_3',
 'neg_patterns.pattern_4',
 'neg_patterns.pattern_5',
 'neg_patterns.pattern_6',
 'neg_patterns.pattern_7',
 'neg_patterns.pattern_8',
 'neg_patterns.pattern_9',
 'neg_patterns.pattern_10',
 'neg_patterns.pattern_11',
 'neg_patterns.pattern_12',
 'neg_patterns.pattern_13',
 'neg_patterns.pattern_14',
 'neg_patterns.pattern_15',
 'neg_patterns.pattern_16',
 'neg_patterns.pattern_17',
 'neg_patterns.pattern_18',
 'neg_patterns.pattern_19',
 'neg_patterns.pattern_20',
 'neg_patterns.pattern_21',
 'neg_patterns.pattern_22',
 'neg_patterns.pattern_23',
 'neg_patterns.pattern_24',
 'neg_patterns.pattern_25',
 'neg_patterns.pattern_26',
 'neg_patterns.pattern_27',
 'neg_patterns.pattern_28',
 'neg_patterns.pattern_29',
 'neg_patterns.pattern_30',
 'neg_patterns.pattern_31',
 'neg_patterns.pattern_32',
 'neg_patterns.pattern_33',
 'neg_patterns.pattern_34',
 'neg_patterns.pattern_35',
 '

In [14]:
selin_logo_dir = os.path.join(logo_dir, 'selin')
if not os.path.isdir(selin_logo_dir):
    os.mkdir(selin_logo_dir)

pattern_groups = [group for group in selin_h5.keys()]
print(pattern_groups)

trim_threshold = 0.1

create_selin_logos(selin_h5_file, selin_logo_dir, trim_threshold, ['pos_patterns'])
create_selin_logos(selin_h5_file, selin_logo_dir, trim_threshold, ['neg_patterns'])

['neg_patterns', 'pos_patterns']


['BCL11A-Brepressive',
 'HIC',
 'NFYrepressive',
 'YY1-2repressive',
 'ZEB-SNAI',
 'ZEB-SNAI_ZEB-SNAI-1',
 'ZEB-SNAI_ZEB-SNAI-2',
 'ZEB-SNAI_ZEB-SNAI-3',
 'ZEB-SNAI_ZEB-SNAI-4',
 'ZEB-SNAI_ZEB-SNAI-5',
 'unresolved,repressive-1',
 'unresolved,repressive-2',
 'unresolved,repressive,YY1-2-like',
 'unresolved,repressive,ZEB-SNAI-like']

In [15]:
metadata = pd.read_table(metadata_file)
metadata['selin_match'] = metadata['selin_match'].str.replace('/', '-')
metadata['selin_match'] = metadata['selin_match'].str.replace("#", "-")

metadata

Unnamed: 0,index,name,num_patterns,num_seqlets,num_samples,num_datasets,datasets,posneg,hocomoco_similarity,hocomoco_match,vierstra_similarity,vierstra_match,selin_similarity,selin_match,annotation
0,0,pos_patterns.pattern_0,74,194827,53,3,"corces_2020,domcke_2020,trevino_2021",pos,0.914414,NRF1.H12CORE.0.PS.A,0.951238,NRF1_M09443_2.00,0.998523,NRF1,NRF1_0
1,1,pos_patterns.pattern_1,58,372948,54,3,"corces_2020,domcke_2020,trevino_2021",pos,0.969925,SP4.H12CORE.0.P.C,0.944662,SP4_M08296_2.00,0.998351,SP-KLF,SP-KLF_1
2,2,pos_patterns.pattern_2,55,137965,53,3,"corces_2020,domcke_2020,trevino_2021",pos,0.937175,GMEB2.H12CORE.2.SM.B,0.973756,CREB1_M04258_2.00,0.998884,BZIP:ATF-CREB-1,BZIP:ATF-CREB-1_2
3,3,pos_patterns.pattern_3,54,92698,54,3,"corces_2020,domcke_2020,trevino_2021",pos,0.981210,ZN143.H12CORE.0.P.B,0.971501,ETS1_M07950_2.00,0.999409,ZNF143-1,ZNF143-1_3
4,4,pos_patterns.pattern_4,54,317250,54,3,"corces_2020,domcke_2020,trevino_2021",pos,0.978794,NFYA.H12CORE.0.P.B,0.984840,NFYA_MA0060.1,0.999510,NFY,NFY_4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,994,pos_patterns.pattern_720,1,28,1,1,corces_2020,pos,0.912841,RFX1.H12CORE.1.PSM.A,0.922637,RFX2_M09362_2.00,0.962012,RFX-1,RFX-1_994
995,995,pos_patterns.pattern_721,1,27,1,1,corces_2020,pos,0.908444,CEBPD.H12CORE.0.P.B,0.907553,CEBPA_M08813_2.00,0.930452,BZIP:CEBP-1,Unknown_995
996,996,pos_patterns.pattern_722,1,26,1,1,corces_2020,pos,0.898491,MITF.H12CORE.0.P.B,0.854161,ZNF317_M08308_2.00,0.888819,BHLH:USF1-2-1,Unknown_996
997,997,pos_patterns.pattern_723,1,24,1,1,corces_2020,pos,0.901590,RUNX2.H12CORE.0.P.B,0.919790,RUNX3_M02751_2.00,0.937730,RUNX-1,Unknown_997


In [16]:
hocomoco_meme = read_meme(hocomoco_meme_file)
vierstra_meme = read_meme(vierstra_meme_file)

In [17]:
hocomoco_meme.keys()

dict_keys(['AHR.H12CORE.0.P.B', 'AHRR.H12CORE.0.P.C', 'ALX1.H12CORE.0.SM.B', 'ALX3.H12CORE.0.SM.B', 'ALX3.H12CORE.1.S.B', 'ALX4.H12CORE.0.S.B', 'ALX4.H12CORE.1.SM.B', 'ANDR.H12CORE.0.P.B', 'ANDR.H12CORE.1.S.C', 'ANDR.H12CORE.2.P.B', 'AP2A.H12CORE.0.PSM.A', 'AP2B.H12CORE.0.SM.B', 'AP2C.H12CORE.0.PSM.A', 'AP2E.H12CORE.0.SM.B', 'ARGFX.H12CORE.0.SM.B', 'ARI1A.H12CORE.0.P.C', 'ARNT.H12CORE.0.P.B', 'ARNT.H12CORE.1.P.B', 'ARNT2.H12CORE.0.P.B', 'ARNT2.H12CORE.1.P.B', 'ARNT2.H12CORE.2.S.C', 'ARX.H12CORE.0.S.B', 'ARX.H12CORE.1.SM.B', 'ASCL1.H12CORE.0.PSM.A', 'ASCL2.H12CORE.0.PSM.A', 'ATF1.H12CORE.0.P.B', 'ATF1.H12CORE.1.P.B', 'ATF2.H12CORE.0.PSM.A', 'ATF2.H12CORE.1.P.B', 'ATF3.H12CORE.0.P.B', 'ATF3.H12CORE.1.P.B', 'ATF3.H12CORE.2.P.B', 'ATF3.H12CORE.3.SM.B', 'ATF4.H12CORE.0.P.B', 'ATF4.H12CORE.1.S.B', 'ATF4.H12CORE.2.SM.B', 'ATF6A.H12CORE.0.SM.B', 'ATF6B.H12CORE.0.SM.B', 'ATF7.H12CORE.0.PSM.A', 'ATOH1.H12CORE.0.P.B', 'ATOH1.H12CORE.1.SM.B', 'ATOH8.H12CORE.0.P.C', 'BACH1.H12CORE.0.P.B', 'BACH2.H1

In [18]:
vierstra_meme.keys()

dict_keys(['BANP_Grand2021', 'M00217_2.00', 'M00218_2.00', 'M00219_2.00', 'M00220_2.00', 'M00221_2.00', 'M00222_2.00', 'M00223_2.00', 'M00224_2.00', 'M00225_2.00', 'M00226_2.00', 'M00227_2.00', 'M00228_2.00', 'M00229_2.00', 'M00230_2.00', 'M00231_2.00', 'M00232_2.00', 'M00233_2.00', 'M00234_2.00', 'M00235_2.00', 'M00236_2.00', 'M00237_2.00', 'M00238_2.00', 'M00239_2.00', 'M00240_2.00', 'M00241_2.00', 'M00242_2.00', 'M00243_2.00', 'M00244_2.00', 'M00245_2.00', 'M00246_2.00', 'M00247_2.00', 'M00248_2.00', 'M00249_2.00', 'M00250_2.00', 'M00251_2.00', 'M00252_2.00', 'M00253_2.00', 'M00254_2.00', 'M00255_2.00', 'M00256_2.00', 'M00257_2.00', 'M00258_2.00', 'M00259_2.00', 'M00260_2.00', 'M00261_2.00', 'M00262_2.00', 'M00263_2.00', 'M00264_2.00', 'M00265_2.00', 'M00266_2.00', 'M00267_2.00', 'M00268_2.00', 'M00269_2.00', 'M00270_2.00', 'M00271_2.00', 'M00272_2.00', 'M00273_2.00', 'M00274_2.00', 'M00275_2.00', 'M00276_2.00', 'M00277_2.00', 'M00278_2.00', 'M00279_2.00', 'M00280_2.00', 'M00281_2.0

In [19]:
vierstra_metadata = pd.read_table('/oak/stanford/groups/akundaje/soumyak/motifs/latest/vierstra/metadata.tsv')

vierstra_metadata

Unnamed: 0,motif_id,cluster,source_id,tf_name,family_name,motif_type,PMID
0,BANP_Grand2021,AC0375,BANP,BANP,Unknown,ChIP-seq,34234345
1,M00217_2.00,AC0237,ZNF200_H322Y,ZNF200,C2H2 ZF,PBM,27013732
2,M00218_2.00,AC0623,ZNF200_REF,ZNF200,C2H2 ZF,PBM,27013732
3,M00219_2.00,AC0623,ZNF200_S265Y,ZNF200,C2H2 ZF,PBM,27013732
4,M00220_2.00,AC0606,SNAI2_D119E,SNAI2,C2H2 ZF,PBM,27013732
...,...,...,...,...,...,...,...
5188,MA1984.1,AC0490,MA1984.1,ZNF667,['C2H2 zinc finger factors'],Unknown,27852650
5189,MA1985.1,AC0406,MA1985.1,ZNF669,['C2H2 zinc finger factors'],Unknown,27852650
5190,MA1986.1,AC0124,MA1986.1,ZNF692,['C2H2 zinc finger factors'],Unknown,25690854
5191,MA1987.1,AC0178,MA1987.1,ZNF701,['C2H2 zinc finger factors'],Unknown,22955616


In [20]:
vierstra_subset = vierstra_metadata.loc[vierstra_metadata.apply(lambda x: x['tf_name'] + '_' + x['motif_id'] in metadata['vierstra_match'].values,
                                                                axis=1)]

vierstra_subset

Unnamed: 0,motif_id,cluster,source_id,tf_name,family_name,motif_type,PMID
0,BANP_Grand2021,AC0375,BANP,BANP,Unknown,ChIP-seq,34234345
4,M00220_2.00,AC0606,SNAI2_D119E,SNAI2,C2H2 ZF,PBM,27013732
5,M00221_2.00,AC0606,SNAI2_REF,SNAI2,C2H2 ZF,PBM,27013732
6,M00222_2.00,AC0606,SNAI2_T234I,SNAI2,C2H2 ZF,PBM,27013732
7,M00223_2.00,AC0111,KLF1_E325K,KLF1,C2H2 ZF,PBM,27013732
...,...,...,...,...,...,...,...
5122,MA1721.1,AC0460,MA1721.1,ZNF93,['C2H2 zinc finger factors'],Unknown,25274305
5132,MA1928.1,AC0242,MA1928.1,BNC2,['C2H2 zinc finger factors'],Unknown,30487138
5137,MA1933.1,AC0626,MA1933.1,ELK1::SREBF2,"['Tryptophan cluster factors', 'Basic helix-lo...",Unknown,23050235
5139,MA1935.1,AC0626,MA1935.1,ERF::FOXI1,"['Tryptophan cluster factors', 'Fork head/wing...",Unknown,31913281


In [21]:
vierstra_meme_subset = {}

for index,row in vierstra_subset.iterrows():
    vierstra_meme_subset[row['tf_name'] + '_' + row['motif_id']] = vierstra_meme[row['motif_id']]

In [22]:
hocomoco_meme_subset = {key: hocomoco_meme[key] for key in hocomoco_meme.keys() if key in metadata['hocomoco_match'].values}

In [23]:
vierstra_meme_subset.keys()

dict_keys(['BANP_BANP_Grand2021', 'SNAI2_M00220_2.00', 'SNAI2_M00221_2.00', 'SNAI2_M00222_2.00', 'KLF1_M00223_2.00', 'KLF1_M00224_2.00', 'EGR2_M00233_2.00', 'GFI1_M00237_2.00', 'ZNF655_M00250_2.00', 'ZNF655_M00251_2.00', 'FOXC1_M00255_2.00', 'VSX2_M00285_2.00', 'VENTX_M00304_2.00', 'VENTX_M00305_2.00', 'PITX2_M00312_2.00', 'HOXB7_M00332_2.00', 'NR1H4_M00367_2.00', 'PHF1_M00967_2.00', 'BCL11A_M00983_2.00', 'BCL11A_M00984_2.00', 'BCL11A_M00985_2.00', 'BCL11B_M00987_2.00', 'BCL11B_M00988_2.00', 'SOX4_M01026_2.00', 'ZSCAN29_M01165_2.00', 'PLAGL2_M01171_2.00', 'VEZF1_M01172_2.00', 'TIGD1_M01204_2.00', 'LCORL_M01304_2.00', 'MAX_M01497_2.00', 'CUX2_M01499_2.00', 'CUX1_M01501_2.00', 'DLX3_M01503_2.00', 'NKX2-5_M01507_2.00', 'KDM2B_M01911_2.00', 'KMT2A_M01912_2.00', 'TET1_M01914_2.00', 'E2F2_M01961_2.00', 'EMX1_M02088_2.00', 'NR2F6_M02396_2.00', 'RARG_M02397_2.00', 'GMEB1_M02460_2.00', 'SOX9_M02716_2.00', 'SRY_M02717_2.00', 'GATA1_M02738_2.00', 'MEF2C_M02745_2.00', 'RUNX3_M02751_2.00', 'TFAP2C_

In [24]:
hocomoco_meme_subset.keys()

dict_keys(['ANDR.H12CORE.0.P.B', 'AP2A.H12CORE.0.PSM.A', 'ARNT2.H12CORE.0.P.B', 'ATF1.H12CORE.1.P.B', 'ATF2.H12CORE.0.PSM.A', 'ATF3.H12CORE.0.P.B', 'ATF3.H12CORE.2.P.B', 'ATF3.H12CORE.3.SM.B', 'ATF6B.H12CORE.0.SM.B', 'ATOH8.H12CORE.0.P.C', 'BARH2.H12CORE.1.S.B', 'BATF.H12CORE.0.P.B', 'BATF3.H12CORE.0.P.B', 'BHE40.H12CORE.0.PSM.A', 'BHE41.H12CORE.0.PSM.A', 'CDX2.H12CORE.0.PS.A', 'CEBPB.H12CORE.1.SM.B', 'CEBPD.H12CORE.0.P.B', 'CEBPE.H12CORE.0.P.B', 'CENPB.H12CORE.0.S.B', 'COE2.H12CORE.0.P.B', 'COT1.H12CORE.0.PSM.A', 'COT1.H12CORE.1.PSM.A', 'COT2.H12CORE.1.P.B', 'CPEB1.H12CORE.0.S.B', 'CTCF.H12CORE.0.P.B', 'CTCFL.H12CORE.0.P.B', 'CUX1.H12CORE.0.PM.A', 'DBP.H12CORE.0.SM.B', 'DDIT3.H12CORE.0.P.B', 'DLX2.H12CORE.1.S.B', 'DLX3.H12CORE.1.S.B', 'E2F1.H12CORE.1.S.B', 'E2F3.H12CORE.1.SM.B', 'E2F4.H12CORE.0.P.B', 'E2F8.H12CORE.1.SM.B', 'EGR2.H12CORE.0.PSM.A', 'EGR3.H12CORE.0.PSM.A', 'ELK3.H12CORE.0.PSM.A', 'ELK4.H12CORE.0.PSM.A', 'ESR2.H12CORE.0.P.B', 'ETS2.H12CORE.0.S.C', 'ETS2.H12CORE.1.P.B', 'E

In [25]:
vierstra_logo_dir = os.path.join(logo_dir, 'vierstra')
if not os.path.isdir(vierstra_logo_dir):
    os.mkdir(vierstra_logo_dir)

for motif in vierstra_meme_subset.keys():
    make_logo(motif, vierstra_logo_dir, vierstra_meme_subset)

In [26]:
hocomoco_logo_dir = os.path.join(logo_dir, 'hocomoco')
if not os.path.isdir(hocomoco_logo_dir):
    os.mkdir(hocomoco_logo_dir)

for motif in hocomoco_meme_subset.keys():
    make_logo(motif, hocomoco_logo_dir, hocomoco_meme_subset)

In [27]:
metadata.columns.tolist()

['index',
 'name',
 'num_patterns',
 'num_seqlets',
 'num_samples',
 'num_datasets',
 'datasets',
 'posneg',
 'hocomoco_similarity',
 'hocomoco_match',
 'vierstra_similarity',
 'vierstra_match',
 'selin_similarity',
 'selin_match',
 'annotation']

In [28]:
logo_link_base = 'https://mitra.stanford.edu/kundaje/oak/projects/neuro-variants/motif_compendium/logos'

metadata['modisco_fwd'] = path_to_image_link(logo_link_base + '/modisco/all_data/leiden_96/' + metadata['name'] + '.cwm.fwd.png')
metadata['modisco_rev'] = path_to_image_link(logo_link_base + '/modisco/all_data/leiden_96/' + metadata['name'] + '.cwm.rev.png')
metadata['selin_fwd'] = path_to_image_link(logo_link_base + '/selin/' + metadata['selin_match'] + '.cwm.fwd.png')
metadata['selin_rev'] = path_to_image_link(logo_link_base + '/selin/' + metadata['selin_match'] + '.cwm.rev.png')
metadata['vierstra_logo'] = path_to_image_link(logo_link_base + '/vierstra/' + metadata['vierstra_match'] + '.png')
metadata['hocomoco_logo'] = path_to_image_link(logo_link_base + '/hocomoco/' + metadata['hocomoco_match'] + '.png')
metadata.rename(columns={'name': 'modisco_pattern',
                         'annotation': 'auto_annotation'},
                         inplace=True)
metadata['manual_annotation'] = ''

metadata = metadata[['modisco_pattern', 'modisco_fwd', 'modisco_rev', 'manual_annotation', 'auto_annotation',
                     'selin_similarity', 'selin_match', 'selin_fwd', 'selin_rev',
                     'vierstra_similarity', 'vierstra_match', 'vierstra_logo',
                     'hocomoco_similarity', 'hocomoco_match', 'hocomoco_logo',
                     'num_patterns', 'num_seqlets', 'num_samples', 'num_datasets',
                     'datasets', 'posneg', 'index']]

metadata

Unnamed: 0,modisco_pattern,modisco_fwd,modisco_rev,manual_annotation,auto_annotation,selin_similarity,selin_match,selin_fwd,selin_rev,vierstra_similarity,...,hocomoco_similarity,hocomoco_match,hocomoco_logo,num_patterns,num_seqlets,num_samples,num_datasets,datasets,posneg,index
0,pos_patterns.pattern_0,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...","=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",,NRF1_0,0.998523,NRF1,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...","=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",0.951238,...,0.914414,NRF1.H12CORE.0.PS.A,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",74,194827,53,3,"corces_2020,domcke_2020,trevino_2021",pos,0
1,pos_patterns.pattern_1,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...","=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",,SP-KLF_1,0.998351,SP-KLF,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...","=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",0.944662,...,0.969925,SP4.H12CORE.0.P.C,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",58,372948,54,3,"corces_2020,domcke_2020,trevino_2021",pos,1
2,pos_patterns.pattern_2,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...","=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",,BZIP:ATF-CREB-1_2,0.998884,BZIP:ATF-CREB-1,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...","=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",0.973756,...,0.937175,GMEB2.H12CORE.2.SM.B,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",55,137965,53,3,"corces_2020,domcke_2020,trevino_2021",pos,2
3,pos_patterns.pattern_3,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...","=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",,ZNF143-1_3,0.999409,ZNF143-1,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...","=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",0.971501,...,0.981210,ZN143.H12CORE.0.P.B,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",54,92698,54,3,"corces_2020,domcke_2020,trevino_2021",pos,3
4,pos_patterns.pattern_4,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...","=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",,NFY_4,0.999510,NFY,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...","=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",0.984840,...,0.978794,NFYA.H12CORE.0.P.B,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",54,317250,54,3,"corces_2020,domcke_2020,trevino_2021",pos,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,pos_patterns.pattern_720,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...","=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",,RFX-1_994,0.962012,RFX-1,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...","=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",0.922637,...,0.912841,RFX1.H12CORE.1.PSM.A,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",1,28,1,1,corces_2020,pos,994
995,pos_patterns.pattern_721,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...","=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",,Unknown_995,0.930452,BZIP:CEBP-1,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...","=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",0.907553,...,0.908444,CEBPD.H12CORE.0.P.B,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",1,27,1,1,corces_2020,pos,995
996,pos_patterns.pattern_722,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...","=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",,Unknown_996,0.888819,BHLH:USF1-2-1,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...","=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",0.854161,...,0.898491,MITF.H12CORE.0.P.B,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",1,26,1,1,corces_2020,pos,996
997,pos_patterns.pattern_723,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...","=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",,Unknown_997,0.937730,RUNX-1,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...","=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",0.919790,...,0.901590,RUNX2.H12CORE.0.P.B,"=IMAGE(""https://mitra.stanford.edu/kundaje/oak...",1,24,1,1,corces_2020,pos,997


In [29]:
metadata.to_csv('/oak/stanford/groups/akundaje/projects/neuro-variants/motif_compendium/all_data/leiden_96/neuro-variants.all_data.motif_compendium.avg.metadata.leiden_96.logos.tsv',
                sep='\t', index=False)