In [1]:
import MotifCompendium
import MotifCompendium.utils.analysis as utils_analysis
import MotifCompendium.utils.motif as utils_motif
from MotifCompendium.utils.similarity import set_default_options
from IPython.display import display, HTML, Image
import pandas as pd
import numpy as np
import os
import h5py
import matplotlib.pyplot as plt
import seaborn as sns
from multiprocessing import Pool
import scipy.cluster.hierarchy as sch
from scipy.cluster.hierarchy import linkage, leaves_list, dendrogram

In [2]:
# Directory containing the data
data_dir = '/oak/stanford/groups/akundaje/projects/neuro-variants/finemo_variants/rare/K562_bias'
table_dir = '/oak/stanford/groups/akundaje/projects/neuro-variants/tables'
rare_variants_file = '/oak/stanford/groups/akundaje/projects/neuro-variants/variant_lists/shap_variants.rare.model_inputs.hg38.tsv'
variant_motif_matrices_dir = '/oak/stanford/groups/akundaje/projects/neuro-variants/variant_motif_matrices/rare/K562_bias'
plot_dir = '/oak/stanford/groups/akundaje/projects/neuro-variants/plots/motifs'
motif_file_name = "hits_unique.tsv"

fold = 'mean'
score_type = 'counts'
alpha = 'alpha_0.8'
leiden = 'leiden_96'

# Define motifs to exclude
EXCLUDED_MOTIFS = ['ZBTB33', 'POU_4', 'SP-KLF_2', 'BCL11', 'BHLH::NFI_5',
                   'NHLH', 'BHLH_3', 'NFI-half_1', 'NFI-half_3', 'HAND',
                   'E2F_2', 'SOX::SOX_13', 'SOX::SOX_12', 'DLX']

In [3]:
mc = MotifCompendium.load('/oak/stanford/groups/akundaje/projects/neuro-variants/motif_compendium/all_data/'
                          + leiden + '/neuro-variants.all_data.motif_compendium.' + leiden + '.mc')

mc['original_pattern'] = mc['name'].str.split('-').str[1].str.split('.').str[0] + '_patterns.' + mc['name'].str.split('.').str[-1]
mc_metadata = mc.metadata.copy()
mc_metadata = mc_metadata[['model', 'original_pattern', leiden, 'annotation']]

mc_metadata

Unnamed: 0,model,original_pattern,leiden_96,annotation
0,trevino_2021.c20,pos_patterns.pattern_0,10,NFI-1_10
1,trevino_2021.c20,pos_patterns.pattern_1,5,CTCF-1_5
2,trevino_2021.c20,pos_patterns.pattern_10,2,BZIP:ATF-CREB-1_2
3,trevino_2021.c20,pos_patterns.pattern_11,7,ETS:ELF-ETV-1_7
4,trevino_2021.c20,pos_patterns.pattern_12,3,ZNF143-1_3
...,...,...,...,...
2483,corces_2020.Cluster24,pos_patterns.pattern_6,99,IRF-STAT_IRF-STAT-1_99
2484,corces_2020.Cluster24,pos_patterns.pattern_7,4,NFY_4
2485,corces_2020.Cluster24,pos_patterns.pattern_8,19,BZIP:ATF-CREB-1_19
2486,corces_2020.Cluster24,pos_patterns.pattern_9,10,NFI-1_10


In [4]:
mc_avg = MotifCompendium.load('/oak/stanford/groups/akundaje/projects/neuro-variants/motif_compendium/all_data/'
                                + leiden + '/neuro-variants.all_data.motif_compendium.avg.' + leiden + '.mc')

mc_avg.metadata

Unnamed: 0,index,name,num_patterns,num_seqlets,num_samples,num_datasets,datasets,posneg,hocomoco_similarity,hocomoco_match,vierstra_similarity,vierstra_match,selin_similarity,selin_match,annotation
0,0,pos_patterns.pattern_0,74,194827,53,3,"corces_2020,domcke_2020,trevino_2021",pos,0.914414,NRF1.H12CORE.0.PS.A,0.951238,NRF1_M09443_2.00,0.998523,NRF1,NRF1_0
1,1,pos_patterns.pattern_1,58,372948,54,3,"corces_2020,domcke_2020,trevino_2021",pos,0.969925,SP4.H12CORE.0.P.C,0.944662,SP4_M08296_2.00,0.998351,SP/KLF,SP-KLF_1
2,2,pos_patterns.pattern_2,55,137965,53,3,"corces_2020,domcke_2020,trevino_2021",pos,0.937175,GMEB2.H12CORE.2.SM.B,0.973756,CREB1_M04258_2.00,0.998884,BZIP:ATF/CREB#1,BZIP:ATF-CREB-1_2
3,3,pos_patterns.pattern_3,54,92698,54,3,"corces_2020,domcke_2020,trevino_2021",pos,0.981210,ZN143.H12CORE.0.P.B,0.971501,ETS1_M07950_2.00,0.999409,ZNF143#1,ZNF143-1_3
4,4,pos_patterns.pattern_4,54,317250,54,3,"corces_2020,domcke_2020,trevino_2021",pos,0.978794,NFYA.H12CORE.0.P.B,0.984840,NFYA_MA0060.1,0.999510,NFY,NFY_4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,994,pos_patterns.pattern_720,1,28,1,1,corces_2020,pos,0.912841,RFX1.H12CORE.1.PSM.A,0.922637,RFX2_M09362_2.00,0.962012,RFX#1,RFX-1_994
995,995,pos_patterns.pattern_721,1,27,1,1,corces_2020,pos,0.908444,CEBPD.H12CORE.0.P.B,0.907553,CEBPA_M08813_2.00,0.930452,BZIP:CEBP#1,Unknown_995
996,996,pos_patterns.pattern_722,1,26,1,1,corces_2020,pos,0.898491,MITF.H12CORE.0.P.B,0.854161,ZNF317_M08308_2.00,0.888819,BHLH:USF1/2#1,Unknown_996
997,997,pos_patterns.pattern_723,1,24,1,1,corces_2020,pos,0.901590,RUNX2.H12CORE.0.P.B,0.919790,RUNX3_M02751_2.00,0.937730,RUNX#1,Unknown_997


In [5]:
# Define the entropy metrics we want to calculate
entropy_list = [
    "motif_entropy",
    "posbase_entropy_ratio",
    "copair_entropy_ratio",
    "dinuc_entropy_ratio"
]

# Calculate entropy metrics
utils_analysis.calculate_entropy(
    mc_avg,
    entropy_list=entropy_list
)

mc_avg.metadata.rename(columns={'index': leiden,
                                'name': 'clustered_pattern'},
                                inplace=True)

mc_avg.metadata

Unnamed: 0,leiden_96,clustered_pattern,num_patterns,num_seqlets,num_samples,num_datasets,datasets,posneg,hocomoco_similarity,hocomoco_match,vierstra_similarity,vierstra_match,selin_similarity,selin_match,annotation,posbase_entropy_ratio,motif_entropy,dinuc_entropy_ratio,copair_entropy_ratio
0,0,pos_patterns.pattern_0,74,194827,53,3,"corces_2020,domcke_2020,trevino_2021",pos,0.914414,NRF1.H12CORE.0.PS.A,0.951238,NRF1_M09443_2.00,0.998523,NRF1,NRF1_0,1.543807,0.516517,2.845492,1.488471
1,1,pos_patterns.pattern_1,58,372948,54,3,"corces_2020,domcke_2020,trevino_2021",pos,0.969925,SP4.H12CORE.0.P.C,0.944662,SP4_M08296_2.00,0.998351,SP/KLF,SP-KLF_1,2.061026,0.510798,1.740413,1.639470
2,2,pos_patterns.pattern_2,55,137965,53,3,"corces_2020,domcke_2020,trevino_2021",pos,0.937175,GMEB2.H12CORE.2.SM.B,0.973756,CREB1_M04258_2.00,0.998884,BZIP:ATF/CREB#1,BZIP:ATF-CREB-1_2,1.127238,0.506504,1.292231,1.554268
3,3,pos_patterns.pattern_3,54,92698,54,3,"corces_2020,domcke_2020,trevino_2021",pos,0.981210,ZN143.H12CORE.0.P.B,0.971501,ETS1_M07950_2.00,0.999409,ZNF143#1,ZNF143-1_3,1.445198,0.567049,1.474322,1.301145
4,4,pos_patterns.pattern_4,54,317250,54,3,"corces_2020,domcke_2020,trevino_2021",pos,0.978794,NFYA.H12CORE.0.P.B,0.984840,NFYA_MA0060.1,0.999510,NFY,NFY_4,1.144037,0.487531,1.526165,1.653990
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,994,pos_patterns.pattern_720,1,28,1,1,corces_2020,pos,0.912841,RFX1.H12CORE.1.PSM.A,0.922637,RFX2_M09362_2.00,0.962012,RFX#1,RFX-1_994,1.284123,0.572708,1.447394,1.632554
995,995,pos_patterns.pattern_721,1,27,1,1,corces_2020,pos,0.908444,CEBPD.H12CORE.0.P.B,0.907553,CEBPA_M08813_2.00,0.930452,BZIP:CEBP#1,Unknown_995,1.223956,0.540519,1.552044,1.222717
996,996,pos_patterns.pattern_722,1,26,1,1,corces_2020,pos,0.898491,MITF.H12CORE.0.P.B,0.854161,ZNF317_M08308_2.00,0.888819,BHLH:USF1/2#1,Unknown_996,1.137544,0.560141,1.442505,1.232628
997,997,pos_patterns.pattern_723,1,24,1,1,corces_2020,pos,0.901590,RUNX2.H12CORE.0.P.B,0.919790,RUNX3_M02751_2.00,0.937730,RUNX#1,Unknown_997,1.522029,0.577830,1.518736,1.494380


In [6]:
mc_avg_metadata = mc_avg.metadata.copy()

mc_avg_metadata

Unnamed: 0,leiden_96,clustered_pattern,num_patterns,num_seqlets,num_samples,num_datasets,datasets,posneg,hocomoco_similarity,hocomoco_match,vierstra_similarity,vierstra_match,selin_similarity,selin_match,annotation,posbase_entropy_ratio,motif_entropy,dinuc_entropy_ratio,copair_entropy_ratio
0,0,pos_patterns.pattern_0,74,194827,53,3,"corces_2020,domcke_2020,trevino_2021",pos,0.914414,NRF1.H12CORE.0.PS.A,0.951238,NRF1_M09443_2.00,0.998523,NRF1,NRF1_0,1.543807,0.516517,2.845492,1.488471
1,1,pos_patterns.pattern_1,58,372948,54,3,"corces_2020,domcke_2020,trevino_2021",pos,0.969925,SP4.H12CORE.0.P.C,0.944662,SP4_M08296_2.00,0.998351,SP/KLF,SP-KLF_1,2.061026,0.510798,1.740413,1.639470
2,2,pos_patterns.pattern_2,55,137965,53,3,"corces_2020,domcke_2020,trevino_2021",pos,0.937175,GMEB2.H12CORE.2.SM.B,0.973756,CREB1_M04258_2.00,0.998884,BZIP:ATF/CREB#1,BZIP:ATF-CREB-1_2,1.127238,0.506504,1.292231,1.554268
3,3,pos_patterns.pattern_3,54,92698,54,3,"corces_2020,domcke_2020,trevino_2021",pos,0.981210,ZN143.H12CORE.0.P.B,0.971501,ETS1_M07950_2.00,0.999409,ZNF143#1,ZNF143-1_3,1.445198,0.567049,1.474322,1.301145
4,4,pos_patterns.pattern_4,54,317250,54,3,"corces_2020,domcke_2020,trevino_2021",pos,0.978794,NFYA.H12CORE.0.P.B,0.984840,NFYA_MA0060.1,0.999510,NFY,NFY_4,1.144037,0.487531,1.526165,1.653990
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,994,pos_patterns.pattern_720,1,28,1,1,corces_2020,pos,0.912841,RFX1.H12CORE.1.PSM.A,0.922637,RFX2_M09362_2.00,0.962012,RFX#1,RFX-1_994,1.284123,0.572708,1.447394,1.632554
995,995,pos_patterns.pattern_721,1,27,1,1,corces_2020,pos,0.908444,CEBPD.H12CORE.0.P.B,0.907553,CEBPA_M08813_2.00,0.930452,BZIP:CEBP#1,Unknown_995,1.223956,0.540519,1.552044,1.222717
996,996,pos_patterns.pattern_722,1,26,1,1,corces_2020,pos,0.898491,MITF.H12CORE.0.P.B,0.854161,ZNF317_M08308_2.00,0.888819,BHLH:USF1/2#1,Unknown_996,1.137544,0.560141,1.442505,1.232628
997,997,pos_patterns.pattern_723,1,24,1,1,corces_2020,pos,0.901590,RUNX2.H12CORE.0.P.B,0.919790,RUNX3_M02751_2.00,0.937730,RUNX#1,Unknown_997,1.522029,0.577830,1.518736,1.494380


In [7]:
mc_avg_metadata.columns

Index(['leiden_96', 'clustered_pattern', 'num_patterns', 'num_seqlets',
       'num_samples', 'num_datasets', 'datasets', 'posneg',
       'hocomoco_similarity', 'hocomoco_match', 'vierstra_similarity',
       'vierstra_match', 'selin_similarity', 'selin_match', 'annotation',
       'posbase_entropy_ratio', 'motif_entropy', 'dinuc_entropy_ratio',
       'copair_entropy_ratio'],
      dtype='object')

In [8]:
manual_annotations = pd.read_table('/oak/stanford/groups/akundaje/projects/neuro-variants/motif_compendium/all_data/'
                                    + leiden + '/neuro-variants.all_data.motif_compendium.avg.metadata.annotated_only.manual.including_repeats.'
                                    + leiden + '.tsv',
                                    names=['clustered_pattern', 'manual_annotation'])

manual_annotations

Unnamed: 0,clustered_pattern,manual_annotation
0,pos_patterns.pattern_18,AP1
1,pos_patterns.pattern_276,AP1::NFAT
2,pos_patterns.pattern_17,ARNT-USF
3,pos_patterns.pattern_149,AT-repeat_1
4,neg_patterns.pattern_246,AT-repeat_2
...,...,...
185,neg_patterns.pattern_11,ZBTB7
186,neg_patterns.pattern_0,ZEB-SNAI
187,neg_patterns.pattern_3,ZEB-SNAI::ZEB-SNAI
188,pos_patterns.pattern_3,ZNF143_1


In [9]:
mc_avg_manual_annotations = mc_avg_metadata.merge(manual_annotations, on=['clustered_pattern'], how='left')

mc_avg_manual_annotations

Unnamed: 0,leiden_96,clustered_pattern,num_patterns,num_seqlets,num_samples,num_datasets,datasets,posneg,hocomoco_similarity,hocomoco_match,vierstra_similarity,vierstra_match,selin_similarity,selin_match,annotation,posbase_entropy_ratio,motif_entropy,dinuc_entropy_ratio,copair_entropy_ratio,manual_annotation
0,0,pos_patterns.pattern_0,74,194827,53,3,"corces_2020,domcke_2020,trevino_2021",pos,0.914414,NRF1.H12CORE.0.PS.A,0.951238,NRF1_M09443_2.00,0.998523,NRF1,NRF1_0,1.543807,0.516517,2.845492,1.488471,NRF1
1,1,pos_patterns.pattern_1,58,372948,54,3,"corces_2020,domcke_2020,trevino_2021",pos,0.969925,SP4.H12CORE.0.P.C,0.944662,SP4_M08296_2.00,0.998351,SP/KLF,SP-KLF_1,2.061026,0.510798,1.740413,1.639470,SP-KLF_1
2,2,pos_patterns.pattern_2,55,137965,53,3,"corces_2020,domcke_2020,trevino_2021",pos,0.937175,GMEB2.H12CORE.2.SM.B,0.973756,CREB1_M04258_2.00,0.998884,BZIP:ATF/CREB#1,BZIP:ATF-CREB-1_2,1.127238,0.506504,1.292231,1.554268,ATF_1
3,3,pos_patterns.pattern_3,54,92698,54,3,"corces_2020,domcke_2020,trevino_2021",pos,0.981210,ZN143.H12CORE.0.P.B,0.971501,ETS1_M07950_2.00,0.999409,ZNF143#1,ZNF143-1_3,1.445198,0.567049,1.474322,1.301145,ZNF143_1
4,4,pos_patterns.pattern_4,54,317250,54,3,"corces_2020,domcke_2020,trevino_2021",pos,0.978794,NFYA.H12CORE.0.P.B,0.984840,NFYA_MA0060.1,0.999510,NFY,NFY_4,1.144037,0.487531,1.526165,1.653990,NFY
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,994,pos_patterns.pattern_720,1,28,1,1,corces_2020,pos,0.912841,RFX1.H12CORE.1.PSM.A,0.922637,RFX2_M09362_2.00,0.962012,RFX#1,RFX-1_994,1.284123,0.572708,1.447394,1.632554,
995,995,pos_patterns.pattern_721,1,27,1,1,corces_2020,pos,0.908444,CEBPD.H12CORE.0.P.B,0.907553,CEBPA_M08813_2.00,0.930452,BZIP:CEBP#1,Unknown_995,1.223956,0.540519,1.552044,1.222717,
996,996,pos_patterns.pattern_722,1,26,1,1,corces_2020,pos,0.898491,MITF.H12CORE.0.P.B,0.854161,ZNF317_M08308_2.00,0.888819,BHLH:USF1/2#1,Unknown_996,1.137544,0.560141,1.442505,1.232628,
997,997,pos_patterns.pattern_723,1,24,1,1,corces_2020,pos,0.901590,RUNX2.H12CORE.0.P.B,0.919790,RUNX3_M02751_2.00,0.937730,RUNX#1,Unknown_997,1.522029,0.577830,1.518736,1.494380,


In [10]:
mc_avg_manual_annotations.columns

Index(['leiden_96', 'clustered_pattern', 'num_patterns', 'num_seqlets',
       'num_samples', 'num_datasets', 'datasets', 'posneg',
       'hocomoco_similarity', 'hocomoco_match', 'vierstra_similarity',
       'vierstra_match', 'selin_similarity', 'selin_match', 'annotation',
       'posbase_entropy_ratio', 'motif_entropy', 'dinuc_entropy_ratio',
       'copair_entropy_ratio', 'manual_annotation'],
      dtype='object')

In [11]:
mc_manual_annotations = mc_metadata.merge(mc_avg_manual_annotations, on=[leiden, 'annotation'], how='left')

mc_manual_annotations

Unnamed: 0,model,original_pattern,leiden_96,annotation,clustered_pattern,num_patterns,num_seqlets,num_samples,num_datasets,datasets,...,hocomoco_match,vierstra_similarity,vierstra_match,selin_similarity,selin_match,posbase_entropy_ratio,motif_entropy,dinuc_entropy_ratio,copair_entropy_ratio,manual_annotation
0,trevino_2021.c20,pos_patterns.pattern_0,10,NFI-1_10,pos_patterns.pattern_10,47,537160,47,3,"corces_2020,domcke_2020,trevino_2021",...,NFIX.H12CORE.0.SM.B,0.964340,NFIC_M09636_2.00,0.999624,NFI#1,1.253310,0.466839,1.401777,1.142712,NFI_1
1,trevino_2021.c20,pos_patterns.pattern_1,5,CTCF-1_5,pos_patterns.pattern_5,54,947567,54,3,"corces_2020,domcke_2020,trevino_2021",...,CTCF.H12CORE.0.P.B,0.983484,CTCFL_M09507_2.00,0.999548,CTCF#1,1.413051,0.530499,1.424199,1.240325,CTCF_1
2,trevino_2021.c20,pos_patterns.pattern_10,2,BZIP:ATF-CREB-1_2,pos_patterns.pattern_2,55,137965,53,3,"corces_2020,domcke_2020,trevino_2021",...,GMEB2.H12CORE.2.SM.B,0.973756,CREB1_M04258_2.00,0.998884,BZIP:ATF/CREB#1,1.127238,0.506504,1.292231,1.554268,ATF_1
3,trevino_2021.c20,pos_patterns.pattern_11,7,ETS:ELF-ETV-1_7,pos_patterns.pattern_7,53,163362,52,3,"corces_2020,domcke_2020,trevino_2021",...,ELK4.H12CORE.0.PSM.A,0.956144,ELK3_M04730_2.00,0.999409,ETS:ELF/ETV#1,1.272438,0.488163,1.458882,1.557328,ETS_2
4,trevino_2021.c20,pos_patterns.pattern_12,3,ZNF143-1_3,pos_patterns.pattern_3,54,92698,54,3,"corces_2020,domcke_2020,trevino_2021",...,ZN143.H12CORE.0.P.B,0.971501,ETS1_M07950_2.00,0.999409,ZNF143#1,1.445198,0.567049,1.474322,1.301145,ZNF143_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2483,corces_2020.Cluster24,pos_patterns.pattern_6,99,IRF-STAT_IRF-STAT-1_99,pos_patterns.pattern_83,2,6758,2,2,"corces_2020,trevino_2021",...,IRF3.H12CORE.0.PS.A,0.934069,STAT1_M08230_2.00,0.989269,IRF/STAT_IRF/STAT#1,1.423037,0.509113,1.896891,1.408563,IRF_2
2484,corces_2020.Cluster24,pos_patterns.pattern_7,4,NFY_4,pos_patterns.pattern_4,54,317250,54,3,"corces_2020,domcke_2020,trevino_2021",...,NFYA.H12CORE.0.P.B,0.984840,NFYA_MA0060.1,0.999510,NFY,1.144037,0.487531,1.526165,1.653990,NFY
2485,corces_2020.Cluster24,pos_patterns.pattern_8,19,BZIP:ATF-CREB-1_19,pos_patterns.pattern_17,29,32724,28,3,"corces_2020,domcke_2020,trevino_2021",...,BHE41.H12CORE.0.PSM.A,0.984382,BHLHE41_M02783_2.00,0.997725,BZIP:ATF/CREB#1,1.158009,0.504458,1.425264,1.851094,ARNT-USF
2486,corces_2020.Cluster24,pos_patterns.pattern_9,10,NFI-1_10,pos_patterns.pattern_10,47,537160,47,3,"corces_2020,domcke_2020,trevino_2021",...,NFIX.H12CORE.0.SM.B,0.964340,NFIC_M09636_2.00,0.999624,NFI#1,1.253310,0.466839,1.401777,1.142712,NFI_1


In [12]:
# Filter out motifs that have nan manual annotations
mc_manual_annotations_filtered = mc_manual_annotations.loc[~mc_manual_annotations['manual_annotation'].isna()].copy()

mc_manual_annotations_filtered = mc_manual_annotations_filtered.loc[(~mc_manual_annotations_filtered['manual_annotation'].str.startswith('Unknown')) &
                                                                    (~mc_manual_annotations_filtered['manual_annotation'].str.contains('repeat'))]

mc_manual_annotations_filtered

Unnamed: 0,model,original_pattern,leiden_96,annotation,clustered_pattern,num_patterns,num_seqlets,num_samples,num_datasets,datasets,...,hocomoco_match,vierstra_similarity,vierstra_match,selin_similarity,selin_match,posbase_entropy_ratio,motif_entropy,dinuc_entropy_ratio,copair_entropy_ratio,manual_annotation
0,trevino_2021.c20,pos_patterns.pattern_0,10,NFI-1_10,pos_patterns.pattern_10,47,537160,47,3,"corces_2020,domcke_2020,trevino_2021",...,NFIX.H12CORE.0.SM.B,0.964340,NFIC_M09636_2.00,0.999624,NFI#1,1.253310,0.466839,1.401777,1.142712,NFI_1
1,trevino_2021.c20,pos_patterns.pattern_1,5,CTCF-1_5,pos_patterns.pattern_5,54,947567,54,3,"corces_2020,domcke_2020,trevino_2021",...,CTCF.H12CORE.0.P.B,0.983484,CTCFL_M09507_2.00,0.999548,CTCF#1,1.413051,0.530499,1.424199,1.240325,CTCF_1
2,trevino_2021.c20,pos_patterns.pattern_10,2,BZIP:ATF-CREB-1_2,pos_patterns.pattern_2,55,137965,53,3,"corces_2020,domcke_2020,trevino_2021",...,GMEB2.H12CORE.2.SM.B,0.973756,CREB1_M04258_2.00,0.998884,BZIP:ATF/CREB#1,1.127238,0.506504,1.292231,1.554268,ATF_1
3,trevino_2021.c20,pos_patterns.pattern_11,7,ETS:ELF-ETV-1_7,pos_patterns.pattern_7,53,163362,52,3,"corces_2020,domcke_2020,trevino_2021",...,ELK4.H12CORE.0.PSM.A,0.956144,ELK3_M04730_2.00,0.999409,ETS:ELF/ETV#1,1.272438,0.488163,1.458882,1.557328,ETS_2
4,trevino_2021.c20,pos_patterns.pattern_12,3,ZNF143-1_3,pos_patterns.pattern_3,54,92698,54,3,"corces_2020,domcke_2020,trevino_2021",...,ZN143.H12CORE.0.P.B,0.971501,ETS1_M07950_2.00,0.999409,ZNF143#1,1.445198,0.567049,1.474322,1.301145,ZNF143_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2483,corces_2020.Cluster24,pos_patterns.pattern_6,99,IRF-STAT_IRF-STAT-1_99,pos_patterns.pattern_83,2,6758,2,2,"corces_2020,trevino_2021",...,IRF3.H12CORE.0.PS.A,0.934069,STAT1_M08230_2.00,0.989269,IRF/STAT_IRF/STAT#1,1.423037,0.509113,1.896891,1.408563,IRF_2
2484,corces_2020.Cluster24,pos_patterns.pattern_7,4,NFY_4,pos_patterns.pattern_4,54,317250,54,3,"corces_2020,domcke_2020,trevino_2021",...,NFYA.H12CORE.0.P.B,0.984840,NFYA_MA0060.1,0.999510,NFY,1.144037,0.487531,1.526165,1.653990,NFY
2485,corces_2020.Cluster24,pos_patterns.pattern_8,19,BZIP:ATF-CREB-1_19,pos_patterns.pattern_17,29,32724,28,3,"corces_2020,domcke_2020,trevino_2021",...,BHE41.H12CORE.0.PSM.A,0.984382,BHLHE41_M02783_2.00,0.997725,BZIP:ATF/CREB#1,1.158009,0.504458,1.425264,1.851094,ARNT-USF
2486,corces_2020.Cluster24,pos_patterns.pattern_9,10,NFI-1_10,pos_patterns.pattern_10,47,537160,47,3,"corces_2020,domcke_2020,trevino_2021",...,NFIX.H12CORE.0.SM.B,0.964340,NFIC_M09636_2.00,0.999624,NFI#1,1.253310,0.466839,1.401777,1.142712,NFI_1


In [13]:
mc_manual_annotations_filtered['manual_annotation'].nunique()

155

In [14]:
# Filter out motifs that have nan manual annotations
mc_avg_manual_annotations_filtered = mc_avg_manual_annotations.loc[~mc_avg_manual_annotations['manual_annotation'].isna()].copy()

mc_avg_manual_annotations_filtered = mc_avg_manual_annotations_filtered.loc[(~mc_avg_manual_annotations_filtered['manual_annotation'].str.startswith('Unknown')) &
                                                                            (~mc_avg_manual_annotations_filtered['manual_annotation'].str.contains('repeat'))]

mc_avg_manual_annotations_filtered

Unnamed: 0,leiden_96,clustered_pattern,num_patterns,num_seqlets,num_samples,num_datasets,datasets,posneg,hocomoco_similarity,hocomoco_match,vierstra_similarity,vierstra_match,selin_similarity,selin_match,annotation,posbase_entropy_ratio,motif_entropy,dinuc_entropy_ratio,copair_entropy_ratio,manual_annotation
0,0,pos_patterns.pattern_0,74,194827,53,3,"corces_2020,domcke_2020,trevino_2021",pos,0.914414,NRF1.H12CORE.0.PS.A,0.951238,NRF1_M09443_2.00,0.998523,NRF1,NRF1_0,1.543807,0.516517,2.845492,1.488471,NRF1
1,1,pos_patterns.pattern_1,58,372948,54,3,"corces_2020,domcke_2020,trevino_2021",pos,0.969925,SP4.H12CORE.0.P.C,0.944662,SP4_M08296_2.00,0.998351,SP/KLF,SP-KLF_1,2.061026,0.510798,1.740413,1.639470,SP-KLF_1
2,2,pos_patterns.pattern_2,55,137965,53,3,"corces_2020,domcke_2020,trevino_2021",pos,0.937175,GMEB2.H12CORE.2.SM.B,0.973756,CREB1_M04258_2.00,0.998884,BZIP:ATF/CREB#1,BZIP:ATF-CREB-1_2,1.127238,0.506504,1.292231,1.554268,ATF_1
3,3,pos_patterns.pattern_3,54,92698,54,3,"corces_2020,domcke_2020,trevino_2021",pos,0.981210,ZN143.H12CORE.0.P.B,0.971501,ETS1_M07950_2.00,0.999409,ZNF143#1,ZNF143-1_3,1.445198,0.567049,1.474322,1.301145,ZNF143_1
4,4,pos_patterns.pattern_4,54,317250,54,3,"corces_2020,domcke_2020,trevino_2021",pos,0.978794,NFYA.H12CORE.0.P.B,0.984840,NFYA_MA0060.1,0.999510,NFY,NFY_4,1.144037,0.487531,1.526165,1.653990,NFY
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
942,942,pos_patterns.pattern_680,1,1230,1,1,corces_2020,pos,0.975248,PO2F3.H12CORE.0.PS.A,0.883825,VSX2_M05039_2.00,0.957176,HD#1,PO2F3.H12CORE.0.PS.A_942,1.069282,0.486243,1.177785,1.375409,LHX_2
943,943,pos_patterns.pattern_681,1,1165,1,1,corces_2020,pos,0.834702,HME2.H12CORE.0.SM.B,0.854191,LMX1B_M05154_2.00,0.867289,HD#1,Unknown_943,1.114464,0.502105,1.168189,1.296962,POU4F_1
968,968,pos_patterns.pattern_694,1,3789,1,1,corces_2020,pos,0.969082,RUNX2.H12CORE.0.P.B,0.982315,RUNX3_M09369_2.00,0.994932,RUNX#1,RUNX-1_968,1.562398,0.501254,1.741838,1.853814,RUNX
969,969,pos_patterns.pattern_695,1,3293,1,1,corces_2020,pos,0.893427,ETV6.H12CORE.1.P.B,0.948562,ETV4_M04798_2.00,0.991183,ETS:ELF/SPIB#1,ETS:ELF-SPIB-1_969,1.728998,0.499603,2.168055,1.546192,ETV_1


In [15]:
mc_avg_manual_annotations_filtered['manual_annotation'].nunique()

155

In [16]:
original_pattern_to_cluster = { (row['model'], row['original_pattern']): row['manual_annotation'] for _, row in mc_manual_annotations_filtered.iterrows() }

original_pattern_to_cluster

{('trevino_2021.c20', 'pos_patterns.pattern_0'): 'NFI_1',
 ('trevino_2021.c20', 'pos_patterns.pattern_1'): 'CTCF_1',
 ('trevino_2021.c20', 'pos_patterns.pattern_10'): 'ATF_1',
 ('trevino_2021.c20', 'pos_patterns.pattern_11'): 'ETS_2',
 ('trevino_2021.c20', 'pos_patterns.pattern_12'): 'ZNF143_1',
 ('trevino_2021.c20', 'pos_patterns.pattern_13'): 'POU_1',
 ('trevino_2021.c20', 'pos_patterns.pattern_14'): 'POU2F_1',
 ('trevino_2021.c20', 'pos_patterns.pattern_15'): 'YY1',
 ('trevino_2021.c20', 'pos_patterns.pattern_16'): 'POU4F_2',
 ('trevino_2021.c20', 'pos_patterns.pattern_17'): 'SP-KLF_2',
 ('trevino_2021.c20', 'pos_patterns.pattern_18'): 'ZBTB33',
 ('trevino_2021.c20', 'pos_patterns.pattern_19'): 'LHX::LHX_4',
 ('trevino_2021.c20', 'pos_patterns.pattern_2'): 'NFI-half_1',
 ('trevino_2021.c20', 'pos_patterns.pattern_20'): 'ETS::ATF_1',
 ('trevino_2021.c20', 'pos_patterns.pattern_21'): 'ZBTB33',
 ('trevino_2021.c20', 'pos_patterns.pattern_22'): 'RFX_2',
 ('trevino_2021.c20', 'pos_patter

In [17]:
merged_pattern_to_cluster = {row['clustered_pattern']: row['manual_annotation'] for _, row in mc_avg_manual_annotations_filtered.iterrows()}

merged_pattern_to_cluster

{'pos_patterns.pattern_0': 'NRF1',
 'pos_patterns.pattern_1': 'SP-KLF_1',
 'pos_patterns.pattern_2': 'ATF_1',
 'pos_patterns.pattern_3': 'ZNF143_1',
 'pos_patterns.pattern_4': 'NFY',
 'pos_patterns.pattern_5': 'CTCF_1',
 'pos_patterns.pattern_6': 'ZBTB33',
 'pos_patterns.pattern_7': 'ETS_2',
 'pos_patterns.pattern_8': 'RFX_1',
 'pos_patterns.pattern_9': 'ETS::ATF_1',
 'pos_patterns.pattern_10': 'NFI_1',
 'neg_patterns.pattern_0': 'ZEB-SNAI',
 'pos_patterns.pattern_11': 'YY1',
 'pos_patterns.pattern_12': 'RFX_2',
 'pos_patterns.pattern_13': 'ZNF143_2',
 'pos_patterns.pattern_14': 'SOX::SOX_1',
 'pos_patterns.pattern_15': 'NFI-half_1',
 'neg_patterns.pattern_1': 'BCL11',
 'pos_patterns.pattern_17': 'ARNT-USF',
 'pos_patterns.pattern_18': 'AP1',
 'pos_patterns.pattern_19': 'ATF_2',
 'pos_patterns.pattern_20': 'POU_1',
 'pos_patterns.pattern_21': 'LHX_1',
 'pos_patterns.pattern_22': 'BHLH_1',
 'pos_patterns.pattern_23': 'SP-KLF_2',
 'pos_patterns.pattern_24': 'POU2F_1',
 'pos_patterns.patt

In [18]:
def process_variant_motifs(file_info, original_pattern_to_cluster, merged_pattern_to_cluster):
    """Process original and clustered motif hits for variant alleles and construct a variant-by-motif table."""
    original_file_path, clustered_file_path, sample = file_info

    # Load original motif annotations
    original_df = pd.read_table(original_file_path)
    original_df = original_df[['peak_id', 'chr', 'variant_loc', 'motif_name', 'allele']]

    # Map motifs to clusters using (sample, motif_name) as the key
    original_df['motif_cluster'] = original_df.apply(lambda row: original_pattern_to_cluster.get((sample, row['motif_name'])), axis=1)
    
    # Remove motifs not found in the mapping
    original_df = original_df.dropna(subset=['motif_cluster'])
    original_df.drop(columns=['motif_name'], inplace=True)
    original_df.rename(columns={'motif_cluster': 'motif_name'}, inplace=True)

    # Unique motif clusters detected in the original file
    detected_clusters = set(original_df['motif_name'])

    ## ---- ##

    # Load clustered motif annotations
    clustered_df = pd.read_table(clustered_file_path)
    clustered_df = clustered_df[['peak_id', 'chr', 'variant_loc', 'motif_name', 'allele']]

    # Map motifs to clusters, using (sample, motif_name) as the key
    clustered_df['motif_cluster'] = clustered_df.apply(lambda row: merged_pattern_to_cluster.get(row['motif_name']), axis=1)

    # Remove motifs not found in the mapping
    clustered_df = clustered_df.dropna(subset=['motif_cluster'])
    clustered_df.drop(columns=['motif_name'], inplace=True)
    clustered_df.rename(columns={'motif_cluster': 'motif_name'}, inplace=True)

    # Remove motif clusters that were already found in the original file
    clustered_df = clustered_df[~clustered_df['motif_name'].isin(detected_clusters)]

    # Merge original and filtered clustered motif hits
    merged_df = pd.concat([original_df, clustered_df], ignore_index=True)

    merged_df = merged_df[~merged_df['motif_name'].isin(EXCLUDED_MOTIFS)]

    # Group and combine motif names per allele
    grouped = (
        merged_df.groupby(["chr", "variant_loc", "allele"])["motif_name"]
        .apply(lambda x: ", ".join(x))
        .reset_index()
    )

    # Pivot so each allele is its own column
    collapsed = grouped.pivot(index=["chr", "variant_loc"], columns="allele", values="motif_name").reset_index()

    # Rename columns for clarity
    collapsed = collapsed.rename(
        columns={
            "allele1": "motifs_allele1",
            "allele2": "motifs_allele2"
        }
    )

    # Combine all allele motifs
    collapsed['all_alleles'] = collapsed[['motifs_allele1', 'motifs_allele2']].apply(
        lambda row: ",".join([v for v in row if pd.notna(v)]),
        axis=1
    )

    # Update column names to include sample name
    collapsed.rename(
        columns={
            'chr': 'chr_hg38', 'variant_loc': 'pos_hg38',
            'motifs_allele1': f"{sample}.ref_allele_motifs",
            'motifs_allele2': f"{sample}.alt_allele_motifs",
            'all_alleles': f"{sample}.all_motifs"
        },
        inplace=True
    )

    return collapsed

In [19]:
file_infos = []

for dataset in os.listdir(data_dir):
    for sample in os.listdir(os.path.join(data_dir, dataset, 'all_data_all_patterns', leiden)):
        print(sample)
        file_infos.append((
            os.path.join(data_dir, dataset, 'original_modisco', sample, fold, score_type, alpha, 'variant_hit_calls.tsv'),  # Original file
            os.path.join(data_dir, dataset, 'all_data_all_patterns', leiden, sample, fold, score_type, alpha, 'variant_hit_calls.tsv'),  # Clustered file
            sample
        ))

trevino_2021.c20
trevino_2021.c19
trevino_2021.c9
trevino_2021.c16
trevino_2021.c4
trevino_2021.c1
trevino_2021.c10
trevino_2021.c11
trevino_2021.c8
trevino_2021.c14
trevino_2021.c12
trevino_2021.c13
trevino_2021.c21
trevino_2021.c18
trevino_2021.c17
trevino_2021.c5
trevino_2021.c3
trevino_2021.c0
trevino_2021.c7
trevino_2021.c15
trevino_2021.c6
trevino_2021.c2
domcke_2020.fetal_brain.SKOR2_NPSR1_positive_cells
domcke_2020.fetal_brain.Limbic_system_neurons
domcke_2020.fetal_brain.Astrocytes
domcke_2020.fetal_brain.Inhibitory_neurons
domcke_2020.fetal_brain.Excitatory_neurons
domcke_2020.fetal_brain.Vascular_endothelial_cells
domcke_2020.fetal_brain.Astrocytes_Oligodendrocytes
domcke_2020.fetal_brain.Cerebrum_Unknown_3
corces_2020.Cluster9
corces_2020.Cluster4
corces_2020.Cluster10
corces_2020.Cluster20
corces_2020.Cluster22
corces_2020.Cluster8
corces_2020.Cluster7
corces_2020.Cluster19
corces_2020.Cluster15
corces_2020.Cluster13
corces_2020.Cluster5
corces_2020.Cluster18
corces_2020.C

In [20]:
file_infos

[('/oak/stanford/groups/akundaje/projects/neuro-variants/finemo_variants/rare/K562_bias/trevino_2021/original_modisco/trevino_2021.c20/mean/counts/alpha_0.8/variant_hit_calls.tsv',
  '/oak/stanford/groups/akundaje/projects/neuro-variants/finemo_variants/rare/K562_bias/trevino_2021/all_data_all_patterns/leiden_96/trevino_2021.c20/mean/counts/alpha_0.8/variant_hit_calls.tsv',
  'trevino_2021.c20'),
 ('/oak/stanford/groups/akundaje/projects/neuro-variants/finemo_variants/rare/K562_bias/trevino_2021/original_modisco/trevino_2021.c19/mean/counts/alpha_0.8/variant_hit_calls.tsv',
  '/oak/stanford/groups/akundaje/projects/neuro-variants/finemo_variants/rare/K562_bias/trevino_2021/all_data_all_patterns/leiden_96/trevino_2021.c19/mean/counts/alpha_0.8/variant_hit_calls.tsv',
  'trevino_2021.c19'),
 ('/oak/stanford/groups/akundaje/projects/neuro-variants/finemo_variants/rare/K562_bias/trevino_2021/original_modisco/trevino_2021.c9/mean/counts/alpha_0.8/variant_hit_calls.tsv',
  '/oak/stanford/gro

In [21]:
# Process the files in parallel
with Pool(40) as pool:
    results = pool.starmap(process_variant_motifs, [(info, original_pattern_to_cluster, merged_pattern_to_cluster) for info in file_infos])

In [22]:
rare_variants = pd.read_table(rare_variants_file, names=['chr_hg38', 'pos_hg38', 'allele1', 'allele2', 'variant_id'])

rare_variants

Unnamed: 0,chr_hg38,pos_hg38,allele1,allele2,variant_id
0,chr1,101816427,A,T,chr1:101816427:A:T
1,chr1,10646593,A,G,chr1:10646593:A:G
2,chr1,108559923,G,C,chr1:108559923:G:C
3,chr1,108609660,G,A,chr1:108609660:G:A
4,chr1,108746573,G,A,chr1:108746573:G:A
...,...,...,...,...,...
1612,chr22,44201792,C,T,chr22:44201792:C:T
1613,chr22,45831660,T,A,chr22:45831660:T:A
1614,chr22,45831848,G,C,chr22:45831848:G:C
1615,chr22,49827389,A,C,chr22:49827389:A:C


In [23]:
# Combine results into a single DataFrame
variant_motifs = rare_variants.copy()
variant_motifs.rename(columns={'allele1': 'ref_allele', 'allele2': 'alt_allele'}, inplace=True)

for collapsed in results:
    variant_motifs = variant_motifs.merge(collapsed, on=['chr_hg38', 'pos_hg38'], how='left')

In [24]:
variant_motifs

Unnamed: 0,chr_hg38,pos_hg38,ref_allele,alt_allele,variant_id,trevino_2021.c20.ref_allele_motifs,trevino_2021.c20.alt_allele_motifs,trevino_2021.c20.all_motifs,trevino_2021.c19.ref_allele_motifs,trevino_2021.c19.alt_allele_motifs,...,corces_2020.Cluster23.all_motifs,corces_2020.Cluster17.ref_allele_motifs,corces_2020.Cluster17.alt_allele_motifs,corces_2020.Cluster17.all_motifs,corces_2020.Cluster11.ref_allele_motifs,corces_2020.Cluster11.alt_allele_motifs,corces_2020.Cluster11.all_motifs,corces_2020.Cluster24.ref_allele_motifs,corces_2020.Cluster24.alt_allele_motifs,corces_2020.Cluster24.all_motifs
0,chr1,101816427,A,T,chr1:101816427:A:T,,,,,,...,,,,,,,,,,
1,chr1,10646593,A,G,chr1:10646593:A:G,ZNF143_1,ZNF143_1,"ZNF143_1,ZNF143_1",ZNF143_1,ZNF143_1,...,"ZNF143_1,ZNF143_1",ZNF143_1,,ZNF143_1,ZNF143_1,,ZNF143_1,ZNF143_1,ZNF143_2,"ZNF143_1,ZNF143_2"
2,chr1,108559923,G,C,chr1:108559923:G:C,ATF_1,,ATF_1,ATF_1,SP-KLF_1,...,ZEB-SNAI,,,,ATF_1,,ATF_1,ATF_1,SP-KLF_1,"ATF_1,SP-KLF_1"
3,chr1,108609660,G,A,chr1:108609660:G:A,,,,ETS_1,ETS_1,...,,,,,,,,,,
4,chr1,108746573,G,A,chr1:108746573:G:A,"SP-KLF_1, ETS_2",,"SP-KLF_1, ETS_2",ETS_2,,...,SP-KLF_1,SP-KLF_1,,SP-KLF_1,"ETS_2, SP-KLF_1",,"ETS_2, SP-KLF_1",ETS_2,,ETS_2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1612,chr22,44201792,C,T,chr22:44201792:C:T,,,,,,...,ZEB-SNAI,,,,,ZEB-SNAI,ZEB-SNAI,,ZEB-SNAI,ZEB-SNAI
1613,chr22,45831660,T,A,chr22:45831660:T:A,,,,,,...,,,,,,"TEF, TEF","TEF, TEF",,,
1614,chr22,45831848,G,C,chr22:45831848:G:C,,,,,,...,,,,,,,,,,
1615,chr22,49827389,A,C,chr22:49827389:A:C,YY1,,YY1,YY1,,...,,,,,YY1,,YY1,YY1,,YY1


In [25]:
variant_motifs.to_csv(table_dir + '/rare_variants.motif_annotations.all_data_all_patterns.' + leiden + '.' + alpha + '.tsv',
                      sep='\t', index=False)

In [26]:
excitatory_neuron_clusters = ['corces_2020.Cluster1', 'corces_2020.Cluster3', 'corces_2020.Cluster4',
                                'trevino_2021.c0', 'trevino_2021.c1', 'trevino_2021.c2',
                                'trevino_2021.c5', 'trevino_2021.c6', 'trevino_2021.c7',
                                'trevino_2021.c13', 'trevino_2021.c14', 'trevino_2021.c18',
                                'domcke_2020.fetal_brain.Excitatory_neurons']

all_variants = variant_motifs['variant_id'].unique()

# Dictionary to store the final binary matrices for each cell type
binary_matrices = {}

for cluster in excitatory_neuron_clusters:
    col_name = f"{cluster}.all_motifs"
    
    # If the column doesn't exist in df, skip or handle accordingly
    if col_name not in variant_motifs.columns:
        continue
    
    # 1) Subset to variant_id and the cluster's motif column
    sub_df = variant_motifs[['variant_id', col_name]].copy()
    
    # 2) Split on commas and explode
    sub_df[col_name] = sub_df[col_name].fillna('').str.split(',')
    sub_df = sub_df.explode(col_name)
    sub_df[col_name] = sub_df[col_name].str.strip()
    
    # 3) Remove any empty entries (which can appear if the cell was NaN)
    sub_df = sub_df[sub_df[col_name] != '']
    
    # 4) Pivot to create a binary matrix
    #    Assign an indicator column "ind=1" for presence
    binary_matrix = (
        sub_df
        .assign(ind=1)
        .pivot_table(index='variant_id', 
                     columns=col_name, 
                     values='ind', 
                     fill_value=0)
    )
    
    binary_matrix = binary_matrix.reindex(all_variants, fill_value=0)
    binary_matrix.index.name = None
    binary_matrix.columns.name = None

    # Store in the dictionary
    binary_matrices[cluster] = binary_matrix

# Find union of columns across all binary matrices
all_motifs = set()
for mat in binary_matrices.values():
    all_motifs |= set(mat.columns)

all_motifs = sorted(all_motifs)

# Reindex each DataFrame so they share the same motif columns
for cluster, mat in binary_matrices.items():
    binary_matrices[cluster] = mat.reindex(columns=all_motifs, fill_value=0)
    binary_matrices[cluster]

for dataset in ['corces_2020', 'trevino_2021', 'domcke_2020']: #, 'ameen_2022', 'encode_2024']:
    os.makedirs(variant_motif_matrices_dir + '/all_data_all_patterns/' + leiden + '/' + alpha + '/' + dataset, exist_ok=True)
    for cluster in [i for i in binary_matrices if i.startswith(dataset)]:
        binary_matrices[cluster].to_csv(variant_motif_matrices_dir + '/all_data_all_patterns/' + leiden + '/' + alpha + '/' + dataset + '/' + cluster + '.tsv',
                                        sep='\t')

In [27]:
binary_matrices['corces_2020.Cluster1']['NDF-ATOH_1'].sum()

64.0

In [28]:
binary_matrices['trevino_2021.c0']['NDF-ATOH_1'].sum()

158.0

In [29]:
binary_matrices['domcke_2020.fetal_brain.Excitatory_neurons']['NDF-ATOH_1'].sum()

160.0