# Feature enrichments

In [None]:
path_to_tes = 'dat/TEs'
path_to_rt_results = 'dat/repdomains'
path_to_sc_dat = 'kind_et_al/2015'
hg19_to_hg38 = 'liftOver/hg19ToHg38.over.chain.gz'
path_to_lad_calls = 'LADs_KDDs/hmm_calls_3states'
chromhmm_path = 'chromHMM/hg38'

# load transposable elements

cts = ['Liver', 'CardiacMyocytes', 'EndoProgenitor', 'D5Midbrain',
       'H9ESC', 'Epicardium', 'MidHindgut', 'ParaxMesoderm',
       'EarlySomite', 'DefEctoderm', 'BorderEctoderm', 'D4Artery', 'APS']

def get_enrichment_df(enr_results_path, enr_feature):
    """
    Collects enrichment results (pre-computed) into a dataframe for plotting
    """
    zscores = []
    pvalues = []
    lad_cts = []
    enr_elements = []
    ladtypes = []

    for ct in cts:
#         print(ct)

        enr_dat_t1lad = pd.read_csv(f'{enr_results_path}/{enr_feature}_{ct}_T1LADs_enrichment_results.tsv',
                            sep='\t').replace(0.0, 0.00001)
        enr_dat_t2lad = pd.read_csv(f'{enr_results_path}/{enr_feature}_{ct}_T2LADs_enrichment_results.tsv',
                            sep='\t').replace(0.0, 0.00001)

        with open(f'{enr_results_path}/{enr_feature}_{ct}_T1LADs_actual_intersections.json', 'r') as f:
            actual_ints_t1lads = json.load(f)

        if actual_ints_t1lads == 0:
            actual_ints_t1lads = 0.00001

        with open(f'{enr_results_path}/{enr_feature}_{ct}_T2LADs_actual_intersections.json', 'r') as f:
            actual_ints_t2lads = json.load(f)

        if actual_ints_t2lads == 0:
            actual_ints_t2lads = 0.00001

        # this loop was originally developed for TEs, but now 
        # works with all the features we assessed for enrichment
        for te_class in enr_dat_t1lad['TE_class'].unique():

            # calculate z-score for T1-LAD

            t1lad_sd = np.std(enr_dat_t1lad.query('TE_class == @te_class')['n_intersections'])
            t1lad_ints = actual_ints_t1lads[te_class]
            t1lad_obs_z = (t1lad_ints - \
                         np.mean(enr_dat_t1lad.query('TE_class == @te_class')['n_intersections'])) / t1lad_sd

            # calculate empirical p-value for T1-LAD

            if t1lad_obs_z > 0.00001:
                pt1lad = len(enr_dat_t1lad.query('(TE_class == @te_class) and (n_intersections > @t1lad_ints)')) /\
                len(enr_dat_t1lad.query('(TE_class == @te_class)'))
            else:
                pt1lad = len(enr_dat_t1lad.query('(TE_class == @te_class) and (n_intersections < @t1lad_ints)')) /\
                len(enr_dat_t1lad.query('(TE_class == @te_class)'))

            zscores.append(t1lad_obs_z)
            pvalues.append(pt1lad)
            lad_cts.append(ct)
            enr_elements.append(te_class)
            ladtypes.append('T1-LAD')

            # calculate z-score for T2-LAD

            t2lad_sd = np.std(enr_dat_t2lad.query('TE_class == @te_class')['n_intersections'])
            t2lad_ints = actual_ints_t2lads[te_class]
            t2lad_obs_z = (t2lad_ints - \
                         np.mean(enr_dat_t2lad.query('TE_class == @te_class')['n_intersections'])) / t2lad_sd

            # calculate empirical p-value for T2-LAD

            if t2lad_obs_z > 0.00001:
                pt2lad = len(enr_dat_t2lad.query('(TE_class == @te_class) and (n_intersections > @t2lad_ints)')) /\
                len(enr_dat_t2lad.query('(TE_class == @te_class)'))
            else:
                pt2lad = len(enr_dat_t2lad.query('(TE_class == @te_class) and (n_intersections < @t2lad_ints)')) /\
                len(enr_dat_t2lad.query('(TE_class == @te_class)'))

            zscores.append(t2lad_obs_z)
            pvalues.append(pt2lad)
            lad_cts.append(ct)
            enr_elements.append(te_class)
            ladtypes.append('T2-LAD')     

    zpdf = pd.DataFrame({
        'p_value':pvalues,
        'zscore':zscores,
        'LAD_type':ladtypes,
        'TE_class':enr_elements,
        'celltype':lad_cts
    }).dropna().query('zscore != "-inf"').query('zscore != "inf"')

    zpdf['-log10p'] = -np.log10(zpdf['p_value'])
    zpdf['-log10p'] = zpdf['-log10p'].replace(np.inf, 1.0)
    return zpdf

In [None]:
# enrichment df for TEs
zpdf_tes = get_enrichment_df(path_to_tes, 'TEs_by_fam')

# enrichment df for replication timing domains
zpdf_rt = get_enrichment_df(path_to_rt_results, 'RTs')

# find TE families with same enrichment pattern in all celltypes
# e.g. T1 and T2-LADs all enriched, or T1-LADs enriched, T2-LADs depleted, or vice versa

for_include = []
        
te_cs = zpdf_tes['TE_class'].unique()
for te in zpdf_tes.query('TE_class in @te_cs')['TE_class'].unique():
    if all(zpdf_tes.query('TE_class == @te')['zscore'] > 0) or all(zpdf_tes.query('TE_class == @te')['zscore'] < 0):
        for_include.append(te)
    elif all(zpdf_tes.query('TE_class == @te and LAD_type == "T1-LAD"')['zscore'] > 0) and all(zpdf_tes.query('TE_class == @te and LAD_type == "T2-LAD"')['zscore'] < 0):
        for_include.append(te)
    elif all(zpdf_tes.query('TE_class == @te and LAD_type == "T1-LAD"')['zscore'] < 0) and all(zpdf_tes.query('TE_class == @te and LAD_type == "T2-LAD"')['zscore'] > 0):
        for_include.append(te)
  
exclude = ['Unknown']
for_include = [ x for x in for_include if not (x.endswith('?') or x in exclude) ]

zpdf_tog = pd.concat([zpdf_tes.query('TE_class in @for_include').copy(), zpdf_rt], sort=False)

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))
g = sns.scatterplot(x='zscore', y='TE_class', data=zpdf_tog, size='-log10p', 
                   hue='celltype', style='LAD_type', legend='brief',
                   palette='colorblind', sizes=(20, 200))
plt.xticks(rotation=90)
plt.legend(bbox_to_anchor=(1.04,1), loc="upper left")
plt.grid(b=None)
plt.ylabel('')
plt.xlabel('Z-score')

# CTCF at boundaries

In [None]:
# add CTCF

def load_ctcf_dat(path, rtype):
    ctcf = pd.read_table(path,
                         skiprows=1, header=0, low_memory=False).replace(np.nan, 0.0).replace('--',0.0)
    ctcf.columns = ['CTCF',rtype] + list(range(ctcf.shape[1]))[:-2]
    ctcf.columns = ['CTCF',rtype] + list(range(ctcf.shape[1]))[:-2]

    dfs = []

    for ix, row in ctcf.iterrows():
        CTCF_dset = row['CTCF']
        r_dset = row[rtype]

        dat = pd.DataFrame(row[2:].T)
        dat.columns=['CTCF_sig']
        dat['CTCF'] = CTCF_dset.split('_')[0]
        dat[rtype] = r_dset.split('_')[0]
        dat['bin'] = dat.index

        dfs.append(dat)


    ctcf_tog = pd.concat(dfs, sort=False)
    ctcf_tog['CTCF_sig'] = ctcf_tog['CTCF_sig'].astype(float)
    return(ctcf_tog)

max_bin = 200

# T1 to non

non_to_t1_all = []

for ct in ['escs','parax_mesoderm','cms']:
    t1_to_non = load_ctcf_dat(f'ctcf_reanalysis/t1_to_non_{ct}_for_plot.tsv','T1_to_non').query('bin < @max_bin')
    rev_bins = t1_to_non['bin'].tolist()
    rev_bins.reverse()
    t1_to_non['bin'] = rev_bins
    non_to_t1 = load_ctcf_dat(f'ctcf_reanalysis/non_to_t1_{ct}_for_plot.tsv','T1_to_non').query('bin < @max_bin')
    t1_to_non_tog = pd.concat([t1_to_non, non_to_t1], sort=False).sort_values(by='bin', ascending=False)
    t1_to_non_tog['cat'] = 'T1_to_non'
    t1_to_non_tog['ct'] = ct
    non_to_t1_all.append(t1_to_non_tog)
    
non_to_t1_all_cts = pd.concat(non_to_t1_all, sort=False)

# T2 to non

non_to_t2_all = []

for ct in ['escs','parax_mesoderm','cms']:
    t2_to_non = load_ctcf_dat(f'ctcf_reanalysis/t2_to_non_{ct}_for_plot.tsv','T2_to_non').query('bin < @max_bin')
    rev_bins = t2_to_non['bin'].tolist()
    rev_bins.reverse()
    t2_to_non['bin'] = rev_bins
    non_to_t2 = load_ctcf_dat(f'ctcf_reanalysis/non_to_t2_{ct}_for_plot.tsv','T2_to_non').query('bin < @max_bin')
    t2_to_non_tog = pd.concat([t2_to_non, non_to_t2], sort=False).sort_values(by='bin', ascending=False)
    t2_to_non_tog['cat'] = 'T2_to_non'
    t2_to_non_tog['ct'] = ct
    non_to_t2_all.append(t2_to_non_tog)
    
non_to_t2_all_cts = pd.concat(non_to_t2_all, sort=False)

# T2 to T1

t2_to_t1_all = []

for ct in ['escs','parax_mesoderm','cms']:
    t1_to_t2 = load_ctcf_dat(f'ctcf_reanalysis/t1_to_t2_{ct}_for_plot.tsv','T2_to_T1').query('bin < @max_bin')
    rev_bins = t1_to_t2['bin'].tolist()
    rev_bins.reverse()
    t1_to_t2['bin'] = rev_bins
    t2_to_t1 = load_ctcf_dat(f'ctcf_reanalysis/t2_to_t1_{ct}_for_plot.tsv','T2_to_T1').query('bin < @max_bin')
    t2_to_t1_tog = pd.concat([t2_to_t1, t1_to_t2], sort=False).sort_values(by='bin', ascending=False)
    t2_to_t1_tog['cat'] = 'T2_to_t1'
    t2_to_t1_tog['ct'] = ct
    t2_to_t1_all.append(t2_to_t1_tog)
    
t2_to_t1_all_cts = pd.concat(t2_to_t1_all, sort=False)

ctr = {
    'Cardiac myocytes':'cms',
    'Paraxial mesoderm':'parax_mesoderm',
    'ESCs':'escs'
}

# characterize number of types of regions per cell type

cats = []
cts = []
n_bs = []

for ct in ['ESCs','Paraxial mesoderm','Cardiac myocytes']:
    ctr_ = ctr[ct]
    t1_to_non = pd.concat([BedTool(f'ctcf_reanalysis/t1_to_non_{ctr_}.bed').to_dataframe(),
                              BedTool(f'ctcf_reanalysis/non_to_t1_{ctr_}.bed').to_dataframe()],
                            sort=False)
    cats.append('nonLAD to T1-LAD')
    cts.append(ct)
    n_bs.append(len(t1_to_non))
    
    t2_to_non = pd.concat([BedTool(f'ctcf_reanalysis/t2_to_non_{ctr_}.bed').to_dataframe(),
                              BedTool(f'ctcf_reanalysis/non_to_t2_{ctr_}.bed').to_dataframe()],
                            sort=False)
    cats.append('nonLAD to T2-LAD')
    cts.append(ct)
    n_bs.append(len(t2_to_non))
    
    t1_to_t2 = pd.concat([BedTool(f'ctcf_reanalysis/t2_to_t1_{ctr_}.bed').to_dataframe(),
                              BedTool(f'ctcf_reanalysis/t1_to_t2_{ctr_}.bed').to_dataframe()],
                            sort=False)
    cats.append('T2-LAD to T1-LAD')
    cts.append(ct)
    n_bs.append(len(t1_to_t2))
    
n_doms_df = pd.DataFrame({
    'category':cats,
    'cell_type':cts,
    'n_boundaries':n_bs
})


# CTCF nonLAD --> T2-LAD

axC = plt.subplot(gs[1, 1])
c = sns.lineplot(x='bin', y='CTCF_sig', data=non_to_t2_all_cts.replace({'escs':'ESCs',
                                                                       'cms':'Cardiac myocytes',
                                                                       'parax_mesoderm':'Paraxial mesoderm'}), 
                 hue='ct', ci='sd', alpha=0.5, estimator='mean', 
                 hue_order=['ESCs','Paraxial mesoderm','Cardiac myocytes'],
                ax=axC)



c.set(ylabel='CTCF binding [rpkm]', xlabel='genomic position')
axC.legend(frameon=False, bbox_to_anchor = (1., -0.2))
axC.set_xticks([0, 100, 200])
axC.set_xticklabels(['-50kb','boundary','+50kb'])
axC.axvline(100, linestyle='--', color='grey')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
axC.annotate('',
            xy=(0.1, .98), xycoords='axes fraction',
            xytext=(0.4, .98), textcoords='axes fraction',
            arrowprops=dict(facecolor='grey', shrink=0.05, alpha=0.5),
            horizontalalignment='right', verticalalignment='top')
axC.text(0.3, 1.1, 'nonLAD', fontsize=12, transform=axC.transAxes,
         horizontalalignment='center',
         verticalalignment='center')
axC.annotate('',
            xy=(0.9, .98), xycoords='axes fraction',
            xytext=(0.6, .98), textcoords='axes fraction',
            arrowprops=dict(facecolor='grey', shrink=0.05, alpha=0.5),
            horizontalalignment='right', verticalalignment='top')
axC.text(0.7, 1.1, 'T2-LAD', fontsize=12, transform=axC.transAxes,
         horizontalalignment='center',
         verticalalignment='center')

# CTCF T2-LAD --> T1-LAD

axD = plt.subplot(gs[1, 2])
d = sns.lineplot(x='bin', y='CTCF_sig', data=t2_to_t1_all_cts.replace({'escs':'ESCs',
                                                                       'cms':'Cardiac myocytes',
                                                                       'parax_mesoderm':'Paraxial mesoderm'}), 
                 hue='ct', ci='sd', alpha=0.5, estimator='mean', 
                 hue_order=['ESCs','Paraxial mesoderm','Cardiac myocytes'],
                ax=axD)



d.set(ylabel='CTCF binding [rpkm]', xlabel='genomic position')
axD.legend(frameon=False, bbox_to_anchor = (1., -.2))
axD.set_xticks([0, 100, 200])
axD.set_xticklabels(['-50kb','boundary','+50kb'])
axD.axvline(100, linestyle='--', color='grey')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
axD.annotate('',
            xy=(0.1, .98), xycoords='axes fraction',
            xytext=(0.4, .98), textcoords='axes fraction',
            arrowprops=dict(facecolor='grey', shrink=0.05, alpha=0.5),
            horizontalalignment='right', verticalalignment='top')
axD.text(0.3, 1.1, 'T2-LAD', fontsize=12, transform=axD.transAxes,
         horizontalalignment='center',
         verticalalignment='center')
axD.annotate('',
            xy=(0.9, .98), xycoords='axes fraction',
            xytext=(0.6, .98), textcoords='axes fraction',
            arrowprops=dict(facecolor='grey', shrink=0.05, alpha=0.5),
            horizontalalignment='right', verticalalignment='top')
axD.text(0.7, 1.1, 'T1-LAD', fontsize=12, transform=axD.transAxes,
         horizontalalignment='center',
         verticalalignment='center')

# Single cell

In [None]:
# load Kind et al. 2015 single cell data
dat = pd.read_csv(f'{path_to_sc_dat}/GSE68260_Clone.5-5.1N.OE_LP150415.txt',
                 sep='\t')

# determine percentage of cells with LB1 association at each locus
dat['perc_cells_LB1'] = 100.0 * ((dat[dat.columns[4:-1]] > 0.0).sum(1) / 172.0 )

# liftOver to hg38 from hg19
bed_hg38 = BedTool.from_dataframe(dat[['seqnames','start','end','perc_cells_LB1']]).liftover(hg19_to_hg38).sort()

def assign_categories(dat, cat_names):
    """
    Assign categories to specific HMM states
    based on the mean ChIP-seq coverage
    in bins assigned to that state.
    
    dat is the binned data with HMM predicted states
    generated by lad_utils.hmm_predict()
    
    cat_names should be provided based on which should be
    assigned to sequecing coverage level in ascending order.
    """
    n_states = len(dat['hmm_pred'].unique())
    n_cats = len(cat_names)
    if n_states != n_cats:
        print(f'{n_states} in data, but {n_cats} provided. '\
              f'Number of categories must match the number '\
              f'of HMM states.')
    else:
        
        cols = ['score0','score1'] # each cell type has 2 replicates
        # cols = ['score']
        
        states = []
        means = []
        
        for state in dat['hmm_pred'].unique():
            states.append(state)
            means.append(np.mean(dat.query(f'hmm_pred == {state}')[cols].mean()))
           
        # sort means and associated states, ascending
        sorted_means = sorted(means)
        means_to_states = dict(zip(means, states))
        
        sorted_states = []
        
        for val in sorted_means:
            sorted_states.append(means_to_states[val])
    
        names_to_states = dict(zip(sorted_states, cat_names))
        dat['category'] = dat['hmm_pred'].replace(names_to_states)
        return dat

n_states = 3

# Paraxial mesoderm is the closest cell type we have
lb1_dat = pd.read_table(f'{path_to_lad_calls}/ParaxMesoderm.tsv',
                       header=0, names=['chrom','start','stop','score0','score1','hmm_pred'])
lb1_dat = assign_categories(lb1_dat.copy(), cat_names=['nonLAD','T2-LAD','T1-LAD'])
lb1_dat['id'] = lb1_dat.index
lb1_dat_bed = BedTool.from_dataframe(lb1_dat[['chrom','start','stop','id']]).sort()

# intersect LB1 bins with single cell data
lb1_scdat = lb1_dat_bed.intersect(bed_hg38, wa=True, wb=True).to_dataframe()[['name','thickEnd']]
lb1_scdat.columns = ['id','perc_cells_LB1']

# merge back to LB1 data
lb1_w_sc = lb1_dat.merge(lb1_scdat, on='id', how='left').fillna(0.0)

sns.violinplot(y='category', x='perc_cells_LB1', 
               data=lb1_w_sc, #sc_dat from LADs and LADlites vs. sc data
              order=['nonLAD','T2-LAD','T1-LAD'],
              scale='count', linewidth=1, inner='box')

# chromHMM results

In [None]:
cts = ['H9ESC','CardiacMyocytes', 'Liver', 'ParaxMesoderm']

esc_chromhmm = pd.read_table(f'{chromhmm_path}/E008_25_imputed12marks_hg38lift_dense.bed', 
                            skiprows=1, header=None, usecols=range(4), names=['chrom','start','stop','state'])

cm_chromhmm = pd.read_table(f'{chromhmm_path}/E095_25_imputed12marks_hg38lift_dense.bed', 
                            skiprows=1, header=None, usecols=range(4), names=['chrom','start','stop','state'])

liver_chromhmm = pd.read_table(f'{chromhmm_path}/E066_25_imputed12marks_hg38lift_dense.bed', 
                            skiprows=1, header=None, usecols=range(4), names=['chrom','start','stop','state'])

mesoderm_chromhmm = pd.read_table(f'{chromhmm_path}/E013_25_imputed12marks_hg38lift_dense.bed', 
                            skiprows=1, header=None, usecols=range(4), names=['chrom','start','stop','state'])

chromhmms = {}

chromhmms['H9ESC'] = esc_chromhmm
chromhmms['CardiacMyocytes'] = cm_chromhmm
chromhmms['Liver'] = liver_chromhmm
chromhmms['ParaxMesoderm'] = mesoderm_chromhmm

ct_lb1 = {}

for ct in cts:
    lb1_dat = pd.read_table(f'{path_to_lad_calls}/{ct}.tsv',
                           header=0, names=['chrom','start','stop','score0','score1','hmm_pred'])
    lb1_dat = assign_categories(lb1_dat.copy(), cat_names=['nonLAD','T2-LAD','T1-LAD'])
    lb1_dat['id'] = lb1_dat.index
    
    ct_lb1[ct] = lb1_dat
    
    
cts_for_plot = []
cats_for_plot = []
odds_ratios = []
pvals = []
chromhmm_cat = []

for ct in cts:
    for lad_cat in ['T1-LAD','T2-LAD']:
        lad_bed = BedTool.from_dataframe(ct_lb1[ct].query('category == @lad_cat')[['chrom','start','stop','id']].copy()).sort().set_chromsizes('hg38')

        all_cats = {}

        depleted = []
        enriched = []

        chromhmm = chromhmms[ct]
        for cat in chromhmm['state'].unique():
            boi = BedTool.from_dataframe(chromhmm.query('state == @cat').copy()).sort().set_chromsizes('hg38')
            out = str(boi.fisher(lad_bed))
            if float(out.split('\n')[-2].split('\t')[0]) < 0.01:
                depleted.append(cat)
            elif float(out.split('\n')[-2].split('\t')[1]) < 0.01:
                enriched.append(cat)
            pvals.append(float(out.split('\n')[-2].split('\t')[1]))
            cts_for_plot.append(ct)
            cats_for_plot.append(lad_cat)
            odds_ratios.append(float(out.split('\n')[-2].split('\t')[3]))
            chromhmm_cat.append(cat)
            
chromhmm_replace = {
    '1_TssA':'Active TSS',
    '2_PromU':'Promoter Upstream TSS',
    '3_PromD1':'Promoter Downstream TSS 1',
    '4_PromD2':'Promoter Downstream TSS 2',
    '5_Tx5':"Transcribed - 5' preferential",
    '6_Tx':'Strong transcription',
    '7_Tx3':"Transcribed - 3' preferential",
    '8_TxWk':'Weak transcription',
    '9_TxReg':'Transcribed & regulatory (Prom/Enh)',
    '10_TxEnh5':"Transcribed 5' preferential and Enh",
    '11_TxEnh3':"Transcribed 3' preferential and Enh",
    '12_TxEnhW':'Transcribed and Weak Enhancer',
    '13_EnhA1':'Active Enhancer 1',
    '14_EnhA2':'Active Enhancer 2',
    '15_EnhAF':'Active Enhancer Flank',
    '16_EnhW1':'Weak Enhancer 1',
    '17_EnhW2':'Weak Enhancer 2',
    '18_EnhAc':'Primary H3k27ac possible Enhancer',
    '19_DNase':'Primary DNase',
    '20_ZNF/Rpts':'ZNF genes & repeats',
    '21_Het':'Heterochromatin',
    '22_PromP':'Poised Promoter',
    '23_PromBiv':'Bivalent Promoter',
    '24_ReprPC':'Repressed Polycomb',
    '25_Quies':'Quiescent/Low',
    "10_TxEnh5'":"Transcribed 5' preferential and Enh",
    "11_TxEnh3'":"Transcribed 3' preferential and Enh",
    "5_Tx5'":"Transcribed - 5' preferential",
    "7_Tx3'":"Transcribed - 3' preferential"
}

chromhmm_df_plot = pd.DataFrame({
    'celltype':cts_for_plot,
    'LAD_cat':cats_for_plot,
    'odds_ratio':odds_ratios,
    'pval':pvals,
    'chromHMM':chromhmm_cat
}).replace(np.nan, 1.0)

chromhmm_df_plot = chromhmm_df_plot.replace(chromhmm_replace)
chromhmm_df_plot['celltype & LAD category'] = chromhmm_df_plot['celltype'] + '_' + chromhmm_df_plot['LAD_cat']

hm_chmm = chromhmm_df_plot[['celltype & LAD category','chromHMM','odds_ratio']].copy().pivot('celltype & LAD category','chromHMM','odds_ratio')


chromhmm_df_plot['pval'] = chromhmm_df_plot['pval'].astype(float)
chromhmm_df_plot['pval_sig'] = chromhmm_df_plot['pval'] < 0.01
chromhmm_df_plot['pval_formatted'] = chromhmm_df_plot['pval_sig'].replace({True:'*', False:''})

hm_chmm2 = chromhmm_df_plot[['celltype & LAD category','chromHMM','pval_formatted']].copy().pivot('celltype & LAD category','chromHMM','pval_formatted')

from matplotlib import cm

cb_pal4 = np.array(sns.color_palette(palette='colorblind')[:4])

cb_pal = np.array(['indigo','mediumpurple'])

lut = dict(zip(hm_chmm.index.str.split('_').str[1].unique(), cb_pal))
row_colors = hm_chmm.index.str.split('_').str[1].map(lut)
lut = {
    'CardiacMyocytes':'tab:green',
    'H9ESC':'tab:blue',
    'Liver':'tab:cyan',
    'ParaxMesoderm':'tab:orange'
}
row_colors2 = hm_chmm.index.str.split('_').str[0].map(lut)

cmap = sns.clustermap(hm_chmm, 
               cmap='viridis', col_cluster=False, 
               row_cluster=True, 
               row_colors=[row_colors2, row_colors],
              vmin=0.0, metric='cosine', figsize=(10,6),
                     dendrogram_ratio = 0.1, cbar_pos=(0.01, .05, .03, .2),
                     cbar_kws = {'label':'odds ratio'}, colors_ratio = 0.03,
                      annot_kws={'rotation': 90},
                     **{'annot':hm_chmm2, 'center':3}, fmt='s')



# H3K9me2 in LADs

In [None]:
density_dat = pd.read_csv('tables_out/all_dat_tog.tsv', sep='\t').replace('EarlySomite','Early somite')

density_dat['score_lb1'] = density_dat[['score0_lb1','score1_lb1']].mean(axis=1)
density_dat['score_k9'] = density_dat[['score0_k9','score1_k9']].mean(axis=1)

f, axB = plt.subplots(figsize=(6,8))

b = sns.violinplot(x='score_h3k9me2', y='cell_type', data=density_dat, kind='violin',
           hue='category_lb1',linewidth=0.5, inner=None, ax=axB,
                   hue_order=['T1-LAD','T2-LAD','nonLAD'],
          order=['ESCs','Mid-hindgut','Liver','Anterior primitive streak',
                'Paraxial mesoderm','Early somite','Artery progenitors',
                'Endothelial progenitors','Cardiac myocytes','Epicardium',
                'Definitive ectoderm','Border ectoderm','Midbrain progenitors'],
                  palette = mypal)
plt.legend(frameon=False, title='KDD category', bbox_to_anchor=(1, 1.15),
          ncol=1)
# plt.xticks(rotation=45, ha='right')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
axB.set(ylabel='cell type', xlabel='H3K9me2 binding [log2(read counts)]')
plt.tight_layout()