# SFig. 3: HMM validation

In [None]:
def AIC(L, n_states):
    n_params = 2 # because normal distribution defined by mean and standard deviation
    return(2*(n_states**2 + n_params*n_states - 1) - 2*(L))

def BIC(L, n_states, n_obs):
    n_params = 2 # because normal distribution defined by mean and standard deviation
    return((n_params*n_states + n_states**2 - 1)*np.log(n_obs) - 2*L)

bics = []
n_states_list = []
aics = []
cts = []

for ix, row in org_df.query('chip == "LB1"').iterrows():
    for n_states in [2, 3, 4, 5]:
        ct = row['cell_type']
        with open(f'/pollard/home/kathleen/projects/LADs/LADs_KDDs/hmms_no_umap_filter_{n_states}states/{ct}.json') as json_file:
                    dat_model = HiddenMarkovModel.from_json(json.load(json_file))
        
        dat = pd.read_table(f'/pollard/home/kathleen/projects/LADs/LADs_KDDs/hmm_calls_no_umap_filter_{n_states}states/{ct}.tsv', names=['chrom','start','stop','score0','score1','hmm_pred'],
                           header=0)

        bic = BIC(dat_model.log_probability(dat[['score0','score1']].to_numpy()), n_states, len(dat))

        bics.append(bic)

        aic = AIC(dat_model.log_probability(dat[['score0','score1']].to_numpy()), n_states)

        aics.append(aic)

        n_states_list.append(n_states)

        cts.append(ct)

tog = pd.DataFrame({
    'ct':cts,
    'aic':aics,
    'bic':bics,
    'n_states':n_states_list
}).drop_duplicates()

aic_diffs = []
bic_diffs = []
states = []
cts = []

for state in [2,3,4]:
    for ct in tog['ct'].unique():
        cts.append(ct)
        s_plus_one = state + 1
        states.append(f'{state} > {s_plus_one}')
        aic_diffs.append(tog.query('ct == @ct and n_states == @state')['aic'].tolist()[0]-tog.query('ct == @ct and n_states == @s_plus_one')['aic'].tolist()[0])
        bic_diffs.append(tog.query('ct == @ct and n_states == @state')['bic'].tolist()[0]-tog.query('ct == @ct and n_states == @s_plus_one')['bic'].tolist()[0])
        
        

df = pd.DataFrame({
    'AIC_diff':aic_diffs,
    'BIC_diff':bic_diffs,
    'states':states,
    'ct':cts
})

sns.barplot(x='ct', y='AIC_diff', hue='states', data=df,
           order=sorted(df['ct'].drop_duplicates()))
plt.xticks(rotation=45, ha='right')
plt.title('LAD AIC differences: Models trained per cell type and per replicate')
plt.legend(title='# of states', ncol=1, frameon=False)
plt.ylabel('AIC difference')
plt.xlabel('cell type')

# SFig. 5: KDD HMM validation

In [None]:
bics = []
n_states_list = []
aics = []
cts = []

for ix, row in org_df.query('chip == "H3K9me2"').iterrows():
    for n_states in [2, 3, 4, 5]:
        ct = row['cell_type']
        with open(f'/pollard/home/kathleen/projects/LADs/LADs_KDDs/hmms_no_umap_filter_{n_states}states/{ct}.json') as json_file:
                    dat_model = HiddenMarkovModel.from_json(json.load(json_file))
        
        dat = pd.read_table(f'/pollard/home/kathleen/projects/LADs/LADs_KDDs/hmm_calls_no_umap_filter_{n_states}states/{ct}.tsv', names=['chrom','start','stop','score0','score1','hmm_pred'],
                           header=0)

        bic = BIC(dat_model.log_probability(dat[['score0','score1']].to_numpy()), n_states, len(dat))

        bics.append(bic)

        aic = AIC(dat_model.log_probability(dat[['score0','score1']].to_numpy()), n_states)

        aics.append(aic)

        n_states_list.append(n_states)

        cts.append(ct)

tog_kdd = pd.DataFrame({
    'ct':cts,
    'aic':aics,
    'bic':bics,
    'n_states':n_states_list
}).drop_duplicates()

aic_diffs = []
bic_diffs = []
states = []
cts = []

for state in [2,3,4]:
    for ct in tog['ct'].unique():
        cts.append(ct)
        s_plus_one = state + 1
        states.append(f'{state} > {s_plus_one}')
        aic_diffs.append(tog.query('ct == @ct and n_states == @state')['aic'].tolist()[0]-tog.query('ct == @ct and n_states == @s_plus_one')['aic'].tolist()[0])
        bic_diffs.append(tog.query('ct == @ct and n_states == @state')['bic'].tolist()[0]-tog.query('ct == @ct and n_states == @s_plus_one')['bic'].tolist()[0])
        
        

kdd_df = pd.DataFrame({
    'AIC_diff':aic_diffs,
    'BIC_diff':bic_diffs,
    'states':states,
    'ct':cts
})

ct_replace = {
    'CardiacMyocytes':'Cardiac myocytes',
    'EndoProgenitor':'Endothelial progenitors',
    'D5Midbrain':'Midbrain progenitors',
    'H9ESC':'ESCs',
    'MidHindgut':'Mid-hindgut',
    'ParaxMesoderm':'Paraxial mesoderm',
    'Early Somite':'Early somite',
    'DefEctoderm':'Definitive ectoderm',
    'BorderEctoderm':'Border ectoderm',
    'D4Artery':'Artery progenitors',
    'APS':'Anterior primitive streak',
    'EarlySomite':'Early somite'
}

sns.barplot(x='ct', y='AIC_diff', hue='states', data=kdd_df,
           order=sorted(df['ct'].drop_duplicates()))
plt.xticks(rotation=45, ha='right')
plt.title('KDD AIC differences: Models trained per cell type and per replicate')
plt.legend(title='# of states', ncol=1, frameon=False)
plt.ylabel('AIC difference')
plt.xlabel('cell type')

In [None]:
# H3K9me2 in KDDs

dat = pd.read_csv('tables_out/all_dat_tog.tsv', sep='\t')

dat['score_lb1'] = dat[['score0_lb1','score1_lb1']].mean(axis=1)
dat['score_k9'] = dat[['score0_k9','score1_k9']].mean(axis=1)

c = sns.violinplot(y='score_k9', x='cell_type', data=dat, kind='violin',
           hue='category_k9', hue_order=['nonKDD','T2-KDD','T1-KDD'],
                   order=sorted(dat['cell_type'].unique()),
              linewidth=0.5, inner=None, ax=axC)
plt.xticks(rotation=45, ha='right')
plt.title('H3K9me2 binding, KDDs')
plt.ylabel('H3K9me2 binding [log2(read counts)]')
plt.xlabel('cell type')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
axC.legend(ncol=3, title='LAD category', frameon=False, bbox_to_anchor=(0.98, 1.05))
axC.set(ylim=(-3, 3.8))

# SFig. 6: KDD/LAD overlap

In [None]:
cts = ['Liver', 'CardiacMyocytes', 'EndoProgenitor', 'D5Midbrain',
       'H9ESC', 'Epicardium', 'MidHindgut', 'ParaxMesoderm',
       'EarlySomite', 'DefEctoderm', 'BorderEctoderm', 'D4Artery', 'APS']

t1lads = {}
t1kdds = {}
t2lads = {}
t2kdds = {}

for ct in cts:
    t1lads[ct] = BedTool(f'{path_to_beds}/BED_files_no_umap_filter_3states/{ct}_T1-LADs.bed').sort()
    t1kdds[ct] = BedTool(f'{path_to_beds}/KDD_BED_files_no_umap_filter_3states/{ct}_T1-KDDs.bed').sort()
    t2lads[ct] = BedTool(f'{path_to_beds}/BED_files_no_umap_filter_3states/{ct}_T2-LADs.bed').sort()
    t2kdds[ct] = BedTool(f'{path_to_beds}/KDD_BED_files_no_umap_filter_3states/{ct}_T2-KDDs.bed').sort()
    

f, ax = plt.subplots(figsize=(4.5,3))

lad_types = []
percs = []

for ct in cts:
    t1kdd = t1kdds[ct]
    t1lad = t1lads[ct]
    t2kdd = t2kdds[ct]
    t2lad = t2lads[ct]
    percs.append(100.0*len(t1lad.intersect(t1kdd, u=True))/len(t1lad))
    lad_types.append('T1-LAD/KDD')
    percs.append(100.0*len(t2lad.intersect(t2kdd, u=True))/len(t2lad))
    lad_types.append('T2-LAD/KDD')

sim_plot = pd.DataFrame({
    'percentage':percs,
    'type':lad_types
})

sns.barplot(x='type', y='percentage', data=sim_plot, ci='sd',
           palette=dict(zip(sim_plot['type'].unique(),['darkgrey','dimgrey'])))
plt.ylim(0, 100)
plt.ylabel('LADs overlapping KDDs (%)')
plt.xlabel('Type')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.tight_layout()