In [None]:
import pandas as pd
import numpy as np
import scipy as sp
from scipy.stats import norm
import anndata as adata
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from plotnine import *

In [None]:
#make a "read file" method here so less chunky code below
def read_file(disease, filename, inp_subfolder):
    try:
        inp_folder ='/data/srlab/agupta/data/h2/h2_output/07_2022_traits/' + inp_subfolder
        return pd.DataFrame(pd.read_csv(inp_folder+'/'+filename,sep='\t',index_col=0)).T
    except:
        print('wasnt able to read in file')

In [None]:
#get all PASS trait file names
import os
pass_fnames = []
for root, dirs, files in os.walk("/data/srlab/agupta/data/all_PASS_traits"):
    for filename in files:
        pass_fnames.append(filename.split(".")[0])
len(pass_fnames)

In [None]:
# read in accurate heritability estimates for all traits
h2g = pd.read_csv('/data/srlab/agupta/data/ref_files/traits_test_h2g_and_names_vJuly2022-forh2gmapping.csv',index_col=0)
h2_vals = []
diseases = list(h2g.index)
fnames = []
#h2g file also has 'Trait Name' column!
for trait in diseases:
    h2 = h2g.loc[trait,'h2g']
    h2_vals.append(h2)
    
    fname = h2g.loc[trait,'Trait_Identifier'].split("PASS_")[-1]
    fnames.append(fname)

#map trait name to 1) h2 val and 2) fname
disease_h2_mapping = dict(zip(diseases, h2_vals))
disease_fname_mapping = dict(zip(diseases, fnames))

#make sure file names in h2g spreadsheet match actual sumstats files above
print(len(set(pass_fnames).intersection(set(h2g['Trait_Identifier']))), len(h2g))

h2g

In [None]:
def calc_tau_star(input_df, disease, continuous, cell_type, annot):
    ## calculating and adding tau* as a metric (with its updated SE as well)
    tau = input_df.loc['Coefficient',:]
    tau_se = input_df.loc['Coefficient_std_error',:]

    if continuous==False:
        p = input_df.loc['Prop._SNPs',:]
    # num common SNPs
    num_total_SNPs = 5_961_159

    # read in trait-specific h2 from the .log file
    disease_h2 = disease_h2_mapping[disease]

    tau_star_num = np.sqrt(p*(1-p))

    tau_star_denom = disease_h2/num_total_SNPs

    # update tau* and tau* SE
    tau_star = tau * tau_star_num/tau_star_denom
    tau_star_SE = tau_se * tau_star_num/tau_star_denom

    return(tau_star, tau_star_SE)

In [None]:
def make_heatmap(inp_df, metric, x, y, max_pval, figx, figy, enrich_col, enrich_lim):
    if metric=='Enrichment':
        val = 'Enrichment'
        pval = '-log10(enrich_p)'
        name = 'Enrichment'
        dot_color = enrich_col
    if metric=='effect_size':
        val = 'effect_size'
        pval = '-log10(tau*_p)'
        name = 'tau*'
        dot_color = 'darkred'

    print(ggplot(inp_df,
        aes(x=x,
        y=y))
     + geom_point(aes(fill = val,
                      color = val,
                      size=pval))
     + scale_fill_gradient2(low = "blue",  
                          mid = "white",
                          midpoint=0,
                          high = dot_color,
                          name=name,limits=(0,enrich_lim))+
      scale_color_gradient2(low = "blue",  
                           mid = "white",
                           midpoint=0,
                           high = dot_color,
                           name=name,limits=(0,enrich_lim))
     + scale_size(range = (0,10), name='-log10(pval)', limits=(0,max_pval))
     + xlab(" ")
     + ylab(" ")
     + theme_bw()
     + theme(figure_size=(figx,figy))
     + theme(axis_text_x = element_text(angle = 90, size = 9))
     + theme(axis_text_y = element_text(size = 10)))

In [None]:
AI_diseases = sorted(['Celiac_Disease','Eczema',
 'Child_Onset_Asthma',
 "Crohn's_Disease",
 'Hypothyroidism',
 'IBD',
 'Adult_Onset_Asthma',
 'Primary_Biliary_Cirrhosis',
 'Respiratory_and_Ear-nose-throat_Diseases',
 'Rheumatoid_Arthritis',
 'Systemic_Lupus_Erythematosus',
 'Ulcerative_Colitis'])

blood_traits = ['Basophil_count','Eosinophil_count','Lymphocyte_count','Neutrophil_count','White_blood_cell_count']
blood_traits

non_AI_traits = ['BMI','Height']

In [None]:
## MAKE GIANT, META DATAFRAME ##

overall_df = pd.DataFrame()

cell_types = ['T','B','M','F','E']
annots = ['DI:COMBINED']

inp_subfolder = "broad_ODI_fig3" # 'subtypes_fig4'
traits_plot_all = AI_diseases # blood_traits, non_AI_traits
date='072622'

for disease in traits_plot_all:
    for cell_type in cell_types:
        for annot in annots:
            try:
                if annot == 'COMBINED':
                    filename = date+"."+disease+"."+cell_type+"_COMBINED.results"
                elif annot == 'DI:COMBINED':
                    filename = date+"."+disease+"."+cell_type+"_DI:COMBINED.results"
                else:
                    filename = date+"."+disease+"."+cell_type+"_"+annot+".results"
                this_df = read_file(disease_fname_mapping[disease], filename, inp_subfolder)

                annot_concat = annot                
                if 'POS' in annot:
                    annot_concat = str(annot).split("_")[0]+"+"
                elif 'NEG' in annot:
                    annot_concat = str(annot).split("_")[0]+"-"

                if annot == 'DI:COMBINED' or annot == 'COMBINED':
                    #if DYN vs INV
                    this_df = this_df[['L2_0','L2_1']]
                    cols = [disease+"_"+cell_type+"_peaks_conditional_"+i for i in ['DYNAMIC','INVARIANT']]
                    this_df.columns = cols

                if annot != 'COMBINED' and annot != "DI:COMBINED":
                    # print('independent')
                    this_df = this_df[['L2_0']]
                    this_df.columns = [disease+"_"+cell_type+"_"+annot_concat]

                #calculate tau*
                tau_star, tau_star_SE = calc_tau_star(this_df, disease, False, cell_type, annot_concat) #True

                this_df.loc['effect_size',:] = tau_star
                this_df.loc['effect_size_SE',:] = tau_star_SE
                pvals = norm.pdf(0, loc = tau_star, scale = tau_star_SE)
                this_df.loc['effect_size_pval',:] = pvals

                overall_df = pd.concat([overall_df,this_df],axis=1)

            except:
                print("something went wrong:", disease, cell_type, annot)

### CALCULATE KEY METRICS FOR EACH ANNOT ###
overall_df.loc['annot_size (%)',:] = overall_df.loc['Prop._SNPs',:]*100
overall_df.loc['-log10(enrich_p)',:] = -np.log10(overall_df.loc['Enrichment_p',:])
overall_df.loc['-log10(tau*_p)',:] = -np.log10(overall_df.loc['effect_size_pval',:])
overall_df.loc['prop_h2',:] = overall_df.loc['Prop._h2',:]*100
overall_df.loc['prop_h2_SE',:] = overall_df.loc['Prop._h2_std_error',:]*100

overall_df

In [None]:
#total number of columns above should equal: # traits X # annots X # cell types (i.e. 28 X 13 X 4 = 1456)

**MAKE TRAITS X ANNOTATIONS DFs that include:**
- h2 enrichment
- h2 enrich pval
- tau*
- tau* pval

#create conditional and non-conditional versions of each

In [None]:
df_to_use = overall_df

# if conditional
if 'conditional' in df_to_use.columns[0]:
    plot_df = df_to_use.T[['Prop._h2','Enrichment','Enrichment_std_error','Enrichment_p','-log10(enrich_p)','effect_size','effect_size_SE','effect_size_pval','-log10(tau*_p)']]
    plot_df['annot'] = [i.split("_")[-4:-3][0]+"_"+i.split("_")[-1:][0] for i in plot_df.index]
    plot_df['trait'] = ["_".join(i.split("_")[:-4]) for i in plot_df.index]
    plot_df['cell_type'] = [i.split("_")[-4:-3][0] for i in plot_df.index]

else: #if independent
    plot_df = df_to_use.T[['Prop._h2','Enrichment','Enrichment_std_error','Enrichment_p','-log10(enrich_p)','effect_size','effect_size_SE','effect_size_pval','-log10(tau*_p)']]
    plot_df['annot'] = [i.split("_")[-2:-1][0]+"_"+i.split("_")[-1:][0] for i in plot_df.index]
    plot_df['trait'] = ["_".join(i.split("_")[:-2]) for i in plot_df.index]
    plot_df['cell_type'] = [i.split("_")[-2:-1][0] for i in plot_df.index]

plt.hist(plot_df['-log10(enrich_p)'],bins=50,label='enrichment',alpha=0.7,color='darkblue');
plt.hist(plot_df['-log10(tau*_p)'],bins=50,alpha=0.9,label='tau*',color='teal');
sns.despine()
plt.legend()
plt.title("pvalue distribs\nenrichment and tau*")
plt.show()

plot_df

In [None]:
### for plotting, update the pvals such that those that are not significant are equal to 0 (so they aren't colored):
pdc = plot_df.copy()
sig_threshold = -np.log10(0.05/len(AI_diseases))
pdc['-log10(enrich_p)'] = np.where(pdc['-log10(enrich_p)']<sig_threshold, 0, pdc['-log10(enrich_p)'])
pdc['Enrichment'] = np.where(pdc['-log10(enrich_p)']<sig_threshold, 0, pdc['Enrichment'])
pdc['-log10(tau*_p)'] = np.where(pdc['-log10(tau*_p)']<sig_threshold, 0, pdc['-log10(enrich_p)'])
pdc['effect_size'] = np.where(pdc['-log10(tau*_p)']<sig_threshold, 0, pdc['effect_size'])

In [None]:
#plotting metrics for ALL ANNOTS COMBINED

df_to_plot = plot_df#pdc#plot_df
max_pval = 10
figx, figy = 3,6
df_to_plot['OG_traits_ordered'] = pd.Categorical(df_to_plot['trait'], categories=sorted(list(set(df_to_plot['trait'])))[::-1], ordered=True)
df_to_plot['annot_ordered'] = pd.Categorical(df_to_plot['annot'], categories=['B_DYNAMIC','B_INVARIANT',
                                                             'T_DYNAMIC','T_INVARIANT',
                                                             'M_DYNAMIC','M_INVARIANT',
                                                             'F_DYNAMIC','F_INVARIANT',
                                                             'E_DYNAMIC','E_INVARIANT'], ordered=True)
df_to_plot['annot_open_ordered'] = pd.Categorical(df_to_plot['annot'], categories=['B_OPEN',
                                                             'T_OPEN',
                                                             'M_OPEN',
                                                             'F_OPEN'
                                                             'E_OPEN'], ordered=True)

#for within cell-type annots
make_heatmap(df_to_plot, 'Enrichment', 'annot_ordered', 'OG_traits_ordered', max_pval, figx, figy, '#729CAE', 40)
make_heatmap(df_to_plot, 'effect_size', 'annot_ordered', 'OG_traits_ordered', max_pval, figx, figy, '#729CAE', 3)

In [None]:
#bar plot for annotation sizes

plt.figure(figsize=(3,1))
plt.bar(np.arange(0,10,1),[0.48, 1.48, 1.9, 1.1, 2.7, 1.3, 2.8, 1.4, 0.68, 1.6],
        color=['#F8766D','#F8766D','#F892EB','#F892EB','#619CFF','#619CFF','#00BFC4','#00BFC4','#00BA38','#00BA38'])
sns.despine()

In [None]:
###### subset by cell type
ctbar='T'
for col in AI_diseases:
    ct = '_'+ctbar+'_' # '_M_'
    ct_cols = [i for i in df_for_bar.columns if ct in str(i)]
    ct_df = df_for_bar[ct_cols]
    
ordered_ct_cols = []
for trait in AI_diseases:
    for at in ['DYNAMIC','INVARIANT']:
        ordered_ct_cols.append(trait+"_"+ctbar+"_peaks_conditional_"+at)

#tau*
plt.figure(figsize=(12,3))
plt.bar(ordered_ct_cols,ct_df.loc['effect_size',ordered_ct_cols],
        color=['mediumslateblue','gainsboro'],width=1)
plt.tight_layout()
sns.despine()
plt.ylabel('$h^2$ effect size (tau*)',fontsize=16)
plt.xticks(np.arange(0,len(ordered_ct_cols),1),[i.split("_"+ctbar+"_peaks_conditional_")[0] for i in ordered_ct_cols],rotation=90)

plt.errorbar(ordered_ct_cols, ct_df.loc['effect_size',ordered_ct_cols], yerr=ct_df.loc['effect_size_SE',ordered_ct_cols]*1.96,
             fmt='none', color='black', elinewidth=1, capthick=1, errorevery=1, alpha=1, ms=4) #, capsize = 3
plt.xlim([-.5,len(ordered_ct_cols)])

In [None]:
pd.DataFrame(ct_df.loc['effect_size',ordered_ct_cols]).sort_values('effect_size')

In [None]:
# scatter plots of dynamic versus invariant (same data as above, shown differently)

ctcolor='#F892EB'
ordered_ct_cols_DYN = [i for i in ordered_ct_cols if "DYNAMIC" in i]
ordered_ct_cols_INV = [i for i in ordered_ct_cols if 'INVARIANT' in i]

plt.figure(figsize=(3,6))
plt.errorbar(ct_df.loc['effect_size',ordered_ct_cols_INV],
            ct_df.loc['effect_size',ordered_ct_cols_DYN], 
             xerr=ct_df.loc['effect_size_SE',ordered_ct_cols_INV], fmt="o",
            color=ctcolor)
plt.errorbar(ct_df.loc['effect_size',ordered_ct_cols_INV],
            ct_df.loc['effect_size',ordered_ct_cols_DYN], 
             yerr=ct_df.loc['effect_size_SE',ordered_ct_cols_DYN], fmt="o",
            color=ctcolor)

plt.plot([-1,2.3],[-1,2.3],color='black',linestyle='--')
# plt.xlim(-1,2)
# plt.ylim(-1,6)
plt.xlabel('open peaks tau*',fontsize=15)
plt.ylabel('dynamic peaks tau*',fontsize=15)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
sns.despine()

#F M T E B: ['#00BFC4','#619CFF','#F892EB','#00BA38','#F8766D']