This notebook builds plots used in Figs. 1b-c. Large histogram and strip plot are output as one concatenated plot. The plot inset is separately output and was added manually to yield complete Fig. 1b

In [None]:
import pandas as pd
import altair as alt
import scipy.stats as stats

In [None]:
file = './Data/final_tables/supplementary_file_1_BARD1_SGE_final_table.xlsx' #SGE datafile

In [None]:
def read_scores(file): #reads score from excel file
    df = pd.read_excel(file, sheet_name = 'scores')
    df = df.loc[~df['variant_qc_flag'].isin(['WARN'])]
    threshold_df = pd.read_excel(file, sheet_name = 'thresholds')
    df = df.rename(columns = {'consequence': 'Consequence'}) #Comment out when column name changes back to 'Consequence'

    df.loc[df['var_type'] == '3bp_del', 'Consequence'] = '3bp Deletion'

    
    
    thresholds = [threshold_df['min'][0], threshold_df['max'][0]]


    return df, thresholds

In [None]:
def prep_data(df): #Renames categories to be nicer for legend
    
    df.loc[df['Consequence'].str.contains('missense'), 'Consequence'] = 'Missense'
    df.loc[df['Consequence'] == 'synonymous_variant', 'Consequence'] = 'Synonymous'
    df.loc[df['Consequence'] == 'intron_variant', 'Consequence'] = 'Intron'
    df.loc[df['Consequence'] == 'stop_gained', 'Consequence'] = 'Stop Gained'
    df.loc[df['Consequence'] == 'stop_lost', 'Consequence'] = 'Stop Lost'
    df.loc[df['Consequence'].str.contains('site'), 'Consequence'] = 'Canonical Splice'
    df.loc[df['Consequence'].str.contains('ing_var'), 'Consequence'] = 'Splice Region'
    df.loc[df['Consequence'].str.contains('UTR'), 'Consequence'] = 'UTR Variant'
    df.loc[df['Consequence'] == 'start_lost', 'Consequence'] = 'Start Lost'

    return df


In [None]:
def make_histogram(df, thresholds): #makes histogram

    alt.data_transformers.disable_max_rows() #gets rid of max data length problem

    length = str(len(df)) #gets length of data for title

    nf_line = alt.Chart(pd.DataFrame({'Fitness Score': [thresholds[0]]})).mark_rule(
                color='black', strokeDash=[8, 8], strokeWidth=2
            ).encode(x='Fitness Score:Q')
            
    func_line = alt.Chart(pd.DataFrame({'Fitness Score': [thresholds[1]]})).mark_rule(
                color='#888888', strokeDash=[8, 8], strokeWidth=2
            ).encode(x='Fitness Score:Q')

    final_tital = 'Distribution of BARD1 SGE Scores ' + '(n = ' + length + ')' #used to build title
    bins = 50 #number of bins

    sorted = ["Intron", "Missense", "Synonymous", "Stop Gained", "Splice", "Start Lost", 'Stop Lost', 'UTR', '3bp Deletion'] #order for the legend
    selection = alt.selection_point(fields=['Consequence'], bind='legend')

    palette = [
    '#006616', # dark green,
    '#81B4C7', # dusty blue
    '#ffcd3a', # yellow
    '#6AA84F', # med green
    '#93C47D', # light green
    '#888888', # med gray
    '#000000', # black
    '#1170AA', # darker blue
    '#CFCFCF', # light gray
    '#FF9A00'   #orange
        
    ]
    
    
    variant_types = [
        'Synonymous',
        'Missense',  
        'Stop Gained',
        'Intron', 
        'UTR Variant',
        'Stop Lost',
        'Start Lost',
        'Canonical Splice', 
        'Splice Region',
        '3bp Deletion'
    ]
    
    # Builds histogram with interactive legend
    histogram = alt.Chart(df).mark_bar().encode(
            alt.X('score', 
                  axis = alt.Axis(title = '', 
                                  labelFontSize = 16, 
                                  titleFontSize = 20,
                                  values = [-0.8, -0.6,-0.4,-0.2, 0,0.2]
                                 ), 
                  bin = alt.Bin(maxbins = bins)),
            alt.Y('count()', 
                  axis = alt.Axis(title = 'Number of Variants', 
                                  labelFontSize = 16, 
                                  titleFontSize = 20,
                                 values = [0, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500])),
            color = alt.Color('Consequence:N', 
                             scale = alt.Scale(range = palette,
                                              domain = variant_types), 
                             legend = alt.Legend(titleFontSize = 16, 
                                                 labelFontSize = 14,
                                                 orient = 'right',
                                                offset = -80)),
            opacity=alt.condition(selection, alt.value(1), alt.value(0.2))  # Highlight selected categories
    ).add_params(
        selection
    ).properties(
        width = 800,
        height = 200,
        title = alt.TitleParams(text = final_tital, fontSize = 22)
    ).interactive()

    histogram = alt.layer(histogram, nf_line, func_line).resolve_scale(
        y = 'shared'
    )

    '''
    annotation = alt.Chart(pd.DataFrame({
        'x': [-0.09851],
        'y': [125],  # Adjust this value as needed
        'text': ['*']
    })).mark_text(
        size=30,
        color='black'
    ).encode(
        x='x:Q',
        y='y:Q',
        text='text:N'
    )

    histogram = histogram + annotation

    '''
    
    #Code for histogram inset
    inset = alt.Chart(df).mark_bar().encode(
            alt.X('score', 
                  axis = alt.Axis(title = '', 
                                  labelFontSize = 22, 
                                  titleFontSize = 20,
                                  values = [-0.8, -0.7, -0.6, -0.5, -0.4,-0.3,-0.2, -0.1]
                                 ), 
                  bin = alt.Bin(maxbins = bins),
                 scale = alt.Scale(domain = [-0.8, -0.06159])
                 ),
            alt.Y('count()', 
                  axis = alt.Axis(title = '', 
                                  labelFontSize = 22, 
                                  titleFontSize = 20,
                                 values = [0, 50, 100, 150, 200]),
                 scale = alt.Scale(domain = [0, 200])
                 ),
            color = alt.Color('Consequence:N', 
                             scale = alt.Scale(range = palette,
                                              domain = variant_types), 
                             sort = sorted, 
                             legend = None),
            opacity=alt.condition(selection, alt.value(1), alt.value(0.2))  # Highlight selected categories
    ).add_params(
        selection
    ).configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    ).properties(
        width = 800,
        height = 200,
        title = 'HISTOGRAM INSET'
    ).interactive()

    inset.display()
    #inset.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_1c_inset.png', ppi = 500)
    
    #Block for colored histograms contianing only synonymous and stop-gained variants
    '''
    gray_df = df.loc[df['Consequence'].isin(['Stop Gained', 'Synonymous'])]
    final_tital_gray = 'Distribution of BARD1 Stop and Syn. Variants ' + '(n = ' + str(len(gray_df)) + ')' #used to build title
    histogram_gray = alt.Chart(gray_df).mark_bar(color = 'grey').encode(
            alt.X('snv_score', axis = alt.Axis(title = 'SGE Score', labelFontSize = 16, titleFontSize = 20), 
                  bin = alt.Bin(maxbins = bins)),
            alt.Y('count()', axis = alt.Axis(title = 'Number of Variants', labelFontSize = 16, titleFontSize = 20)),
            color = alt.Color('Consequence:N',
                              scale = alt.Scale(
                                  domain = ['Synonymous', 'Stop Gained'],
                                  range = ['#2ca02c', 'grey']
                              ),
                              legend = alt.Legend(titleFontSize = 16, labelFontSize = 14))
    ).properties(
        width = 800,
        height = 400,
        title = alt.TitleParams(text = final_tital_gray, fontSize = 22)
    ).interactive()

    histogram_gray = histogram_gray + nf_line + func_line

    histogram_gray = histogram_gray.configure_axis(
        grid = False
    )

    histogram_gray.display()
    histogram_gray.save('/Users/ivan/Desktop/BARD1_draft_figs/stop_syn_histogram_wSyn.png', ppi = 500)

    '''
    
    #histogram.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_1c.png', ppi = 500)
    #histogram.display()
    
    return histogram, inset

In [None]:
def strip_plot(df, thresholds):

    palette = [
    '#006616', # dark green,
    '#81B4C7', # dusty blue
    '#ffcd3a', # yellow
    '#6AA84F', # med green
    '#93C47D', # light green
    '#888888', # med gray
    '#000000', # black
    '#1170AA', # darker blue
    '#CFCFCF', # light gray
    '#FF7F0E'   #cat 10 orange
        
    ]
    
    
    variant_types = [
        'Synonymous',
        'Missense',  
        'Stop Gained',
        'Intron', 
        'UTR Variant',
        'Stop Lost',
        'Start Lost',
        'Canonical Splice', 
        'Splice Region',
        '3bp Deletion'
    ]

    df['Consequence'] = pd.Categorical(df['Consequence'], 
                                   categories=variant_types, 
                                   ordered=True)

    nf_line = alt.Chart(pd.DataFrame({'Fitness Score': [thresholds[0]]})).mark_rule(
                color='black', strokeDash=[8, 8], strokeWidth=2
            ).encode(x='Fitness Score:Q')
            
    func_line = alt.Chart(pd.DataFrame({'Fitness Score': [thresholds[1]]})).mark_rule(
        color='#888888', strokeDash=[8, 8], strokeWidth=2
    ).encode(x='Fitness Score:Q')

    sort_order = variant_types
            
    plot = alt.Chart(df).mark_tick(opacity = 1).encode(
        x = alt.X('score:Q',
                  axis = alt.Axis(title = 'Fitness Score', 
                                  values = [-0.8, -0.6, -0.4,  -0.2,  0,  0.2],
                                  titleFontSize = 20,
                                 labelFontSize = 16)
                 ),
        y = alt.Y('Consequence:N', 
                  sort = sort_order,
                  axis = alt.Axis(title = '',
                                 labelFontSize = 16)
                 ),
        color = alt.Color('Consequence:N',
                legend=None,
                scale = alt.Scale(range = palette,
                                  domain = variant_types)
                         )
        ).properties(
            width = 800,
            height = 200
        ).interactive()

    #plot.display()
    '''
    annotation_df = pd.DataFrame({
        'x': [0.00753087, 0.00258044, -0.0201918],
        'y': ['Stop Gained', 'Stop Gained', 'Stop Gained'],
        'text': ['*', '*', '*']
    })
    
    annotation_df['y'] = pd.Categorical(annotation_df['y'], 
                                        categories=variant_types, 
                                        ordered=True)

    annotation = alt.Chart(annotation_df).mark_text(
        size=20,
        color='red',
        dy = -7.5
    ).encode(
        x='x:Q',
        y= alt.Y('y:N',
                 sort = sort_order
                ),
        text='text:N'
    )
    
    plot = alt.layer(plot, nf_line, func_line, annotation).resolve_scale(
        y = 'shared'
    )
    '''

    plot = alt.layer(plot, nf_line, func_line).resolve_scale(
    y = 'shared'
        )
    #plot.display()
    '''
    plot = plot.configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    )
    '''
    
    #plot.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_1d_stripplot.png', ppi = 500)
    #plot.display()

    return plot

In [None]:
def group_testing(df):
    lof_missense = df.loc[(df['Consequence'].isin(['Missense'])) & (df['functional_consequence'].isin(['functionally_abnormal']))]
    norm_missense = df.loc[(df['Consequence'].isin(['Missense'])) & (df['functional_consequence'].isin(['functionally_normal']))]

    stop_splice = df.loc[(df['Consequence'].isin(['Stop Gained', 'Canonical Splice']))]
    syn = df.loc[(df['Consequence'].isin(['Synonymous']))]

    lof_missense_scores = lof_missense['score'].tolist()
    print(syn['score'].mean())
    norm_missense_scores = norm_missense['score'].tolist()

    stop_splice_scores = stop_splice['score'].tolist()
    syn_scores = stop_splice['score'].tolist()

    pairs = [('LoF Missense vs. Stop/Splice', lof_missense_scores, stop_splice_scores), ('Normal Missense vs. Syn.', norm_missense_scores, syn_scores)]

    for pair in pairs:
        name, missense_scores, not_missense_score = pair

        stat, p_val = stats.ks_2samp(missense_scores, not_missense_score)

        print(name, ':', str(p_val))

In [None]:
def main(save = False):
    data, thresholds = read_scores(file)
    to_graph = prep_data(data)
    histogram, inset = make_histogram(to_graph, thresholds)
    stripplot = strip_plot(to_graph, thresholds)
    group_testing(to_graph)

    final = histogram  & stripplot

    final = final.configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    ).resolve_scale(
        x = 'shared'
    )
    
    final.display()

    if save:
        inset.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_1c_inset.svg')
        final.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_1c_d_base.svg')

In [None]:
main(save = False)