In [None]:
import pandas as pd
import altair as alt

In [None]:
bard1_data = '../Data/BARD1_SGE_final_table.xlsx'
brca1_data = '../Data/BRCA1_SGE_data.xlsx'

#Gets thresholds for BARD1 data
threshold_df = pd.read_excel(bard1_data, sheet_name = 'thresholds')

cutoffs = {'BARD1': [threshold_df['min'][0], threshold_df['max'][0]],
           'BRCA1': [-1.328,-0.748] #BRCA1 thresholds from Findlay et al. 2018
          }

In [None]:
def get_zn_residues(bard1, brca1): #Pulls out data for missense variants impacting Zn2+ binding residues

    aa_3to1 = {
    'Ala': 'A', 'Arg': 'R', 'Asn': 'N', 'Asp': 'D', 'Cys': 'C',
    'Gln': 'Q', 'Glu': 'E', 'Gly': 'G', 'His': 'H', 'Ile': 'I',
    'Leu': 'L', 'Lys': 'K', 'Met': 'M', 'Phe': 'F', 'Pro': 'P',
    'Ser': 'S', 'Thr': 'T', 'Trp': 'W', 'Tyr': 'Y', 'Val': 'V'
    } #Amino acid three letter code to one letter code map for BRCA1 data

    #Reads data
    bard1_df = pd.read_excel(bard1, sheet_name = 'scores')
    brca1_df = pd.read_excel(brca1, sheet_name = 'findlay_2018')
    
    bard1_df = bard1_df.rename(columns = {'consequence': 'Consequence'}) #Renaming columns for consistency
    brca1_df = brca1_df.rename(columns = {'snv_score_minmax': 'score'}) #Renaming columns for consistency
    
    df_list = [bard1_df, brca1_df]

    #Re-labels functional consequences for each consequence type
    for df in df_list:
        df.loc[df['Consequence'].str.contains('missense'), 'Consequence'] = 'Missense'
        df.loc[df['Consequence'] == 'synonymous_variant', 'Consequence'] = 'Synonymous'
        df.loc[df['Consequence'] == 'intron_variant', 'Consequence'] = 'Intron'
        df.loc[df['Consequence'] == 'stop_gained', 'Consequence'] = 'Stop Gained'
        df.loc[df['Consequence'] == 'stop_lost', 'Consequence'] = 'Stop Lost'
        df.loc[df['Consequence'].str.contains('site'), 'Consequence'] = 'Canonical Splice'
        df.loc[df['Consequence'].str.contains('ing_var'), 'Consequence'] = 'Splice Region'
        df.loc[df['Consequence'].str.contains('UTR'), 'Consequence'] = 'UTR Variant'
        df.loc[df['Consequence'] == 'start_lost', 'Consequence'] = 'Start Lost'

    
    #Gets BARD1 amino acids that coordinate Zn2+
    bard1_df['amino_acid'] = bard1_df['amino_acid_change'].transform(lambda x: x[:-1])
    bard1_df = bard1_df.loc[bard1_df['amino_acid'].isin(['C50','C53','C66', 'H68', 'C71', 'C74', 'C83', 'C86'])] #Zn2+ binding residues from Klevit lab structure paper
    bard1_df = bard1_df[['amino_acid', 'amino_acid_change', 'Consequence', 'functional_consequence', 'score']]

    #Gets BRCA1 amino acids that coordinate Zn2+
    brca1_df['AApos'] = brca1_df['hgvs_pro'].transform(lambda x: x.split(':')[1].split('.')[1][3:-3]) #Gets amino acid position from the HGVS protein notation
    brca1_df['amino_acid'] = brca1_df['hgvs_pro'].transform(lambda x: x.split(':')[1].split('.')[1][0:3]) #Gets original amino acid from the HGVS protein notation
    brca1_df['amino_acid'] = brca1_df['amino_acid'].map(aa_3to1) #Remaps the 3-letter amino acid code to 1-letter code
    brca1_df['amino_acid'] = brca1_df['amino_acid'] + brca1_df['AApos'] #Gets the full amino acid with position
    brca1_df['amino_acid_change'] = brca1_df['hgvs_pro'].transform(lambda x: x.split(':')[1].split('.')[1][-3:]) #Gets amino acid substitution
    
    brca1_df['amino_acid_change'] = brca1_df['amino_acid_change'].map(aa_3to1) #Maps BRCA1 3 letter amino acid codes to one letter codes
 
    brca1_df['amino_acid_change'] = brca1_df['amino_acid'] + brca1_df['amino_acid_change']
    brca1_df = brca1_df.loc[brca1_df['amino_acid'].isin(['C24', 'C27', 'C44', 'C47', 'C39', 'H41', 'C61', 'C64'])] #Zn2+ binding residues from Klevit lab structure paper
    brca1_df = brca1_df[['score', 'Consequence', 'amino_acid', 'amino_acid_change']]

    to_return = {'BARD1': bard1_df,
                 'BRCA1': brca1_df
                }
    
    return to_return

In [None]:
def quick_missense_stats(data_dfs): #Quick numbers on number of Zn2+ binding missense variants that are functionally abnormal
    
    df = data_dfs['BARD1']
    df = df.loc[df['Consequence'].isin(['Missense'])]
    zn_missense_vars = len(df)
    ab_missense_vars = df['functional_consequence'].value_counts().get('functionally_abnormal',0)

    print('Total Missense Vars: ', str(zn_missense_vars), '\n',
            'Abnormal Missense Vars: ', str(ab_missense_vars)
         )

In [None]:
def strip_plot(data_dfs, cutoffs): #Builds side-by-side strip plots to compare/contrast distribution of Zn2+ binding missense variants for BRCA1 and BARD1
    
    keys = ['BARD1', 'BRCA1'] #Gene names
    plots = [] #List to hold plots

    #Iterates through each gene and builds the strip plot
    for key in keys:
        df = data_dfs[key]

        nf_line = alt.Chart(pd.DataFrame({'x': [cutoffs[key][0]]})).mark_rule(color = 'red').encode(
            x = 'x')
    
        func_lin = alt.Chart(pd.DataFrame({'x': [cutoffs[key][1]]})).mark_rule(color = 'blue').encode(
            x = 'x')
    
        sort_order = ["Intron", "Missense", "Synonymous", "Stop Gained", "Splice", "Start Lost", 'Stop Lost', 'UTR']
        controls_sorted = ["Intron",  "Synonymous", "Stop Gained", "Missense", "Splice", "Start Lost", 'Stop Lost', 'UTR']
    
    
        # Get the category10 colors
        category10_colors = [
            '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', 
            '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'
        ]
        
        # Map each category to a specific color
        color_mapping = {
            "Intron": category10_colors[0],
            "Synonymous": category10_colors[2],
            "Stop Gained": category10_colors[3],
            "Missense": category10_colors[1],
            "Splice": category10_colors[4],
            "Start Lost": category10_colors[5],
            "Stop Lost": category10_colors[6],
            "UTR": category10_colors[7]
        }
        
        plot = alt.Chart(df).mark_tick(opacity = 1).encode(
            x = alt.X('score:Q',
                      axis = alt.Axis(title = '', 
                                      titleFontSize = 20,
                                     labelFontSize = 24)
                     ),
            y = alt.Y('amino_acid:N', 
                      sort = 'ascending',
                      axis = alt.Axis(title = '',
                                     labelFontSize = 24)
                     ),
            color = alt.Color('Consequence:N',
                    sort = sort_order,
                    scale = alt.Scale(domain = list(color_mapping.keys()),
                                      range = list(color_mapping.values())
                             )
                             ),
            tooltip = [alt.Tooltip('Consequence', title = 'Consequence: '),
                        alt.Tooltip('amino_acid_change', title = 'Amino Acid Change: ')]
            ).properties(
                width = 800,
                height = 400,
                title = 'Variants at ' + key + ' Zn-Binding Residues'
            )
        
        plot = plot + nf_line + func_lin

        plots.append(plot)

    final_plot = plots[0] | plots[1] #Concatenates plots
    
    final_plot = final_plot.configure_tick(
        thickness = 2
    )
    final_plot.display()


In [None]:
def bard1_plot_only(df, cutoffs): #Builds clean strip plot showing distribution of BARD1 Zn2+ binding variants only
    
    nf_line = alt.Chart(pd.DataFrame({'x': [cutoffs['BARD1'][0]]})).mark_rule(color = 'red').encode(
            x = 'x')
    
    func_lin = alt.Chart(pd.DataFrame({'x': [cutoffs['BARD1'][1]]})).mark_rule(color = 'blue').encode(
        x = 'x')

    sort_order = ["Intron", "Missense", "Synonymous", "Stop Gained", "Splice", "Start Lost", 'Stop Lost', 'UTR']
    controls_sorted = ["Intron",  "Synonymous", "Stop Gained", "Missense", "Splice", "Start Lost", 'Stop Lost', 'UTR']


    palette = [
    '#006616', # dark green,
    '#81B4C7', # dusty blue
    '#ffcd3a', # yellow
    '#CFCFCF' # light gray
        
    ]
    
    
    variant_types = [
        'Synonymous',
        'Missense',  
        'Stop Gained', 
        'Splice Region',
    ]
    
    plot = alt.Chart(df).mark_tick(opacity = 1, thickness = 2).encode(
        x = alt.X('score:Q',
                  axis = alt.Axis(title = 'SGE Score', 
                                  values = [-0.6, -0.4,-0.2,  0,  0.2],
                                  titleFontSize = 20,
                                 labelFontSize = 22)
                 ),
        y = alt.Y('amino_acid:N', 
                  sort = ['C50', 'C53', 'C66', 'H68', 'C71', 'C74','C83', 'C86'],
                  axis = alt.Axis(title = 'Residue',
                                  titleFontSize = 20,
                                 labelFontSize = 22)
                 ),
        color = alt.Color('Consequence:N',
                sort = variant_types,
                scale = alt.Scale(domain = variant_types,
                                  range = palette
                         ),
                legend = alt.Legend(titleFontSize = 18, 
                                     labelFontSize = 16,
                         )
                         ),
        tooltip = [alt.Tooltip('Consequence', title = 'Consequence: '),
                    alt.Tooltip('amino_acid_change', title = 'Amino Acid Change: ')]
        ).properties(
            width = 400,
            height = 300,
            title = alt.TitleParams(text = 'Variants at ' + 'BARD1' + ' Zn-Binding Residues',
                                    fontSize = 22
                                   )
        )
    
    plot = (plot + nf_line + func_lin).configure_axis(
        grid = False).configure_view(
        stroke = None
    )

    plot.display()

    return plot

In [None]:
def main():
    dfs = get_zn_residues(bard1_data, brca1_data)
    quick_missense_stats(dfs)
    strip_plot(dfs, cutoffs)
    bard1_plot = bard1_plot_only(dfs['BARD1'], cutoffs)

    #bard1_plot.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_5c_ZnStrip.png', ppi = 500)

In [None]:
main()