In [None]:
import pandas as pd
import altair as alt

In [None]:
bard1_data = '../Data/20250508_BARD1scores_update_FILTERED.xlsx'
brca1_data = '../Data/20240830_BRCA1_SGE_AllScores.xlsx'

cutoffs = {'BARD1': [-3.438968 * 0.028675 + 0.009242,-3.018904 * 0.028675 + 0.009242],
           'BRCA1': [-1.328,-0.748]
          }

In [None]:
def get_zn_residues(bard1, brca1):

    aa_3to1 = {
    'Ala': 'A', 'Arg': 'R', 'Asn': 'N', 'Asp': 'D', 'Cys': 'C',
    'Gln': 'Q', 'Glu': 'E', 'Gly': 'G', 'His': 'H', 'Ile': 'I',
    'Leu': 'L', 'Lys': 'K', 'Met': 'M', 'Phe': 'F', 'Pro': 'P',
    'Ser': 'S', 'Thr': 'T', 'Trp': 'W', 'Tyr': 'Y', 'Val': 'V'
    } 

    
    bard1_df = pd.read_excel(bard1)
    brca1_df = pd.read_excel(brca1)
    bard1_df = bard1_df.rename(columns = {'simplified_consequence': 'Consequence'}) #Renaming columns for consistency
    brca1_df = brca1_df.rename(columns = {'snv_score_minmax': 'score'}) #Renaming columns for consistency
    
    df_list = [bard1_df, brca1_df]
    
    for df in df_list:
        df.loc[df['Consequence'].str.contains('missense'), 'Consequence'] = 'Missense'
        df.loc[df['Consequence'] == 'synonymous_variant', 'Consequence'] = 'Synonymous'
        df.loc[df['Consequence'] == 'intron_variant', 'Consequence'] = 'Intron'
        df.loc[df['Consequence'] == 'stop_gained', 'Consequence'] = 'Stop Gained'
        df.loc[df['Consequence'] == 'stop_lost', 'Consequence'] = 'Stop Lost'
        df.loc[df['Consequence'].str.contains('site'), 'Consequence'] = 'Canonical Splice'
        df.loc[df['Consequence'].str.contains('ing_var'), 'Consequence'] = 'Splice Region'
        df.loc[df['Consequence'].str.contains('UTR'), 'Consequence'] = 'UTR Variant'
        df.loc[df['Consequence'] == 'start_lost', 'Consequence'] = 'Start Lost'

    

    bard1_df['amino_acid'] = bard1_df['amino_acid_change'].transform(lambda x: x[:-1])
    bard1_df = bard1_df.loc[bard1_df['amino_acid'].isin(['C50','C53','C66', 'H68', 'C71', 'C74', 'C83', 'C86'])] #Zn2+ binding residues from Klevit lab structure paper
    bard1_df = bard1_df[['amino_acid', 'amino_acid_change', 'Consequence', 'score']]

    brca1_df['AApos'] = brca1_df['hgvs_pro'].transform(lambda x: x.split(':')[1].split('.')[1][3:-3]) #Gets amino acid position from the HGVS protein notation
    brca1_df['amino_acid'] = brca1_df['hgvs_pro'].transform(lambda x: x.split(':')[1].split('.')[1][0:3]) #Gets original amino acid from the HGVS protein notation
    brca1_df['amino_acid'] = brca1_df['amino_acid'].map(aa_3to1) #Remaps the 3-letter amino acid code to 1-letter code
    brca1_df['amino_acid'] = brca1_df['amino_acid'] + brca1_df['AApos'] #Gets the full amino acid with position
    brca1_df['amino_acid_change'] = brca1_df['hgvs_pro'].transform(lambda x: x.split(':')[1].split('.')[1][-3:]) #Gets amino acid substitution
    
    brca1_df['amino_acid_change'] = brca1_df['amino_acid_change'].map(aa_3to1)
 
    brca1_df['amino_acid_change'] = brca1_df['amino_acid'] + brca1_df['amino_acid_change']
    brca1_df = brca1_df.loc[brca1_df['amino_acid'].isin(['C24', 'C27', 'C44', 'C47', 'C39', 'H41', 'C61', 'C64'])] #Zn2+ binding residues from Klevit lab structure paper
    brca1_df = brca1_df[['score', 'Consequence', 'amino_acid', 'amino_acid_change']]

    to_return = {'BARD1': bard1_df,
                 'BRCA1': brca1_df
                }
    
    return to_return

In [None]:
def strip_plot(data_dfs, cutoffs):
    
    keys = ['BARD1', 'BRCA1']
    plots = []
    for key in keys:
        df = data_dfs[key]
        print(cutoffs[key][0])
        nf_line = alt.Chart(pd.DataFrame({'x': [cutoffs[key][0]]})).mark_rule(color = 'red').encode(
            x = 'x')
    
        func_lin = alt.Chart(pd.DataFrame({'x': [cutoffs[key][1]]})).mark_rule(color = 'blue').encode(
            x = 'x')
    
        sort_order = ["Intron", "Missense", "Synonymous", "Stop Gained", "Splice", "Start Lost", 'Stop Lost', 'UTR']
        controls_sorted = ["Intron",  "Synonymous", "Stop Gained", "Missense", "Splice", "Start Lost", 'Stop Lost', 'UTR']
    
    
        # Get the category10 colors
        category10_colors = [
            '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', 
            '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'
        ]
        
        # Map each category to a specific color
        color_mapping = {
            "Intron": category10_colors[0],
            "Synonymous": category10_colors[2],
            "Stop Gained": category10_colors[3],
            "Missense": category10_colors[1],
            "Splice": category10_colors[4],
            "Start Lost": category10_colors[5],
            "Stop Lost": category10_colors[6],
            "UTR": category10_colors[7]
        }
        
        plot = alt.Chart(df).mark_tick(opacity = 1).encode(
            x = alt.X('score:Q',
                      axis = alt.Axis(title = '', 
                                      titleFontSize = 20,
                                     labelFontSize = 24)
                     ),
            y = alt.Y('amino_acid:N', 
                      sort = 'ascending',
                      axis = alt.Axis(title = '',
                                     labelFontSize = 24)
                     ),
            color = alt.Color('Consequence:N',
                    sort = sort_order,
                    scale = alt.Scale(domain = list(color_mapping.keys()),
                                      range = list(color_mapping.values())
                             )
                             ),
            tooltip = [alt.Tooltip('Consequence', title = 'Consequence: '),
                        alt.Tooltip('amino_acid_change', title = 'Amino Acid Change: ')]
            ).properties(
                width = 800,
                height = 400,
                title = 'Variants at ' + key + ' Zn-Binding Residues'
            )
        
        plot = plot + nf_line + func_lin

        
        #plot.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_1d_stripplot.png', ppi = 500)
        plots.append(plot)

    final_plot = plots[0] | plots[1]
    final_plot = final_plot.configure_tick(
        thickness = 2
    )
    final_plot.display()

In [None]:
def main():
    dfs = get_zn_residues(bard1_data, brca1_data)
    print(dfs)
    strip_plot(dfs, cutoffs)

In [None]:
main()