In [None]:
import pandas as pd
import altair as alt
from scipy import stats
import numpy as np
from natsort import natsorted

In [None]:
bard1_data = '../Data/BARD1_SGE_final_table.xlsx'
brca1_data = '../Data/BRCA1_SGE_data.xlsx'

#Estimated thresholds for data from Findlay et. al 2018
brca1_int_cutoff = -0.748
brca1_path_cutoff = -1.328

domain = 'BRCT'

In [None]:
def convert_to_one(hgvs_str):
    
    aa_3to1 = {
    'Ala': 'A', 'Arg': 'R', 'Asn': 'N', 'Asp': 'D', 'Cys': 'C',
    'Gln': 'Q', 'Glu': 'E', 'Gly': 'G', 'His': 'H', 'Ile': 'I',
    'Leu': 'L', 'Lys': 'K', 'Met': 'M', 'Phe': 'F', 'Pro': 'P',
    'Ser': 'S', 'Thr': 'T', 'Trp': 'W', 'Tyr': 'Y', 'Val': 'V'
    } #Dictionary for converting 3-letter amino acid codes to 1-letter codes

    sub = hgvs_str.split('.')[1]

    ogAA = sub[0:3]
    newAA = sub[-3:]
    pos = sub[3:-3]
    
    remap_ogAA = aa_3to1[ogAA]
    remap_newAA = aa_3to1[newAA]

    new_hgvs = 'p.' + remap_ogAA + pos + remap_newAA

    return new_hgvs  

In [None]:
def read_data(bard1, brca1, brca1_int, brca1_path_max, domain):
    bard1_df = pd.read_excel(bard1, sheet_name = 'scores')
    bard1_df = bard1_df.loc[~bard1_df['variant_qc_flag'].isin(['WARN'])] #Filters out WARN variants in same codon as fixed edit
    brca1_df = pd.read_excel(brca1, sheet_name = 'findlay_2018')

    if domain == 'RING': #Sets domain for RING
        brca1_region = list(range(1, 302))
        bard1_region = list(range(26, 123))

    elif domain == 'RING_loop': #Sets domain for RING domains without the large helices
        brca1_region = list(range(67, 238)) #Residues 23 - 79
        bard1_region = list(range(48, 97)) #Residues 48 - 96

    elif domain == 'BRCT': #Sets domain for BRCT
        brca1_region = list(range(4936, 5566))
        bard1_region = list(range(568, 778))

    brca1_df = brca1_df.loc[brca1_df['pos'].isin(brca1_region)] #Gets all data from Findlay 2018 dataset
    brca1_df['functional_consequence'] = 'indeterminate' #Creates functional consequence column

    #Puts variants into functional classes
    brca1_df.loc[brca1_df['snv_score_minmax'] <= brca1_path_max, 'functional_consequence'] = 'functionally_abnormal'
    brca1_df.loc[brca1_df['snv_score_minmax'] >= brca1_int, 'functional_consequence'] = 'functionally_normal'
    brca1_df = brca1_df.loc[brca1_df['Consequence'].isin(['missense_variant'])] #Filters for missense variants
    brca1_df['AApos'] = brca1_df['hgvs_pro'].transform(lambda x: int(x.split(':')[1].split('.')[1][3:-3]))
    brca1_df['hgvs_pro'] = brca1_df['hgvs_pro'].transform(lambda x: x.split(':')[1])
    brca1_df['hgvs_pro'] = brca1_df['hgvs_pro'].apply(convert_to_one)
    
    new_brca1 = pd.read_excel(brca1, sheet_name = 'dace_2025') #Reads Dace 2025 BRCA1 data
    new_brca1 = new_brca1.loc[new_brca1['Consequence'].isin(['Missense'])] #Filters for missense variants
    new_brca1['CDSpos'] = new_brca1['CDSpos'].astype(int)
    new_brca1 = new_brca1.loc[new_brca1['CDSpos'].isin(brca1_region)] #Filters for variants in specified domain only

    new_brca1['target'] = 'BRCA1_X2' #Placeholder column for downstream concatenation 
    new_brca1['functional_consequence'] = 'indeterminate' #Sets functional consequence collumn
    new_brca1.loc[new_brca1['function_class'] == 'LoF', 'functional_consequence'] = 'functionally_abnormal' #LoF varinats are set as functionaly abnormal
    new_brca1.loc[new_brca1['function_class'] == 'Neutral', 'functional_consequence'] = 'functionally_normal' #Neutral variants set to functionally normal
    new_brca1_pos = list(set(new_brca1['CDSpos'].tolist())) #Gets positions in new data set
    new_brca1 = new_brca1.rename(columns = {'CDSpos': 'pos', 'final_function_score': 'snv_score_minmax', 'protPos': 'AApos', 'pHGVS': 'hgvs_pro'}) #Renames columns for concatenation

    new_brca1['AApos'] = new_brca1['AApos'].astype(int)
    brca1_df = brca1_df.loc[~(brca1_df['pos'].isin(new_brca1_pos))] #Variants in 2018 dataset found in 2025 dataset are removed
    brca1_df = pd.concat([brca1_df, new_brca1[['target','pos','Consequence','snv_score_minmax', 'functional_consequence', 'AApos', 'hgvs_pro']]]) #2018 and 2025 BRCA1 datasets are concatenated
    brca1_df = brca1_df.rename(columns = {'snv_score_minmax': 'score'})
    
    #Processing BARD1 data
    bard1_df = bard1_df.loc[bard1_df['consequence'].isin(['missense_variant'])] #Missense variants only
    bard1_df['AApos'] = bard1_df['amino_acid_change'].transform(lambda x: x[1:-1]) #Gets amino acid position
    bard1_df['AApos'] = bard1_df['AApos'].astype(int) #Sets data type as integer
    bard1_df = bard1_df.loc[bard1_df['AApos'].isin(bard1_region)] #Gets variants in region of interst only
    
    return bard1_df, brca1_df

In [None]:
def missense_sensitivity(bard1, brca1): #Tabulates missense sensitivity between BRCA1 and BARD1

    #Gets counts and percent of missense variants in functionally normal and abnormal camps for BARD1
    bard1_sensitive = bard1['functional_consequence'].value_counts().get('functionally_abnormal',0)
    bard1_normal = bard1['functional_consequence'].value_counts().get('functionally_normal',0)
    total_bard1_ring_missense = len(bard1)
    bard1_sensitivity = bard1_sensitive / total_bard1_ring_missense
    bard1_normal_percent = bard1_normal / total_bard1_ring_missense

    #Analagous code for BRCA1
    brca1_sensitive = brca1['functional_consequence'].value_counts().get('functionally_abnormal',0)
    brca1_normal = brca1['functional_consequence'].value_counts().get('functionally_normal',0)
    total_brca1_ring_missense = len(brca1)

    brca1_sensitivity = brca1_sensitive / total_brca1_ring_missense
    brca1_normal_percent = brca1_normal / total_brca1_ring_missense

    #Sets up contingency table for Fischer exact testing
    contingency_table = np.array([[brca1_sensitive, brca1_normal],
                                  [bard1_sensitive, bard1_normal]])

    odds_ratio, p_value = stats.fisher_exact(contingency_table)

    print(f"Fisher's exact test p-value: {p_value}")
    print(f"Odds ratio: {odds_ratio}")

    #Dataframe passed on for plotting
    to_plot = pd.DataFrame({
        'Gene': ['BRCA1', 'BARD1','BRCA1', 'BARD1'],
        'Percent of Variants': [brca1_sensitivity,bard1_sensitivity, brca1_normal_percent, bard1_normal_percent],
        'Type': ['Abnormal Missense', 'Abnormal Missense', 'Normal Missense', 'Normal Missense']
    })

    display(to_plot)
    return to_plot
    

In [None]:
def missense_resi(bard1_df, brca1_df):

    #brca1_df.loc[brca1_df['functional_consequence'] == 'indeterminate', 'functional_consequence'] = 'functionally_abnormal'

    bard1_df['functional_consequence'] = pd.Categorical(bard1_df['functional_consequence'], categories = ['functionally_abnormal', 'indeterminate', 'functionally_normal'], ordered=True)
    brca1_df['functional_consequence'] = pd.Categorical(brca1_df['functional_consequence'], categories = ['functionally_abnormal', 'indeterminate', 'functionally_normal'], ordered=True)
    #brca1_df = brca1_df.loc[brca1_df['AApos'].isin([20, 26, 27, 28, 51, 52, 54, 55, 63, 64, 65, 66, 68, 104])]
    
    bard1_df = bard1_df[['functional_consequence', 'AApos']]
    bard1_df = bard1_df.groupby('AApos').agg({
        'functional_consequence': 'min'
    }).reset_index()

    brca1_df = brca1_df[['functional_consequence', 'AApos']]
    brca1_df = brca1_df.groupby('AApos').agg({
        'functional_consequence': 'min'
    }).reset_index()

    bard1_sensitive_count = bard1_df['functional_consequence'].value_counts().get('functionally_abnormal', 0)
    brca1_sensitive_count = brca1_df['functional_consequence'].value_counts().get('functionally_abnormal',0)

    

    brca1_perc = brca1_sensitive_count / len(brca1_df)
    bard1_perc = bard1_sensitive_count / len(bard1_df)

    print('BARD1: ', str(bard1_sensitive_count), '\n',
            'BRCA1: ', str(brca1_sensitive_count), '\n',
          'BARD1 %: ', str(bard1_perc), '\n',
          'BRCA1 %: ', str(brca1_perc)
         )

In [None]:
def bar_plot(df): #Simple bar plot visualization

    df['Percent of Variants'] = df['Percent of Variants'] * 100

    plot = alt.Chart(df).mark_bar(color = 'white',
                                 stroke = 'black').encode(
        x = alt.X('Gene',
                  axis = alt.Axis(title = '',
                                 labelFontSize = 16,
                                 titleFontSize = 18,
                                  labelAngle = 0
                                 )
                 ),
        y = alt.Y('Percent of Variants',
                  axis = alt.Axis(title = '% LoF Missense Vars.',
                                  labelFontSize = 16,
                                  titleFontSize = 18
                                 )
                 ),
        column = alt.Column('Type', 
                            title = '',
                            header = alt.Header(
                                titleFontSize = 18
                            )
                           )
    ).properties(
        width = 125,
        height = 330
    ).configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    )

    
    #plot.save('/Users/ivan/Desktop/BARD1_draft_figs/supp_figs/suppfig_BRCTcomparison_barplot.png', ppi = 500)
    plot.display()

In [None]:
def heatmap(df, gene):

    if gene == 'BRCA1':
        min = -1.5
        max = 0
        sort_order = ['S1655', 'G1656', 'T1700', 'K1702']

    elif gene == 'BARD1':
        min = -0.2
        max = 0
        sort_order = ['S575', 'G576', 'T617', 'K619']

    color_domain = [min, max]
    
    plot = alt.Chart(df).mark_rect().encode(
        x = alt.X('aa_plot:O',
                  title = '',
                  sort = sort_order,
                  axis = alt.Axis(
                      labelAngle = 0,
                      ticks = False,
                      labelFontSize = 16
                  )
                 ),
        y = alt.Y('type_change:N',
                  title = '',
                  axis = alt.Axis(ticks = False,
                                  labelFontSize = 16
                                 )
                 ),
        color = alt.Color('score:Q',
                         scale = alt.Scale(
                             scheme = 'bluepurple',
                             domain = color_domain,
                             clamp = True,
                             reverse = True
                         ),
                          legend = alt.Legend(
                              title = 'SGE Score',
                              titleFontSize = 18,
                              labelFontSize = 14
                          )
                         )
    ).properties(
        width = 250,
        height = 300,
        title = alt.TitleParams(text = gene,
                                fontSize = 20
                               )
    ).configure_view(
        stroke = None
    )

    plot.display()
    return plot

In [None]:
def phosphosite_heatmap(bard1_df, brca1_df):

    bard1_phosphosite = [575, 576, 617, 619]
    brca1_phosphosite = [1655, 1656, 1700, 1702]

    bard1_df = bard1_df.copy()
    brca1_df = brca1_df.copy()

    bard1_df = bard1_df.loc[bard1_df['AApos'].isin(bard1_phosphosite)]

    #WARN reannotation for G576 and T617
    bard1_warn_reannotated = pd.DataFrame({'pos_id': ['214745805:A','214745805:G', '214745806:A', '214745806:G', '214745806:T' ,'214745805:T', '214745119:A', '214745119:C', '214745119:T', '214745121:A', '214745121:C', '214745121:G'],
                                           'score': [-0.109977, -0.0175886, -0.0298891, -0.00470975, -0.0413407, 0.0208659, -0.00814722, -0.00228625, 0.00051614, -0.00129351, 0.018338, -0.0955636],
                                           'amino_acid_change': ['G576V', 'G576A', 'G576C', 'G576R', 'G576S', 'G576D', 'T617T', 'T617T', 'T617T', 'T617S', 'T617A', 'T617P'],
                                           'functional_consequence': ['functionally_abnormal', 'functionally_normal', 'functionally_normal', 'functionally_normal', 'indeterminate', 'functionally_normal', 'functionally_normal', 'functionally_normal', 'functionally_normal', 'functionally_normal', 'functionally_normal', 'functionally_abnormal'],
                                           'AApos': [576, 576, 576, 576, 576, 576, 617, 617, 617, 617, 617, 617]
                                          })

    bard1_df = pd.concat([bard1_df, bard1_warn_reannotated]).reset_index()
                                           
    brca1_df = brca1_df.loc[brca1_df['AApos'].isin(brca1_phosphosite)]

    bard1_df['nAA'] = bard1_df['amino_acid_change'].transform(lambda x: x[-1])
    bard1_df['oAA'] = bard1_df['amino_acid_change'].transform(lambda x: x[0])
    brca1_df['nAA'] = brca1_df['hgvs_pro'].transform(lambda x: x[-1])
    brca1_df['oAA'] = brca1_df['hgvs_pro'].transform(lambda x: x.split('.')[1][0])

    bard1_df['aa_plot'] = bard1_df['oAA'] + bard1_df['AApos'].astype(str)
    brca1_df['aa_plot'] = brca1_df['oAA'] + brca1_df['AApos'].astype(str)

    
    
    aa_properties = {
        # Positive charge (basic)
        'R': 'Positive',  # Arginine
        'K': 'Positive',  # Lysine
        'H': 'Positive',  # Histidine (can be positive at physiological pH)
        
        # Negative charge (acidic)
        'D': 'Negative',  # Aspartic acid
        'E': 'Negative',  # Glutamic acid
        
        # Polar uncharged
        'S': 'Polar',     # Serine
        'T': 'Polar',     # Threonine
        'N': 'Polar',     # Asparagine
        'Q': 'Polar',     # Glutamine
        'Y': 'Polar',     # Tyrosine
        'C': 'Polar',     # Cysteine (can form disulfide bonds)
        
        # Hydrophobic (nonpolar)
        'A': 'Hydrophobic',  # Alanine
        'V': 'Hydrophobic',  # Valine
        'I': 'Hydrophobic',  # Isoleucine
        'L': 'Hydrophobic',  # Leucine
        'M': 'Hydrophobic',  # Methionine
        'F': 'Hydrophobic',  # Phenylalanine
        'W': 'Hydrophobic',  # Tryptophan
        
        # Special cases
        'G': 'Special',   # Glycine (smallest, flexible)
        'P': 'Special',   # Proline (helix breaker, rigid)
    }

    bard1_df['type_change'] = bard1_df['nAA'].map(aa_properties)
    brca1_df['type_change'] = brca1_df['nAA'].map(aa_properties)

    brca1_df = brca1_df.groupby(['aa_plot', 'type_change']).agg({
        'score': 'median'}
                                                               ).reset_index()

    bard1_df = bard1_df.groupby(['aa_plot', 'type_change']).agg({
        'score': 'median'}
                                                               ).reset_index()

    brca1_map = heatmap(brca1_df, 'BRCA1')
    bard1_map = heatmap(bard1_df, 'BARD1')

    return brca1_map, bard1_map

In [None]:
def main():
    bard1, brca1 = read_data(bard1_data, brca1_data,brca1_int_cutoff, brca1_path_cutoff, domain)
    #to_plot = missense_sensitivity(bard1,brca1)
    #missense_resi(bard1,brca1)
    #bar_plot(to_plot)

    if domain == 'BRCT':
        brca1_map, bard1_map = phosphosite_heatmap(bard1, brca1)

        #brca1_map.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_5_brca1_phophomap.png', ppi = 400)
        #bard1_map.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_5_bard1_phophomap.png', ppi = 400)

In [None]:
main()