In [None]:
import pandas as pd
import altair as alt
from scipy import stats
import numpy as np

In [None]:
bard1_data = '../Data/BARD1_SGE_final_table.xlsx'
brca1_data = '../Data/BRCA1_SGE_data.xlsx'

#Estimated thresholds for data from Findlay et. al 2018
brca1_int_cutoff = -0.748
brca1_path_cutoff = -1.328

domain = 'RING_loop'

In [None]:
def read_data(bard1, brca1, brca1_int, brca1_path_max, domain):
    bard1_df = pd.read_excel(bard1, sheet_name = 'scores')
    bard1_df = bard1_df.loc[~bard1_df['variant_qc_flag'].isin(['WARN'])] #Filters out WARN variants in same codon as fixed edit
    brca1_df = pd.read_excel(brca1, sheet_name = 'findlay_2018')

    if domain == 'RING': #Sets domain for RING
        brca1_region = list(range(1, 302))
        bard1_region = list(range(26, 123))

    elif domain == 'RING_loop': #Sets domain for RING domains without the large helices
        brca1_region = list(range(67, 238)) #Residues 23 - 79
        bard1_region = list(range(48, 97)) #Residues 48 - 96

    elif domain == 'BRCT': #Sets domain for BRCT
        brca1_region = list(range(4936, 5566))
        bard1_region = list(range(568, 778))

    brca1_df = brca1_df.loc[brca1_df['pos'].isin(brca1_region)] #Gets all data from Findlay 2018 dataset
    brca1_df['functional_consequence'] = 'indeterminate' #Creates functional consequence column

    #Puts variants into functional classes
    brca1_df.loc[brca1_df['snv_score_minmax'] <= brca1_path_max, 'functional_consequence'] = 'functionally_abnormal'
    brca1_df.loc[brca1_df['snv_score_minmax'] >= brca1_int, 'functional_consequence'] = 'functionally_normal'
    brca1_df = brca1_df.loc[brca1_df['Consequence'].isin(['missense_variant'])] #Filters for missense variants
    brca1_df['AApos'] = brca1_df['hgvs_pro'].transform(lambda x: int(x.split(':')[1].split('.')[1][3:-3]))
    
    new_brca1 = pd.read_excel(brca1, sheet_name = 'dace_2025') #Reads Dace 2025 BRCA1 data
    new_brca1 = new_brca1.loc[new_brca1['Consequence'].isin(['Missense'])] #Filters for missense variants
    new_brca1['CDSpos'] = new_brca1['CDSpos'].astype(int)
    new_brca1 = new_brca1.loc[new_brca1['CDSpos'].isin(brca1_region)] #Filters for variants in specified domain only

    new_brca1['target'] = 'BRCA1_X2' #Placeholder column for downstream concatenation 
    new_brca1['functional_consequence'] = 'indeterminate' #Sets functional consequence collumn
    new_brca1.loc[new_brca1['function_class'] == 'LoF', 'functional_consequence'] = 'functionally_abnormal' #LoF varinats are set as functionaly abnormal
    new_brca1.loc[new_brca1['function_class'] == 'Neutral', 'functional_consequence'] = 'functionally_normal' #Neutral variants set to functionally normal
    new_brca1_pos = list(set(new_brca1['CDSpos'].tolist())) #Gets positions in new data set
    new_brca1 = new_brca1.rename(columns = {'CDSpos': 'pos', 'final_function_score': 'snv_score_minmax', 'protPos': 'AApos'}) #Renames columns for concatenation

    new_brca1['AApos'] = new_brca1['AApos'].astype(int)
    brca1_df = brca1_df.loc[~(brca1_df['pos'].isin(new_brca1_pos))] #Variants in 2018 dataset found in 2025 dataset are removed
    brca1_df = pd.concat([brca1_df, new_brca1[['target','pos','Consequence','snv_score_minmax', 'functional_consequence', 'AApos']]]) #2018 and 2025 BRCA1 datasets are concatenated

    #Processing BARD1 data
    bard1_df = bard1_df.loc[bard1_df['consequence'].isin(['missense_variant'])] #Missense variants only
    bard1_df['AApos'] = bard1_df['amino_acid_change'].transform(lambda x: x[1:-1]) #Gets amino acid position
    bard1_df['AApos'] = bard1_df['AApos'].astype(int) #Sets data type as integer
    bard1_df = bard1_df.loc[bard1_df['AApos'].isin(bard1_region)] #Gets variants in region of interst only
    
    return bard1_df, brca1_df

In [None]:
def missense_sensitivity(bard1, brca1): #Tabulates missense sensitivity between BRCA1 and BARD1

    #Gets counts and percent of missense variants in functionally normal and abnormal camps for BARD1
    bard1_sensitive = bard1['functional_consequence'].value_counts().get('functionally_abnormal',0)
    bard1_normal = bard1['functional_consequence'].value_counts().get('functionally_normal',0)
    total_bard1_ring_missense = len(bard1)
    bard1_sensitivity = bard1_sensitive / total_bard1_ring_missense
    bard1_normal_percent = bard1_normal / total_bard1_ring_missense

    #Analagous code for BRCA1
    brca1_sensitive = brca1['functional_consequence'].value_counts().get('functionally_abnormal',0)
    brca1_normal = brca1['functional_consequence'].value_counts().get('functionally_normal',0)
    total_brca1_ring_missense = len(brca1)

    brca1_sensitivity = brca1_sensitive / total_brca1_ring_missense
    brca1_normal_percent = brca1_normal / total_brca1_ring_missense

    #Sets up contingency table for Fischer exact testing
    contingency_table = np.array([[brca1_sensitive, brca1_normal],
                                  [bard1_sensitive, bard1_normal]])

    odds_ratio, p_value = stats.fisher_exact(contingency_table)

    print(f"Fisher's exact test p-value: {p_value}")
    print(f"Odds ratio: {odds_ratio}")

    #Dataframe passed on for plotting
    to_plot = pd.DataFrame({
        'Gene': ['BRCA1', 'BARD1','BRCA1', 'BARD1'],
        'Percent of Variants': [brca1_sensitivity,bard1_sensitivity, brca1_normal_percent, bard1_normal_percent],
        'Type': ['Abnormal Missense', 'Abnormal Missense', 'Normal Missense', 'Normal Missense']
    })

    display(to_plot)
    return to_plot
    

In [None]:
def missense_resi(bard1_df, brca1_df):

    #brca1_df.loc[brca1_df['functional_consequence'] == 'indeterminate', 'functional_consequence'] = 'functionally_abnormal'

    bard1_df['functional_consequence'] = pd.Categorical(bard1_df['functional_consequence'], categories = ['functionally_abnormal', 'indeterminate', 'functionally_normal'], ordered=True)
    brca1_df['functional_consequence'] = pd.Categorical(brca1_df['functional_consequence'], categories = ['functionally_abnormal', 'indeterminate', 'functionally_normal'], ordered=True)
    #brca1_df = brca1_df.loc[brca1_df['AApos'].isin([20, 26, 27, 28, 51, 52, 54, 55, 63, 64, 65, 66, 68, 104])]
    
    bard1_df = bard1_df[['functional_consequence', 'AApos']]
    bard1_df = bard1_df.groupby('AApos').agg({
        'functional_consequence': 'min'
    }).reset_index()

    brca1_df = brca1_df[['functional_consequence', 'AApos']]
    brca1_df = brca1_df.groupby('AApos').agg({
        'functional_consequence': 'min'
    }).reset_index()

    bard1_sensitive_count = bard1_df['functional_consequence'].value_counts().get('functionally_abnormal', 0)
    brca1_sensitive_count = brca1_df['functional_consequence'].value_counts().get('functionally_abnormal',0)

    

    brca1_perc = brca1_sensitive_count / len(brca1_df)
    bard1_perc = bard1_sensitive_count / len(bard1_df)

    print('BARD1: ', str(bard1_sensitive_count), '\n',
            'BRCA1: ', str(brca1_sensitive_count), '\n',
          'BARD1 %: ', str(bard1_perc), '\n',
          'BRCA1 %: ', str(brca1_perc)
         )

In [None]:
def bar_plot(df): #Simple bar plot visualization

    df['Percent of Variants'] = df['Percent of Variants'] * 100

    plot = alt.Chart(df).mark_bar(color = 'white',
                                 stroke = 'black').encode(
        x = alt.X('Gene',
                  axis = alt.Axis(title = '',
                                 labelFontSize = 16,
                                 titleFontSize = 18,
                                  labelAngle = 0
                                 )
                 ),
        y = alt.Y('Percent of Variants',
                  axis = alt.Axis(title = '% LoF Missense Vars.',
                                  labelFontSize = 16,
                                  titleFontSize = 18
                                 )
                 ),
        column = alt.Column('Type', 
                            title = '',
                            header = alt.Header(
                                titleFontSize = 18
                            )
                           )
    ).properties(
        width = 125,
        height = 330
    ).configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    )

    
    #plot.save('/Users/ivan/Desktop/BARD1_draft_figs/supp_figs/suppfig_BRCTcomparison_barplot.png', ppi = 500)
    plot.display()

In [None]:
def main():
    bard1, brca1 = read_data(bard1_data, brca1_data,brca1_int_cutoff, brca1_path_cutoff, domain)
    to_plot = missense_sensitivity(bard1,brca1)
    missense_resi(bard1,brca1)
    bar_plot(to_plot)

In [None]:
main()