In [None]:
import pandas as pd
import altair as alt
import numpy as np
import statsmodels.api as sm
from scipy import stats
import re

In [None]:
sge = '../Data/20250825_BARD1snvscores_filtered.xlsx'
gnomad = '../Data/20240905_BARD1_gnomADv4.1.0_SNVs.xlsx' #for Ambry
carriers = '/Users/ivan/Downloads/7_genes_for_Starita_02282025_hg38.xlsx' #carriers data
ambry = '/Users/ivan/Downloads/Ambry_BARD1.xlsx' #ambry set
bridges_all = '../Data/BRIDGES_data/20250815_BRIDGES_missense_all.xlsx' #bridges all missense variants
bridges_population = '../Data/BRIDGES_data/20250815_BRIDGES_missense_population.xlsx' #bridges population missense variants
bridges_all_ptv = '../Data/BRIDGES_data/20250815_BRIDGES_PTVs_all.xlsx' #bridges all PTVs
bridges_pop_ptv = '../Data/BRIDGES_data/20250815_BRIDGES_PTVs_pop.xlsx' #bridges population PTVs


In [None]:
def read_sge(sge):
    sge = pd.read_excel(sge) #Reads SGE data
    sge = sge.loc[~(sge['functional_consequence'].isin(['indeterminate']))]

    sge = sge.rename(columns = {'functional_consequence': 'Classification', 
                               'consequence': 'Consequence'})

    sge.loc[sge['Classification'] == 'functionally_normal', 'Classification'] = 'F'
    sge.loc[sge['Classification'] == 'functionally_abnormal', 'Classification'] = 'NF'
    sge.loc[sge['Classification'] == 'indeterminate', 'Classification'] = 'I'

    return sge

In [None]:
def read_ambry(ambry, gnomad):
    ambry = pd.read_excel(ambry)
    ambry = ambry[['Gene', 'c_variant', 'p_variant', 'Classification']]
    ambry['amino_acid_change'] = ambry['p_variant'].str[2:]

    ambry = ambry.dropna(subset = ['amino_acid_change']).copy()
    ambry = ambry.reset_index(drop = True)

    
    gnomad = pd.read_excel(gnomad)
    gnomad = gnomad[['Chromosome', 'Position', 'Reference', 'Alternate', 'Allele Frequency', 'Protein Consequence']]
    gnomad['Position'] = gnomad['Position'].astype(str)
    gnomad['pos_id'] = gnomad['Position'] + ':' + gnomad['Alternate']

    return ambry, gnomad

In [None]:
def read_carriers_data(cc): #Reads all data
    cc = pd.read_excel(cc) #Reads case-control data
    cc = cc[cc['CAVA_GENE'].isin(['BARD1'])] #Filters only for BARD1
    cc = cc[['Classification', '#CHROM', 'REF', 'ALT', 'CAVA_GENE', 'CAVA_CSN', 'CAVA_SO', 'Sample_AAF', 'Sample_ID', 'CaseControl','ER_status1', 'hg38_start']].copy() #Keeps necessary columns
    
    cc['pos_id'] = None #Creates emtpy pos_id column
    cc = cc[cc['ALT'].str.len() == 1].copy()
    cc['hg38_start'] = cc['hg38_start'].astype(str) #Sets hg38 coordinates as str data type
    cc['pos_id'] = cc['hg38_start'] + ':' + cc['ALT'] #Creates position ID

    cc_er = cc[cc['ER_status1'].isin([0, 777])]

    return cc, cc_er

In [None]:
def read_bridges(all, population, all_ptv, pop_ptv):
    bridges_all = pd.read_excel(all, sheet_name = 'BARD1')
    bridges_pop = pd.read_excel(population, sheet_name = 'BARD1')
    bridges_all_ptv = pd.read_excel(all_ptv, sheet_name = 'BARD1')
    bridges_pop_ptv = pd.read_excel(pop_ptv, sheet_name = 'BARD1')

    raw_dfs = [bridges_all, bridges_pop, bridges_all_ptv, bridges_pop_ptv]
    cleaned_dfs = []
    
    for df in raw_dfs:
        df = df[['Cases', 'Controls', 'chr', 'ref', 'alt', 'hg38_pos']]
        df = df.loc[(df['ref'].str.len() == 1) & (df['alt'].str.len() == 1)]
        
        df = df.rename(columns = {'hg38_pos': 'pos'})

        df['pos_id'] = df['pos'].astype(str) + ':' + df['alt']

        df = df[['Cases', 'Controls', 'pos_id']]
        cleaned_dfs.append(df)


    final_dfs = []
    bridges_all = pd.concat([cleaned_dfs[0], cleaned_dfs[2]])
    bridges_pop = pd.concat([cleaned_dfs[1], cleaned_dfs[3]])

    dfs = [bridges_all, bridges_pop]
    
    for elem in dfs:
        final_dfs.append(elem)

    return bridges_all, bridges_pop   

In [None]:
def classify_vars(sge):

    #under development - trying to figure out which parameters are best
    sge['AApos'] = sge['amino_acid_change'].str.extract(r'([0-9]+)', expand=False).astype(float)
    sge_all = sge #All variants
    
    sge_miss = sge[sge['Consequence'].isin(['missense_variant'])] #missense variants only 


    not_domain_coords = list(range(1, 26)) + list(range(123, 425)) + list(range(546, 568))
    sge_ring = sge_miss.loc[sge_miss['AApos'].isin(list(range(26,123)))] #RING missense variants only
    sge_ard = sge_miss.loc[sge_miss['AApos'].isin(list(range(425, 546)))] #ARD missense variants only
    sge_brct = sge_miss.loc[sge_miss['AApos'].isin(list(range(568, 778)))] #BRCT missense variants only 
    sge_structured = sge_miss.loc[~(sge_miss['AApos'].isin(not_domain_coords))] #All structured missense variants only
        
    sge_dict = {'miss': sge_miss, 'all': sge_all, 'ring': sge_ring,
                'ard': sge_ard, 'brct': sge_brct, 'structured': sge_structured
               }

    sge_keys = list(sge_dict.keys())
    
    return sge_dict, sge_keys

In [None]:
def analyze_ambry(sge, gnomad, ambry):
    
    sge_gnomad = pd.merge(sge, gnomad, on = 'pos_id', how = 'inner')

    gnomad_func = sge_gnomad['Classification'].value_counts()['F']
    gnomad_nf = sge_gnomad['Classification'].value_counts()['NF']
    
    sge_ambry = pd.merge(sge,ambry, on = 'amino_acid_change', how = 'inner')

    ambry_func = sge_ambry['Classification_x'].value_counts()['F']
    ambry_nf = sge_ambry['Classification_x'].value_counts()['NF'] 

    contingency_table = np.array([[ambry_nf, ambry_func],
                                    [gnomad_nf, gnomad_func]])

    df = pd.DataFrame(contingency_table, columns = ['NF', 'F'], index = ['Ambry', 'gnomAD'])


    oddsratio, p_value = stats.fisher_exact(contingency_table) #Tabulates odds-ratio and p-value from Fischer's exact test
    table = sm.stats.Table2x2(contingency_table) #Generates confidence intervals
    
    #result = table.summary() #Summary table with Odds and Risk ratios
    #display(result) #Displays summary stats
    
    '''
    #Prints out summary stats
    print('Analysis for Ambry: ')
    display(result) #Displays summary stats
    print(f"Odds Ratio: {table.oddsratio:.2f}")
    print(f"95% CI: ({table.oddsratio_confint()[0]:.2f}, {table.oddsratio_confint()[1]:.2f})")
    print(f"P-value (Fisher's exact): {p_value:.4f}")
    oddsratio, p_value = stats.fisher_exact(contingency_table)
    '''

    to_return = (table.oddsratio, [table.oddsratio_confint()[0], table.oddsratio_confint()[1]])
    
    return to_return

In [None]:
def analyze_carriers(cc, sge):
    merged = pd.merge(cc, sge, on = 'pos_id', how = 'inner') #Merges case-control and SGE data
    merged = merged.dropna(subset = ['Classification_y']) #drops any columsn without a classification
    
    contingency_tab = merged[['CaseControl', 'Classification_y']] #Creates dataframe for contingency table

    contingency_tab = pd.crosstab(merged['CaseControl'], merged['Classification_y']) #Creates contingency table
    contingency_tab = contingency_tab[contingency_tab.columns[::-1]] #Swaps NF and F columns in df
    table_array = contingency_tab.values #Gets values from contingency table

    oddsratio, p_value = stats.fisher_exact(table_array) #Tabulates odds-ratio and p-value from Fischer's exact test
    table = sm.stats.Table2x2(table_array) #Generates confidence intervals
    
    #result = table.summary() #Summary table with Odds and Risk ratios
    
    

    '''
    #Prints out summary stats
    print('Analysis for CARRIERS Data Set: ')
    display(contingency_tab) #Displays contingency table 
    display(result) #Displays summary stats
    print(f"Odds Ratio: {table.oddsratio:.2f}")
    print(f"95% CI: ({table.oddsratio_confint()[0]:.2f}, {table.oddsratio_confint()[1]:.2f})")
    print(f"P-value (Fisher's exact): {p_value:.4f}")
    '''
    
    to_return = (table.oddsratio, [table.oddsratio_confint()[0], table.oddsratio_confint()[1]])


    return to_return

In [None]:
def analyze_bridges(sge, df):

    merged = pd.merge(df, sge, on = 'pos_id', how = 'inner')

    summary = merged.pivot_table(
        values = ['Cases', 'Controls'],
        index = 'Classification',
        aggfunc = 'sum'
    )

    #print(summary.transpose())
    table_array = np.array([[summary['Cases']['NF'], summary['Cases']['F']],
                             [summary['Controls']['NF'], summary['Controls']['F']]])

    
    oddsratio, p_value = stats.fisher_exact(table_array) #Tabulates odds-ratio and p-value from Fischer's exact test
    table = sm.stats.Table2x2(table_array) #Generates confidence intervals
    
    #result = table.summary() #Summary table with Odds and Risk ratios
    



    #Prints out summary stats
    '''
    print('Analysis for ', 'BRIDGES', ' Data Set: ')
    display(result) #Displays summary stats
    print(f"Odds Ratio: {table.oddsratio:.2f}")
    print(f"95% CI: ({table.oddsratio_confint()[0]:.2f}, {table.oddsratio_confint()[1]:.2f})")
    print(f"P-value (Fisher's exact): {p_value:.4f}")        
    '''
    
    to_return = (table.oddsratio, [table.oddsratio_confint()[0], table.oddsratio_confint()[1]])

    return to_return

In [None]:
def process_data_dict(dict, key_dict):
    all_keys = list(dict.keys())


    odds_ratios = []

    lwr_ci = []
    uppr_ci = []
    for key in all_keys:
        odds, ci_list = dict[key]
        odds_ratios.append(odds)
        lwr_ci.append(ci_list[0])
        uppr_ci.append(ci_list[1])


    datasets = ['Ambry', 'CARRIERS (All)', 'CARRIERS (ER-)', 'BRIDGES (All)', 'BRIDGES (Pop.)',
                'Ambry', 'CARRIERS (All)', 'CARRIERS (ER-)', 'BRIDGES (All)', 'BRIDGES (Pop.)',
                'Ambry', 'CARRIERS (All)', 'CARRIERS (ER-)', 'BRIDGES (All)', 'BRIDGES (Pop.)',
                'Ambry', 'CARRIERS (All)', 'CARRIERS (ER-)', 'BRIDGES (All)', 'BRIDGES (Pop.)',
                'Ambry', 'CARRIERS (All)', 'CARRIERS (ER-)', 'BRIDGES (All)', 'BRIDGES (Pop.)',
                'Ambry', 'CARRIERS (All)', 'CARRIERS (ER-)', 'BRIDGES (All)', 'BRIDGES (Pop.)'
               ]

    vars_incl = ['All Mis.', 'All Mis.', 'All Mis.', 'All Mis.', 'All Mis.', 
                 'All Vars.', 'All Vars.', 'All Vars.', 'All Vars.', 'All Vars.', 
                 'RING Mis.', 'RING Mis.', 'RING Mis.', 'RING Mis.', 'RING Mis.', 
                 'ARD Mis.', 'ARD Mis.','ARD Mis.','ARD Mis.','ARD Mis.',
                 'BRCT Mis.', 'BRCT Mis.','BRCT Mis.','BRCT Mis.','BRCT Mis.',
                 'Structured Mis.', 'Structured Mis.','Structured Mis.','Structured Mis.','Structured Mis.'
                ]

    
    df = pd.DataFrame({'analysis': all_keys, 'OR': odds_ratios, 'lwr_ci': lwr_ci, 'uppr_ci': uppr_ci,
                      'dataset': datasets, 'incl_vars': vars_incl})

    df['full_type'] =  df['dataset'] + ' ' + df['incl_vars']
    
    return df

In [None]:
def cc_fig(df):

    base = alt.Chart(df)

    sort_order = ['Ambry All Vars.', 'BRIDGES (All) All Vars.', 'BRIDGES (Pop.) All Vars.', 'CARRIERS (All) All Vars.', 'CARRIERS (ER-) All Vars.',
                  'Ambry All Mis.', 'BRIDGES (All) All Mis.', 'BRIDGES (Pop.) All Mis.', 'CARRIERS (All) All Mis.', 'CARRIERS (ER-) All Mis.',
                  'Ambry RING Mis.', 'BRIDGES (All) RING Mis.', 'BRIDGES (Pop.) RING Mis.', 'CARRIERS (All) RING Mis.', 'CARRIERS (ER-) RING Mis.',
                  'Ambry ARD Mis.', 'BRIDGES (All) ARD Mis.', 'BRIDGES (Pop.) ARD Mis.', 'CARRIERS (All) ARD Mis.', 'CARRIERS (ER-) ARD Mis.',
                  'Ambry BRCT Mis.', 'BRIDGES (All) BRCT Mis.', 'BRIDGES (Pop.) BRCT Mis.', 'CARRIERS (All) BRCT Mis.', 'CARRIERS (ER-) BRCT Mis.',
                  'Ambry Structured Mis.', 'BRIDGES (All) Structured Mis.', 'BRIDGES (Pop.) Structured Mis.', 'CARRIERS (All) Structured Mis.', 'CARRIERS (ER-) Structured Mis.']
    
    points = base.mark_point(
        filled = True,
        size = 50, 
        color = 'black'
        ).encode(
        y = alt.Y('full_type:O',
                  scale = alt.Scale(domain = sort_order),
                 axis = alt.Axis(title = '',
                                 labelFontSize = 16, 
                                 labelLimit = 1000
                                )
                 ),
        x = alt.X('OR',
                 axis = alt.Axis(
                     title = 'Odds Ratio',
                     labelFontSize = 16,
                     titleFontSize = 18,
                     values = list(range(0, 9))
                                 ),
                  scale = alt.Scale(domain = [0, 8]
                                   )
                 ),
        tooltip = ['OR']
        )
    
    ci_bars = base.mark_errorbar().encode(
        y = 'full_type',
        x = alt.Y('lwr_ci:Q', axis = alt.Axis(title = '')),
        x2 = 'uppr_ci:Q'
        )

    line = alt.Chart(pd.DataFrame({'Odds Ratio': [1]})).mark_rule(color = 'red').encode(
        x = 'Odds Ratio')


    plot = (points + ci_bars + line).configure_view(
        stroke = None
        ).configure_axis(
        grid = False
        ).interactive()

    
    plot.display()

    return plot 
    

In [None]:
def main():
    raw_sge_df = read_sge(sge)
    ambry_df, gnomad_df = read_ambry(ambry, gnomad)
    
    cc_df, cc_er_df = read_carriers_data(carriers)
    bridges_all_df, bridges_pop_df = read_bridges(bridges_all, bridges_population, bridges_all_ptv, bridges_pop_ptv)
    
    sge_dfs, sge_keys = classify_vars(raw_sge_df)

    data_keys = {'miss': ['ambry_miss', 'cc_miss', 'cc_miss_er', 'bridges_all_miss', 'bridges_pop_miss'], 
                 'all': ['ambry_all', 'cc_all', 'cc_all_er', 'bridges_all_all', 'bridges_pop_all'],
                 'ring': ['ambry_ring', 'cc_ring', 'cc_ring_er', 'bridges_all_ring', 'bridges_pop_ring'],
                 'ard': ['ambry_ard', 'cc_ard', 'cc_ard_er', 'bridges_all_ard', 'bridges_pop_ard'],
                 'brct': ['ambry_brct', 'cc_brct', 'cc_brct_er', 'bridges_all_brct', 'bridges_pop_brct'],
                 'structured': ['ambry_structured', 'cc_structured', 'cc_structured_er', 'bridges_all_structured', 'bridges_pop_structured']
                }

    data_dict = {}



    for key in sge_keys:
        sge_df = sge_dfs[key]
        output_keys = data_keys[key]

        data_dict[output_keys[0]] = analyze_ambry(sge_df, gnomad_df, ambry_df)
        data_dict[output_keys[1]] = analyze_carriers(cc_df, sge_df)
        data_dict[output_keys[2]] = analyze_carriers(cc_er_df, sge_df)
        data_dict[output_keys[3]] = analyze_bridges(sge_df, bridges_all_df)
        data_dict[output_keys[4]] = analyze_bridges(sge_df, bridges_pop_df)


    final_df = process_data_dict(data_dict, data_keys)
    

    '''
    carriers_all, carriers_controls, carriers_cases = process_carriers(cc_df)
    analyze_ambry(raw_sge_df, gnomad_df, ambry_df)
    cc_df = analyze_carriers(carriers_all, sge_all)
    analyze_bridges(sge_all, bridges_dfs)
    '''
    
    final_plot = cc_fig(final_df)
    #final_plot.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_4d_ORplot.png', ppi = 500)


In [None]:
main()