This notebook builds Extended Data Figs. 5b (predictor score vs. SGE) and 5c (odds ratio analysis of predictor scores). In addition to S5A and S5C, figures not published also generated

In [None]:
import pandas as pd
import altair as alt
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.contingency_tables import mcnemar
from scipy import stats

In [None]:
save_figs = False #True/False for saving figures

sge_file = '../Data/final_tables/supplementary_file_1_BARD1_SGE_final_table.xlsx' #SGE data file


#External case-control data
carriers = '../Data/extra_data/case_control_data/CARRIERS_data/20250303_CARRIERS_data.xlsx' #carriers data
bridges = '../Data/extra_data/case_control_data/BRIDGES_data/20250815_BRIDGES_missense_population.xlsx' #bridges population missense variants

#Predictor score cutoffs for providing moderate evidence toward pathogenic or benign variant classifiations. Pulled from Bergquist et al. 2025
cutoffs = {'MutPred2': [0.197, 0.829],
                        'REVEL': [0.183, 0.773],
                        'AlphaMissense': [0.099, 0.906]
          }

#Total individuals sequenced for CARRIERS and BRIDGES studies.
#All denotes all individuals/patients sequenced by the study. 'pop' denotes the subset found in patients without considering family history of breast cancer
carriers_totals = {'cases_all': 39553, #Total number of cases sequenced
                   'controls_all': 35867, #Total number of controls sequenced
                   'cases_pop': 32247, #Nubmer of cases used for population-based estimates
                   'controls_pop': 32544, #Number of controls used for population-based estimates
                   'er_cases': 3805 #Number of estrogen receptor (ER) negative cases used in population-based estimates
                  }

#Annotations for are same, but numbers are pulled from the BRIDGES study
bridges_totals = {'cases_all': 60466, 
                  'controls_all': 53461,
                  'cases_pop': 48826,
                  'controls_pop': 50703
                 }

alt.data_transformers.disable_max_rows()

In [None]:
def get_thresholds(file): #Gets SGE score thresholds and scores

    sge_df = pd.read_excel(file, sheet_name = 'scores') #Gets scores
    sge_df = sge_df.loc[(sge_df['var_type'].isin(['snv'])) & (~sge_df['variant_qc_flag'].isin(['WARN']))] #Filters for SNVs and removes WARN variants
    threshold_df = pd.read_excel(file, sheet_name = 'thresholds') #Gets thresholds

    thresholds = [threshold_df['min'][0], threshold_df['max'][0]] #Thresholds sent to list

    sge_df = sge_df.rename(columns = {'am_score': 'AlphaMissense',
                                    'revel_score': 'REVEL',
                                    'cadd_score': 'CADD'
                                   }) #predictor columns renamed for nice axis labels 

    return sge_df, thresholds

In [None]:
def process_vep(df): #Processes and generates a summary dataframe for predictor scores

    df['amino_acid'] = df['amino_acid_change'].transform(lambda x: x[:-1]) 
    df = df.loc[~df['amino_acid_change'].isin(['---'])] 
    df = df.dropna(subset = ['MutPred2'])

    df = df.loc[df['max_SpliceAI'] <= 0.2]
    summary_df = df.groupby('CDS_position').agg({
        'amino_acid': 'first',
        'score': 'median',
        'AlphaMissense': 'median',
        'CADD': 'median',
        'REVEL': 'median',
        'MutPred2': 'median'
    }).reset_index()

    summary_df = summary_df.rename(columns = {'am_score': 'AlphaMissense',
                                    'revel_score': 'REVEL',
                                    'cadd_score': 'CADD'
                                   })
    
    return summary_df 

In [None]:
def vep_v_sge(df, thresholds): #Creates scatter plots of a predictor score vs. SGE

    palette = [
    '#006616', # dark green,
    '#81B4C7', # dusty blue
    '#ffcd3a', # yellow
    '#6AA84F', # med green
    '#93C47D', # light green
    '#888888', # med gray
    '#000000', # black
    '#1170AA', # darker blue
    '#CFCFCF' # light gray
    ]


    variant_types = [
        'synonymous_variant',
        'missense_variant',  
        'stop_gained',
        'intron_variant', 
        'UTR_variant',
        'stop_lost',
        'start_lost',
        'splice_site_variant', 
        'splicing_variant',
        ]

    
    veps = ['AlphaMissense', 'REVEL', 'CADD', 'MutPred2']

    miss_veps = ['AlphaMissense', 'REVEL', 'MutPred2']
    scatters =[]
    for vep in veps:
        new_df = df.copy()
        if vep in miss_veps:
            new_df = df.loc[df['max_SpliceAI'] <= 0.2]
            pred_thresholds = cutoffs[vep]

            benign_thresh = pred_thresholds[0]
            path_thresh = pred_thresholds[1]

            

        scatter = alt.Chart(new_df).mark_circle(opacity = 1).encode(
            x = alt.X('score', 
                      axis = alt.Axis(title = 'Functional Score',
                                      titleFontSize = 18, 
                                      labelFontSize = 16
                                     )
                     ),
            y = alt.Y(f'{vep}:Q',
                      title = vep,
                      axis = alt.Axis(titleFontSize = 18,
                                      labelFontSize = 16
                                     )
                     ),
            color = alt.Color('consequence', 
                              scale = alt.Scale(
                                  range = palette,
                                  domain = variant_types
                              ),
                              legend = alt.Legend(
                                  title = 'Consequence',
                                  titleFontSize = 18,
                                  labelFontSize = 16,
                                  symbolOpacity = 1
                              )
                             ),
            tooltip = [alt.Tooltip('pos_id', title = 'Position ID: '),
                       alt.Tooltip('amino_acid_change', title = 'AA Sub: ')
                      ]
        ).interactive()

        sge_nf_line = alt.Chart(pd.DataFrame({'score': [thresholds[0]]})).mark_rule(color = 'red').encode(
        x = 'score') 

        sge_func_line = alt.Chart(pd.DataFrame({'score': [thresholds[1]]})).mark_rule(color = 'blue').encode(
        x = 'score') 

        scatter = scatter + sge_nf_line + sge_func_line

        if vep in miss_veps:
            rectangles = pd.DataFrame({
                'x_start': [thresholds[1]],
                'x_end': [0.1],
                vep: [path_thresh],
                'y_end': [1],
                'rect_color': ['red']
            })

            rect_layer = alt.Chart(rectangles).mark_rect(opacity=0.2).encode(
                x='x_start:Q',
                x2='x_end:Q',
                y=alt.Y(f"{vep}:Q"),
                y2='y_end:Q',
                color=alt.Color('rect_color:N', scale = None)  # Use literal colors
            )

            scatter = alt.layer(rect_layer, scatter).resolve_scale(y = 'shared')
            
        scatters.append(scatter)


    top_panel = scatters[0] | scatters[1] 
    bottom_panel = scatters[2] | scatters[3] 

    full_fig = (top_panel  & bottom_panel).configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    )

    full_fig.display()
    return full_fig

In [None]:
def quant_overpathogenicity(df):
    veps = ['AlphaMissense', 'REVEL', 'MutPred2']
    
    
    
    for vep in veps:
        moderate_cutoffs = cutoffs[vep]
        moderate_plus_benign = moderate_cutoffs[0]
        moderate_plus_path = moderate_cutoffs[1]

        vep_df = df.dropna(subset = [vep]).copy()
        vep_df = vep_df.loc[~vep_df[vep].isin(['-'])]
        vep_df[vep] = vep_df[vep].astype(float)   

        vep_df['vep_class'] = 'indeterminate'

        vep_df.loc[vep_df[vep] <= moderate_plus_benign, 'vep_class'] = 'functionally_normal'
        vep_df.loc[vep_df[vep] >= moderate_plus_path, 'vep_class'] = 'functionally_abnormal'

        vep_moderate_plus = vep_df['vep_class'].value_counts().get('functionally_normal', 0)
        sge_path = vep_df['functional_consequence'].value_counts().get('functionally_normal', 0)

        # a: both say normal
        a = ((vep_df['vep_class'] == 'functionally_normal') & 
             (vep_df['functional_consequence'] == 'functionally_normal')).sum()
        
        # b: VEP says normal, SGE says abnormal
        b = ((vep_df['vep_class'] == 'functionally_normal') & 
             (vep_df['functional_consequence'] == 'functionally_abnormal')).sum()
        
        # c: VEP says abnormal, SGE says normal
        c = ((vep_df['vep_class'] == 'functionally_abnormal') & 
             (vep_df['functional_consequence'] == 'functionally_normal')).sum()
        
        # d: both say abnormal
        d = ((vep_df['vep_class'] == 'functionally_abnormal') & 
             (vep_df['functional_consequence'] == 'functionally_abnormal')).sum()

        print(vep)
        print(f' VEP #Vars. with moderate+ path but normal in SGE: {c}')
        print(f' VEP %Vars. with moderate+ path but normal in SGE: {(c/len(vep_df)) * 100}')

In [None]:
def scores_across_gene(df): #Builds plot of predictor score across BARD1

    palette = [
    '#006616', # dark green,
    '#81B4C7', # dusty blue
    '#ffcd3a', # yellow
    '#6AA84F', # med green
    '#93C47D', # light green
    '#888888', # med gray
    '#000000', # black
    '#1170AA', # darker blue
    '#CFCFCF' # light gray
    ]


    variant_types = [
        'synonymous_variant',
        'missense_variant',  
        'stop_gained',
        'intron_variant', 
        'UTR_variant',
        'stop_lost',
        'start_lost',
        'splice_site_variant', 
        'splicing_variant',
        ]

    df = df.rename(columns = {'score': 'SGE'})
    
    veps = ['SGE', 'AlphaMissense', 'REVEL', 'CADD', 'MutPred2']
    miss_veps = ['AlphaMissense', 'REVEL', 'MutPred2']
    plots = []
    for vep in veps:
        new_df = df.copy()
        if vep in miss_veps:
            new_df = df.loc[df['max_SpliceAI'] <= 0.2]
        plot = alt.Chart(new_df).mark_circle().encode(
            x = alt.X('CDS_position:Q',
                      title = 'CDS Position'
                     ),
            y = f'{vep}:Q', 
            color = alt.Color('consequence:N',
                              scale = alt.Scale(
                                  range = palette,
                                  domain = variant_types
                              )
                             ),
            tooltip = [alt.Tooltip('amino_acid_change', title = 'AA Change: ')]
        ).properties(
            width = 1750,
            height = 200
        )

        plots.append(plot)


    full_fig = (plots[0] & plots[1] & plots[2] & plots[3] & plots[4]).configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    )

    full_fig.display()

    return full_fig

In [None]:
def concordance(df): #Builds plot that displays points where predictor classifications (based on Bergquist thresholds) disagree with SGE
    
    df = df.loc[~df['amino_acid_change'].isin(['---'])]
    df = df.loc[~df['functional_consequence'].isin(['indeterminate'])]
    df['amino_acid'] = df['amino_acid_change'].transform(lambda x: x[:-1])

    df['amino_acid_position'] = df['amino_acid'].transform(lambda x: int(x[1::]))
    
    df = df.dropna(subset = ['MutPred2'])

    df = df.loc[df['max_SpliceAI'] <= 0.2]

    predictors = ['AlphaMissense', 'MutPred2', 'REVEL']

    plots = []
    for predictor in predictors:
        pred_thresholds = cutoffs[predictor]

        benign_thresh = pred_thresholds[0]
        path_thresh = pred_thresholds[1]

        consequence_column = f'{predictor}_consequence'

        df[consequence_column] = 'indeterminate'
        #df[predictor] = df[predictor].astype(float)
        df.loc[df[predictor] <= benign_thresh, consequence_column] = 'functionally_normal'
        df.loc[df[predictor] >= path_thresh, consequence_column] = 'functionally_abnormal'

        test_column = f'{predictor}_test'

        df[test_column] = 'pred_indeterminate'
        df.loc[df['functional_consequence'] == df[consequence_column], test_column] = 'pred_concordant'
        df.loc[(df[consequence_column] == 'functionally_abnormal') & (df['functional_consequence'] == 'functionally_normal'), test_column]  = 'pred_overpath'
        df.loc[(df[consequence_column] == 'functionally_normal') & (df['functional_consequence'] == 'functionally_abnormal'), test_column]  = 'pred_overbenign'

        df = df.loc[~df[test_column].isin(['pred_concordant', 'pred_indeterminate'])]
        chart = alt.Chart(df).mark_circle().encode(
            x = 'amino_acid_position:Q',
            y = predictor,
            color = test_column
        ).properties(
            width = 1750, 
            height = 200
        )

        plots.append(chart)

    final_plot = (plots[0] & plots[1] & plots[2]).resolve_scale(
        x = 'shared'
    )

    final_plot.display()
    #df = df[['amino_acid_position', 'AlphaMissense_consequence', 'REVEL_consequence', 'MutPred2_consequence']]


    return df

In [None]:
def min_heatmap(df, threshold): #Creates mini-heatmap across BARD1 of missense predictors and SGE

    df = df.drop(columns = ['CADD'])
    
    vep_df = pd.melt(df, id_vars = ['CDS_position', 'amino_acid'], value_vars = ['AlphaMissense', 'REVEL', 'MutPred2'])

    #sge_df = sge_df.rename(columns = {'score': 'SGE Score'})
    
    sge_df = pd.melt(df, id_vars = ['CDS_position', 'amino_acid'], value_vars = ['score'])

    sge_df['variable'] = 'SGE'

    dfs = [vep_df, sge_df]

    maps = []

    vep_map = alt.Chart(vep_df).mark_rect().encode(
        x = alt.X('CDS_position:Q',
                  title = 'CDS Position',
                  bin = alt.Bin(maxbins = 2335, minstep = 1),
                  axis = alt.Axis(labelFontSize = 16,
                          values = list(range(0, 2335, 100))
                                 )
                 ),
        y = alt.Y('variable:O',
                  axis = alt.Axis(labelFontSize = 16),
                  title = ''
                 ),
        color = alt.Color('value:Q',
                          scale = alt.Scale(
                              domain = [0,1],
                              reverse = False,
                              scheme = 'bluepurple'
                          ),
                          legend = alt.Legend(
                              title = 'Median VEP Score',
                              titleFontSize = 18,
                              labelFontSize = 16,
                              gradientLength = 100
                          )
                         ),
        tooltip = [alt.Tooltip('CDS_position', title = 'CDS Pos: '), 
                  alt.Tooltip('amino_acid', title = 'Amino Acid: '),
                  alt.Tooltip('value', title = 'Min. Score')
                  ]
    ).properties(
        width = 1750,
        height = 300
    )

    maps.append(vep_map)

    sge_map = alt.Chart(sge_df).mark_rect().encode(
        x = alt.X('CDS_position:Q',
                  title = 'CDS Position',
                  bin = alt.Bin(maxbins = 2335, minstep = 1),
                  axis = alt.Axis(labelFontSize = 16,
                                  values = list(range(0, 2335, 100))
                                 )
                 ),
        y = alt.Y('variable:O',
                  title = '',
                  axis = alt.Axis(labelFontSize = 16)
                 ),
        color = alt.Color('value:Q',scale = alt.Scale(
                                  domain = [-0.2, 0],
                                  clamp = True,
                                  reverse = True,
                                  scheme = 'bluepurple'
                              ),
                          legend = alt.Legend(title = 'Median Functional Score',
                                             titleFontSize = 18,
                                             labelFontSize = 16, 
                                             gradientLength = 100)
                         ),
        tooltip = [alt.Tooltip('CDS_position', title = 'CDS Pos: '), 
                  alt.Tooltip('amino_acid', title = 'Amino Acid: '),
                  alt.Tooltip('value', title = 'Min. Score')
                  ]
    ).properties(
        width = 1750,
        height = 300
    )

    maps.append(sge_map)
    
    final_map = (alt.layer(maps[0], maps[1])).resolve_scale(
        color = 'independent'
    ).properties(
        title = 'Median Predictor Score vs. Median Missense Functional Score'
    )

    final_map.display()
    
    return final_map

In [None]:
def odds_testing(row): #Custom function to do odds ratio testing

    #Extracts variant counts
    case_nf_count = row['case_nf']
    control_nf_count = row['control_nf']
    case_f_count = row['case_f']
    control_f_count = row['control_f']

    #Extracts total individauls sequenced
    case_total = row['case_total']
    control_total = row['control_total']

    #Builds array for LoF variants
    nf_array = np.array([[case_nf_count, case_total],
                        [control_nf_count, control_total]])

    #Builds array for functionally normal variants
    f_array = np.array([[case_f_count, case_total],
                        [control_f_count, control_total]])


    oddsratio, nf_p_value = stats.fisher_exact(nf_array) #Tabulates odds-ratio and p-value from Fischer's exact test
    nf_table = sm.stats.Table2x2(nf_array) #Generates confidence intervals

    #Gets stats for LoF array
    nf_or = nf_table.oddsratio
    nf_lwr_ci = nf_table.oddsratio_confint()[0]
    nf_upper_ci = nf_table.oddsratio_confint()[1]
    nf_p_val = nf_p_value

    oddsratio, f_p_value = stats.fisher_exact(f_array) #Tabulates odds-ratio and p-value from Fischer's exact test
    f_table = sm.stats.Table2x2(f_array) #Generates confidence intervals

    #Gets stats for functionally normal array
    f_or = f_table.oddsratio
    f_lwr_ci = f_table.oddsratio_confint()[0]
    f_upper_ci = f_table.oddsratio_confint()[1]
    f_p_val = f_p_value

    return nf_or, nf_lwr_ci, nf_upper_ci, nf_p_val, f_or, f_lwr_ci, f_upper_ci, f_p_val

In [None]:
def case_control(df): #Wrapper function for case-control analysis
    
    sge_data = {}
    sge_data['mis'] = df

    sge_keys = ['mis']
    def read_carriers_data(cc): #Reads CARRIERS data
    
        cc = pd.read_excel(cc) #Reads case-control data
        cc_all_raw = cc[cc['CAVA_GENE'].isin(['BARD1'])] #Filters only for BARD1
        cc_all_raw = cc_all_raw.copy() #Raw df for all variants
        cc_pop_raw = cc_all_raw[cc_all_raw['CARRIERS_PROJECT'].isin(['population-based'])] #Raw df for population-based variants only
    
        raw_dfs = [cc_all_raw, cc_pop_raw] #Makes list for iteration
    
        processed_dfs = []
        for cc in raw_dfs: #Iterates through each df and generates a pos_id column for merging with SGE data
            cc = cc[['Classification', '#CHROM', 'REF', 'ALT', 'CAVA_GENE', 'CAVA_CSN', 'CAVA_SO', 'Sample_AAF', 'Sample_ID', 'CaseControl','ER_status1', 'hg38_start']].copy() #Keeps necessary columns
            
            cc['pos_id'] = None #Creates emtpy pos_id column
            cc = cc[cc['ALT'].str.len() == 1].copy()
            cc['hg38_start'] = cc['hg38_start'].astype(str) #Sets hg38 coordinates as str data type
            cc['pos_id'] = cc['hg38_start'] + ':' + cc['ALT'] #Creates position ID
    
            processed_dfs.append(cc)
    
        #All variants and population-based variants extracted
        cc_all = processed_dfs[0]
        cc_pop = processed_dfs[1]
    
    
        
    
        carriers_data = {'pop': cc_pop
        }
    
    
    
        carriers_keys = list(carriers_data.keys())
        
        return carriers_data, carriers_keys

    def read_bridges(bridges): #Read BRIDGES Data

        #Pulls BARD1 sheets
  
        bridges_pop = pd.read_excel(bridges, sheet_name = 'BARD1')

        raw_dfs = [bridges_pop] #List for iteration
        cleaned_dfs = []
        
        for df in raw_dfs: #Cleans and builds pos_id column in each df
            df = df[['Cases', 'Controls', 'chr', 'ref', 'alt', 'hg38_pos']]
            df = df.loc[(df['ref'].str.len() == 1) & (df['alt'].str.len() == 1)]
            
            df = df.rename(columns = {'hg38_pos': 'pos'})
    
            df['pos_id'] = df['pos'].astype(str) + ':' + df['alt']
    
            df = df[['Cases', 'Controls', 'pos_id']]
            cleaned_dfs.append(df)
    
    
        #Concatenates dfs for all variants and population-based variants only
        bridges_pop = cleaned_dfs[0]
    
        
        bridges_data = {
                        'pop': bridges_pop
                       } #Data dicitonary to return
    
        bridges_keys = list(bridges_data.keys())
        
        return bridges_data, bridges_keys

    def count_carriers(carriers_data, carriers_keys, sge_data, sge_keys): #Gets counts of functionally abnormal and functionally normal variants in cases and controls

        #Lists to hold values for returned dataframe
        analysis = []
        carrier_dataset = []
        case_nf = []
        control_nf = []
        case_f = []
        control_f = []
        case_denom = []
        control_denom = []
        
        for key in sge_keys: #Iterates through each SGE dataset
            sge_df = sge_data[key] #Gets SGE df
    
            for carrier_key in carriers_keys: #Iterates through each CARRIERS dataset
                carriers_df = carriers_data[carrier_key]
    
                merged = pd.merge(carriers_df, sge_df, on = 'pos_id', how = 'inner') #Merges case-control and SGE data
                merged = merged.dropna(subset = ['Classification_y']) #drops any columsn without a classification
                contingency_tab = merged[['CaseControl', 'Classification_y']] #Creates dataframe for contingency table
                contingency_tab = pd.crosstab(merged['CaseControl'], merged['Classification_y']) #Creates contingency table
                contingency_tab = contingency_tab[contingency_tab.columns[::-1]]
                
                columns = list(contingency_tab.columns)
                if 'F' not in columns:
                    contingency_tab['F'] = 0
                    
                if key == 'ring' and carrier_key == 'cc_pop_er_cases': #Exception for ER- RING subset as there are 0 LoF variants
                    contingency_tab['NF'] = 0
    
                if carrier_key == 'cc_pop_er_cases': #For ER- negative subsets, number of LoF vars. seen in the population-based control set is fixed
                    cases_nf = contingency_tab['NF']['Case'] #Gets number of LoF variants
                    cases_f = contingency_tab['F']['Case'] #Gets number of functionally normal variants
                    controls_nf = 19 #Number of LoF variants seen in the missense vars. only population-based control set
                    controls_f = 864  #Number of functionally normal variants seen in the missense vars. only population-based control set
    
                    analysis.append(key) #Appends SGE dataset key
                    carrier_dataset.append(carrier_key) #Appends CARRIERS data dictionary key
                    case_nf.append(cases_nf) #Appends number of LoF variants in cases
                    case_f.append(cases_f) #Appends number of functionally normal variants in cases
                    control_nf.append(controls_nf) #Appends number of LoF variants in controls 
                    control_f.append(controls_f) #Appends number of functionally normal variants in controls
                else: #Handles all other cases
                    #Gets number of LoF variants in cases and controls
                    cases_nf = contingency_tab['NF']['Case']
                    controls_nf = contingency_tab['NF']['Control']
    
                    #Gets number of functionally_normal variants in cases and controls
                    cases_f = contingency_tab['F']['Case']
                    controls_f = contingency_tab['F']['Control']
    
                    #Appends to dataframe lists
                    analysis.append(key)
                    carrier_dataset.append(carrier_key)
                    case_nf.append(cases_nf)
                    case_f.append(cases_f)
                    control_nf.append(controls_nf)
                    control_f.append(controls_f)
    
                #Appends correct total number of individuals sequenced
                if carrier_key == 'all':
                    case_denom.append(carriers_totals['cases_all'])
                    control_denom.append(carriers_totals['controls_all'])
                elif carrier_key == 'pop':
                    case_denom.append(carriers_totals['cases_pop'])
                    control_denom.append(carriers_totals['controls_pop'])
                elif carrier_key == 'cc_pop_er_cases':
                    case_denom.append(carriers_totals['er_cases'])
                    control_denom.append(carriers_totals['controls_pop'])
    
    
        #Builds final dataframe
        df = pd.DataFrame({'region': analysis,
                           'dataset': carrier_dataset,
                            'case_nf': case_nf,
                            'control_nf': control_nf,
                            'case_f': case_f,
                            'control_f': control_f,
                           'case_total': case_denom,
                           'control_total': control_denom
                          })
    
        df['cohort'] = 'carriers' #Sets cohort identifer
        df['full_data_id'] = df['cohort'] + '_' + df['dataset'] #Builds full data identifier linking cohort and specific dataset used
        
        return df


    def count_bridges(bridges_data, bridges_keys, sge_data, sge_keys): #Analagous code as previous function but for BRIDGES datasets

        region = []
        bridges_dataset = []
        case_nf = []
        control_nf = []
        case_f = []
        control_f = []
        case_denom = []
        control_denom = []
    
        for key in sge_keys:
            sge_df = sge_data[key]
    
            for bridge_key in bridges_keys:
                bridges_df = bridges_data[bridge_key]
        
                merged = pd.merge(bridges_df, sge_df, on = 'pos_id', how = 'inner')
                contingency_tab = merged.pivot_table(
                    values = ['Cases', 'Controls'],
                    index = 'Classification',
                    aggfunc = 'sum'
                )
    
                contingency_tab = contingency_tab.transpose()
    
                columns = list(contingency_tab.columns)
                if 'F' not in columns:
                    contingency_tab['F'] = 0
                    
                cases_nf = contingency_tab['NF']['Cases']
                controls_nf = contingency_tab['NF']['Controls']
        
                cases_f = contingency_tab['F']['Cases']
                controls_f = contingency_tab['F']['Controls']
    
                region.append(key)
                bridges_dataset.append(bridge_key)
                case_nf.append(cases_nf)
                case_f.append(cases_f)
                control_nf.append(controls_nf)
                control_f.append(controls_f)
    
                if bridge_key == 'all':
                    case_denom.append(bridges_totals['cases_all'])
                    control_denom.append(bridges_totals['controls_all'])
                elif bridge_key == 'pop':
                    case_denom.append(bridges_totals['cases_pop'])
                    control_denom.append(bridges_totals['controls_pop'])
    
        df = pd.DataFrame({'region': region,
                           'dataset': bridges_dataset,
                            'case_nf': case_nf,
                            'control_nf': control_nf,
                            'case_f': case_f,
                            'control_f': control_f,
                           'case_total': case_denom,
                           'control_total': control_denom
                          })
    
        df['cohort'] = 'bridges'
        df['full_data_id'] = df['cohort'] + '_' + df['dataset']
        
        return df
        
    def wrapper():
        carriers_data, carriers_keys = read_carriers_data(carriers)
        bridges_data, bridges_keys = read_bridges(bridges)

        carriers_counted = count_carriers(carriers_data, carriers_keys, sge_data, sge_keys)
        bridges_counted = count_bridges(bridges_data, bridges_keys, sge_data, sge_keys)
        
        df = pd.concat([carriers_counted, bridges_counted]).reset_index(drop = True) #Counted data frames concatenated

        #Combines CARRIERS and BRIDGES datasets into new columns
        final_df = df.groupby(['region', 'dataset']).agg({
            'case_nf': 'sum',
            'control_nf': 'sum',
            'case_f': 'sum',
            'control_f': 'sum',
            'case_total': 'sum',
            'control_total': 'sum',
            'cohort': lambda x: '+'.join(x),
            'full_data_id': lambda x: '+'.join(x)
        }).reset_index()

        final_df[['nf_or', 'nf_lwr_ci', 'nf_upper_ci', 'nf_p', 'f_or', 'f_lwr_ci', 'f_upper_ci', 'f_p']] = final_df.apply(odds_testing, axis = 1, result_type = 'expand') #Odds ratios calculated
        final_df['significant'] = 'FALSE' #Builds significance column

        final_df.loc[(final_df['nf_p'] < 0.05) & (final_df['nf_lwr_ci'] > 1) & (final_df['f_lwr_ci'] < 1), 'significant'] = 'TRUE' #Rows where the LoF OR's lower CI does not pass 1 and p < 0.05 and functionally normal variants do not associate with disease are marked as significant
        
        return final_df
        
    odds_df = wrapper()

    return odds_df

In [None]:
def or_plot(df): #Builds odds ratio plot
    base = alt.Chart(df)

    base_or = base.mark_point(filled = True,
        size = 50, 
        color = 'black').encode(
        x = alt.X('nf_or:Q', 
                  title = 'Odds Ratio'
                 ),
        y = alt.Y('score_set:N',
                  title = ''
                 )
    )

    ci_bars = base.mark_errorbar().encode(
        y = 'score_set',
        x = alt.X('nf_lwr_ci:Q',
                  title = ''
                 ),
        x2 = 'nf_upper_ci:Q'
    )

    line = alt.Chart(pd.DataFrame({'nf_or': [1]})).mark_rule(color = 'red').encode(
        x = 'nf_or')

    plot = alt.layer(base_or, ci_bars, line).configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    )
    plot.display()

    return plot

In [None]:
def main():
    sge_df, thresholds = get_thresholds(sge_file)


    agg_df = process_vep(sge_df)
    
    scatter_plot = vep_v_sge(sge_df, thresholds)
    quant_overpathogenicity(sge_df)
    across_gene = scores_across_gene(sge_df)

    concordance(sge_df)
    heatmap = min_heatmap(agg_df, thresholds)

    cutoffs['SGE'] = thresholds

    all_scores = list(cutoffs.keys())

 
    odds_dfs = []
    print('Starting Odds Analysis...')
    for key in all_scores:
        print(f' Processing {key} ...')
        if key == 'SGE':
            sge_df = sge_df.rename(columns = {'score': 'SGE'})
            sge_df = sge_df.loc[sge_df['consequence'].isin(['missense_variant'])]
            
        
            df = sge_df[['pos_id', key, 'functional_consequence']]
            df = df.copy()

        elif key == 'REVEL' or key == 'MutPred2':
            training_col = f'{key}_train'
            df = sge_df[['pos_id', key, training_col]]
            df = df.copy()
            df = df.dropna(subset = [key])
            df = df.loc[~df[key].isin(['-'])]
            df = df.loc[df[training_col].isin([0.0])]
            
            df[key] = df[key].astype(float)

        else: 
            df = sge_df[['pos_id', key]]
            df = df.copy()
            df = df.dropna(subset = [key])
            df = df.loc[~df[key].isin(['-'])]
            df[key] = df[key].astype(float)
        
        lwr_thresh, upper_thresh = cutoffs[key]
        
        df['Classification'] = 'Indeterminate'

        if key == 'SGE': 
            df.loc[df['functional_consequence'] == 'functionally_abnormal', 'Classification'] = 'NF'
            df.loc[df['functional_consequence'] == 'functionally_normal', 'Classification'] = 'F'

            df = df[['pos_id', key, 'Classification']]
        else: 
            df.loc[df[key] <= lwr_thresh, 'Classification'] = 'F'
            df.loc[df[key] >= upper_thresh, 'Classification'] = 'NF'


        odds_df = case_control(df)
        odds_df['score_set'] = key
        odds_dfs.append(odds_df)


    final_df = pd.concat(odds_dfs).reset_index(drop = True)
    print(final_df)
    odds_plot = or_plot(final_df)


    if save_figs: 
        heatmap.save('/Users/ivan/Desktop/BARD1_draft_figs/supp_figs/suppfig_vep_heatmap.svg')
        scatter_plot.save('/Users/ivan/Desktop/BARD1_draft_figs/supp_figs/suppfig_VEPvsSGE.svg')
        across_gene.save('/Users/ivan/Desktop/BARD1_draft_figs/supp_figs/suppfig_VEPs_AcrossBARD1.svg')
        odds_plot.save('/Users/ivan/Desktop/BARD1_draft_figs/supp_figs/suppfig_VEPs_ORs.svg')
    

In [None]:
main()