In [None]:
import pandas as pd
import altair as alt
import numpy as np
import statsmodels.api as sm
from scipy import stats

In [None]:
sge = '../Data/20250813_BARD1scores_final_FILTERED.xlsx'
gnomad = '../Data/20240905_BARD1_gnomADv4.1.0_SNVs.xlsx' #for Ambry
carriers = '/Users/ivan/Downloads/7_genes_for_Starita_02282025_hg38.xlsx'
ambry = '/Users/ivan/Downloads/Ambry_BARD1.xlsx'
bridges_all = '../Data/BRIDGES_data/20250815_BRIDGES_missense_all.xlsx'
bridges_population = '../Data/BRIDGES_data/20250815_BRIDGES_missense_population.xlsx'

aa_dict = {
    'Ala': 'A',
    'Cys': 'C',
    'Asp': 'D',
    'Glu': 'E',
    'Phe': 'F',
    'Gly': 'G',
    'His': 'H',
    'Ile': 'I',
    'Lys': 'K',
    'Leu': 'L',
    'Met': 'M',
    'Asn': 'N',
    'Pro': 'P',
    'Gln': 'Q',
    'Arg': 'R',
    'Ser': 'S',
    'Thr': 'T',
    'Val': 'V',
    'Trp': 'W',
    'Tyr': 'Y',
    'Ter': '*'   # Termination codon
}

In [None]:
def read_sge(sge):
    sge = pd.read_excel(sge) #Reads SGE data
    sge = sge.loc[~(sge['functional_consequence'].isin(['indeterminate']))]

    sge = sge.rename(columns = {'functional_consequence': 'Classification', 
                               'consequence': 'Consequence'})

    sge.loc[sge['Classification'] == 'functionally_normal', 'Classification'] = 'F'
    sge.loc[sge['Classification'] == 'functionally_abnormal', 'Classification'] = 'NF'
    sge.loc[sge['Classification'] == 'indeterminate', 'Classification'] = 'I'

    return sge

In [None]:
def read_ambry(ambry, gnomad):
    ambry = pd.read_excel(ambry)
    ambry = ambry[['Gene', 'c_variant', 'p_variant', 'Classification']]
    ambry['amino_acid_change'] = ambry['p_variant'].str[2:]

    ambry = ambry.dropna(subset = ['amino_acid_change']).copy()
    ambry = ambry.reset_index(drop = True)

    
    gnomad = pd.read_excel(gnomad)
    gnomad = gnomad[['Chromosome', 'Position', 'Reference', 'Alternate', 'Allele Frequency', 'Protein Consequence']]
    gnomad['Position'] = gnomad['Position'].astype(str)
    gnomad['pos_id'] = gnomad['Position'] + ':' + gnomad['Alternate']

    return ambry, gnomad

In [None]:
def read_carriers_data(cc): #Reads all data
    cc = pd.read_excel(cc) #Reads case-control data
    cc = cc[cc['CAVA_GENE'].isin(['BARD1'])] #Filters only for BARD1
    cc = cc[['Classification', '#CHROM', 'REF', 'ALT', 'CAVA_GENE', 'CAVA_CSN', 'CAVA_SO', 'Sample_AAF', 'Sample_ID', 'CaseControl','ER_status1', 'hg38_start']].copy() #Keeps necessary columns
    cc = cc[cc['ER_status1'].isin([0, 777])] #ER negative cases only 

    return cc

In [None]:
def read_bridges(all, population):
    bridges_all = pd.read_excel(all, sheet_name = 'BARD1')
    bridges_pop = pd.read_excel(population, sheet_name = 'BARD1')

    raw_dfs = [bridges_all, bridges_pop]
    cleaned_dfs = []
    
    for df in raw_dfs:
        df = df[['Cases', 'Controls', 'chr', 'ref', 'alt', 'hg38_pos']]
        df = df.rename(columns = {'hg38_pos': 'pos'})

        df['pos_id'] = df['pos'].astype(str) + ':' + df['alt']

        df = df[['Cases', 'Controls', 'pos_id']]
        cleaned_dfs.append(df)


    return cleaned_dfs   

In [None]:
def classify_vars(sge):

    #under development - trying to figure out which parameters are best
    sge = sge[sge['Consequence'].isin(['missense_variant'])]
    sge = sge.copy()
    sge['AApos'] = sge['amino_acid_change'].transform(lambda x: int(x[1:-1]))
    
    analysis_type = input('Analysis Region (RING/ARD/BRCT/Structured/All): ')
    

    if analysis_type == 'BRCT':
        brct = list(range(565, 778))
        sge = sge.loc[sge['AApos'].isin(brct)]
    elif analysis_type == 'ARD': 
        ard = list(range(425, 546))
        sge = sge.loc[sge['AApos'].isin(ard)]
    elif analysis_type == 'RING':
        ring = list(range(26, 123))
        sge = sge.loc[sge['AApos'].isin(ring)]
    elif analysis_type == 'Structured':
        idr = list(range(123, 425)) + list(range(546, 565))
        sge = sge.loc[~(sge['AApos'].isin(idr))]  
    elif analysis_type == 'All': 
        pass
        
    sge_nf = sge[sge['Classification'].isin(['NF'])]
    sge_func = sge[sge['Classification'].isin(['F'])]

    return sge, sge_func, sge_nf

In [None]:
def process_carriers(cc): #Adds Position ID column to case-control data and splits into cases and controls
    
    cc['pos_id'] = None #Creates emtpy pos_id column
    cc = cc[cc['ALT'].str.len() == 1].copy()
    cc['hg38_start'] = cc['hg38_start'].astype(str) #Sets hg38 coordinates as str data type
    cc['pos_id'] = cc['hg38_start'] + ':' + cc['ALT'] #Creates position ID

    cc_control = cc[cc['CaseControl'].isin(['Control'])] #Creates control df
    cc_case = cc[cc['CaseControl'].isin(['Case'])] #creates case df

    return cc, cc_control, cc_case

In [None]:
def analyze_ambry(sge, gnomad, ambry):
    
    sge_gnomad = pd.merge(sge, gnomad, on = 'pos_id', how = 'inner')

    gnomad_func = sge_gnomad['Classification'].value_counts()['F']
    gnomad_nf = sge_gnomad['Classification'].value_counts()['NF']
    
    sge_ambry = pd.merge(sge,ambry, on = 'amino_acid_change', how = 'inner')

    ambry_func = sge_ambry['Classification_x'].value_counts()['F']
    ambry_nf = sge_ambry['Classification_x'].value_counts()['NF'] 

    contingency_table = np.array([[ambry_nf, ambry_func],
                                    [gnomad_nf, gnomad_func]])

    df = pd.DataFrame(contingency_table, columns = ['NF', 'F'], index = ['Ambry', 'gnomAD'])
    display(df)


    oddsratio, p_value = stats.fisher_exact(contingency_table) #Tabulates odds-ratio and p-value from Fischer's exact test
    table = sm.stats.Table2x2(contingency_table) #Generates confidence intervals
    
    result = table.summary() #Summary table with Odds and Risk ratios
    display(result) #Displays summary stats
    
    
    #Prints out summary stats
    print('Analysis for Ambry: ')
    print(f"Odds Ratio: {table.oddsratio:.2f}")
    print(f"95% CI: ({table.oddsratio_confint()[0]:.2f}, {table.oddsratio_confint()[1]:.2f})")
    print(f"P-value (Fisher's exact): {p_value:.4f}")
    oddsratio, p_value = stats.fisher_exact(contingency_table)

In [None]:
def analyze_carriers(cc, sge):
    merged = pd.merge(cc, sge, on = 'pos_id', how = 'inner') #Merges case-control and SGE data
    merged = merged.dropna(subset = ['Classification_y']) #drops any columsn without a classification
    
    contingency_tab = merged[['CaseControl', 'Classification_y']] #Creates dataframe for contingency table

    contingency_tab = pd.crosstab(merged['CaseControl'], merged['Classification_y']) #Creates contingency table
    contingency_tab = contingency_tab[contingency_tab.columns[::-1]] #Swaps NF and F columns in df
    table_array = contingency_tab.values #Gets values from contingency table

    print(table_array)
    oddsratio, p_value = stats.fisher_exact(table_array) #Tabulates odds-ratio and p-value from Fischer's exact test
    table = sm.stats.Table2x2(table_array) #Generates confidence intervals
    
    result = table.summary() #Summary table with Odds and Risk ratios
    display(contingency_tab) #Displays contingency table 
    display(result) #Displays summary stats
    

    
    #Prints out summary stats
    print('Analysis for CARRIERS Data Set: ')
    print(f"Odds Ratio: {table.oddsratio:.2f}")
    print(f"95% CI: ({table.oddsratio_confint()[0]:.2f}, {table.oddsratio_confint()[1]:.2f})")
    print(f"P-value (Fisher's exact): {p_value:.4f}")

    df = pd.DataFrame({'data': ['CARRIERS'], 'OR': [table.oddsratio], 'LCB': [table.oddsratio_confint()[0]], 'UCB': [table.oddsratio_confint()[1]]
                      })

    return df

In [None]:
def analyze_bridges(sge, dfs):

    i = 0
    labels = ['BRIDGES All', 'BRIDGES Population']
    for df in dfs:
        merged = pd.merge(df, sge, on = 'pos_id', how = 'inner')

        summary = merged.pivot_table(
            values = ['Cases', 'Controls'],
            index = 'Classification',
            aggfunc = 'sum'
        )

        print(summary.transpose())
        table_array = np.array([[summary['Cases']['NF'], summary['Cases']['F']],
                                 [summary['Controls']['NF'], summary['Controls']['F']]])

        
        oddsratio, p_value = stats.fisher_exact(table_array) #Tabulates odds-ratio and p-value from Fischer's exact test
        table = sm.stats.Table2x2(table_array) #Generates confidence intervals
        
        result = table.summary() #Summary table with Odds and Risk ratios
        display(result) #Displays summary stats
    

    
        #Prints out summary stats
        print('Analysis for ', labels[i], ' Data Set: ')
        print(f"Odds Ratio: {table.oddsratio:.2f}")
        print(f"95% CI: ({table.oddsratio_confint()[0]:.2f}, {table.oddsratio_confint()[1]:.2f})")
        print(f"P-value (Fisher's exact): {p_value:.4f}")        
        
        i += 1

In [None]:
def cc_fig(df):

    #Manual input for ambry data, will combine notebooks
    ambry_df = pd.DataFrame({'data': ['Ambry & gnomAD'], 'OR': [1.378], 'LCB': [1.133], 'UCB': [1.677]
                      })

    df = pd.concat([df, ambry_df])

    palette = [
    '#006616', # dark green,
    '#81B4C7' # dusty blue
     ]

    datasets = [
        'Ambry & gnomAD',
        'CARRIERS'
    ]
    
    base = alt.Chart(df)

    points = base.mark_point(
        filled = True,
        size = 50
    ).encode(
        x = alt.X('data',
                 axis = alt.Axis(title = '',
                                 labels = False,
                                 labelFontSize = 16
                                )
                 ),
        y = alt.Y('OR',
                 axis = alt.Axis(
                     title = 'Odds Ratio',
                     labelFontSize = 16,
                     titleFontSize = 18
                                 )
                 ),
        color = alt.Color('data',
                          scale = alt.Scale(domain = datasets,
                                            range = palette
                                           ),
                         legend = alt.Legend(title = '',
                                             labelFontSize = 14, 
                                             titleFontSize = 26
                                            )
                         )
    )

    ci_bars = base.mark_errorbar().encode(
        x = 'data',
        y = alt.Y('LCB:Q', axis = alt.Axis(title = '')),
        y2 = 'UCB:Q',
        color = 'data'
    )

    plot = (points + ci_bars).properties(
        height = 600, 
        width = 100
    ).configure_view(
        stroke = None
    ).configure_axis(
        grid = False
    )

    plot.display()

    #plot.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_4d_ORplot.png', ppi = 500)

In [None]:
def main():
    raw_sge_df = read_sge(sge)
    ambry_df, gnomad_df = read_ambry(ambry, gnomad)
    cc_df = read_carriers_data(carriers)
    bridges_dfs = read_bridges(bridges_all, bridges_population)
    
    sge_all,sge_func, sge_nf = classify_vars(raw_sge_df)
    
    carriers_all, carriers_controls, carriers_cases = process_carriers(cc_df)
    analyze_ambry(raw_sge_df, gnomad_df, ambry_df)
    cc_df = analyze_carriers(carriers_all, sge_all)
    analyze_bridges(sge_all, bridges_dfs)
    #cc_fig(cc_df)

In [None]:
main()