In [None]:
import pandas as pd
import altair as alt

In [None]:
gnom_path = '../Data/20240905_BARD1_gnomADv4.1.0_SNVs.xlsx' #gnomAD data path
reg_path = '../Data/20240802_BARD1_Regeneron_MAF.xlsx' #Regeneron data path
scores = '../Data/20250825_BARD1snvscores_filtered.xlsx' #SGE scores

In [None]:
def read_gnomAD(gnomAD_path): #Reads gnomAD file
    
    unfiltered = pd.read_excel(gnomAD_path) #Reads gnomAD file
    filtered = unfiltered[['gnomAD ID', 'Allele Frequency']] #Gets necessary columns 

    filtered = filtered.copy()
    filtered['dataset'] = 'gnomAD' #Sets dataset name
    filtered['pos_id'] = filtered['gnomAD ID'].transform(lambda x: x[2:11] + ':' + x[14]) #Adds pos_id column for merging
    
    return filtered

In [None]:
def read_regeneron(reg_path): #Reads Regeneron data
    
    df = pd.read_excel(reg_path) #Reads data
    maf = df[['Variant','AAF']] #Pulls necessary columns
    maf = maf.copy()

    maf = maf.rename(columns = {'AAF': 'Allele Frequency', 'Variant': 'pos_id'}) #Renames columns to share column names with SGE data
    maf['dataset'] = 'Regeneron' #Sets dataset name

    maf['pos_id'] = maf['pos_id'].transform(lambda x: x[2:12] + x[len(x) - 1: len(x) + 1]) #Remakes the pos_id column to match pos_id column from SGE data for merging
    
    return maf

In [None]:
def add_scores(df, scores): #Merges MAF dataframes and SGE scores
    
    scores = pd.read_excel(scores) #Reads SGE scores
    scores = scores.rename(columns = {'consequence': 'Consequence', 'score': 'snv_score_minmax'}) #Renames columns to harmonize column names
    scores['pos'] = scores['pos'].astype(str) #Sets data type of position column to string
    scores['pos_id'] = scores['pos'] + ':' + scores['alt'] #adds pos_id column to SGE data
    sge_scores = scores[['exon', 'pos_id','Consequence','snv_score_minmax', 'amino_acid_change']] #pulls necessary columns
    
    df = pd.merge(sge_scores, df, on = 'pos_id', how = 'inner') #Merges both data frames

    #Renames variant consequences to be cleaner
    df.loc[df['Consequence'].str.contains('missense'), 'Consequence'] = 'Missense'
    df.loc[df['Consequence'] == 'synonymous_variant', 'Consequence'] = 'Synonymous'
    df.loc[df['Consequence'] == 'intron_variant', 'Consequence'] = 'Intron'
    df.loc[df['Consequence'] == 'stop_gained', 'Consequence'] = 'Stop Gained'
    df.loc[df['Consequence'] == 'stop_lost', 'Consequence'] = 'Stop Lost'
    df.loc[df['Consequence'].str.contains('site'), 'Consequence'] = 'Canonical Splice'
    df.loc[df['Consequence'].str.contains('ing_var'), 'Consequence'] = 'Splice Region'
    df.loc[df['Consequence'].str.contains('UTR'), 'Consequence'] = 'UTR Variant'
    df.loc[df['Consequence'] == 'start_lost', 'Consequence'] = 'Start Lost'

    return df

In [None]:
def graph(df): #Generates scatter plot of allele frequencies vs. SGE score
    

    #Color palettes
    palette = [
    '#006616', # dark green,
    '#81B4C7', # dusty blue
    '#ffcd3a', # yellow
    '#6AA84F', # med green
    '#93C47D', # light green
    '#888888', # med gray
    '#000000', # black
    '#1170AA', # darker blue
    '#CFCFCF' # light gray
        
    ]
    
    
    variant_types = [
        'Synonymous',
        'Missense',  
        'Stop Gained',
        'Intron', 
        'UTR Variant',
        'Stop Lost',
        'Start Lost',
        'Canonical Splice', 
        'Splice Region',
    ]

    #Builds scatter plot
    graph = alt.Chart(df).mark_point().encode(
        x=alt.X('snv_score_minmax', 
                axis=alt.Axis(
                    title='SGE Score', 
                    labelFontSize = 18, 
                    titleFontSize = 20
                )
               ),
        y = alt.Y('Allele Frequency', 
                  axis = alt.Axis(
                      grid = False, 
                      labelFontSize = 18, 
                      titleFontSize = 20,
                      format = 'e'
                  ), 
                  scale = alt.Scale(type = 'log'
                                   )
                 ),
        color = alt.Color('Consequence', 
                          scale = alt.Scale(range = palette,
                                  domain = variant_types), 
                          legend = alt.Legend(
                              titleFontSize = 16, 
                              labelFontSize = 14,
                              labelLimit = 500
                          )
                         ),
        shape = alt.Shape('dataset',
                         legend = alt.Legend(
                             title = 'Dataset',
                             titleFontSize = 16,
                             labelFontSize = 14
                         )
                         ),
        tooltip = [alt.Tooltip('exon', title = 'Exon: '),
                   alt.Tooltip('pos_id', title = 'Pos. ID: '),
                   alt.Tooltip('Consequence', title = 'Consequence: '),
                   alt.Tooltip('amino_acid_change', title = 'AA Sub.: '),
                   alt.Tooltip('snv_score_minmax', title = 'SGE Score: '),
                   alt.Tooltip('Allele Frequency', title = 'MAF: ')
                  ]
    ).configure_axis(
        grid = False
    ).properties(
        width = 600,
        height = 400,
        title = alt.TitleParams( text = 'Allele Frequency vs. SGE Score' + ' (n = ' + str(len(df)) + ')', fontSize = 22)
    ).interactive()
    
    #graph.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_4a_MAF.png', ppi = 500)
    graph.show()

In [None]:
def main():
    gnomad = read_gnomAD(gnom_path)
    regeneron = read_regeneron(reg_path)

    final_df = pd.concat([gnomad, regeneron])
    
    merged = add_scores(final_df, scores)
    #merged.to_excel('/Users/ivan/Desktop/20250703_BARD1_gnomAD_vars.xlsx', index = False)
    graph(merged)


In [None]:
main() 