In [None]:
import pandas as pd
import altair as alt

In [None]:
gnom_path = '../Data/20240905_BARD1_gnomADv4.1.0_SNVs.xlsx'
scores = '../Data/20250423_BARD1_snvscores_IGVFupload.tsv'

In [None]:
def read_gnomAD(gnomAD_path): #Reads gnomAD file
    
    unfiltered = pd.read_excel(gnomAD_path) #Reads gnomAD file
    filtered = unfiltered[['gnomAD ID', 'Allele Frequency']] #Gets necessary columns 
    
    return filtered

In [None]:
def rewrite(df): #Adds a pos_id column for merging

    df['pos_id'] = df['gnomAD ID'].transform(lambda x: x[2:11] + ':' + x[14])

    return df

In [None]:
def add_scores(df, scores): #Merges gnomAD data frame and SGE scores
    
    scores = pd.read_csv(scores, sep = '\t') #Reads SGE scores
    scores = scores.rename(columns = {'consequence': 'Consequence', 'score': 'snv_score_minmax'}) #Renames columns to harmonize column names
    scores['pos'] = scores['pos'].astype(str) #Sets data type of position column to string
    scores['pos_id'] = scores['pos'] + ':' + scores['alt'] #adds pos_id column to SGE data
    sge_scores = scores[['exon', 'pos_id','Consequence','snv_score_minmax', 'amino_acid_change']] #pulls necessary columns
    
    df = pd.merge(sge_scores, df, on = 'pos_id', how = 'inner') #Merges both data frames

    df.loc[df['Consequence'].str.contains('missense'), 'Consequence'] = 'Missense'
    df.loc[df['Consequence'] == 'synonymous_variant', 'Consequence'] = 'Synonymous'
    df.loc[df['Consequence'] == 'intron_variant', 'Consequence'] = 'Intron'
    df.loc[df['Consequence'] == 'stop_gained', 'Consequence'] = 'Stop Gained'
    df.loc[df['Consequence'] == 'stop_lost', 'Consequence'] = 'Stop Lost'
    df.loc[df['Consequence'].str.contains('splic'), 'Consequence'] = 'Splice'
    df.loc[df['Consequence'].str.contains('UTR'), 'Consequence'] = 'UTR Variant'
    df.loc[df['Consequence'] == 'start_lost', 'Consequence'] = 'Start Lost'

    return df

In [None]:
def graph(df): #Generates scatter plot of gnomAD allele frequency vs. SGE score
    
    sorted = ["Intron", "Missense", "Synonymous", "Stop Gained", "Splice", "Start Lost", 'Stop Lost', 'UTR'] #Sort order for legend

    #Builds scatter plot
    graph = alt.Chart(df).mark_point().encode(
        x=alt.X('snv_score_minmax', 
                axis=alt.Axis(
                    title='SGE Score', 
                    labelFontSize = 32, 
                    titleFontSize = 36
                )
               ),
        y = alt.Y('Allele Frequency', 
                  axis = alt.Axis(
                      grid = False, 
                      labelFontSize = 32, 
                      titleFontSize = 36
                  ), 
                  scale = alt.Scale(type = 'log'
                                   )
                 ),
        color = alt.Color('Consequence', 
                          sort = sorted, 
                          scale = alt.Scale(
                              scheme = 'category10'
                          ) , 
                          legend = alt.Legend(
                              titleFontSize = 24, 
                              labelFontSize = 24
                          )
                         ),
        tooltip = [alt.Tooltip('exon', title = 'Exon: '),
                   alt.Tooltip('pos_id', title = 'Pos. ID: '),
                   alt.Tooltip('Consequence', title = 'Consequence: '),
                   alt.Tooltip('amino_acid_change', title = 'AA Sub.: '),
                   alt.Tooltip('snv_score_minmax', title = 'SGE Score: ')
                  ]
    ).configure_axis(
        grid = False
    ).properties(
        width = 1800,
        height = 674,
        title = alt.TitleParams( text = 'gnomAD Allele Frequency vs. SGE Scores' + ' (n = ' + str(len(df)) + ')', fontSize = 40)
    ).interactive()
    graph.save('/Users/ivan/Desktop/BARD1_draft_figs/gnomAD.png', ppi = 500)
    graph.show()

In [None]:
def main():
    filt = read_gnomAD(gnom_path)
    rewritten = rewrite(filt)
    merged = add_scores(rewritten, scores)
    graph(merged)


In [None]:
main() 