In [None]:
import pandas as pd
import altair as alt
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
file = '../Data/BARD1_SGE_final_table.xlsx' #SGE datafile

In [None]:
def read_data(file):
    df = pd.read_excel(file, sheet_name = 'scores')
    df = df.loc[~df['variant_qc_flag'].isin(['WARN'])]
    gnomad_df = df.dropna(subset = ['gnomad_af'])
    gnomad_df['dataset'] = 'gnomAD'
    gnomad_df = gnomad_df.drop(columns = ['regeneron_maf'])
    
    reg_df = df.dropna(subset = ['regeneron_maf'])
    reg_df = reg_df.drop(columns = ['gnomad_af'])
    reg_df['dataset'] = 'Regeneron'
    
    gnomad_df = gnomad_df.rename(columns = {'gnomad_af': 'Allele Frequency'})
    reg_df = reg_df.rename(columns = {'regeneron_maf': 'Allele Frequency'})


    df = pd.concat([gnomad_df, reg_df])

    df = df.rename(columns = {'consequence': 'Consequence', 'score': 'snv_score_minmax'}) #Renames columns to harmonize column names

    #Renames variant consequences to be cleaner
    df.loc[df['Consequence'].str.contains('missense'), 'Consequence'] = 'Missense'
    df.loc[df['Consequence'] == 'synonymous_variant', 'Consequence'] = 'Synonymous'
    df.loc[df['Consequence'] == 'intron_variant', 'Consequence'] = 'Intron'
    df.loc[df['Consequence'] == 'stop_gained', 'Consequence'] = 'Stop Gained'
    df.loc[df['Consequence'] == 'stop_lost', 'Consequence'] = 'Stop Lost'
    df.loc[df['Consequence'].str.contains('site'), 'Consequence'] = 'Canonical Splice'
    df.loc[df['Consequence'].str.contains('ing_var'), 'Consequence'] = 'Splice Region'
    df.loc[df['Consequence'].str.contains('UTR'), 'Consequence'] = 'UTR Variant'
    df.loc[df['Consequence'] == 'start_lost', 'Consequence'] = 'Start Lost'

    df['log_AF'] = np.log10(df['Allele Frequency'])

    return df

In [None]:
def graph(df): #Generates scatter plot of allele frequencies vs. SGE score
    

    #Color palettes
    palette = [
    '#006616', # dark green,
    '#81B4C7', # dusty blue
    '#ffcd3a', # yellow
    '#6AA84F', # med green
    '#93C47D', # light green
    '#888888', # med gray
    '#000000', # black
    '#1170AA', # darker blue
    '#CFCFCF' # light gray
        
    ]
    
    
    variant_types = [
        'Synonymous',
        'Missense',  
        'Stop Gained',
        'Intron', 
        'UTR Variant',
        'Stop Lost',
        'Start Lost',
        'Canonical Splice', 
        'Splice Region',
    ]

    #Builds scatter plot
    graph = alt.Chart(df).mark_point().encode(
        x=alt.X('snv_score_minmax', 
                axis=alt.Axis(
                    title='SGE Score', 
                    labelFontSize = 18, 
                    titleFontSize = 20
                )
               ),
        y = alt.Y('Allele Frequency', 
                  axis = alt.Axis(
                      grid = False, 
                      labelFontSize = 18, 
                      titleFontSize = 20,
                      format = 'e'
                  ), 
                  scale = alt.Scale(type = 'log'
                                   )
                 ),
        color = alt.Color('Consequence', 
                          scale = alt.Scale(range = palette,
                                  domain = variant_types), 
                          legend = alt.Legend(
                              titleFontSize = 16, 
                              labelFontSize = 14,
                              labelLimit = 500
                          )
                         ),
        shape = alt.Shape('dataset',
                         legend = alt.Legend(
                             title = 'Dataset',
                             titleFontSize = 16,
                             labelFontSize = 14
                         )
                         ),
        tooltip = [alt.Tooltip('exon', title = 'Exon: '),
                   alt.Tooltip('pos_id', title = 'Pos. ID: '),
                   alt.Tooltip('Consequence', title = 'Consequence: '),
                   alt.Tooltip('amino_acid_change', title = 'AA Sub.: '),
                   alt.Tooltip('snv_score_minmax', title = 'SGE Score: '),
                   alt.Tooltip('Allele Frequency', title = 'MAF: ')
                  ]
    ).configure_axis(
        grid = False
    ).properties(
        width = 600,
        height = 400,
        title = alt.TitleParams( text = 'Allele Frequency vs. SGE Score' + ' (n = ' + str(len(df)) + ')', fontSize = 22)
    ).interactive()
    
    #graph.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_4a_MAF.png', ppi = 500)
    graph.show()

In [None]:
def heatmap(df):

    
    plot = alt.Chart(df).mark_rect().encode(
        x = alt.X('snv_score_minmax:Q',
                  axis = alt.Axis(
                      title = 'SGE Score',
                      titleFontSize = 20,
                      labelFontSize = 18,
                      values = [-0.5, -0.4, -0.3, -0.2, -0.1, 0, 0.1]
                  ),
                  bin = alt.Bin(maxbins = 50)
                 ),
        y = alt.Y('log_AF:Q',
                  bin = alt.Bin(maxbins = 20),
                  axis = alt.Axis(
                      title = 'log10(Allele Frequency)',
                      titleFontSize = 20,
                      labelFontSize = 18,
                      values = [0, -1, -2, -3, -4, -5, -6]
                  )
                  ),
        color = alt.Color('count():Q',
                          scale = alt.Scale(scheme = 'lighttealblue'),
                          legend = alt.Legend(title = '# of Variants',
                                              titleFontSize = 16,
                                              labelFontSize = 14
                                             )
                         )
    ).properties(
        width = 500,
        height = 300,
        title = alt.TitleParams( text = 'Allele Frequency vs. SGE Score' + ' (n = ' + str(len(df)) + ')', fontSize = 22)
    ).configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    )

    plot.display()

    return plot

In [None]:
def density(df):
    print(df)

    sns.set_style("whitegrid")
    plt.figure(figsize=(6, 4))

    sns.kdeplot(
        data = df,
        x = 'snv_score_minmax',
        y = 'log_AF',
        fill = True,
        cmap = 'viridis',
        levels = 10,
        thresh = 0.05,
        bw_adjust = 0.5
    )

    plt.show()

In [None]:
def main():
    merged = read_data(file)
    graph(merged)
    binned_map = heatmap(merged)
    density(merged)

    #binned_map.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_4a_MAF.png', ppi = 500)

In [None]:
main() 