This notebook builds Extended Data Fig. 6, MAF vs. Fitness Score

In [None]:
import pandas as pd
import altair as alt
import numpy as np
from scipy.stats import mannwhitneyu

In [None]:
file = './Data/final_tables/supplementary_file_1_BARD1_SGE_final_table.xlsx' #SGE datafile

In [None]:
def read_data(file):
    df = pd.read_excel(file, sheet_name = 'scores')
    df = df.loc[~df['variant_qc_flag'].isin(['WARN'])]

    raw_data = df
    
    gnomad_df = df.dropna(subset = ['gnomad_af'])
    gnomad_df['dataset'] = 'gnomAD'
    gnomad_df = gnomad_df.drop(columns = ['regeneron_maf'])
    
    reg_df = df.dropna(subset = ['regeneron_maf'])
    reg_df = reg_df.drop(columns = ['gnomad_af'])
    reg_df['dataset'] = 'Regeneron'
    
    gnomad_df = gnomad_df.rename(columns = {'gnomad_af': 'Allele Frequency'})
    reg_df = reg_df.rename(columns = {'regeneron_maf': 'Allele Frequency'})


    df = pd.concat([gnomad_df, reg_df])

    df = df.rename(columns = {'consequence': 'Consequence', 'score': 'snv_score_minmax'}) #Renames columns to harmonize column names

    #Renames variant consequences to be cleaner
    df.loc[df['Consequence'].str.contains('missense'), 'Consequence'] = 'Missense'
    df.loc[df['Consequence'] == 'synonymous_variant', 'Consequence'] = 'Synonymous'
    df.loc[df['Consequence'] == 'intron_variant', 'Consequence'] = 'Intron'
    df.loc[df['Consequence'] == 'stop_gained', 'Consequence'] = 'Stop Gained'
    df.loc[df['Consequence'] == 'stop_lost', 'Consequence'] = 'Stop Lost'
    df.loc[df['Consequence'].str.contains('site'), 'Consequence'] = 'Canonical Splice'
    df.loc[df['Consequence'].str.contains('ing_var'), 'Consequence'] = 'Splice Region'
    df.loc[df['Consequence'].str.contains('UTR'), 'Consequence'] = 'UTR Variant'
    df.loc[df['Consequence'] == 'start_lost', 'Consequence'] = 'Start Lost'

    df['log_AF'] = np.log10(df['Allele Frequency'])

    return df, raw_data

In [None]:
def heatmap(df):

    
    plot = alt.Chart(df).mark_rect().encode(
        x = alt.X('snv_score_minmax:Q',
                  axis = alt.Axis(
                      title = 'Fitness Score',
                      titleFontSize = 20,
                      labelFontSize = 18,
                      values = [-0.4, -0.2, 0]
                  ),
                  bin = alt.Bin(maxbins = 50)
                 ),
        y = alt.Y('log_AF:Q',
                  bin = alt.Bin(maxbins = 20),
                  axis = alt.Axis(
                      title = 'log10(Allele Frequency)',
                      titleFontSize = 20,
                      labelFontSize = 18,
                      values = [0, -1, -2, -3, -4, -5, -6]
                  )
                  ),
        color = alt.Color('count():Q',
                          scale = alt.Scale(scheme = 'lighttealblue',
                                           domain = [1,400]),
                          legend = alt.Legend(title = '# of Variants',
                                              titleFontSize = 16,
                                              labelFontSize = 14
                                             )
                         )
    ).properties(
        width = 300,
        height = 250,
        title = alt.TitleParams( text = 'Allele Frequency vs. Fitness Score' + ' (n = ' + str(len(df)) + ')', fontSize = 22)
    ).configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    )

    plot.display()

    return plot

In [None]:
def whitneytesting(df):

    gnomad_df = df.dropna(subset = ['gnomad_af']).copy()
    reg_df = df.dropna(subset = ['regeneron_maf']).copy()

    
    gnomad_ab_maf = gnomad_df.loc[gnomad_df['functional_consequence'].isin(['functionally_abnormal'])]['gnomad_af']
    gnomad_normal_maf = gnomad_df.loc[gnomad_df['functional_consequence'].isin(['functionally_normal'])]['gnomad_af']

    reg_ab_maf = reg_df.loc[reg_df['functional_consequence'].isin(['functionally_abnormal'])]['regeneron_maf']
    reg_normal_maf = reg_df.loc[reg_df['functional_consequence'].isin(['functionally_normal'])]['regeneron_maf']

    pairs = [(gnomad_ab_maf, gnomad_normal_maf, 'gnomAD'), (reg_ab_maf, reg_normal_maf, 'Regeneron')]


    for pair in pairs:
        ab, normal, dataset = pair
        stat, p_value = mannwhitneyu(ab, normal, alternative = 'less')

        print(f' Testing MAF distribution for LoF vs. functionally normal variants in {dataset} data. p-value is: {p_value}. The median MAF for LoF vars. is:{ab.median()} and median for normal vars. is: {normal.median()}')

In [None]:
def main(save = True):
    merged, testing_input = read_data(file)
    binned_map = heatmap(merged)
    whitneytesting(testing_input)

    if save:
        binned_map.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_4a_MAF.svg')

In [None]:
main(save = False) 