This notebook builds Extended Data Fig. 5a(orthogonal assay histogram and strip plots). The output is manually annotated for orthogonal assay used and variants called abnormal in orthogonal assays but normal in SGE.

In [None]:
import pandas as pd
import numpy as np
import altair as alt

In [None]:
sge = '../Data/final_tables/supplementary_file_1_BARD1_SGE_final_table.xlsx' #sge data tables

In [None]:
def read_scores(scores): #Reads SGE scores
    df = pd.read_excel(scores, sheet_name = 'scores')#reads TSV

    threshold_df = pd.read_excel(scores, sheet_name = 'thresholds')

    thresholds = [threshold_df['min'][0], threshold_df['max'][0]]

    df = df.loc[~df['amino_acid_change'].isin(['-'])] #pulls out missense variants only
    df = df.loc[~df['variant_qc_flag'].isin(['WARN'])]
    df = df.rename(columns = {'score' : 'snv_score_minmax', 'amino_acid_change': 'AAsub'}) #renames score column and amino acid change column

   
    
    return df, thresholds

In [None]:
def read_orthogonal(file): #Reads orthogonal assay file
    
    df = pd.read_excel(file, sheet_name = 'orthogonal_data') #reads excel

    df['Orthogonal Assay'] = 'ok' #Adds column to hold orthogonal assay characterization

    df = df.reset_index(drop = True)
                       
    return df

In [None]:
def merge(scores,orthogonal): #Merges dataframes
    
    df = pd.merge(scores, orthogonal, how = 'inner', on = 'AAsub') #merged on intersection of AA substitutions
    df = df.drop_duplicates(subset = 'pos_id') #drops duplicate SNVs that can appear

    return df

In [None]:
def characterize_orthogonal(df): #Determines how variant performed in orthogonal assay(s)

    df = df.dropna(subset = ['pos_id']) #drops any variants without score

    #This block iterates through the dataframe and assigns an orthogonal assay function to each variant
    for i in range(len(df)):
        row_raw = df.iloc[i].tolist() #pulls out the row of data and stores as list
        row = [] #empty list to hold each element of the row

        #Iterates through row_raw and converts to string and removes extra spaces
        for elem in row_raw:
            elem = str(elem)
            row.append(elem.strip())

        #Boolean tests for how that variant performed in orthogonal assay
        if 'Abnormal' in row:
            df.iloc[i, len(row) - 1] = 'Abnormal'
        elif 'Intermediate' in row:
            df.iloc[i, len(row) - 1] = 'Intermediate'
        else:
            df.iloc[i, len(row) - 1] = 'Normal'

    #This chunk assigns what assay(s) were used to characterize each variant
    df['Assay Type'] = np.nan #establishes an empty column
    df['Assay Type'] = np.where((df['Adamovich2019_HDR'].notna()) | (df['Lee2015_HDR'].notna()),'HDR', df['Assay Type']) #Finds variants done by HDR 
    df['Assay Type'] = np.where((pd.isna(df['Adamovich2019_HDR'])) & (pd.notna(df['Adamovich2019_western'])),'Western Only', df['Assay Type']) #Finds variants done only by Western
    df['Assay Type'] = np.where((pd.notna(df['Dai2021_brca1foci'])) & (pd.notna(df['Dai2021_bard1foci'])) & (pd.notna(df['Dai2021_Ubbinding'])), #finds variants done by nuclear foci and Ub binding
                                'Ub Binding', df['Assay Type'])
    df['Assay Type'] = np.where((pd.notna(df['Dai2021_Ubbinding'])) & (pd.isna(df['Dai2021_brca1foci'])) & (pd.isna(df['Dai2021_bard1foci'])), #Finds Variants only done by Ub binding
                                'Ub Binding', df['Assay Type'])
    df['Assay Type'] = np.where(df['Witus2021_Ubactivity'].notna(),'Ubiquitylation Activity', df['Assay Type']) #Finds variants done by Ub activity
    df['Assay Type'] = np.where(df['Becker2021_PARPiSensitivity'].notna(),'PARPi Sensitivity', df['Assay Type']) #Finds variants done by Ub activity
    

    df = df.loc[~df['Assay Type'].isin(['Western Only'])]
    return df
        

In [None]:
def make_histogram(df, thresholds): #Makes histogram that displays distribution of SGE scores for variants asayed by orthogonal assays
    
    nf_line = alt.Chart(pd.DataFrame({'x': [thresholds[0]]})).mark_rule(color = 'red').encode(
        x = 'x')

    func_lin = alt.Chart(pd.DataFrame({'x': [thresholds[1]]})).mark_rule(color = 'blue').encode(
        x = 'x')
    
    bins = 50 #number of bins
    scale = [-2,2] #scale
    ticks = list(range(-2,2)) #tick marks
    sorted = ['Normal','Abnormal','Intermediate'] #order for the legend

    #df = df.loc[df['Assay Type'].isin(['HDR'])]
    histogram = alt.Chart(df).mark_bar().encode(
            alt.X('snv_score_minmax', axis = alt.Axis(title = '', labels = False, ticks = False), bin = alt.Bin(maxbins = bins), scale = alt.Scale(domain = [-0.45, 0.1])),
            alt.Y('count()', axis = alt.Axis(title = 'Number of Variants', labelFontSize = 16, titleFontSize = 20)),
            color = alt.Color('Orthogonal Assay:N', scale = alt.Scale(scheme = 'category10'), sort = sorted, legend = alt.Legend(titleFontSize = 16, labelFontSize = 14))
    ).properties(
        width = 600,
        height = 400,
        title = alt.TitleParams(text = f' Functional Scores for Variants with Orthogonal Assay Data (n = {len(df)})', fontSize = 22)
    ).interactive()

    #histogram = histogram + nf_line + func_lin
    return histogram

In [None]:
def make_dotplot(df, thresholds): #Makes dotplot to show how variants in orthogonal assays scored in SGE

    nf_line = alt.Chart(pd.DataFrame({'snv_score_minmax': [thresholds[0]]})).mark_rule(
        color='red',
        strokeWidth=2,  # Make line more visible
        opacity=0.7     # Slight transparency
    ).encode(
        x='snv_score_minmax:Q'  # Specify quantitative type
    )

    func_line = alt.Chart(pd.DataFrame({'snv_score_minmax': [thresholds[1]]})).mark_rule(
            color='blue',
            strokeWidth=2,
            opacity=0.7
        ).encode(
            x='snv_score_minmax:Q'
        )

    #df = df.loc[df['Assay Type'].isin(['HDR'])]
    sorted = ['Normal','Abnormal','Intermediate'] #order for the legend

    dotplot = alt.Chart(df).mark_tick(opacity = 1).encode(
        x = alt.X('snv_score_minmax', 
                  axis = alt.Axis(title = 'Functional Score', 
                                  values = [-0.4, -0.3, -0.2, -0.1, 0, 0.1], 
                                  labelFontSize = 16, 
                                  titleFontSize = 20
                                 ), 
                  scale = alt.Scale(domain = [-0.45, 0.1])
                 ),
        y = alt.Y('Orthogonal Assay:N', axis = alt.Axis(title = '', labelFontSize = 16, titleFontSize = 20)),
        color = alt.Color('Orthogonal Assay:N', scale = alt.Scale(scheme = 'category10'),sort = sorted, legend = alt.Legend(titleFontSize = 16, labelFontSize = 14)),
        tooltip = [alt.Tooltip('snv_score_minmax', title = 'Functional Score: '),
                   alt.Tooltip('exon', title = 'BARD1 Exon: '),
                   alt.Tooltip('AAsub', title = 'Amino Acid Sub.: '),
                   alt.Tooltip('functional_consequence', title = 'SGE Functional Consequence: '),
                   alt.Tooltip('Assay Type', title = 'Orthogonal Assay: ')
                  ]
    ).properties(
        width = 600,
        height = 100,
        title = alt.TitleParams(text = '')
    ).facet('Assay Type:N', columns = 1, spacing = 20).resolve_scale(
        y = 'independent'
    ).interactive()

    
    return dotplot

In [None]:
def main():
    data = read_orthogonal(sge)
    sge_scores, thresholds = read_scores(sge)
    merged = merge(sge_scores,data)
    ready_df = characterize_orthogonal(merged)
    histogram = make_histogram(ready_df, thresholds)
    dotplot = make_dotplot(ready_df, thresholds)
    
    final_plot = alt.vconcat(histogram, dotplot)

    final_plot = final_plot.configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    ).resolve_scale(
        x = 'shared'
    ).interactive()
    
    final_plot.display()


    #final_plot.save('/Users/ivan/Desktop/BARD1_draft_figs/supp_figs/suppfig_bard1_orthogonal.svg')
    #ready_df.to_excel('/Users/ivan/Desktop/20241205_BARD1_SGEvsOrthogonalAssays.xlsx', index = False)


In [None]:
main()