In [None]:
import pandas as pd
import altair as alt
from scipy import stats
import numpy as np

In [None]:
scores = '../Data/20250825_BARD1snvscores_filtered.xlsx'
sge_thresholds = '../Data/20250813_BARD1_thresholds.tsv' #SGE Thresholds file
alt.data_transformers.disable_max_rows()

In [None]:
def read_scores(file, threshold_file): #reads scores from excel file

    #this block reads most of the scores from the pillar project QC-passed dataset
    df = pd.read_excel(file) #reads TSV

    df = df.rename(columns = {'consequence': 'Consequence'}) #Remove once Consequence column is renamed
    df = df[['pos','Consequence','score', 'functional_consequence']] #pulls relevant columns
    
    threshold_df = pd.read_csv(threshold_file, sep = '\t')

    lwr = threshold_df['lthresh'][0] * threshold_df['std_neut'][0] + threshold_df['#mu_neut'][0]
    uppr = threshold_df['uthresh'][0] * threshold_df['std_neut'][0] + threshold_df['#mu_neut'][0]

    thresholds = [lwr, uppr]
    
    return df, thresholds

In [None]:
def prep_data(df): #Add data for category for if a datapoint is in a BARD1 region

    #lists of BARD1 protein domains and respective protein domains
    ring_coords = [(214809412,214809494),(214797061, 214797117), (214792297,214792445), (214781508, 214781509)] #Residues 26 to 122
    ard_coords = [(214780560,214780601),(214769232,214769312),(214767482,214767654),(214752486,214752555)] #Residues 425 to 545
    brct_coords = [(214745722,214745830),(214745067,214745159),(214730411,214730508),(214728685,214729008)] #Residues 568 to 777
    idr_coords = [(214809495,214809569), (214780602,214781507),(214745831,214745854)] #Residues 1 to 25, 123 to 424 and 546 to 567

    all_coords = {'ring': ring_coords, 'idr': idr_coords, 'ard': ard_coords, 'brct': brct_coords}
    domains = list(all_coords.keys())

    domain_coords = {}
    for domain in domains:
        domain_list = []
        all_domain_coords = all_coords[domain]

        for pair in all_domain_coords:
            start, end = pair

            for i in range(start, end + 1):
                domain_list.append(i)

        domain_coords[domain] = domain_list

    df['Domain'] = None #creates empty column for domain


    #assigns domains for each datapoint
    df.loc[df['pos'].isin(domain_coords['ring']), 'Domain'] = 'RING'
    df.loc[df['pos'].isin(domain_coords['brct']), 'Domain'] = 'BRCT'
    df.loc[df['pos'].isin(domain_coords['ard']), 'Domain'] = 'ARD'
    df.loc[df['pos'].isin(domain_coords['idr']), 'Domain'] = 'IDRs'

    df = df.loc[df['Domain'].isin(['RING','BRCT','ARD', 'IDRs'])] #gets data just from domains
    df = df.reset_index(drop = True) #cleans up indices

    #Block used to export data for RING domain
    #export_df = df.loc[df['Domain'].isin(['RING'])]
    #export_df.to_excel('/Users/ivan/Desktop/20241021_BARD1_RING_Domain_PillarPjct_Plus1B.xlsx')
    
    #Renames variant effects to be more human readable
    df.loc[df['Consequence'].str.contains('missense'), 'Consequence'] = 'Missense'
    df.loc[df['Consequence'] == 'synonymous_variant', 'Consequence'] = 'Synonymous'
    df.loc[df['Consequence'] == 'intron_variant', 'Consequence'] = 'Intron'
    df.loc[df['Consequence'] == 'stop_gained', 'Consequence'] = 'Stop Gained'
    df.loc[df['Consequence'] == 'stop_lost', 'Consequence'] = 'Stop Lost'
    df.loc[df['Consequence'].str.contains('splic'), 'Consequence'] = 'Splice'
    
    return df

In [None]:
def percent_missense(df):
    df = df.loc[df['Consequence'].isin(['Missense'])]
    ring_df = df.loc[df['Domain'].isin(['RING'])]
    ard_df = df.loc[df['Domain'].isin(['ARD'])]
    brct_df = df.loc[df['Domain'].isin(['BRCT'])]
    idr_df = df.loc[df['Domain'].isin(['IDRs'])]

    total_sensitive = df['functional_consequence'].value_counts().get('functionally_abnormal',0)
    ring_sensitive = ring_df['functional_consequence'].value_counts().get('functionally_abnormal',0)
    ring_normal = ring_df['functional_consequence'].value_counts().get('functionally_normal', 0)
    ard_sensitive = ard_df['functional_consequence'].value_counts().get('functionally_abnormal',0)
    ard_normal = ard_df['functional_consequence'].value_counts().get('functionally_normal',0)
    brct_sensitive = brct_df['functional_consequence'].value_counts().get('functionally_abnormal',0)
    brct_normal = brct_df['functional_consequence'].value_counts().get('functionally_normal',0)
    idr_sensitive = idr_df['functional_consequence'].value_counts().get('functionally_abnormal',0)
    idr_normal = idr_df['functional_consequence'].value_counts().get('functionally_normal',0)
    idr_int = idr_df['functional_consequence'].value_counts().get('indeterminate',0)

    domain_sensitive = ring_sensitive + ard_sensitive + brct_sensitive
    
    ring_vars = len(ring_df)
    ard_vars = len(ard_df)
    brct_vars = len(brct_df)
    idr_vars = len(idr_df)

    ring_perc = ring_sensitive / ring_vars
    ard_perc = ard_sensitive / ard_vars
    brct_perc = brct_sensitive / brct_vars
    idr_perc = idr_sensitive / idr_vars
    structured_perc = domain_sensitive / total_sensitive
            
    print('RING % Sensitive Misense: ', str(100 * ring_perc), '\n',
         'ARD % Sensitive Missense: ', str(100 * ard_perc), '\n',
          'BRCT % Sensitive Missense: ', str(100 * brct_perc), '\n',
          'IDR % Sensitive Missense: ', str(100 * idr_perc), '\n',
          'IDR Missense Vars: ', str(idr_vars), '\n',
          'IDR Abnormal: ', str(idr_sensitive), '\n',
          'IDR Indeterminate: ', str(idr_int), '\n', 
          'Total LOF Missense: ', str(total_sensitive), '\n', 
          'LOF in Domains: ', str(domain_sensitive), '\n', 
          '%LOF in Domains: ', str(structured_perc * 100)
         )
    
    contingency_table = np.array([[ard_sensitive, brct_sensitive],
                                  [ard_normal, brct_normal]])

    print(contingency_table)
    odds_ratio, p_value = stats.fisher_exact(contingency_table)

    print(f"Fisher's exact test p-value: {p_value}")
    print(f"Odds ratio: {odds_ratio}")

In [None]:
def histogram(df): #Builds histogram with variants in all 3 domains. Also created a plot faceted by domain.

    length = str(len(df)) #gets length of data

    final_tital = 'Distribution of BARD1 SGE Scores ' + '(n = ' + length + ')' #used to build title
    bins = 50 #number of bins

    sorted = ["Synonymous", "Missense", "Stop", "Splice"] #order for the legend
  
    #Builds histogram for variants in structured domains
    histogram = alt.Chart(df).mark_bar().encode(
            alt.X('score', axis = alt.Axis(title = 'SGE Score', labelFontSize = 16, titleFontSize = 20), bin = alt.Bin(maxbins = bins)),
            alt.Y('count()', axis = alt.Axis(title = 'Number of Variants', labelFontSize = 16, titleFontSize = 20)),
            color = alt.Color('Consequence:N', scale = alt.Scale(scheme = 'category10'), sort = sorted, legend = alt.Legend(titleFontSize = 16, labelFontSize = 14))
    ).properties(
        width = 800,
        height = 400,
        title = alt.TitleParams(text = '', fontSize = 22)
    )
    
    histogram.show()

    #facets histogram by domain
    faceted = histogram.facet(
        alt.Facet('Domain:N', 
                  sort = ['RING', 'IDRs', 'ARD', 'BRCT'],
                  title = 'Distrubtion of BARD1 SGE Scores by Functional Domain'
    ),columns = 3).configure_header(
        titleFontSize = 24, 
        labelFontSize = 20
    )

    faceted.display()


In [None]:
def interact_histogram(df): #Builds interactive historam with dropdown menu to select domain

    #Builds domain selection dropdown menu
    category_selection = alt.selection_point(
        fields=['Domain'],  # Column to filter by
        bind=alt.binding_select(options=df['Domain'].unique().tolist(), name='Select Domain: '), # Dropdown menu
        value= 'RING'  # Initial value for the selection
    )

    bins = 50 #number of bins
    scale = [-0.6,0.15] #scale
    
    sorted = ['Synonymous', 'Missense', "Stop", "Splice"] #order for the legend
  
    #Builds the histogram
    histogram = alt.Chart(df).mark_bar().encode(
        alt.X('score', axis = alt.Axis(title = 'SGE Score', labelFontSize = 16, titleFontSize = 20), bin = alt.Bin(maxbins = bins)),
        alt.Y('count()', axis = alt.Axis(title = 'Number of Variants', labelFontSize = 16, titleFontSize = 20)),
        color = alt.Color('Consequence:N', scale = alt.Scale(scheme = 'category10'), sort = sorted, legend = alt.Legend(titleFontSize = 16, labelFontSize = 14))
        ).add_params(
            category_selection
        ).transform_filter(category_selection
        ).properties(
            width = 800,
            height = 400,
            title = alt.TitleParams(text = '', fontSize = 22)
        ).interactive()

    #histogram.save('interactive_histogram.html')
    histogram.show()

In [None]:
def violinplot(df): #Builds violin plot of variant scores across the 3 domains

    sorted = ['RING', 'ARD', 'BRCT'] #order for violin plot

    #Builds violin plot
    violin = (
    alt.Chart(df)
    .transform_density(
        'score',
        as_=['score', 'density'],
        extent=[-0.4, 0.15],
        groupby=['Domain']
    )
    .mark_area(orient='horizontal')
    .encode(
        y=alt.Y('score:Q', 
                axis=alt.Axis(
                    title='SGE Score', 
                    titleFontSize=16, 
                    labelFontSize=14
                )),
        color=alt.Color('Domain:N', sort=sorted, legend = None),
        x=alt.X('density:Q',
                stack='center',
                impute=None,
                title=None,
                axis=alt.Axis(
                    labels=False, 
                    values=[0], 
                    grid=False, 
                    ticks=True
                )),
        column=alt.Column('Domain:N',
                          sort=sorted,
                          header=alt.Header(
                              titleOrient='bottom',
                              labelOrient='bottom',
                              labelPadding=0,
                              titleFontSize=16,
                              labelFontSize=20
                          ))
    )
    .properties(
        height=700,
        width=400
    )
    .configure_view(stroke=None)
    .configure_facet(spacing=0)
    .configure_axis(grid=False)
    .configure_legend(
        titleFontSize=16, 
        labelFontSize=14
    )
    .interactive()
    )

    #violin.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_2d.png', ppi = 500)
    violin.show()

In [None]:
def strip_plot(df, thresholds):

    df = df.loc[df['Consequence'].isin(['Missense'])]
    
    nf_line = alt.Chart(pd.DataFrame({'x': [thresholds[0]]})).mark_rule(color = 'red').encode(
        x = 'x')

    func_lin = alt.Chart(pd.DataFrame({'x': [thresholds[1]]})).mark_rule(color = 'blue').encode(
        x = 'x')

    sorted = ['RING', 'ARD', 'BRCT', 'IDRS']
    controls_sorted = ["Intron",  "Synonymous", "Stop Gained", "Missense", "Splice", "Start Lost", 'Stop Lost', 'UTR']

    palette = [
    '#B9DBF4', # RING
    '#b1b1b1', # IDR
    '#C8DBC8', # ARD
    '#F6BF93', # BRCT
    ]


    domains = [
        'RING',
        'IDRs',
        'ARD',
        'BRCT'
    ]
    
    plot = alt.Chart(df).mark_tick(opacity = 1).encode(
        x = alt.X('score:Q',
                  axis = alt.Axis(title = 'SGE Score (Missense Vars. Only)', 
                                  titleFontSize = 24,
                                 labelFontSize = 20)
                 ),
        y = alt.Y('Domain:N', 
                  sort = sorted,
                  axis = alt.Axis(title = '',
                                 labelFontSize = 24)
                 ),
        color = alt.Color('Domain:N',
                legend=None,
                sort = sorted,
                scale = alt.Scale(range = palette,
                                 domain = domains)
                         )
        ).properties(
            width = 800,
            height = 400
        )
    
    plot = plot + nf_line + func_lin

    plot = plot.configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    )
    
    #plot.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_2c_structure_faceted_stripplot.png', ppi = 500)
    plot.display()

In [None]:
def cumulator_plot(df,thresholds):
    df = df.loc[df['Consequence'].isin(['Missense'])]

    missub = df.sort_values(by="score")
    gbobj = missub.groupby('Domain')['score']
    missub["cumulative_percentage"] = (gbobj.cumcount()  + 1)/ gbobj.transform('count') * 100.0

    lthresh = thresholds[0]
    uthresh = thresholds[1]

    palette = [
    '#B9DBF4', # RING
    '#b1b1b1', # IDR
    '#C8DBC8', # ARD
    '#F6BF93', # BRCT
    ]


    domains = [
        'RING',
        'IDRs',
        'ARD',
        'BRCT'
    ]
    
    
    # draw the plot
    chart = alt.Chart(missub, height=300, width=300).mark_line(size=5).encode(
        x=alt.X('score:Q', title='SGE score').axis(labelFlush=False),
        y=alt.Y('cumulative_percentage:Q', title='Cumulative percentage of missense SNVs'),
        color = alt.Color('Domain:N', 
                          title = 'Domain',
                          sort = ['RING', 'ARD', 'BRCT', 'IDRs'],
                          scale = alt.Scale(range = palette,
                                            domain = domains
                                           ),
                          legend = alt.Legend(symbolStrokeWidth = 6,
                                              symbolSize = 250
                                             )
                         )
    )
    
    threshl = alt.Chart().mark_rule(color='darkred', strokeDash=[8,8], strokeWidth=1).encode(
        x=alt.datum(lthresh)
    )
    threshu = alt.Chart().mark_rule(color='darkgreen', strokeDash=[8,8], strokeWidth=1).encode(
        x=alt.datum(uthresh)
    )
    
    rectlow = alt.Chart().mark_rect(color='red').encode(
        x=alt.datum(lthresh),
        opacity=alt.value(0.05),
    )
    recthigh = alt.Chart().mark_rect(color='green').encode(
        x2=alt.value(300),
        x=alt.datum(uthresh),
        opacity=alt.value(0.05),
    )
    
    
    final_plot = (chart + threshl + threshu + rectlow + recthigh)

    final_plot.display()

In [None]:
def main():
    data, thresholds = read_scores(scores, sge_thresholds)
    domained = prep_data(data)
    percent_missense(domained)
    #histogram(domained)
    #violinplot(domained)
    strip_plot(domained, thresholds)
    cumulator_plot(domained, thresholds)
    #interact_histogram(domained)


In [None]:
main()