In [None]:
import pandas as pd
import altair as alt
from scipy import stats
import numpy as np

In [None]:
scores = '../Data/20250825_BARD1snvscores_filtered.xlsx'
alt.data_transformers.disable_max_rows()

In [None]:
def read_scores(file): #reads scores from excel file

    df = pd.read_excel(file) #reads scores

    df = df.rename(columns = {'consequence': 'Consequence'}) #Renames consequence column

    # find the GMM thresholds
    target_value = 0.950
    # Calculate the absolute difference for the Normal (N) density
    diffN = (df['gmm_density_normal'] - target_value).abs()
    # Find the index of the minimum difference
    closest_index = diffN.idxmin()
    # Retrieve the row with the closest value
    closest_row_n = df.loc[closest_index]
    
    # now repeat that for the abnormal density
    # Calculate the absolute difference
    diffA = (df['gmm_density_abnormal'] - target_value).abs()
    # Find the index of the minimum difference
    closest_index = diffA.idxmin()
    # Retrieve the row with the closest value
    closest_row_a = df.loc[closest_index]
    
    # now we get the scores that are the closest to the (n)ormal and (a)bnormal thresholds
    score_n_95 = closest_row_n['score']
    score_a_95 = closest_row_a['score']
    
    thresholds = [score_a_95, score_n_95]
    
    return df, thresholds

In [None]:
def prep_data(df): #Adds annotation for which domain (or not) a particular variant falls into

    #lists of BARD1 protein domains and respective protein domains
    ring_coords = [(214809412,214809494),(214797061, 214797117), (214792297,214792445), (214781508, 214781509)] #Residues 26 to 122
    ard_coords = [(214780560,214780601),(214769232,214769312),(214767482,214767654),(214752486,214752555)] #Residues 425 to 545
    brct_coords = [(214745722,214745830),(214745067,214745159),(214730411,214730508),(214728685,214729008)] #Residues 568 to 777
    idr_coords = [(214809495,214809569), (214780602,214781507),(214745831,214745854)] #Residues 1 to 25, 123 to 424 and 546 to 567

    all_coords = {'ring': ring_coords, 'idr': idr_coords, 'ard': ard_coords, 'brct': brct_coords}
    domains = list(all_coords.keys())

    #Generates dictionaries that contain {domain name: [list of coordinates in that domain]}
    domain_coords = {}
    for domain in domains:
        domain_list = []
        all_domain_coords = all_coords[domain]

        for pair in all_domain_coords:
            start, end = pair

            for i in range(start, end + 1):
                domain_list.append(i)

        domain_coords[domain] = domain_list

    df['Domain'] = None #creates empty column for domain


    #assigns domains for each datapoint
    df.loc[df['pos'].isin(domain_coords['ring']), 'Domain'] = 'RING'
    df.loc[df['pos'].isin(domain_coords['brct']), 'Domain'] = 'BRCT'
    df.loc[df['pos'].isin(domain_coords['ard']), 'Domain'] = 'ARD'
    df.loc[df['pos'].isin(domain_coords['idr']), 'Domain'] = 'IDRs'

    df = df.loc[df['Domain'].isin(['RING','BRCT','ARD', 'IDRs'])] #gets data just from domains
    df = df.reset_index(drop = True) #cleans up indices

    
    #Renames variant consequences to be more human readable
    df.loc[df['Consequence'].str.contains('missense'), 'Consequence'] = 'Missense'
    df.loc[df['Consequence'] == 'synonymous_variant', 'Consequence'] = 'Synonymous'
    df.loc[df['Consequence'] == 'intron_variant', 'Consequence'] = 'Intron'
    df.loc[df['Consequence'] == 'stop_gained', 'Consequence'] = 'Stop Gained'
    df.loc[df['Consequence'] == 'stop_lost', 'Consequence'] = 'Stop Lost'
    df.loc[df['Consequence'].str.contains('splic'), 'Consequence'] = 'Splice'

    df = df.loc[df['Consequence'].isin(['Missense'])] #Gets missense variants only
    
    return df

In [None]:
def percent_missense(df): #Quick numerical statistics to compare and contrast structured domains to disordered domains
    
    
    ring_df = df.loc[df['Domain'].isin(['RING'])]
    ard_df = df.loc[df['Domain'].isin(['ARD'])]
    brct_df = df.loc[df['Domain'].isin(['BRCT'])]
    idr_df = df.loc[df['Domain'].isin(['IDRs'])]

    #Counts number of variants that are LoF vs. normal in each domain
    total_sensitive = df['functional_consequence'].value_counts().get('functionally_abnormal',0)
    ring_sensitive = ring_df['functional_consequence'].value_counts().get('functionally_abnormal',0)
    ring_normal = ring_df['functional_consequence'].value_counts().get('functionally_normal', 0)
    ard_sensitive = ard_df['functional_consequence'].value_counts().get('functionally_abnormal',0)
    ard_normal = ard_df['functional_consequence'].value_counts().get('functionally_normal',0)
    brct_sensitive = brct_df['functional_consequence'].value_counts().get('functionally_abnormal',0)
    brct_normal = brct_df['functional_consequence'].value_counts().get('functionally_normal',0)
    idr_sensitive = idr_df['functional_consequence'].value_counts().get('functionally_abnormal',0)
    idr_normal = idr_df['functional_consequence'].value_counts().get('functionally_normal',0)
    idr_int = idr_df['functional_consequence'].value_counts().get('indeterminate',0)

    domain_sensitive = ring_sensitive + ard_sensitive + brct_sensitive #Gets total count of LoF missense variants in domains

    #Gets total number of variants in each group
    ring_vars = len(ring_df)
    ard_vars = len(ard_df)
    brct_vars = len(brct_df)
    structured_total = ring_vars + ard_vars + brct_vars
    idr_vars = len(idr_df)

    #Tabulates percentages
    ring_perc = ring_sensitive / ring_vars
    ard_perc = ard_sensitive / ard_vars
    brct_perc = brct_sensitive / brct_vars
    idr_perc = idr_sensitive / idr_vars
    structured_perc = domain_sensitive / total_sensitive

    structured_enrichment = ((ring_sensitive + ard_sensitive + brct_sensitive) / structured_total) / idr_perc #Calculates how many times more LoF missense variants are in the structured domains vs. IDR

    
    print('RING % Sensitive Misense: ', str(100 * ring_perc), '\n',
         'ARD % Sensitive Missense: ', str(100 * ard_perc), '\n',
          'BRCT % Sensitive Missense: ', str(100 * brct_perc), '\n',
          'IDR % Sensitive Missense: ', str(100 * idr_perc), '\n',
          'IDR Missense Vars: ', str(idr_vars), '\n',
          'IDR Abnormal: ', str(idr_sensitive), '\n',
          'IDR Indeterminate: ', str(idr_int), '\n', 
          'Totla Structure Missense Vars: ', str(structured_total), '\n',
          'Total LOF Missense: ', str(total_sensitive), '\n', 
          'LOF in Domains: ', str(domain_sensitive), '\n', 
          '%LOF in Domains: ', str(structured_perc * 100), '\n',
          'Domain LOF Enrichment', str(structured_enrichment)
         )


    #Fischer exact testing. Gives odds of finding LoF missense variant in the structured domains vs. IDRs with p-value
    contingency_table = np.array([[domain_sensitive, (structured_total - domain_sensitive)],
                                  [idr_sensitive, (idr_vars - idr_sensitive)]])

    odds_ratio, p_value = stats.fisher_exact(contingency_table)

    print(f"Fisher's exact test p-value: {p_value}")
    print(f"Odds ratio: {odds_ratio}")

In [None]:
def strip_plot(df, thresholds): #Builds strip plot showing distribution of missense variants across the differet domains
    
    nf_line = alt.Chart(pd.DataFrame({'x': [thresholds[0]]})).mark_rule(color = 'red').encode(
        x = 'x')

    func_lin = alt.Chart(pd.DataFrame({'x': [thresholds[1]]})).mark_rule(color = 'blue').encode(
        x = 'x')

    sorted = ['RING', 'ARD', 'BRCT', 'IDRS']
    controls_sorted = ["Intron",  "Synonymous", "Stop Gained", "Missense", "Splice", "Start Lost", 'Stop Lost', 'UTR']

    palette = [
    '#8dc4ed', # RING
    '#b1b1b1', # IDR
    '#9abd9a', # ARD
    '#F6BF93', # BRCT
    ]


    domains = [
        'RING',
        'IDRs',
        'ARD',
        'BRCT'
    ]
    
    plot = alt.Chart(df).mark_tick(opacity = 1, color = 'gray').encode(
        x = alt.X('score:Q',
                  axis = alt.Axis(title = 'SGE Score (Missense Vars. Only)', 
                                  titleFontSize = 20,
                                 labelFontSize = 16,
                                 values = [-0.8, -0.6, -0.4, -0.2, 0, 0.2]),
                  scale = alt.Scale(domain = [-0.8, 0.3])
                 ),
        y = alt.Y('Domain:N', 
                  sort = sorted,
                  axis = alt.Axis(title = '',
                                 labelFontSize = 20)
                 )
        ).properties(
            width = 800,
            height = 400
        )
    
    plot = plot + nf_line + func_lin

    plot = plot.configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    )
    
    #plot.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_2c_structure_faceted_stripplot.png', ppi = 500)
    plot.display()

In [None]:
def cumulator_plot(df,thresholds): #Cumulator plot showing cumulative % of SNVs vs. SGE score
    
    missub = df.sort_values(by="score")
    gbobj = missub.groupby('Domain')['score']
    missub["cumulative_percentage"] = (gbobj.cumcount()  + 1)/ gbobj.transform('count') * 100.0

    lthresh = thresholds[0]
    uthresh = thresholds[1]

    palette = [
    '#B9DBF4', # RING
    '#b1b1b1', # IDR
    '#9abd9a', # ARD
    '#F6BF93', # BRCT
    ]


    domains = [
        'RING',
        'IDRs',
        'ARD',
        'BRCT'
    ]
    
    
    # draw the plot
    chart = alt.Chart(missub, height=300, width=300).mark_line(size=5).encode(
        x=alt.X('score:Q', title='SGE score').axis(labelFlush=False),
        y=alt.Y('cumulative_percentage:Q', title='Cumulative percentage of missense SNVs'),
        color = alt.Color('Domain:N', 
                          title = 'Domain',
                          sort = ['RING', 'ARD', 'BRCT', 'IDRs'],
                          scale = alt.Scale(range = palette,
                                            domain = domains
                                           ),
                          legend = alt.Legend(symbolStrokeWidth = 6,
                                              symbolSize = 250
                                             )
                         )
    )
    
    threshl = alt.Chart().mark_rule(color='darkred', strokeDash=[8,8], strokeWidth=1).encode(
        x=alt.datum(lthresh)
    )
    threshu = alt.Chart().mark_rule(color='darkgreen', strokeDash=[8,8], strokeWidth=1).encode(
        x=alt.datum(uthresh)
    )
    
    rectlow = alt.Chart().mark_rect(color='red').encode(
        x=alt.datum(lthresh),
        opacity=alt.value(0.05),
    )
    recthigh = alt.Chart().mark_rect(color='green').encode(
        x2=alt.value(300),
        x=alt.datum(uthresh),
        opacity=alt.value(0.05),
    )
    
    
    final_plot = (chart + threshl + threshu + rectlow + recthigh)

    final_plot.display()

In [None]:
def main():
    data, thresholds = read_scores(scores)
    domained = prep_data(data)
    percent_missense(domained)
    strip_plot(domained, thresholds)
    cumulator_plot(domained, thresholds)


In [None]:
main()