In [None]:
import pandas as pd
import altair as alt
import numpy as np
from natsort import natsorted
import re

In [None]:
rna_output = '/Users/ivan/Desktop/test_excel_outputs/20250811_BARD1_202505_data_RNAclassified_noOverlapCollapse_wholeGene.xlsx'
sge_threshold_file = '../Data/20250813_BARD1_thresholds.tsv' #SGE Thresholds file
alt.data_transformers.disable_max_rows()

In [None]:
def read_output(file, threshold_file):
    df = pd.read_excel(file, sheet_name = 'data')
    
    df.loc[df['Consequence'].str.contains('missense'), 'Consequence'] = 'Missense'
    df.loc[df['Consequence'] == 'synonymous_variant', 'Consequence'] = 'Synonymous'
    df.loc[df['Consequence'] == 'intron_variant', 'Consequence'] = 'Intron'
    df.loc[df['Consequence'] == 'stop_gained', 'Consequence'] = 'Stop Gained'
    df.loc[df['Consequence'] == 'stop_lost', 'Consequence'] = 'Stop Lost'
    df.loc[df['Consequence'].str.contains('site'), 'Consequence'] = 'Canonical Splice'
    df.loc[df['Consequence'].str.contains('ing_var'), 'Consequence'] = 'Splice Region'
    df.loc[df['Consequence'].str.contains('UTR'), 'Consequence'] = 'UTR Variant'
    df.loc[df['Consequence'] == 'start_lost', 'Consequence'] = 'Start Lost'
    
    threshold_df = pd.read_excel(file, sheet_name = 'thresholds')

    min_threshold = threshold_df['min'][0]
    max_threshold = threshold_df['max'][0]

    thresholds = (min_threshold, max_threshold)
              
    #df = df.drop(columns = ['L2RNA/DNA'])
    
    df['AApos'] = df['AAsub'].transform(lambda x: x[1: -1])
    df = df.loc[~(df['AApos'].isin(['-']))]
    df['AApos'] = df['AApos'].astype(int)
    df['exon'] = df['target'].str.extract(r'(BARD1_X\d+)')
    
    abnormal_df = df[df['RNA_classification'].isin(['high', 'low'])]
    normal_df = df[df['RNA_classification'].isin(['normal'])]

    threshold_df = pd.read_csv(threshold_file, sep = '\t')

    lwr = threshold_df['lthresh'][0] * threshold_df['std_neut'][0] + threshold_df['#mu_neut'][0]
    uppr = threshold_df['uthresh'][0] * threshold_df['std_neut'][0] + threshold_df['#mu_neut'][0]

    sge_thresholds = [lwr, uppr]

    return abnormal_df, normal_df, df, thresholds, sge_thresholds

In [None]:
def all_rna_performance(df, thresholds, sge_threshold):

    min_threshold, max_threshold = thresholds
    
    palette = [
    '#006616', # dark green,
    '#81B4C7', # dusty blue
    '#ffcd3a', # yellow
    '#93C47D', # light green
    '#888888', # med gray
    '#000000', # black
    '#CFCFCF' # light gray
        
    ]
    
    
    variant_types = [
        'Synonymous',
        'Missense',  
        'Stop Gained',
        'UTR Variant',
        'Stop Lost',
        'Start Lost',
        'Splice Region',
    ]

    plot = alt.Chart(df).mark_point().encode(
        x = 'AApos',
        y = alt.Y('RNA/DNA',
                  scale = alt.Scale(domain = [0, 3]
                                   )
                 ),
        color = alt.Color('functional_consequence', 
                          sort = ['functionally_normal', 'indeterminate', 'functionally_abnormal']
                         ),
        shape = 'RNA_classification',
        tooltip = [alt.Tooltip('target', title = 'SGE Region: '),
                   alt.Tooltip('AAsub', title = 'Amino Acid Substituion: ')
                  ]
    ).properties(
        width = 1200,
        height = 400, 
        title = 'Distribution of RNA/DNA Across BARD1'
    ).interactive()


    chart = alt.Chart(df).mark_point().encode(
        x = alt.X('RNA/DNA',
                  title = 'RNA Score',
                  axis = alt.Axis(labelFontSize = 16,
                                  titleFontSize = 20,
                                  values = [0, 0.2, 0.4, 0.6, 0.8, 1, 1.2, 1.4, 1.6, 1.8, 2, 2.2, 2.4, 2.6]
                                 ),
                  scale = alt.Scale(domain = [-0.05, 2.6]
                                   )
                 ),
        y = alt.Y('snv_score',
                 title = 'SGE Score',
                 axis = alt.Axis(labelFontSize =16,
                                titleFontSize = 20,
                                values = [-0.6,-0.5, -0.4, -0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3]
                                ),
                  scale = alt.Scale(domain = [-0.6, 0.3]
                                   )
                 ),
        color = alt.Color('Consequence',
                          scale = alt.Scale(range = palette,
                                            domain = variant_types
                                           ),
                          legend = alt.Legend(titleFontSize = 16, 
                                              labelFontSize = 14
                                             )
                         ),
        tooltip = [alt.Tooltip('target', title = 'SGE Region: '),
                   alt.Tooltip('AAsub', title = 'Amino Acid Substituion: ')
                  ]
    ).properties(
        width = 600,
        height = 400, 
        title = alt.TitleParams(text = 'SGE Score vs. RNA Score',
                                fontSize = 22
                               )               
    ).interactive()

    nf_line = alt.Chart(pd.DataFrame({'snv_score': [sge_threshold[0]]})).mark_rule(color = 'red').encode(
        y = 'snv_score')

    func_line = alt.Chart(pd.DataFrame({'snv_score': [sge_threshold[1]]})).mark_rule(color = 'blue').encode(
        y = 'snv_score')
    

    rna_func_line = alt.Chart(pd.DataFrame({'RNA/DNA': [max_threshold]})).mark_rule(color = 'blue').encode(
        x = 'RNA/DNA')
    rna_nf_line = alt.Chart(pd.DataFrame({'RNA/DNA': [min_threshold]})).mark_rule(color = 'red').encode(
        x = 'RNA/DNA')
    
    chart = (chart + nf_line + func_line + rna_func_line + rna_nf_line).configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    ).interactive()
    
    plot.display()
    chart.display()
    
    return chart

In [None]:
def stem_plot(df, thresholds):

    df = df.loc[df['RNA/DNA'] < 10]
    df = df.copy()

    min_threshold, max_threshold = thresholds
    
    abnormal_df = df.loc[(df['functional_consequence'].isin(['functionally_abnormal'])) & (df['RNA_classification'].isin(['low']))]

    df['rule_value'] = np.nan
    df.loc[(df['functional_consequence'] == 'functionally_abnormal') & (df['RNA_classification'] == 'low'), 'rule_value'] = df.loc[(df['functional_consequence'] == 'functionally_abnormal') & (df['RNA_classification'] == 'low'), 'RNA/DNA']

    df.loc[df['functional_consequence'] == 'functionally_abnormal', 'functional_consequence'] = 'Functionally Abnormal'
    df.loc[df['functional_consequence'] == 'functionally_normal', 'functional_consequence'] = 'Functionally Normal'
    df.loc[df['functional_consequence'] == 'indeterminate', 'functional_consequence'] = 'Indeterminate'

    consequence_sort = ['Functionally Normal', 'Fucntionally Abnormal', 'Indeterminate']
    
    exons = list(set(df['exon'].tolist()))
    exons = natsorted(exons)

    palette = [
    '#006616', # dark green,
    '#81B4C7', # dusty blue
    '#ffcd3a', # yellow
    '#93C47D', # light green
    '#888888', # med gray
    '#000000', # black
    '#CFCFCF' # light gray
        
    ]
    
    
    variant_types = [
        'Synonymous',
        'Missense',  
        'Stop Gained',
        'UTR Variant',
        'Stop Lost',
        'Start Lost',
        'Splice Region',
    ]

    if min_threshold == 0:
        base = alt.Chart(df)
        base_dot = base.mark_point(filled = True, size = 50).encode(
            x = alt.X('CDSpos:Q', 
                      scale = alt.Scale(zero = False, 
                                        padding = 5
                                       )
                     ),
            y = alt.Y('RNA/DNA:Q', 
                      title = 'RNA/DNA',
                      scale = alt.Scale(domain = [-2.1, 3],
                                        padding = 5)
                     ),
            color = alt.Color('Consequence',
                              scale = alt.Scale(
                                  range = palette, 
                                  domain = variant_types
                              )
                             ),
            shape = alt.Shape('functional_consequence',
                              sort = consequence_sort
                             ),
            tooltip = [alt.Tooltip('target', title = 'SGE Target: '), 
                       alt.Tooltip('AAsub', title = 'Amino Acid Sub: '),
                       alt.Tooltip('Consequence', title = 'Consequence: '),
                       alt.Tooltip('snv_score', title = 'SGE Score: ')
                      ]
        ).properties(
            width = 800, 
            height = 400
        )
    
        rule = base.mark_rule().encode(
            x = alt.X('CDSpos:Q',
                      scale = alt.Scale(zero = False,
                                        padding = 5
                                       )
                     ),
            y = alt.Y('rule_value:Q', 
                      scale = alt.Scale(padding = 5)
                     )
        )
    
        chart = (rule + base_dot)
        
    
        chart = chart.facet(facet = alt.Facet('exon:N',
                                             sort = exons
                                             ),
                            columns = 4
                           ).resolve_scale(
            x = 'independent', 
            y = 'independent'
        ).resolve_axis(
        x='independent',
        y='independent'
        ).interactive()
        
        chart.display()

        return chart

    else:
        df['min_threshold'] = min_threshold
        base = alt.Chart(df)
        base_dot = base.mark_point(filled = True, size = 75).encode(
            x = alt.X('CDSpos:Q', 
                      title = 'CDS Position',
                      axis = alt.Axis(labelFontSize = 16,
                                      titleFontSize = 20
                                     ),
                      scale = alt.Scale(zero = False, 
                                        padding = 5
                                       )
                     ),
            y = alt.Y('RNA/DNA:Q', 
                      title = 'RNA/DNA',
                      axis = alt.Axis(labelFontSize = 16,
                                      titleFontSize = 20,
                                      values = [0, 0.2, 0.4, 0.6, 0.8, 1, 1.2, 1.4, 1.6, 1.8, 2, 2.2, 2.4, 2.6, 2.8]
                                     ),
                      scale = alt.Scale(domain = [0, 2.8],
                                        padding = 5)
                     ),
            color = alt.Color('Consequence',
                              scale = alt.Scale(
                                  range = palette, 
                                  domain = variant_types
                              ),
                              legend = alt.Legend(
                                  titleFontSize = 16,
                                  labelFontSize =14
                              )
                             ),
            shape = alt.Shape('functional_consequence',
                              sort = consequence_sort,
                              legend = alt.Legend(
                                  titleLimit  = 500,
                                  title = 'Functional Consequence',
                                  titleFontSize = 16,
                                  labelFontSize = 14
                              )
                             ),
            tooltip = [alt.Tooltip('target', title = 'SGE Target: '), 
                       alt.Tooltip('AAsub', title = 'Amino Acid Sub: '),
                       alt.Tooltip('Consequence', title = 'Consequence: '),
                       alt.Tooltip('snv_score', title = 'SGE Score: ')
                      ]
        ).properties(
            width = 800, 
            height = 400
        )
    
        rule = base.mark_rule().encode(
            x=alt.X('CDSpos:Q',
                    scale=alt.Scale(zero=False, padding=5)
            ),
            y=alt.Y('min_threshold:Q'),  # Reference as a column
            y2=alt.Y2('rule_value:Q')
        )

        rna_nf_line = alt.Chart(pd.DataFrame({'y': [min_threshold]})).mark_rule(color = 'red').encode(
        y = 'y')
        
        chart = (rule + base_dot + rna_nf_line)
        
    
        chart = chart.facet(facet = alt.Facet('exon:N',
                                             sort = exons
                                             ),
                            columns = 4
                           ).resolve_scale(
            x = 'independent', 
            y = 'independent'
        ).resolve_axis(
        x='independent',
        y='independent'
        ).interactive()

        chart = chart.configure_axis(
            grid = False
        ).configure_view(
            stroke = None
        )
        
        chart.display()

        return chart
        
    #base_dot.display()
    #rule.display()

In [None]:
def nmd_across_gene(df):
    
    df = df.loc[df['Consequence'].isin(['Stop Gained'])]

    color_scale = alt.Scale(
        domain =['functionally_normal', 'functionally_abnormal'],
        range = ['#1f77b4', '#d62728']
    )
    
    plot = alt.Chart(df).mark_point().encode(
        x = 'AApos',
        y = 'RNA/DNA',
        color = alt.Color('functional_consequence', 
                          scale = color_scale,
                         ),
        shape = 'RNA_classification',
        tooltip = [alt.Tooltip('target', title = 'SGE Region: '),
                   alt.Tooltip('AApos', title = 'Amino Acid Position: ')
                  ]
    ).properties(
        width = 800,
        height = 400, 
        title = 'NMD Across BARD1'
    ).interactive()

    plot.display()
    

In [None]:
def missense_rna_performance(df):

    df = df.loc[df['Consequence'].isin(['Missense'])]
    
    plot = alt.Chart(df).mark_point().encode(
        x = 'AApos',
        y = 'RNA/DNA',
        color = alt.Color('functional_consequence', 
                          sort = ['functionally_normal', 'indeterminate', 'functionally_abnormal']
                         ),
        shape = 'RNA_classification',
        tooltip = [alt.Tooltip('target', title = 'SGE Region: '),
                   alt.Tooltip('AAsub', title = 'Amino Acid Substituion: ')
                  ]
    ).properties(
        width = 800,
        height = 400, 
        title = 'RNA Performance of Missense Variants'
    ).interactive()

    plot.display()

In [None]:
def synonymous_rna_performance(df):
    
    df = df.loc[df['Consequence'].isin(['Synonymous'])]

    color_scale = alt.Scale(
        domain =['functionally_normal', 'functionally_abnormal'],
        range = ['#1f77b4', '#d62728']
    )
    
    plot = alt.Chart(df).mark_point().encode(
        x = 'AApos',
        y = 'RNA/DNA',
        color = alt.Color('functional_consequence', 
                          scale = color_scale,
                         ),
        shape = 'RNA_classification',
        tooltip = [alt.Tooltip('target', title = 'SGE Region: '),
                   alt.Tooltip('AApos', title = 'Amino Acid Position: '), 
                   alt.Tooltip('snv_score', title = 'SGE Score: ')
                  ]
    ).properties(
        width = 800,
        height = 400, 
        title = 'RNA Performance of Synonymous Variants'
    ).interactive()

    plot.display()

In [None]:
def main():
    ab_data, norm_data, all_data, rna_thresholds, sge_thresholds = read_output(rna_output, sge_threshold_file)
    sge_vs_rna = all_rna_performance(all_data, rna_thresholds, sge_thresholds)
    rna_stem_plot = stem_plot(all_data, rna_thresholds)
    nmd_across_gene(all_data)
    missense_rna_performance(ab_data)
    synonymous_rna_performance(ab_data)

    #sge_vs_rna.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_3a.png', ppi = 500)
    #rna_stem_plot.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_3b.png', ppi = 500)

In [None]:
main()