In [None]:
import pandas as pd
import altair as alt
import numpy as np
from natsort import natsorted
import re

In [None]:
rna_output = '/Users/ivan/Desktop/test_excel_outputs/20250710_BARD1_202505_data_RNAclassified_beta_collapsed.xlsx'
alt.data_transformers.disable_max_rows()

In [None]:
def read_output(file):
    df = pd.read_excel(file)
    #df = df.drop(columns = ['L2RNA/DNA'])
    
    df['AApos'] = df['AAsub'].transform(lambda x: x[1: -1])
    df = df.loc[~(df['AApos'].isin(['-']))]
    df['AApos'] = df['AApos'].astype(int)
    df['exon'] = df['target'].str.extract(r'(BARD1_X\d+)')
    
    abnormal_df = df[df['RNA_classification'].isin(['high', 'low'])]
    normal_df = df[df['RNA_classification'].isin(['normal'])]

    return abnormal_df, normal_df, df

In [None]:
def all_rna_performance(df):

    plot = alt.Chart(df).mark_point().encode(
        x = 'AApos',
        y = 'RNA/DNA',
        color = alt.Color('functional_consequence', 
                          sort = ['functionally_normal', 'indeterminate', 'functionally_abnormal']
                         ),
        shape = 'RNA_classification',
        tooltip = [alt.Tooltip('target', title = 'SGE Region: '),
                   alt.Tooltip('AAsub', title = 'Amino Acid Substituion: ')
                  ]
    ).properties(
        width = 1200,
        height = 400, 
        title = 'Distribution of RNA/DNA Across BARD1'
    ).interactive()


    chart = alt.Chart(df).mark_point().encode(
        x = alt.X('RNA/DNA',
                  scale = alt.Scale(domain = [-1.5, 2.5])
                 ),
        y = alt.Y('snv_score'),
        color = 'Consequence',
        tooltip = [alt.Tooltip('target', title = 'SGE Region: '),
                   alt.Tooltip('AAsub', title = 'Amino Acid Substituion: ')
                  ]
    ).properties(
        width = 600,
        height = 400, 
        title = 'SGE Score vs. RNA Score'
    ).interactive()

    nf_line = alt.Chart(pd.DataFrame({'y': [-0.089]})).mark_rule(color = 'red').encode(
        y = 'y')

    func_line = alt.Chart(pd.DataFrame({'y': [-0.077]})).mark_rule(color = 'blue').encode(
        y = 'y')

    rna_func_line = alt.Chart(pd.DataFrame({'x': [1]})).mark_rule(color = 'red').encode(
        x = 'x')
    rna_nf_line = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(color = 'red').encode(
        x = 'x')
    chart = chart + nf_line + func_line + rna_func_line + rna_nf_line
    
    plot.display()
    chart.display()

In [None]:
def stem_plot(df):

    df = df.loc[df['RNA/DNA'] < 10]
    df = df.copy()

    abnormal_df = df.loc[(df['functional_consequence'].isin(['functionally_abnormal'])) & (df['RNA_classification'].isin(['low']))]

    df['rule_value'] = np.nan
    df.loc[(df['functional_consequence'] == 'functionally_abnormal') & (df['RNA_classification'] == 'low'), 'rule_value'] = df.loc[(df['functional_consequence'] == 'functionally_abnormal') & (df['RNA_classification'] == 'low'), 'RNA/DNA']

    exons = list(set(df['exon'].tolist()))
    exons = natsorted(exons)

    palette = [
    '#006616', # dark green,
    '#81B4C7', # dusty blue
    '#ffcd3a', # yellow
    '#6AA84F', # med green
    '#93C47D', # light green
    '#888888', # med gray
    '#000000', # black
    '#1170AA', # darker blue
    '#CFCFCF' # light gray
        
    ]
    
    
    variant_types = [
        'synonymous_variant',
        'missense_variant',  
        'stop_gained',
        'intron_variant', 
        'UTR_variant',
        'stop_lost',
        'start_lost',
        'splice_site_variant', 
        'splicing_variant',
    ]

    
    base = alt.Chart(df)
    base_dot = base.mark_point(filled = True, size = 50).encode(
        x = alt.X('CDSpos:Q', 
                  scale = alt.Scale(zero = False, 
                                    padding = 5
                                   )
                 ),
        y = alt.Y('RNA/DNA:Q', 
                  title = 'RNA/DNA',
                  scale = alt.Scale(domain = [-2.1, 3],
                                    padding = 5)
                 ),
        color = alt.Color('Consequence',
                          scale = alt.Scale(
                              range = palette, 
                              domain = variant_types
                          )
                         ),
        shape = alt.Shape('functional_consequence',
                          sort = ['functionally_normal', 
                                  'functionally_abnormal', 
                                  'indeterminate'
                                 ]
                         ),
        tooltip = [alt.Tooltip('target', title = 'SGE Target: '), 
                   alt.Tooltip('AAsub', title = 'Amino Acid Sub: '),
                   alt.Tooltip('Consequence', title = 'Consequence: '),
                   alt.Tooltip('snv_score', title = 'SGE Score: ')
                  ]
    ).properties(
        width = 800, 
        height = 400
    )

    rule = base.mark_rule().encode(
        x = alt.X('CDSpos:Q',
                  scale = alt.Scale(zero = False,
                                    padding = 5
                                   )
                 ),
        y = alt.Y('rule_value:Q', 
                  scale = alt.Scale(padding = 5)
                 )
    )

    chart = (rule + base_dot)
    

    chart = chart.facet(facet = alt.Facet('exon:N',
                                         sort = exons
                                         ),
                        columns = 4
                       ).resolve_scale(
        x = 'independent', 
        y = 'independent'
    ).resolve_axis(
    x='independent',
    y='independent'
    ).interactive()
    
    chart.display()
    #base_dot.display()
    #rule.display()

In [None]:
def nmd_across_gene(df):
    
    df = df.loc[df['Consequence'].isin(['stop_gained'])]

    color_scale = alt.Scale(
        domain =['functionally_normal', 'functionally_abnormal'],
        range = ['#1f77b4', '#d62728']
    )
    
    plot = alt.Chart(df).mark_point().encode(
        x = 'AApos',
        y = 'RNA/DNA',
        color = alt.Color('functional_consequence', 
                          scale = color_scale,
                         ),
        shape = 'RNA_classification',
        tooltip = [alt.Tooltip('target', title = 'SGE Region: '),
                   alt.Tooltip('AApos', title = 'Amino Acid Position: ')
                  ]
    ).properties(
        width = 800,
        height = 400, 
        title = 'NMD Across BARD1'
    ).interactive()

    plot.display()
    

In [None]:
def missense_rna_performance(df):

    df = df.loc[df['Consequence'].isin(['missense_variant'])]
    
    plot = alt.Chart(df).mark_point().encode(
        x = 'AApos',
        y = 'RNA/DNA',
        color = alt.Color('functional_consequence', 
                          sort = ['functionally_normal', 'indeterminate', 'functionally_abnormal']
                         ),
        shape = 'RNA_classification',
        tooltip = [alt.Tooltip('target', title = 'SGE Region: '),
                   alt.Tooltip('AAsub', title = 'Amino Acid Substituion: ')
                  ]
    ).properties(
        width = 800,
        height = 400, 
        title = 'RNA Performance of Missense Variants'
    ).interactive()

    plot.display()

In [None]:
def synonymous_rna_performance(df):
    
    df = df.loc[df['Consequence'].isin(['synonymous_variant'])]

    color_scale = alt.Scale(
        domain =['functionally_normal', 'functionally_abnormal'],
        range = ['#1f77b4', '#d62728']
    )
    
    plot = alt.Chart(df).mark_point().encode(
        x = 'AApos',
        y = 'RNA/DNA',
        color = alt.Color('functional_consequence', 
                          scale = color_scale,
                         ),
        shape = 'RNA_classification',
        tooltip = [alt.Tooltip('target', title = 'SGE Region: '),
                   alt.Tooltip('AApos', title = 'Amino Acid Position: '), 
                   alt.Tooltip('snv_score', title = 'SGE Score: ')
                  ]
    ).properties(
        width = 800,
        height = 400, 
        title = 'RNA Performance of Synonymous Variants'
    ).interactive()

    plot.display()

In [None]:
def main():
    ab_data, norm_data, all_data = read_output(rna_output)
    all_rna_performance(all_data)
    stem_plot(all_data)
    nmd_across_gene(all_data)
    missense_rna_performance(ab_data)
    synonymous_rna_performance(ab_data)

In [None]:
main()