In [None]:
import pandas as pd
import altair as alt

In [None]:
vep_file = '/Users/ivan/Downloads/20250707_BARD1_All_MisSynSpliceRegion_VEPrun.xlsx'
sge_file = '../Data/20250508_BARD1scores_update_FILTERED.xlsx'
rna_file = '/Users/ivan/Desktop/20250708_BARD1_202505_data_RNAclassified_beta.xlsx'
alt.data_transformers.disable_max_rows()

In [None]:
def process_vep(vep_file, sge_file):

    vep_df = pd.read_excel(vep_file)
    sge_df = pd.read_excel(sge_file)

    vep_df = vep_df[['Location', 'Allele', 'BLOSUM62', 'SpliceAI_pred_DS_AG', 'SpliceAI_pred_DS_AL', 'SpliceAI_pred_DS_DG', 'SpliceAI_pred_DS_DL']]

    vep_df['pos'] = vep_df['Location'].transform(lambda x: x[-9:])
    vep_df['pos_id'] = vep_df['pos'] + ':' + vep_df['Allele']
    vep_df = vep_df.drop(columns = ['Location', 'Allele', 'pos'])
    vep_df['max_SpliceAI'] = vep_df[['SpliceAI_pred_DS_AG', 'SpliceAI_pred_DS_AL', 'SpliceAI_pred_DS_DG', 'SpliceAI_pred_DS_DL']].max(axis = 1)
    
    merged_df = pd.merge(vep_df, sge_df, on = 'pos_id', how = 'left')
    
    return merged_df, vep_df

In [None]:
def scatterplot(df):

    scatter = alt.Chart(df).mark_circle().encode(
        x = 'score',
        y = 'max_SpliceAI',
        color = alt.Color('simplified_consequence',
                          sort = ['synonymous_variant', 'missense_variant', 'splicing_variant']
                         ),
        tooltip = [alt.Tooltip('pos_id', title = 'Variant: '),
                   alt.Tooltip('amino_acid_change', title = 'Amino Acid Sub.: '),
                   alt.Tooltip('target', title = 'SGE Target: '),
                   alt.Tooltip('score', title = 'SGE Score: '),
                   alt.Tooltip('max_SpliceAI', title = 'Max SpliceAI DS: ')
                  ]
    ).properties(
        width = 600,
        height = 400
    ).interactive()

    high_prec = alt.Chart(df).mark_rule(color = 'red').encode(
        y = alt.Y(datum = (0.8))
    )
    recommended =  alt.Chart(df).mark_rule(color = 'yellow').encode(
        y = alt.Y(datum = (0.5))
    )
    high_recall = alt.Chart(df).mark_rule(color = 'green').encode(
        y = alt.Y(datum = (0.2))
    )

    LFCupper = alt.Chart(df).mark_rule(color = 'red').encode(
        x = alt.X(datum = (-0.077))
    )

    LFClower = alt.Chart(df).mark_rule(color = 'red').encode(
        x = alt.X(datum = (-0.089))
    )
    scatter = scatter + high_prec + recommended + high_recall + LFCupper + LFClower
    scatter.display()

In [None]:
def merge_wRNA(rna_file, vep_df):
    rna_df = pd.read_excel(rna_file)

    rna_merged_df = pd.merge(vep_df, rna_df, on = 'pos_id', how = 'inner') 
    rna_abnormal = rna_merged_df.loc[rna_merged_df['RNA_classification'].isin(['low'])]
    rna_dna_abnormal = rna_abnormal.loc[rna_abnormal['functional_consequence'].isin(['functionally_abnormal'])]

    to_plot = [('All Variants', rna_merged_df),
               ('RNA Abnormal Variants', rna_abnormal),
               ('Functionally Abnormal and Abnormal RNA', rna_dna_abnormal)
              ]

    all_scatters = []
    for plot_title, data in to_plot:
        scatter = alt.Chart(data).mark_circle().encode(
            x = 'RNA/DNA',
            y = 'max_SpliceAI',
            color = alt.Color('Consequence',
                              sort = ['synonymous_variant', 'missense_variant', 'splicing_variant']
                             ),
            shape = 'functional_consequence',
            tooltip = [alt.Tooltip('pos_id', title = 'Variant: '),
                       alt.Tooltip('AAsub', title = 'Amino Acid Sub.: '),
                       alt.Tooltip('target', title = 'SGE Target: '),
                       alt.Tooltip('RNA/DNA', title = 'RNA Score: '),
                       alt.Tooltip('max_SpliceAI', title = 'Max SpliceAI DS: ')
                      ]
            ).properties(
                width = 600,
                height = 400,
                title = plot_title
            ).interactive()

        high_prec = alt.Chart(rna_merged_df).mark_rule(color = 'red').encode(
            y = alt.Y(datum = (0.8))
        )
        recommended =  alt.Chart(rna_merged_df).mark_rule(color = 'yellow').encode(
            y = alt.Y(datum = (0.5))
        )
        high_recall = alt.Chart(rna_merged_df).mark_rule(color = 'green').encode(
            y = alt.Y(datum = (0.2))
        )

        scatter = scatter + high_prec + recommended + high_recall
        all_scatters.append(scatter)

    final_scatter = all_scatters[0] | all_scatters[1] | all_scatters[2]
    final_scatter.display()


    scatter = alt.Chart(rna_merged_df).mark_circle().encode(
        x = 'RNA/DNA',
        y = 'max_SpliceAI',
        color = alt.Color('Consequence',
                          sort = ['synonymous_variant', 'missense_variant', 'splicing_variant']
                         ),
        shape = 'functional_consequence',
        tooltip = [alt.Tooltip('pos_id', title = 'Variant: '),
                   alt.Tooltip('AAsub', title = 'Amino Acid Sub.: '),
                   alt.Tooltip('target', title = 'SGE Target: '),
                   alt.Tooltip('RNA/DNA', title = 'RNA Score: '),
                   alt.Tooltip('max_SpliceAI', title = 'Max SpliceAI DS: ')
                  ]
    ).properties(
        width = 600,
        height = 400
    ).interactive()



    LFCupper = alt.Chart(rna_merged_df).mark_rule(color = 'red').encode(
        x = alt.X(datum = (0))
    )

    LFClower = alt.Chart(rna_merged_df).mark_rule(color = 'red').encode(
        x = alt.X(datum = (1))
    )

    scatter_faceted = scatter.facet('functional_consequence')
    scatter = scatter + high_prec + recommended + high_recall + LFCupper + LFClower
    #scatter.display()

    scatter_faceted.display()


In [None]:
def main():
    sge_merged_df, vep_df = process_vep(vep_file, sge_file)
    scatterplot(sge_merged_df)
    merge_wRNA(rna_file, vep_df)

In [None]:
main()