In [None]:
import pandas as pd
import altair as alt

In [None]:
sge_file = '../Data/20250825_BARD1snvscores_filtered.xlsx'
vep_file = '../Data/20250825_BARD1snvscores_filtered_VEP.xlsx'
alt.data_transformers.disable_max_rows()

In [None]:
def get_thresholds(thresholds):
    df = pd.read_excel(thresholds)

    # find the GMM thresholds
    target_value = 0.950
    # Calculate the absolute difference for the Normal (N) density
    diffN = (df['gmm_density_normal'] - target_value).abs()
    # Find the index of the minimum difference
    closest_index = diffN.idxmin()
    # Retrieve the row with the closest value
    closest_row_n = df.loc[closest_index]
    
    # now repeat that for the abnormal density
    # Calculate the absolute difference
    diffA = (df['gmm_density_abnormal'] - target_value).abs()
    # Find the index of the minimum difference
    closest_index = diffA.idxmin()
    # Retrieve the row with the closest value
    closest_row_a = df.loc[closest_index]
    
    # now we get the scores that are the closest to the (n)ormal and (a)bnormal thresholds
    uppr = closest_row_n['score']
    lwr = closest_row_a['score']
    
    thresholds = [lwr, uppr]

    return thresholds

In [None]:
def process_vep(vep, sge):
    raw_vep = pd.read_excel(vep)
    raw_sge = pd.read_excel(sge)

    miss_vep = raw_vep.loc[raw_vep['Consequence'].str.contains('missense_variant')]
    miss_vep = miss_vep.copy()
    miss_vep['pos'] = miss_vep['Location'].transform(lambda x: x.split('-')[1])
    miss_vep['pos_id'] = miss_vep['pos'].astype(str) + ':' + miss_vep['Allele']
    miss_vep['SIFT_CLASS'] = miss_vep['SIFT'].transform(lambda x: x.split('(')[0])
    miss_vep['SIFT'] = miss_vep['SIFT'].transform(lambda x: x.split('(')[1].split(')')[0])
    miss_vep['PolyPhen_CLASS'] = miss_vep['PolyPhen'].transform(lambda x: x.split('(')[0])
    miss_vep['PolyPhen'] = miss_vep['PolyPhen'].transform(lambda x: x.split('(')[1].split(')')[0])
    
    miss_vep = miss_vep[['pos_id', 'SIFT_CLASS', 'SIFT', 'PolyPhen_CLASS', 'PolyPhen', 'ClinPred', 'REVEL', 'CADD_PHRED', 'CADD_RAW', 'am_class', 'am_pathogenicity']]
    miss_vep = miss_vep.rename(columns = {'CADD_PHRED': 'CADD', 'am_pathogenicity': 'AM'})
    miss_vep = miss_vep.astype({'SIFT': 'float', 'PolyPhen': 'float'})

    raw_sge['amino_acid'] = raw_sge['amino_acid_change'].transform(lambda x: x[:-1])
    
    merged = pd.merge(miss_vep, raw_sge, on = 'pos_id', how = 'inner')

    merged['SIFT_transformed'] = 1 - merged['SIFT']
    
    grouped = merged.groupby('CDS_position')

    min_df = grouped.agg({
        'amino_acid': 'first',
        'SIFT': 'min',
        'SIFT_transformed': 'max',
        'PolyPhen': 'max',
        'ClinPred': 'max', 
        'REVEL': 'max',
        'CADD': 'max', 
        'AM': 'max',
        'score': 'min'
    }
                        ).reset_index()
    
    return merged, min_df

In [None]:
def vep_v_sge(df, thresholds):
    veps = ['SIFT_transformed', 'PolyPhen', 'ClinPred', 'REVEL', 'CADD', 'AM']

    scatters =[]
    for vep in veps:
        scatter = alt.Chart(df).mark_circle().encode(
            x = alt.X('score', 
                      axis = alt.Axis(title = 'SGE Score'
                                     )
                     ),
            y = vep, 
            color = 'consequence', 
            tooltip = [alt.Tooltip('pos_id', title = 'Position ID: '),
                       alt.Tooltip('amino_acid_change', title = 'AA Sub: ')
                      ]
        ).interactive()

        sge_nf_line = alt.Chart(pd.DataFrame({'score': [thresholds[0]]})).mark_rule(color = 'red').encode(
        x = 'score') 

        sge_func_line = alt.Chart(pd.DataFrame({'score': [thresholds[1]]})).mark_rule(color = 'blue').encode(
        x = 'score') 

        scatter = scatter + sge_nf_line + sge_func_line
        
        scatters.append(scatter)


    top_panel = scatters[0] | scatters[1] | scatters [2]
    bottom_panel = scatters[3] | scatters[4] | scatters[5]

    full_fig = top_panel  & bottom_panel

    full_fig.display()

In [None]:
def scores_across_gene(df):
    veps = ['score', 'SIFT', 'PolyPhen', 'ClinPred', 'REVEL', 'CADD', 'AM']

    plots = []
    for vep in veps:
        plot = alt.Chart(df).mark_circle().encode(
            x = alt.X('CDS_position:Q',
                      axis = alt.Axis(labels = False)
                     ),
            y = vep, 
            tooltip = [alt.Tooltip('amino_acid_change', title = 'AA Change: ')]
        ).properties(
            width = 1750,
            height = 200
        )

        plots.append(plot)


    full_fig = plots[0] & plots[1] & plots[2] & plots[3] & plots[4] & plots[5] & plots[6]

    full_fig.display()

In [None]:
def min_heatmap(df, threshold):

    df = df.drop(columns = ['CADD', 'SIFT', 'SIFT_transformed'])
    
    vep_df = pd.melt(df, id_vars = ['CDS_position', 'amino_acid'], value_vars = ['PolyPhen', 'ClinPred', 'REVEL',  'AM'])

    #sge_df = sge_df.rename(columns = {'score': 'SGE Score'})
    
    sge_df = pd.melt(df, id_vars = ['CDS_position', 'amino_acid'], value_vars = ['score'])

    sge_df['variable'] = 'SGE'

    dfs = [vep_df, sge_df]

    maps = []

    vep_map = alt.Chart(vep_df).mark_rect().encode(
        x = alt.X('CDS_position:Q',
                  title = 'CDS Position',
                  bin = alt.Bin(maxbins = 2335, minstep = 1),
                  axis = alt.Axis(values = list(range(0, 2335, 100))
                                 )
                 ),
        y = alt.Y('variable:O',
                  title = ''
                 ),
        color = alt.Color('value:Q',
                          scale = alt.Scale(
                              domain = [0,1],
                              reverse = False,
                              scheme = 'bluepurple'
                          ),
                          legend = alt.Legend(
                              title = 'Predictor Score'
                          )
                         ),
        tooltip = [alt.Tooltip('CDS_position', title = 'CDS Pos: '), 
                  alt.Tooltip('amino_acid', title = 'Amino Acid: '),
                  alt.Tooltip('value', title = 'Min. Score')
                  ]
    ).properties(
        width = 1750,
        height = 300
    )

    maps.append(vep_map)

    sge_map = alt.Chart(sge_df).mark_rect().encode(
        x = alt.X('CDS_position:Q',
                  title = 'CDS Position',
                  bin = alt.Bin(maxbins = 2335, minstep = 1),
                  axis = alt.Axis(values = list(range(0, 2335, 100))
                                 )
                 ),
        y = alt.Y('variable:O',
                  title = ''
                 ),
        color = alt.Color('value:Q',scale = alt.Scale(
                                  domain = [-0.2, 0],
                                  clamp = True,
                                  reverse = True,
                                  scheme = 'bluepurple'
                              ),
                          legend = alt.Legend(title = 'SGE Score')
                         ),
        tooltip = [alt.Tooltip('CDS_position', title = 'CDS Pos: '), 
                  alt.Tooltip('amino_acid', title = 'Amino Acid: '),
                  alt.Tooltip('value', title = 'Min. Score')
                  ]
    ).properties(
        width = 1750,
        height = 300
    )

    maps.append(sge_map)
    
    final_map = (alt.layer(maps[0], maps[1])).resolve_scale(
        color = 'independent'
    ).properties(
        title = 'Min. Predictor Score vs. Min. Missense SGE Score'
    )

    final_map.display()
    #final_map.save('/Users/ivan/Desktop/BARD1_draft_figs/supp_figs/vep_heatmap.pdf', dpi = 400)

In [None]:
def main():
    thresholds = get_thresholds(sge_file)
    df, min_df = process_vep(vep_file, sge_file)
    
    vep_v_sge(df, thresholds)
    scores_across_gene(df)
    min_heatmap(min_df, thresholds)
    

In [None]:
main()