In [None]:
import pandas as pd
import altair as alt

In [None]:
score_file = '../Data/20250825_BARD1.snvscores.tsv'
alt.data_transformers.disable_max_rows()

In [None]:
def read_data(file):
    df = pd.read_csv(file, sep = '\t')

    # find the GMM thresholds
    target_value = 0.950
    # Calculate the absolute difference for the Normal (N) density
    diffN = (df['gmm_density_normal'] - target_value).abs()
    # Find the index of the minimum difference
    closest_index = diffN.idxmin()
    # Retrieve the row with the closest value
    closest_row_n = df.loc[closest_index]
    
    # now repeat that for the abnormal density
    # Calculate the absolute difference
    diffA = (df['gmm_density_abnormal'] - target_value).abs()
    # Find the index of the minimum difference
    closest_index = diffA.idxmin()
    # Retrieve the row with the closest value
    closest_row_a = df.loc[closest_index]
    
    # now we get the scores that are the closest to the (n)ormal and (a)bnormal thresholds
    uppr = closest_row_n['score']
    lwr = closest_row_a['score']

    thresholds = [lwr, uppr]

    #Filter for variants used in gmm threshold drawing
    neutral_df = df.loc[df['consequence'].isin(['synonymous_variant', 'intron_variant'])]
    lower, upper = neutral_df['score'].quantile([0.025, 0.975])

    neutral_df = neutral_df[neutral_df['score'].between(lower, upper)]
    
    neutral_df.loc[(neutral_df['consequence'] == 'synonymous_variant') | (neutral_df['consequence'] == 'intron_variant'), 'consequence'] = 'Intron/Syn. Variant'
    
    abnormal_df = df.loc[~(df['exon'].isin(['BARD1_X1', 'BARD1_X4', 'BARD1_X11']))]
    abnormal_df = abnormal_df.copy()
    abnormal_df = abnormal_df.loc[abnormal_df['consequence'].isin(['stop_gained'])]
    abnormal_df.loc[abnormal_df['consequence'] == 'stop_gained', 'consequence'] = 'Stop Gained'

    dfs = [abnormal_df, neutral_df]
    
    return dfs, thresholds

In [None]:
def density_plot(dfs, thresholds):

    plots = []

    nf_data = pd.DataFrame({'score': [thresholds[0]]})
    func_data = pd.DataFrame({'score': [thresholds[1]]})


    nf_line = alt.Chart(nf_data).mark_rule(color = 'red').encode(
        x = 'score')

    func_lin = alt.Chart(func_data).mark_rule(color = 'blue').encode(
        x = 'score')

    nf_text = alt.Chart(nf_data).mark_text(
        align = 'left',
        dx = -55,
        dy = -190,
        color = 'black',
        fontSize = 16
    ).encode(
        x = 'score',
        text = alt.Text('score:Q', format = '.3f')
    )

    func_text = alt.Chart(func_data).mark_text(
        align = 'right',
        dx = 55,
        dy = -190,
        color = 'black',
        fontSize = 16
    ).encode(
        x = 'score',
        text = alt.Text('score:Q', format = '.3f')
    )
        
    
    for df in dfs:
        plot = alt.Chart(df).transform_density(
            'score',
            as_ = ['score', 'density'],
            groupby = ['consequence']
        ).mark_area(fillOpacity = 0.7, 
                   interpolate = 'monotone').encode(
            x = alt.X('score:Q',
                      title = 'SGE Score',
                      axis = alt.Axis(titleFontSize = 20,
                                      labelFontSize = 16
                                     )
                     ),
            y = alt.Y('density:Q',
                      title = 'Density',
                      axis = alt.Axis(titleFontSize = 20,
                                      labelFontSize = 16
                                     )
                     ),
            color = alt.Color('consequence',
                              legend = alt.Legend(title = 'Consequence',
                                                 titleFontSize = 20,
                                                 labelFontSize = 16)
                             )
        ).properties(
            width = 600,
            height = 400
        )

        plots.append(plot)


    final_plot = alt.layer(plots[0], plots[1], nf_line, func_lin, nf_text, func_text).configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    )
    final_plot.display()

    #final_plot.save('/Users/ivan/Desktop/BARD1_draft_figs/supp_figs/gmm_densities.pdf', dpi = 500)

In [None]:
def main():
    data, thresholds = read_data(score_file)
    density_plot(data, thresholds)

In [None]:
main()