In [None]:
import pandas as pd
import altair as alt

In [None]:
file = '../Data/20250508_BARD1scores_update.tsv' #SGE Data File

In [None]:
def read_scores(file): #reads score from excel file
    df = pd.read_csv(file, sep = '\t')
    df = df[['exon','simplified_consequence','score']]

    df = df.rename(columns = {'score': 'snv_score', 'simplified_consequence': 'Consequence'})

    return df

In [None]:
def prep_data(df): #Renames categories to be nicer for legend
    
    df.loc[df['Consequence'].str.contains('missense'), 'Consequence'] = 'Missense'
    df.loc[df['Consequence'] == 'synonymous_variant', 'Consequence'] = 'Synonymous'
    df.loc[df['Consequence'] == 'intron_variant', 'Consequence'] = 'Intron'
    df.loc[df['Consequence'] == 'stop_gained', 'Consequence'] = 'Stop Gained'
    df.loc[df['Consequence'] == 'stop_lost', 'Consequence'] = 'Stop Lost'
    df.loc[df['Consequence'].str.contains('site'), 'Consequence'] = 'Canonical Splice'
    df.loc[df['Consequence'].str.contains('ing_var'), 'Consequence'] = 'Splice Region'
    df.loc[df['Consequence'].str.contains('UTR'), 'Consequence'] = 'UTR Variant'
    df.loc[df['Consequence'] == 'start_lost', 'Consequence'] = 'Start Lost'

    return df


In [None]:
def make_histogram(df): #makes histogram

    alt.data_transformers.disable_max_rows() #gets rid of max data length problem

    length = str(len(df)) #gets length of data for title

    final_tital = 'Distribution of BARD1 SGE Scores ' + '(n = ' + length + ')' #used to build title
    bins = 50 #number of bins

    sorted = ["Intron", "Missense", "Synonymous", "Stop Gained", "Splice", "Start Lost", 'Stop Lost', 'UTR'] #order for the legend
    selection = alt.selection_point(fields=['Consequence'], bind='legend')
    
    # Builds histogram with interactive legend
    histogram = alt.Chart(df).mark_bar().encode(
            alt.X('snv_score', 
                  axis = alt.Axis(title = 'SGE Score', 
                                  labelFontSize = 16, 
                                  titleFontSize = 20,
                                  values = [-0.5,-0.4, -0.3, -0.2, -0.1, 0, 0.1]
                                 ), 
                  bin = alt.Bin(maxbins = bins)),
            alt.Y('count()', axis = alt.Axis(title = 'Number of Variants', labelFontSize = 16, titleFontSize = 20)),
            color = alt.Color('Consequence:N', 
                             scale = alt.Scale(scheme = 'category10'), 
                             sort = sorted, 
                             legend = alt.Legend(titleFontSize = 16, 
                                                 labelFontSize = 14,
                                                 orient = 'top-left')),
            opacity=alt.condition(selection, alt.value(1), alt.value(0.2))  # Highlight selected categories
    ).add_params(
        selection
    ).properties(
        width = 800,
        height = 400,
        title = alt.TitleParams(text = final_tital, fontSize = 22)
    ).configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    ).interactive()

    '''
    
    nf_line = alt.Chart(pd.DataFrame({'x': [-0.089]})).mark_rule(color = 'red').encode(
        x = 'x')

    func_line = alt.Chart(pd.DataFrame({'x': [-0.077]})).mark_rule(color = 'blue').encode(
        x = 'x')

    
    gray_df = df.loc[df['Consequence'].isin(['Stop Gained', 'Synonymous'])]
    final_tital_gray = 'Distribution of BARD1 Stop and Syn. Variants ' + '(n = ' + str(len(gray_df)) + ')' #used to build title
    histogram_gray = alt.Chart(gray_df).mark_bar(color = 'grey').encode(
            alt.X('snv_score', axis = alt.Axis(title = 'SGE Score', labelFontSize = 16, titleFontSize = 20), 
                  bin = alt.Bin(maxbins = bins)),
            alt.Y('count()', axis = alt.Axis(title = 'Number of Variants', labelFontSize = 16, titleFontSize = 20)),
            color = alt.Color('Consequence:N',
                              scale = alt.Scale(
                                  domain = ['Synonymous', 'Stop Gained'],
                                  range = ['#2ca02c', 'grey']
                              ),
                              legend = alt.Legend(titleFontSize = 16, labelFontSize = 14))
    ).properties(
        width = 800,
        height = 400,
        title = alt.TitleParams(text = final_tital_gray, fontSize = 22)
    ).interactive()

    histogram_gray = histogram_gray + nf_line + func_line

    histogram_gray = histogram_gray.configure_axis(
        grid = False
    )

    histogram_gray.display()
    histogram_gray.save('/Users/ivan/Desktop/BARD1_draft_figs/stop_syn_histogram_wSyn.png', ppi = 500)

    '''
    
    #histogram.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_1c.png', ppi = 500)
    histogram.display()
    
    

In [None]:
def main():
    data = read_scores(file)
    to_graph = prep_data(data)
    make_histogram(to_graph)

In [None]:
main()