In [None]:
import pandas as pd
import altair as alt
import numpy as np
from scipy.stats import gaussian_kde

In [None]:
#file = '/Users/ivan/Downloads/20240828_BARD1_AllScores.xlsx'
file = '../Data/20250122_BARD1_SGEscores_wAAsub.xlsx'

In [None]:
def read_scores(file): #reads SGE scores
    df = pd.read_excel(file)
    df = df[['target','Consequence','snv_score']]
    df.loc[df['Consequence'].isin(['stop_gained'])]

    df = df.loc[(df['snv_score'] <= 2) & (df['snv_score'] > -2)] #Needed to control outliers in the data - due to extreme outliers, the plot will not display correctly

    return df

In [None]:
def prep_data(df): #Renames categories to be nicer
    
    df.loc[df['Consequence'].str.contains('missense'), 'Consequence'] = 'Missense'
    df.loc[df['Consequence'] == 'synonymous_variant', 'Consequence'] = 'Synonymous'
    df.loc[df['Consequence'] == 'intron_variant', 'Consequence'] = 'Intron'
    df.loc[(df['Consequence'] == 'stop_gained') | (df['Consequence'] == 'stop_lost') | (df['Consequence'] == 'stop_retained_variant'), 'Consequence'] = 'Stop'
    df.loc[df['Consequence'].str.contains('splice'), 'Consequence'] = 'Splice'
    df.loc[df['Consequence'].str.contains('UTR'), 'Consequence'] = 'UTR'
    df.loc[df['Consequence'] == 'start_lost', 'Consequence'] = 'Start'
    return df


In [None]:
def compute_density(df, category): #used to calculate the density - mostly GPT code
    values = df['snv_score'].values
    density = gaussian_kde(values)
    xs = np.linspace(values.min(), values.max(), 200)
    ys = density(xs)
    
    return pd.DataFrame({'SGE_score': xs, 'density': ys, 'Consequence': category})


In [None]:
def make_plot(df,rawdf): #creates plot

    df = pd.concat([df,rawdf], ignore_index = True) #concatenates the density data and raw data so that the ticks and density plots share a dataframe (required for graphing)

    #Base creates the density plots using the density data 
    base = alt.Chart(df).mark_area(
        interpolate = 'monotone',
        line = True
    ).encode(
        x = alt.X('SGE_score:Q', axis = alt.Axis(title = 'SGE Score', titleFontSize = 20, labelFontSize = 16)),
        y = alt.Y('density:Q', axis = None),
        color = alt.Color('Consequence:N', legend = None)
    )

    #ticks creates the tick marks using the raw data
    ticks = alt.Chart(df).mark_tick(
        color = 'black',
        thickness = 0.5,
        size = 5
    ).encode(
        x = alt.X('snv_score:Q', title = ''),
        y = alt.value(77.5)
    )

    #Plots are layered
    combined_plot = alt.layer(ticks, base).properties(
        width = 1000,
        height = 75
    )

    #plots are faceted by consequence of the variant to yield the final plot (faceting requires that all data is in the same dataframe
    plot = combined_plot.facet(
        row = alt.Row('Consequence:N', title = 'Consequence'),
        spacing = 2
    ).properties(
        title = '',
        bounds = 'flush'
    ).configure_facet(
        spacing= 1
    ).configure_header(
        titleFontSize = 20,
        labelFontSize = 16
    ).configure_title(
        anchor='start'
    ).configure_axis(
        grid=False
    ).configure_view(
        stroke = None
    )
    
    plot.show()

In [None]:
def main():
    alt.data_transformers.disable_max_rows()
    data = read_scores(file)
    relabeled_data = prep_data(data)
    density_data = pd.concat([compute_density(relabeled_data[relabeled_data['Consequence'] == category], category)
                          for category in relabeled_data['Consequence'].unique()])
    #print(density_data)

    #Outlier QC block
    #test = relabeled_data.loc[relabeled_data['Consequence'].isin(['Intron'])]
    #test = test.loc[test['snv_score'] < 0.5]
    #print(test, len(test))
    
    make_plot(density_data,relabeled_data)
    
    

In [None]:
main()