In [None]:
import pandas as pd
import altair as alt
from natsort import natsorted

In [None]:
file = '../Data/20250423_BARD1_snvscores_IGVFupload.tsv' #SGE score file

In [None]:
def read_data(path): #Reads data
    df = pd.read_csv(file, sep = '\t') #Reads SGE score file

    df = df.rename(columns = {'consequence': 'Consequence', 'score': 'snv_score'}) #Renames to harmonize old code with new column names
    
    df = df[['exon','Consequence','snv_score', 'functional_consequence']] #pulls out relevant columns in the dataframe

    filtered_consequences = ['missense_variant','synonymous_variant','stop_gained'] #Focusing on these variant types only 

    df = df.loc[df['Consequence'].isin(filtered_consequences)] #Filters for desired variant types
    
    df = df.reset_index(drop = True) #Resets index
    
    return df

In [None]:
def prep_data(df): #renames VEP consequence categories to be more human friendly
    
    df.loc[df['Consequence'].str.contains('missense_variant'), 'Consequence'] = 'Missense'
    df.loc[df['Consequence'].str.contains('synonymous_variant'), 'Consequence'] = 'Synonymous'
    #df.loc[df['Consequence'] == 'intron_variant', 'Consequence'] = 'Intron'
    df.loc[df['Consequence'].str.contains('stop_gained'), 'Consequence'] = 'Stop'
    #df.loc[(df['Consequence'] == 'splice_polypyrimidine_tract_variant') |(df['Consequence'] == 'splice_region_variant') | (df['Consequence'] == 'splice_acceptor_variant') | (df['Consequence'] == 'splice_donor_region_variant') | (df['Consequence'] == 'splice_donor_5th_base_variant') | (df['Consequence'] == 'splice_donor_variant'),'Consequence'] = 'Splice'
    #df.loc[df['Consequence'] == '3_prime_UTR_variant', 'Consequence'] = 'UTR'

    return df

In [None]:
def rewrite_targets(df): #Rewrites SGE target to get exon number only
    
    df['target'] = df['exon'].transform(lambda x: x[7::])
    
    return df

In [None]:
def exon_stats(df): #Generates summary dataframe with % of variants in each functional class for each exon

    exon_df_list = [] #List to hold summary dataframes for each target
    grouped_exons = df.groupby(['target','Consequence'])
    
    for group_name, group_df in grouped_exons:
        exon, var_type = group_name
        scores = group_df['functional_consequence'].tolist()

        non = 0 #counters for nonfunctional group

        inter = 0 #counter for indeterminate group
        
        for elem in scores: #Iterates through grouped scores and determines number of variants in each functional class
            if elem == 'functionally_abnormal': #Boolean for non-functional 
                non += 1
            elif elem == 'indeterminate': #Boolean for indeterminate
                inter += 1

        non_per = (non / len(group_df)) * 100 #Gets % non-functional
        inter_per = (inter / len(group_df)) * 100 #Gets % intermediate
        same_per = (100 - (non_per + inter_per)) #Gets % functional

        #Creates 3 separate dataframes for each functional class
        exon_non_df = pd.DataFrame({'Exon': exon, 'Consequence': var_type, 'Function Type': 'Nonfunctional', 'Percent': non_per}, index = [0])
        exon_inter_df = pd.DataFrame({'Exon': exon, 'Consequence': var_type, 'Function Type': 'Indeterminate', 'Percent': inter_per}, index = [0])
        exon_same_df = pd.DataFrame({'Exon': exon, 'Consequence': var_type, 'Function Type': 'Functional', 'Percent': same_per}, index = [0])
            
        exon_df = pd.concat([exon_non_df, exon_inter_df, exon_same_df]) #Concatnates all dataframes together

        exon_df_list.append(exon_df) #Appends to final dataframe list

    all_exons_df = pd.concat(exon_df_list) #Concatenates all summary dataframes
    
    all_exons_df = all_exons_df.reset_index(drop = True) #resets index
    
    return all_exons_df

In [None]:
def stacked_bars(df): #Creates the stacked bar chart
    
    exons = natsorted(set(list(df['Exon'].tolist()))) #Gets sorted list of exons
    
    #Builds stacked bar chart
    chart = alt.Chart(df).mark_bar().encode(
            x = alt.X('Exon:O', axis = alt.Axis(labelAngle = 0, labelFontSize = 16, titleFontSize = 20), sort = exons),
            y = alt.Y('Percent',axis = alt.Axis(labelFontSize = 16, titleFontSize = 20)),
            tooltip = [alt.Tooltip('Function Type', title = 'Functional Class: '), 
                        alt.Tooltip('Percent', title = 'Percent: ')],
            color = alt.Color('Function Type', title = 'Functional Class', legend = alt.Legend(titleFontSize = 18, labelFontSize = 16))
    ).properties(
        width = 600,
        height = 400
    ).facet(facet = alt.Facet('Consequence',
        sort = ['Synonymous', 'Missense', 'Stop']
                ),
        columns = 1
    )

    chart = chart.configure_header(
        titleFontSize = 20, 
        labelFontSize = 16
    )

    #chart.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_2a.png', ppi = 500)
    chart.show()
    

In [None]:
def main():
    data = read_data(file)
    reannotated = prep_data(data)
    num_exons = rewrite_targets(data)
    all_stats = exon_stats(num_exons)
    stacked_bars(all_stats)

In [None]:
main()