In [None]:
import pandas as pd
import altair as alt

In [None]:
file = '../Data/20250122_BARD1_SGEscores_wAAsub.xlsx' #SGE score file

#GMM Cutoffs
path_max = 0.689682159032362 
benign_min = 0.807231141721117

In [None]:
def read_data(path):
    df = pd.read_excel(path) #Reads SGE score file
    df = df[['target','Consequence','pos_id','snv_score']] #pulls out relevant columns in the dataframe

    filtered_consequences = ['missense_variant','synonymous_variant','stop_gained'] #Focusing on these variant types only 

    filter_string = '|'.join(filtered_consequences) #Creates a string that can be used for filtering
 
    df = df.loc[df['Consequence'].str.contains(filter_string)] #Uses filter_string to filter dataframe for variant types of interest only

    df = df.reset_index(drop = True) #Resets index
    
    return df

In [None]:
def prep_data(df): #renames VEP consequence categories to be more human friendly
    
    df.loc[df['Consequence'].str.contains('missense_variant'), 'Consequence'] = 'Missense'
    df.loc[df['Consequence'].str.contains('synonymous_variant'), 'Consequence'] = 'Synonymous'
    #df.loc[df['Consequence'] == 'intron_variant', 'Consequence'] = 'Intron'
    df.loc[df['Consequence'].str.contains('stop_gained'), 'Consequence'] = 'Stop'
    #df.loc[(df['Consequence'] == 'splice_polypyrimidine_tract_variant') |(df['Consequence'] == 'splice_region_variant') | (df['Consequence'] == 'splice_acceptor_variant') | (df['Consequence'] == 'splice_donor_region_variant') | (df['Consequence'] == 'splice_donor_5th_base_variant') | (df['Consequence'] == 'splice_donor_variant'),'Consequence'] = 'Splice'
    #df.loc[df['Consequence'] == '3_prime_UTR_variant', 'Consequence'] = 'UTR'

    return df

In [None]:
def rewrite_targets(df): #Rewrites SGE target to get exon number only
    
    i = 0
    while i < len(df):
        target = df['target'][i] #gets SGE target
        id = df['pos_id'][i]
        split = target.split("_") #splits SGE target name by _ and yields list with [Gene name, X(exon#)(region letter)]
        x_format = split[1] #gives just the exon and SGE target

        if x_format[1:3].isdigit(): #rewrites target for exon numbers greater than 9
            exon = int(x_format[1:3])
            df.loc[df['pos_id'] == id, 'target'] = exon
        elif x_format[1].isdigit(): #rewrites target for exon numbers less than 9
            exon = int(x_format[1])
            df.loc[df['pos_id'] == id, 'target'] = exon

        
        i += 1

    return df

In [None]:
def exon_stats(df, path_max, benign_min): #Generates summary dataframe with % of variants in each functional class for each exon

    exon_df_list = [] #List to hold summary dataframes for each target
    grouped_exons = df.groupby(['target','Consequence'])
    for group_name, group_df in grouped_exons:
        exon, var_type = group_name
        scores = group_df['snv_score'].tolist()

        non = 0 #counters for nonfunctional group

        inter = 0 #counter for intermediate group
        
        for elem in scores: #Iterates through grouped scores and determines number of variants in each functional class
            if elem <= path_max: #Boolean for non-functional 
                non += 1
            elif path_max < elem < benign_min: #Boolean for functional
                inter += 1

        non_per = (non / len(group_df)) * 100 #Gets % non-functional
        inter_per = (inter / len(group_df)) * 100 #Gets % intermediate
        same_per = (100 - (non_per + inter_per)) #Gets % functional

        #Creates 3 separate dataframes for each functional class
        exon_non_df = pd.DataFrame({'Exon': exon, 'Consequence': var_type, 'Function Type': 'Nonfunctional', 'Percent': non_per}, index = [0])
        exon_inter_df = pd.DataFrame({'Exon': exon, 'Consequence': var_type, 'Function Type': 'Intermediate', 'Percent': inter_per}, index = [0])
        exon_same_df = pd.DataFrame({'Exon': exon, 'Consequence': var_type, 'Function Type': 'Functional', 'Percent': same_per}, index = [0])
            
        exon_df = pd.concat([exon_non_df, exon_inter_df, exon_same_df]) #Concatnates all dataframes together

        exon_df_list.append(exon_df) #Appends to final dataframe list

    all_exons_df = pd.concat(exon_df_list) #Concatenates all summary dataframes
    
    all_exons_df = all_exons_df.reset_index(drop = True) #resets index
    
    return all_exons_df

In [None]:
def stacked_bars(df): #Creates the stacked bar chart
    
    df = df.sort_values(by = ['Exon'])
    chart = alt.Chart(df).mark_bar().encode(
            x = alt.X('Exon:O', axis = alt.Axis(labelAngle = 0)),
            y = 'Percent',
            tooltip = [alt.Tooltip('Function Type', title = 'Functional Class: '), 
                        alt.Tooltip('Percent', title = 'Percent: ')],
            color = alt.Color('Function Type', title = 'Functional Class')
    ).facet(
        column = 'Consequence:N'
    )
    chart.show()

In [None]:
def main():
    data = read_data(file)
    reannotated = prep_data(data)
    num_exons = rewrite_targets(data)
    all_stats = exon_stats(num_exons, path_max, benign_min)
    stacked_bars(all_stats)

In [None]:
main()