In [None]:
import pandas as pd
import altair as alt

In [None]:
combine = True
most_scores = '/Users/ivan/Library/CloudStorage/OneDrive-UW/Research/Miscellaneous/BARD1_Figure_Data_Inputs/20241004_BARD1_PillarProject_DataFreeze.xlsx'
one_b = '/Users/ivan/Library/CloudStorage/OneDrive-UW/Research/Miscellaneous/BARD1_Figure_Data_Inputs/20240828_BARD1_AllScores.xlsx'

In [None]:
def read_scores(file, file_2): #reads scores from excel file

    #this block reads most of the scores from the pillar project QC-passed dataset
    df = pd.read_excel(file) #reads excel
    df = df[['target','pos','Consequence','snv_score_minmax']] #pulls relevant columns
    df = df.loc[(df['snv_score_minmax'] <= 2) & (df['snv_score_minmax'] > -2)] #Needed to control outliers in the data - due to extreme outliers, the plot will not display correctly

    #this block reads the extra scores from the non-QC passed dataset
    xtra_df = pd.read_excel(file_2)  #reads excel
    xtra_df = xtra_df.loc[xtra_df['target'].isin(['BARD1_X1B'])] #pulls data for regions that did not pass QC
    xtra_df = xtra_df[['target','pos','Consequence','snv_score_minmax']] #pulls relevant columns
    xtra_df = xtra_df.loc[(xtra_df['snv_score_minmax'] <= 2) & (xtra_df['snv_score_minmax'] > -2)] #controls for crazy outliers

    final_df = pd.concat([df,xtra_df]) #concatenates to yield final dataframe
    
    return final_df

In [None]:
def prep_data(df): #Add data for category for if a datapoint is in a BARD1 region

    #lists for SGE regions and their respective protein domains
    ring = ['BARD1_X1B', 'BARD1_X2', 'BARD1_X3A', 'BARD1_X3B']
    ank = ['BARD1_X4L', 'BARD1_X5A', 'BARD1_X5B', 'BARD1_X6A','BARD1_X6B', 'BARD1_X6C', 'BARD1_X7A', 'BARD1_X7B']
    brct = ['BARD1_X8A', 'BARD1_X8B', "BARD1_X9A", 'BARD1_X9B', 'BARD1_X10A',
                     'BARD1_X10B', 'BARD1_X11A', 'BARD1_X11B', 'BARD1_X11C', 'BARD1_11D']

    df['Domain'] = None #creates empty column for domain

    
    #assigns domains for each datapoint
    df.loc[df['target'].isin(ring), 'Domain'] = 'RING'
    df.loc[df['target'].isin(brct), 'Domain'] = 'BRCT'
    df.loc[df['target'].isin(ank), 'Domain'] = 'ANK Repeat'

    df = df.loc[df['Domain'].isin(['RING','BRCT','ANK Repeat'])] #gets data just from domains
    df = df.reset_index(drop = True) #cleans up indices

    #Block used to export data for RING domain
    export_df = df.loc[df['Domain'].isin(['RING'])]
    export_df.to_excel('/Users/ivan/Desktop/20241021_BARD1_RING_Domain_PillarPjct_Plus1B.xlsx')
    
    #Renames variant effects to be more human readable
    df.loc[df['Consequence'] == 'missense_variant', 'Consequence'] = 'Missense'
    df.loc[df['Consequence'] == 'synonymous_variant', 'Consequence'] = 'Synonymous'
    df.loc[df['Consequence'] == 'intron_variant', 'Consequence'] = 'Intron'
    df.loc[(df['Consequence'] == 'stop_gained') | (df['Consequence'] == 'stop_lost') | (df['Consequence'] == 'stop_retained_variant'), 'Consequence'] = 'Stop'
    df.loc[(df['Consequence'] == 'splice_polypyrimidine_tract_variant') |(df['Consequence'] == 'splice_region_variant') | (df['Consequence'] == 'splice_acceptor_variant') | (df['Consequence'] == 'splice_donor_region_variant') | (df['Consequence'] == 'splice_donor_5th_base_variant') | (df['Consequence'] == 'splice_donor_variant'),'Consequence'] = 'Splice'
    df.loc[(df['Consequence'] == '3_prime_UTR_variant'), 'Consequence'] = "3' UTR"
    df.loc[(df['Consequence'] == '5_prime_UTR_variant'), 'Consequence'] = "5' UTR"
    df.loc[df['Consequence'] == 'start_lost', 'Consequence'] = 'Start'
    
    return df

In [None]:
def histogram(df):

    alt.data_transformers.disable_max_rows() #gets rid of max data length problem

    length = str(len(df)) #gets length of data

    final_tital = 'Distribution of BARD1 SGE Scores ' + '(n = ' + length + ')' #used to build title
    bins = 50 #number of bins
    scale = [-2,2] #scale
    ticks = list(range(-2,2)) #tick marks

    
    sorted = ["Intron", "Missense", "Synonymous", "Stop", "Splice", "Start", "5' UTR", "3' UTR"] #order for the legend
  

    
    histogram = alt.Chart(df).mark_bar().encode(
            alt.X('snv_score_minmax', axis = alt.Axis(values = ticks, title = 'SGE Score', labelFontSize = 16, titleFontSize = 20), bin = alt.Bin(maxbins = bins),
             scale = alt.Scale(domain = scale)),
            alt.Y('count()', axis = alt.Axis(title = 'Number of Variants', labelFontSize = 16, titleFontSize = 20)),
            color = alt.Color('Consequence:N', scale = alt.Scale(scheme = 'category10'), sort = sorted, legend = alt.Legend(titleFontSize = 16, labelFontSize = 14))
    ).properties(
        width = 800,
        height = 400,
        title = alt.TitleParams(text = '', fontSize = 22)
    )
    
    histogram.show()

    #facets histogram by domain
    faceted = histogram.facet(
        alt.Facet('Domain:N', 
                  sort = ['RING', 'ANK Repeat', 'BRCT'],
                  title = 'Distrubtion of BARD1 SGE Scores by Functional Domain'
    ),columns = 2).configure_header(
        titleFontSize = 24, 
        labelFontSize = 20
    )



In [None]:
def interact_histogram(df):

    alt.data_transformers.disable_max_rows() #gets rid of max data length problem

    length = str(len(df)) #gets length of data

    category_selection = alt.selection_point(
        fields=['Domain'],  # Column to filter by
        bind=alt.binding_select(options=df['Domain'].unique().tolist(), name='Select Domain: '),  # Dropdown menu
        value= 'RING'  # Initial value for the selection
    )

    
    final_tital = 'Distribution of BARD1 SGE Scores ' + '(n = ' + length + ')' #used to build title
    bins = 50 #number of bins
    scale = [-2,2] #scale
    ticks = list(range(-2,2)) #tick marks

    
    sorted = ["Intron", "Missense", "Synonymous", "Stop", "Splice", "Start", "5' UTR", "3' UTR"] #order for the legend
  

    
    histogram = alt.Chart(df).mark_bar().encode(
            alt.X('snv_score_minmax', axis = alt.Axis(values = ticks, title = 'SGE Score', labelFontSize = 16, titleFontSize = 20), bin = alt.Bin(step = 0.05),
             scale = alt.Scale(domain = scale)),
            alt.Y('count()', axis = alt.Axis(title = 'Number of Variants', labelFontSize = 16, titleFontSize = 20)),
            color = alt.Color('Consequence:N', scale = alt.Scale(scheme = 'category10'), sort = sorted, legend = alt.Legend(titleFontSize = 16, labelFontSize = 14))
    ).add_params(
        category_selection
    ).transform_filter(category_selection
                      ).properties(
        width = 800,
        height = 400,
        title = alt.TitleParams(text = '', fontSize = 22)
    )

    #histogram.save('interactive_histogram.html')
    histogram.show()

In [None]:
def violinplot(df):

    alt.data_transformers.disable_max_rows()

    sorted = ['RING', 'ANK Repeat', 'BRCT'] #order for violin plot
    
    violin = alt.Chart(df).transform_density(
        'snv_score_minmax',
        as_=['snv_score_minmax', 'density'],
        extent=[-1.75, 1.5],
        groupby=['Domain']
        ).mark_area(orient='horizontal').encode(
      y= alt.Y('snv_score_minmax:Q', axis = alt.Axis(title = 'SGE Score')),
        color = alt.Color('Domain:N',sort = sorted),
        x=alt.X(
            'density:Q',
            stack='center',
            impute=None,
            title=None,
            axis=alt.Axis(labels=False, values=[0],grid=False, ticks=True),
        ),
        column=alt.Column(
            'Domain:N',
            sort = sorted,
            header=alt.Header(
                titleOrient='bottom',
                labelOrient='bottom',
                labelPadding=0,
            ),
        )
    ).properties(
        height = 500,
        width=250,
        title = alt.TitleParams(text = '', fontSize = 12)
    ).configure_facet(
        spacing=0
    ).configure_view(
        stroke=None
    )

    violin.show()

In [None]:
def main():
    data = read_scores(most_scores, one_b)
    domained = prep_data(data)
    histogram(domained)
    violinplot(domained)
    interact_histogram(domained)


In [None]:
main()