In [None]:
import pandas as pd
import altair as alt

In [None]:
scores = '../Data/QC_dev_data/BARD1.scores.eval.tsv'
alt.data_transformers.disable_max_rows()

In [None]:
def read_scores(file): #reads scores from excel file

    #this block reads most of the scores from the pillar project QC-passed dataset
    df = pd.read_csv(file, sep = '\t') #reads TSV

    df = df.rename(columns = {'simplified_consequence': 'Consequence', 'score': 'snv_score_minmax'})
    df = df[['exon','pos','Consequence','snv_score_minmax']] #pulls relevant columns

    
    return df

In [None]:
def prep_data(df): #Add data for category for if a datapoint is in a BARD1 region

    #lists of BARD1 protein domains and respective protein domains
    ring_coords = [(214809412,214809494),(214797061, 214797117), (214792298,214792445)] 
    adr_coords = [(214780560,214780601),(214769232,214769312),(214767482,214767654),(214752486,214752555)] 
    brct_coords = [(214745722,214745830),(214745067,214745159),(214730411,214730508),(214728685,214729008)]

    all_coords = {'ring': ring_coords, 'adr': adr_coords, 'brct': brct_coords}
    domains = list(all_coords.keys())

    domain_coords = {}
    for domain in domains:
        domain_list = []
        all_domain_coords = all_coords[domain]

        for pair in all_domain_coords:
            start, end = pair

            for i in range(start, end + 1):
                domain_list.append(i)

        domain_coords[domain] = domain_list

    df['Domain'] = None #creates empty column for domain


    #assigns domains for each datapoint
    df.loc[df['pos'].isin(domain_coords['ring']), 'Domain'] = 'RING'
    df.loc[df['pos'].isin(domain_coords['brct']), 'Domain'] = 'BRCT'
    df.loc[df['pos'].isin(domain_coords['adr']), 'Domain'] = 'ADR'

    df = df.loc[df['Domain'].isin(['RING','BRCT','ADR'])] #gets data just from domains
    df = df.reset_index(drop = True) #cleans up indices

    #Block used to export data for RING domain
    #export_df = df.loc[df['Domain'].isin(['RING'])]
    #export_df.to_excel('/Users/ivan/Desktop/20241021_BARD1_RING_Domain_PillarPjct_Plus1B.xlsx')
    
    #Renames variant effects to be more human readable
    df.loc[df['Consequence'].str.contains('missense'), 'Consequence'] = 'Missense'
    df.loc[df['Consequence'] == 'synonymous_variant', 'Consequence'] = 'Synonymous'
    df.loc[df['Consequence'] == 'intron_variant', 'Consequence'] = 'Intron'
    df.loc[df['Consequence'] == 'stop_gained', 'Consequence'] = 'Stop Gained'
    df.loc[df['Consequence'] == 'stop_lost', 'Consequence'] = 'Stop Lost'
    df.loc[df['Consequence'].str.contains('splic'), 'Consequence'] = 'Splice'
    
    return df

In [None]:
def histogram(df): #Builds histogram with variants in all 3 domains. Also created a plot faceted by domain.

    length = str(len(df)) #gets length of data

    final_tital = 'Distribution of BARD1 SGE Scores ' + '(n = ' + length + ')' #used to build title
    bins = 50 #number of bins

    sorted = ["Synonymous", "Missense", "Stop", "Splice"] #order for the legend
  
    #Builds histogram for variants in structured domains
    histogram = alt.Chart(df).mark_bar().encode(
            alt.X('snv_score_minmax', axis = alt.Axis(title = 'SGE Score', labelFontSize = 16, titleFontSize = 20), bin = alt.Bin(maxbins = bins)),
            alt.Y('count()', axis = alt.Axis(title = 'Number of Variants', labelFontSize = 16, titleFontSize = 20)),
            color = alt.Color('Consequence:N', scale = alt.Scale(scheme = 'category10'), sort = sorted, legend = alt.Legend(titleFontSize = 16, labelFontSize = 14))
    ).properties(
        width = 800,
        height = 400,
        title = alt.TitleParams(text = '', fontSize = 22)
    )
    
    histogram.show()

    #facets histogram by domain
    faceted = histogram.facet(
        alt.Facet('Domain:N', 
                  sort = ['RING', 'ADR', 'BRCT'],
                  title = 'Distrubtion of BARD1 SGE Scores by Functional Domain'
    ),columns = 3).configure_header(
        titleFontSize = 24, 
        labelFontSize = 20
    )

    faceted.display()


In [None]:
def interact_histogram(df): #Builds interactive historam with dropdown menu to select domain

    #Builds domain selection dropdown menu
    category_selection = alt.selection_point(
        fields=['Domain'],  # Column to filter by
        bind=alt.binding_select(options=df['Domain'].unique().tolist(), name='Select Domain: '), # Dropdown menu
        value= 'RING'  # Initial value for the selection
    )

    bins = 50 #number of bins
    scale = [-0.6,0.15] #scale
    
    sorted = ['Synonymous', 'Missense', "Stop", "Splice"] #order for the legend
  
    #Builds the histogram
    histogram = alt.Chart(df).mark_bar().encode(
        alt.X('snv_score_minmax', axis = alt.Axis(title = 'SGE Score', labelFontSize = 16, titleFontSize = 20), bin = alt.Bin(maxbins = bins)),
        alt.Y('count()', axis = alt.Axis(title = 'Number of Variants', labelFontSize = 16, titleFontSize = 20)),
        color = alt.Color('Consequence:N', scale = alt.Scale(scheme = 'category10'), sort = sorted, legend = alt.Legend(titleFontSize = 16, labelFontSize = 14))
        ).add_params(
            category_selection
        ).transform_filter(category_selection
        ).properties(
            width = 800,
            height = 400,
            title = alt.TitleParams(text = '', fontSize = 22)
        ).interactive()

    #histogram.save('interactive_histogram.html')
    histogram.show()

In [None]:
def violinplot(df): #Builds violin plot of variant scores across the 3 domains

    sorted = ['RING', 'ADR', 'BRCT'] #order for violin plot

    #Builds violin plot
    violin = alt.Chart(df).transform_density(
        'snv_score_minmax',
        as_=['snv_score_minmax', 'density'],
        extent=[-0.6, 0.15],
        groupby=['Domain']
        ).mark_area(orient='horizontal').encode(
      y= alt.Y('snv_score_minmax:Q', axis = alt.Axis(title = 'SGE Score')),
        color = alt.Color('Domain:N',sort = sorted),
        x=alt.X(
            'density:Q',
            stack='center',
            impute=None,
            title=None,
            axis=alt.Axis(labels=False, values=[0],grid=False, ticks=True),
        ),
        column=alt.Column(
            'Domain:N',
            sort = sorted,
            header=alt.Header(
                titleOrient='bottom',
                labelOrient='bottom',
                labelPadding=0,
            ),
        )
    ).properties(
        height = 500,
        width=250,
        title = alt.TitleParams(text = '', fontSize = 12)
    ).interactive(
    ).configure_facet(
        spacing=0
    ).configure_view(
        stroke=None
    )

    violin.show()

In [None]:
def main():
    data = read_scores(scores)
    domained = prep_data(data)
    histogram(domained)
    violinplot(domained)
    interact_histogram(domained)


In [None]:
main()