In [None]:
import pandas as pd
import altair as alt

In [None]:
sge_genes = [('BARD1', '../Data/filtered_ppj_data/SGE/BARD1.tsv'),
             ('BRCA2','../Data/filtered_ppj_data/SGE/BRCA2.tsv'), 
             ('CTCF', '../Data/filtered_ppj_data/SGE/CTCF.tsv'), 
             ('PALB2','../Data/filtered_ppj_data/SGE/PALB2.tsv'), 
             ('RAD51D','../Data/filtered_ppj_data/SGE/RAD51D.tsv'), 
             ('SFPQ','../Data/filtered_ppj_data/SGE/SFPQ.tsv'), 
             ('XRCC2', '../Data/filtered_ppj_data/SGE/XRCC2.tsv')
            ]

vampseq = [('F9 Ab001','../Data/filtered_ppj_data/VAMPseq/F9_ab001.csv'),
           ('F9 Ab102','../Data/filtered_ppj_data/VAMPseq/F9_ab102.csv'),
           ('F9 Ab124','../Data/filtered_ppj_data/VAMPseq/F9_ab124.csv'),
           ('F9 Ab3570','../Data/filtered_ppj_data/VAMPseq/F9_ab3570.csv'),
           ('F9 Strep','../Data/filtered_ppj_data/VAMPseq/F9_strep.csv' ),
           ('G6PD','../Data/filtered_ppj_data/VAMPseq/G6PD_scores_consequence.csv'),
           ('TSC2', '../Data/filtered_ppj_data/VAMPseq/TSC2_scores_consequences.csv')
          ]

alt.data_transformers.disable_max_rows() #gets rid of max data length problem

In [None]:
def get_gmm_threshold(df):
    
    # find the GMM thresholds
    target_value = 0.950

    ab_df = df.loc[df['functional_consequence'].isin(['functionally_abnormal'])]
    norm_df = df.loc[df['functional_consequence'].isin(['functionally_normal'])]
    
    # Calculate the absolute difference for the Normal (N) density
    diffN = (norm_df['gmm_density_normal'] - target_value).abs()
    # Find the index of the minimum difference
    closest_index = diffN.idxmin()
    # Retrieve the row with the closest value
    closest_row_n = norm_df.loc[closest_index]
    
    # now repeat that for the abnormal density
    # Calculate the absolute difference
    diffA = (ab_df['gmm_density_abnormal'] - target_value).abs()
    # Find the index of the minimum difference
    closest_index = diffA.idxmin()
    # Retrieve the row with the closest value
    closest_row_a = ab_df.loc[closest_index]
    
    # now we get the scores that are the closest to the (n)ormal and (a)bnormal thresholds
    upprthresh = closest_row_n['score']
    lwrthresh = closest_row_a['score']

    thresholds = [lwrthresh, upprthresh]

    return thresholds

In [None]:
def read_data(sge, vampseq):

    variant_types = [
        'Synonymous',
        'Missense',  
        'Stop Gained',
        'Intron', 
        'UTR Variant',
        'Stop Lost',
        'Start Lost',
        'Canonical Splice', 
        'Splice Region',
    ]
    
    sge_dict = {}
    for gene in sge:
        gene_name, data_path = gene

        df = pd.read_csv(data_path, sep = '\t')
        df = df.loc[df['variant_qc_flag'].isin(['PASS'])]

        
        df = df.rename(columns = {'consequence': 'Consequence'})
        df.loc[df['Consequence'].str.contains('missense'), 'Consequence'] = 'Missense'
        df.loc[df['Consequence'] == 'synonymous_variant', 'Consequence'] = 'Synonymous'
        df.loc[df['Consequence'] == 'intron_variant', 'Consequence'] = 'Intron'
        df.loc[df['Consequence'] == 'stop_gained', 'Consequence'] = 'Stop Gained'
        df.loc[df['Consequence'] == 'stop_lost', 'Consequence'] = 'Stop Lost'
        df.loc[df['Consequence'].str.contains('site'), 'Consequence'] = 'Canonical Splice'
        df.loc[df['Consequence'].str.contains('ing_var'), 'Consequence'] = 'Splice Region'
        df.loc[df['Consequence'].str.contains('UTR'), 'Consequence'] = 'UTR Variant'
        df.loc[df['Consequence'] == 'start_lost', 'Consequence'] = 'Start Lost'
        df.loc[df['ref'].str.len() > 1, 'Consequence'] = '3bp Deletion'
        
        sge_dict[gene_name] = df


    vampseq_dict = {}
    for gene in vampseq:
        gene_name, data_path = gene

        gene_df = pd.read_csv(data_path)
        gene_df = gene_df.rename(columns = {'average_score': 'score', 'type': 'Consequence'})
        gene_df.loc[gene_df['Consequence'] == 'Nonsense', 'Consequence'] = 'Stop Gained'
        gene_df = gene_df.loc[~(gene_df['Consequence'].isin(['Deletion']))]
        vampseq_dict[gene_name] = gene_df


    return sge_dict, vampseq_dict

In [None]:
def sge_histogram(df, gene_name, thresholds, thresholding = True): #makes histogram


    length = str(len(df)) #gets length of data for title

    #final_tital = gene_name + ' SGE Scores ' + '(n = ' + length + ')' #used to build title
    final_tital = f' {gene_name} (n = {length})'
    bins = 50 #number of bins

    sorted = ["Intron", "Missense", "Synonymous", "Stop Gained", "Splice", "Start Lost", 'Stop Lost', 'UTR', '3bp Deletion'] #order for the legend
    selection = alt.selection_point(fields=['Consequence'], bind='legend')

    palette = [
    '#006616', # dark green,
    '#81B4C7', # dusty blue
    '#ffcd3a', # yellow
    '#6AA84F', # med green
    '#93C47D', # light green
    '#888888', # med gray
    '#000000', # black
    '#1170AA', # darker blue
    '#CFCFCF', # light gray
    '#FF9A00'   #orange
        
    ]
    
    
    variant_types = [
        'Synonymous',
        'Missense',  
        'Stop Gained',
        'Intron', 
        'UTR Variant',
        'Stop Lost',
        'Start Lost',
        'Canonical Splice', 
        'Splice Region',
        '3bp Deletion'
    ]

    df = df.loc[df['Consequence'].isin(variant_types)]
    # Builds histogram with interactive legend
    histogram = alt.Chart(df).mark_bar().encode(
            alt.X('score', 
                  axis = alt.Axis(title = 'Functional Score', 
                                  labelFontSize = 16, 
                                  titleFontSize = 20, 
                                  values = [-0.6, -0.5, -0.4, -0.3, -0.2, -0.1, 0, 0.1, 0.2]
                                 ), 
                  bin = alt.Bin(maxbins = bins)),
            alt.Y('count()', 
                  axis = alt.Axis(title = 'Number of Variants', 
                                  labelFontSize = 16, 
                                  titleFontSize = 20)),
            color = alt.Color('Consequence:N', 
                             scale = alt.Scale(range = palette,
                                              domain = variant_types), 
                             legend = alt.Legend(titleFontSize = 16, 
                                                 labelFontSize = 14,
                                                 orient = 'top-left',
                                                offset = 10)),
            opacity=alt.condition(selection, alt.value(1), alt.value(0.2))  # Highlight selected categories
    ).add_params(
        selection
    ).properties(
        width = 400,
        height = 300,
        title = alt.TitleParams(text = final_tital, fontSize = 22)
    ).interactive()

    if thresholding:

        nf_line = alt.Chart(pd.DataFrame({'Functional Score': [thresholds[0]]})).mark_rule(color = 'red').encode(
            x = 'Functional Score')
    
        func_line = alt.Chart(pd.DataFrame({'Functional Score': [thresholds[1]]})).mark_rule(color = 'blue').encode(
            x = 'Functional Score')
    
    
        histogram = alt.layer(histogram, nf_line, func_line)

    else:
        histogram = histogram 
        
    return histogram
    

In [None]:
def vamp_histogram(df, gene_name, thresholds, thresholding = True):

    length = str(len(df)) #gets length of data for title

    final_tital = gene_name +  ' (n = ' + length + ')' #used to build title
    bins = 50 #number of bins

    sorted = ["Intron", "Missense", "Synonymous", "Stop Gained", "Splice", "Start Lost", 'Stop Lost', 'UTR'] #order for the legend
    selection = alt.selection_point(fields=['Consequence'], bind='legend')

    palette = [
    '#006616', # dark green,
    '#81B4C7', # dusty blue
    '#ffcd3a', # yellow    
    ]
    
    
    variant_types = [
        'Synonymous',
        'Missense',  
        'Stop Gained'
    ]
    
    # Builds histogram with interactive legend
    histogram = alt.Chart(df).mark_bar().encode(
            alt.X('score', 
                  axis = alt.Axis(title = 'Functional Score', 
                                  labelFontSize = 16, 
                                  titleFontSize = 20,
                                  values = [-0.2, 0, 0.2, 0.4, 0.6, 0.8, 1, 1.2, 1.4, 1.6]
                                 ), 
                  bin = alt.Bin(maxbins = bins)),
            alt.Y('count()', 
                  axis = alt.Axis(title = 'Number of Variants', 
                                  labelFontSize = 16, 
                                  titleFontSize = 20)),
            color = alt.Color('Consequence:N', 
                             scale = alt.Scale(range = palette,
                                              domain = variant_types), 
                             legend = alt.Legend(titleFontSize = 16, 
                                                 labelFontSize = 14,
                                                 orient = 'top-left',
                                                offset = 10)),
            opacity=alt.condition(selection, alt.value(1), alt.value(0.2))  # Highlight selected categories
    ).add_params(
        selection
    ).properties(
        width = 600,
        height = 400,
        title = alt.TitleParams(text = final_tital, fontSize = 22)
    ).interactive()

    if thresholding:

        nf_line = alt.Chart(pd.DataFrame({'Functional Score': [thresholds[0]]})).mark_rule(color = 'red').encode(
            x = 'Functional Score')
    
        func_line = alt.Chart(pd.DataFrame({'Functional Score': [thresholds[1]]})).mark_rule(color = 'blue').encode(
            x = 'Functional Score')
    
    
        histogram = alt.layer(histogram, nf_line, func_line)

    else:
        histogram = histogram 
        
    return histogram
    

In [None]:
def sge_strip_plot(df, thresholds, thresholding = True):

    palette = [
    '#006616', # dark green,
    '#81B4C7', # dusty blue
    '#ffcd3a', # yellow
    '#6AA84F', # med green
    '#93C47D', # light green
    '#888888', # med gray
    '#000000', # black
    '#1170AA', # darker blue
    '#CFCFCF', # light gray
    '#FF9A00'   #orange
        
    ]
    
    
    variant_types = [
        'Synonymous',
        'Missense',  
        'Stop Gained',
        'Intron', 
        'UTR Variant',
        'Stop Lost',
        'Start Lost',
        'Canonical Splice', 
        'Splice Region',
        '3bp Deletion'
    ]

    df = df.loc[df['Consequence'].isin(variant_types)]
    df['Consequence'] = pd.Categorical(df['Consequence'], 
                                   categories=variant_types, 
                                   ordered=True)

    nf_line = alt.Chart(pd.DataFrame({'SGE Score': [thresholds[0]]})).mark_rule(color = 'red').encode(
        x = 'SGE Score')

    func_line = alt.Chart(pd.DataFrame({'SGE Score': [thresholds[1]]})).mark_rule(color = 'blue').encode(
        x = 'SGE Score')

    sort_order = variant_types
            
    plot = alt.Chart(df).mark_tick(opacity = 1).encode(
        x = alt.X('score:Q',
                  axis = alt.Axis(title = '', 
                                  titleFontSize = 18,
                                 labelFontSize = 16, 
                                 values = [-0.6, -0.5, -0.4, -0.3, -0.2, -0.1, 0, 0.1, 0.2])
                 ),
        y = alt.Y('Consequence:N', 
                  sort = sort_order,
                  axis = alt.Axis(title = '',
                                 labelFontSize = 24)
                 ),
        color = alt.Color('Consequence:N',
                legend=None,
                scale = alt.Scale(range = palette,
                                  domain = variant_types)
                         )
        ).properties(
            width = 800,
            height = 400
        ).interactive()

    if thresholding:
        plot = alt.layer(plot, nf_line, func_line).resolve_scale(y = 'shared')


    else:
        plot = plot

    return plot
        

In [None]:
def vamp_strip(df, thresholds, thresholding = True):

    palette = [
    '#006616', # dark green,
    '#81B4C7', # dusty blue
    '#ffcd3a' # yellow   
    ]
    
    
    variant_types = [
        'Synonymous',
        'Missense',  
        'Stop Gained'
    ]

    df['Consequence'] = pd.Categorical(df['Consequence'], 
                                   categories=variant_types, 
                                   ordered=True)

    nf_line = alt.Chart(pd.DataFrame({'VAMP-seq Score': [thresholds[0]]})).mark_rule(color = 'red').encode(
        x = 'VAMP-seq Score')

    func_line = alt.Chart(pd.DataFrame({'VAMP-seq Score': [thresholds[1]]})).mark_rule(color = 'green').encode(
        x = 'VAMP-seq Score')

    sort_order = variant_types
            
    plot = alt.Chart(df).mark_tick(opacity = 1).encode(
        x = alt.X('score:Q',
                  axis = alt.Axis(title = '', 
                                  titleFontSize = 18,
                                 labelFontSize = 16, 
                                 values = [-0.2, 0, 0.2, 0.4, 0.6, 0.8, 1, 1.2, 1.4, 1.6])
                 ),
        y = alt.Y('Consequence:N', 
                  sort = sort_order,
                  axis = alt.Axis(title = '',
                                 labelFontSize = 24)
                 ),
        color = alt.Color('Consequence:N',
                legend=None,
                scale = alt.Scale(range = palette,
                                  domain = variant_types)
                         )
        ).properties(
            width = 800,
            height = 400
        ).interactive()

    if thresholding:
        plot = alt.layer(plot, nf_line, func_line).resolve_scale(y = 'shared')


    else:
        plot = plot

    return plot
        

In [None]:
def sge_plots(sge_data):
    genes = list(sge_data.keys())

    histograms = {}
    strip_plots = {}
    for gene in genes:
        df = sge_data[gene]
        if gene == 'BRCA2':
            gene_thresholds = [0,0]
            histogram = sge_histogram(df, gene, gene_threshold, thresholding = False)
            stripplot = sge_strip_plot(df, gene_thresholds, thresholding = False)
            histograms[gene] = histogram
            strip_plots[gene] = stripplot
        else:
            threshold_df = df.loc[~df['Consequence'].isin(['3bp Deletion'])]
            gene_threshold = get_gmm_threshold(threshold_df)

            print(f'{gene} Thresholds are: {gene_threshold}')
            histogram = sge_histogram(df, gene, gene_threshold)
            stripplot = sge_strip_plot(df, gene_threshold)
            histograms[gene] = histogram
            strip_plots[gene] = stripplot



    for gene in genes:
        histogram = histograms[gene]
        stripplot = strip_plots[gene]
        final = histogram
        #final = histogram  & stripplot

        final = final.configure_axis(
            grid = False
        ).configure_view(
            stroke = None
        ).resolve_scale(
            x = 'shared'
        )

        to_save = '/Users/ivan/Desktop/pillar_project_figs/Histogram_wStripplot/' + gene + '_Histogram.svg'
        #final.save(to_save)
        #histogram.display()
        final.display()

In [None]:
def vampseq_figs(vampseq_data):
    genes = list(vampseq_data.keys())

    histograms = {}
    strip_plots = {}

    for gene in genes:
        df = vampseq_data[gene]
        gene_threshold = [0,0]
        histogram = vamp_histogram(df, gene, gene_threshold, thresholding = False)
        strip = vamp_strip(df, gene_threshold, thresholding = False)
        histograms[gene] = histogram
        strip_plots[gene] = strip


    for gene in genes:
        histogram = histograms[gene]
        stripplot = strip_plots[gene]

        #final = histogram  & stripplot
        final = histogram

        final = final.configure_axis(
            grid = False
        ).configure_view(
            stroke = None
        ).resolve_scale(
            x = 'shared'
        )

        to_save = '/Users/ivan/Desktop/pillar_project_figs/Histogram_wStripplot/' + gene + '_Histogram.svg'
        #final.save(to_save)
        
        final.display()

        

In [None]:
def main():
    sge_data, vampseq_data = read_data(sge_genes, vampseq)
    sge_plots(sge_data)
    vampseq_figs(vampseq_data)

In [None]:
main()