In [None]:
import pandas as pd
import altair as alt
from pathlib import Path
import numpy as np
import re
from natsort import natsorted

In [None]:
sge_file = '../Data/BARD1_SGE_final_table.xlsx' #SGE datafile'
alt.data_transformers.disable_max_rows()

In [None]:
def read_sge_data(file): #Reads SGE data and gets thresholds
    df = pd.read_excel(file, sheet_name = 'scores')

    df = df.rename(columns = {'am_score': 'AM Mean',
                                    'revel_score': 'REVEL Mean',
                                    'cadd_score': 'CADD',
                                  'MutPred2': 'MP2 Mean'
                                   })

    snv_df = df.loc[df['var_type'].isin(['snv'])]
    snv_df = snv_df.loc[~snv_df['variant_qc_flag'].isin(['WARN'])]

    
    del_df = df.loc[df['var_type'].isin(['3bp_del'])]
 
    
    threshold_df = pd.read_excel(file, sheet_name = 'thresholds')
    
    thresholds = [threshold_df['min'][0], threshold_df['max'][0]] #Gets thresholds

    cutsites = pd.read_excel(file, sheet_name = 'cutsites')
    median_depth = pd.read_excel(file, sheet_name = 'median_pos_depth')
    aa_depth = pd.read_excel(file, sheet_name = 'min_aa_depth')

    return snv_df, del_df, thresholds, cutsites, median_depth, aa_depth

In [None]:
def make_heatmap_input(df):
    df = df.loc[~(df['amino_acid_change'].str.contains('-'))]
    
    df = df.rename(columns = {'amino_acid_change': 'AAsub'})

    df['og_AA'] = df['AAsub'].transform(lambda x: x[0]) #Makes column with the original amino acid
    df['AA_change'] = df['AAsub'].transform(lambda x: x[-1]) #makes column with amino acid change
    df['AApos'] = df['AAsub'].transform(lambda x: x[1: len(x)-1]) #makes column with residue position

    mis_df = df.loc[~(df['consequence'].isin(['stop_gained']))]
    df['AApos'] = df['AApos'].astype(int)

    
    vep_df = df.dropna(subset = ['MP2 Mean'])
    vep_df = vep_df.loc[vep_df['max_SpliceAI'] <= 0.2]
    vep_df['AApos'] = vep_df['AApos'].astype(int)

    vep_summary = vep_df.groupby('AApos').agg({
        'AM Mean': 'mean',
        'CADD': 'mean',
        'REVEL Mean': 'mean',
        'MP2 Mean': 'mean'
    }).reset_index()

    
    vep_melted = vep_summary.melt(
        id_vars = ['AApos'],
        var_name = 'Predictor',
        value_name = 'score'
    )

    vep_melted['og_AA'] = vep_melted['Predictor']
    vep_melted['AA_change'] = vep_melted['Predictor']
    vep_melted['AAsub'] = vep_melted['Predictor']

    vep_final = vep_melted[['AApos', 'og_AA', 'AA_change', 'score', 'AAsub']]

    df = df[['AApos', 'og_AA', 'AA_change', 'score', 'AAsub']]

    min_df = mis_df.groupby('AApos')['score'].min().reset_index()
    min_df['og_AA'] = 'Mis. Min.'
    min_df['AA_change'] = 'Mis. Min.'

    mean_df = mis_df.groupby('AApos')['score'].mean().reset_index()
    mean_df['og_AA'] = 'Mis. Mean'
    mean_df['AA_change'] = 'Mis. Mean'

    df = pd.concat([df, min_df, mean_df])

    

    return df, vep_final

In [None]:
def graph_rep_depth(df,median_depth): #Generates line plots pre-median faceted by replicate (commented out. Re-add if desired)

    median_depth = median_depth[['pos', 'CDSpos']]
    df = pd.merge(df, median_depth, on = 'pos', how = 'left')
    
    
    sorted = natsorted(set(df['target'].tolist()))
    plot =  alt.Chart(df).mark_line(point = True).encode(
        x = alt.X('CDSpos',
                  axis = alt.Axis(title = 'CDS Position'),
                  scale = alt.Scale(domain = [-50, 2384]
                                   )
                 ),
        y = alt.Y('normdepth', 
                  axis = alt.Axis(title = 'Normalized Depth'),
                  scale = alt.Scale(domain = [0, 1.03]
                                   )
                 ),
        color = alt.Color('target', sort = sorted, legend = alt.Legend(title = 'SGE Target',
                                                                      symbolLimit = 0)),
        tooltip = [alt.Tooltip('target', title = 'SGE Target: '),
                   alt.Tooltip('amino_acid', title = 'Amino Acid: '),
                   alt.Tooltip('normdepth', title = 'Normalized Depth: ')]
    ).properties(
        width = 1000, 
        height = 400
    ).interactive()
    
    plot = plot.facet('repl', columns = 1)
    plot.display()

In [None]:
def graph_aapos_alphafold_rects(df, aa_df, cut_coords): #Generates final line plot based on median collapsed depth along with annotations for: BARD1 domain, AlphaFold2 predicted structure/unstructured, and secondary structure annotations

    df = df.loc[~(df['target'].isin(['BARD1_X4J']))] #Target removed from final data
    aa_df = aa_df.loc[~(aa_df['target'].isin(['BARD1_X4J']))] #Target removed from final data
    
    df = df.dropna(subset = ['AApos']) #Drops rows without amino acid position
    df = df.copy()
    aa_df = aa_df.copy()

    #Sets data types to int for plotting
    df['pos'] = df['pos'].astype(int)
    df['AApos'] = df['AApos'].astype(int)
    aa_df['AApos'] = aa_df['AApos'].astype(int)

    depth_only = aa_df[['median_depth', 'AApos']] #Pulls necessary columns

    #Annotates cut sites originally expressed with CDS position to amino acid position
    cds_cut_df = cut_coords
    cds_cut_df = cds_cut_df.drop_duplicates()
    cds_cut_df = pd.merge(cds_cut_df, depth_only, on = 'AApos', how = 'inner')
    
    cds_cut_df = cds_cut_df.drop_duplicates() #Drops duplicates

    cds_cut_df['start_point'] = 0 #Sets start point for cutsite lines at y = 0
    
    
    sorted = natsorted(set(df['target'].tolist())) #Sorts target list


    #Creates huge color scheme
    tableau20 = ['#4E79A7', '#F28E2C', '#E15759', '#76B7B2', '#59A14F', 
             '#EDC949', '#AF7AA1', '#FF9DA7', '#9C755F', '#BAB0AB',
             '#4E79A7', '#F28E2C', '#E15759', '#76B7B2', '#59A14F',
             '#EDC949', '#AF7AA1', '#FF9DA7', '#9C755F', '#BAB0AB']

    category20 = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
                  '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf',
                  '#aec7e8', '#ffbb78', '#98df8a', '#ff9896', '#c5b0d5',
                  '#c49c94', '#f7b6d2', '#c7c7c7', '#dbdb8d', '#9edae5']
    
    # Combine them
    combined_colors = tableau20 + category20  # 40 colors total
    
    # Or remove similar colors
    plot_domain = [1, 778]
    combined_colors = list(dict.fromkeys(tableau20 + category20))

    plot_ticks = list(range(0, 776, 25))

    #Builds median read depth plot
    plot =  alt.Chart(df).mark_line(point = True).encode(
        x = alt.X('AApos',
                  axis = alt.Axis(title = '',
                                  labelFontSize = 16,
                                  titleFontSize = 20, 
                                  values = plot_ticks,
                                 ),
                  scale = alt.Scale(domain = plot_domain
                                   )
                 ),
        y = alt.Y('median_depth', 
                  axis = alt.Axis(title = 'Median Normalized Depth (D13)',
                                  values = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                 labelFontSize = 16,
                                 titleFontSize = 20
                                 ),
                  scale = alt.Scale(domain = [0.05, 1.7]
                                   )
                 ),
        color = alt.Color('target', 
                          sort = sorted, 
                          scale = alt.Scale(range = combined_colors[:34]),
                          legend = None
                         ),
        tooltip = [alt.Tooltip('target', title = 'SGE Target: '),
                   alt.Tooltip('amino_acid', title = 'Amino Acid: '),
                   alt.Tooltip('median_depth', title = 'Median Normalized Depth: ')]
    ).properties(
        width = 1750, 
        height = 300,
        title = alt.TitleParams(text = 'Median Read Depth by Target',
                                fontSize = 22
                               )
    ).interactive()



    #Adds ticks to mark cutsites
    
    ticks = alt.Chart(cds_cut_df).mark_rule(
        color='gray',
        strokeWidth = 2,
        strokeDash = [5,5]
    ).encode(
        x= alt.X('AApos:Q',
                 scale = alt.Scale(domain = plot_domain
                                  )
                ),# Your column name
        y= 'start_point',
        y2 = 'median_depth:Q'
    )


    #Adds rectangles for: BARD1 domains, secondary structure based on solved structures, and AlphaFold predicted structure
    y_max = 1

    rect_data_x = [1, 26, 120,141, 165, 210,220, 421, 548, 565, #AlphaFold2 structure coordinates
              26, 425, 568, #BARD1 Domains
              26,34, 48, 61,63, 68, 70,74, 80, 97, 117, #RING secondary structure (1JM7)
             425,430,439, 441, 450,463, 471, 473, 483,497,504,507, 516, 529, 534,536, #ARD secondary structure (3C5R)
             568,571,574, 578, 592, 595,597, 606, 608, 617, 626, 629, 631, 643, 655,666, 676, 679, 687, 698,700, 702, 712, 717, 735, 739, 750, 752,755, 760, 770 #BRCT secondary structure (3FA2)
                  ]
                   

    rect_data_x2 = [26, 120, 141,165, 210, 220,421, 548, 565, 777, #AlphaFold2 structure coordinates
              122, 545, 777, #BARD1 Domains
              34,48, 61,63,68, 70, 74, 80, 97, 117,122,#RING secondary structure
             430,439,441, 450,463, 471, 473, 483, 497, 504,507, 516, 529, 534,536, 545,#ARD secondary structure
            571,574, 578, 592, 595, 597,606, 608, 617, 626,629, 631, 643, 655, 666, 676, 679, 687, 698, 700, 702, 712, 717, 735, 739, 750, 752, 755,760,770, 777 #BRCT secondary structure
                   ]

    rect_data_y = [y_max * 1.05, y_max * 1.05, y_max * 1.05, y_max * 1.05, y_max * 1.05, y_max * 1.05, y_max * 1.05, y_max * 1.05, y_max * 1.05, y_max * 1.05, #AlphaFold2 structure coordinates
              y_max * 1.45, y_max * 1.45, y_max * 1.45, #BARD1 Domains
             y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, #RING secondary structure
             y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25,y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25,#ARD secondary structure
             y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25,y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25,
             y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25 #BRCT secondary structure
                  ]

    rect_data_y2 = [y_max * 1.2, y_max * 1.2, y_max * 1.2, y_max * 1.2, y_max * 1.2, y_max * 1.2, y_max * 1.2, y_max * 1.2, y_max * 1.2, y_max * 1.2, #AlphaFold structure coordinates
               y_max * 1.6, y_max * 1.6, y_max * 1.6, #BARD1 Domains
              y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4,
              y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4,y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, #RING secondary structure
             y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4,y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4,#ARD secondary structure 
                   y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4 #BRCT secondary structure
                   ]

    rect_data_color = ['#b1b1b1', '#DEDBEE', '#b1b1b1', '#DEDBEE', '#b1b1b1', '#DEDBEE', '#b1b1b1', '#DEDBEE', '#b1b1b1', '#DEDBEE', #AlphaFold strucutral colors
                  '#B9DBF4', '#C8DBC8', '#F6BF93', #Domain Colors
                 'white','blue','white','green','white','green','white','blue','white','blue','white', #RING secondary strucutre coloring (based on 1JM7)
                 'white', 'blue', 'white', 'blue', 'white', 'blue', 'white', 'blue','white', 'blue', 'white', 'blue', 'white', 'blue', 'white', 'blue', #ARD secondary structure coloring (based on 3C5R)
                'white', 'green','white', 'blue', 'white', 'green', 'white', 'green', 'white', 'blue', 'white', 'green', 'blue', 'white', 'blue', 'white', 'green', 'white', 'blue', 'white', 'green', 'white', 'blue', 'white', 'green', 'white', 'green', 'white', 'green', 'blue', 'white'  #BRCT Secondary structure coloring (based on 3FA2)
                ]

    rect_data_text = ['', 'Structured', '', '', '', '', 'Disordered', '', '', '', #AlphaFold text
                  'RING', 'ARD', 'BRCT', #Domain Text
                 '', '', '', '', '', '', '', '', '', '','', #Placeholders for secondary structure rectangles
                 '', '', '', '', '', '', '', '', '','', '', '', '', '', '', '', 
                 '', '', '', '', '', '', '', '', '','', '', '', '', '', '', '','', '', '', '', '', '', '', '', '','', '', '', '', '', '']

    #print(len(rect_data_x), len(rect_data_x2), len(rect_data_y), len(rect_data_y2), len(rect_data_color), len(rect_data_text)) #For array length checking

    #Plots rectangles
    rect_data = pd.DataFrame({
        'x': rect_data_x,    # Start x-position of each rectangle
        'x2':rect_data_x2,  # End x-position of each rectangle  
        'y': rect_data_y,    # Bottom of rectangles (5% above max)
        'y2': rect_data_y2,# Top of rectangles (15% above max)
        'color': rect_data_color,
        'label': rect_data_text
    })
    
    rectangles = alt.Chart(rect_data).mark_rect(
        opacity=0.7,
        stroke = 'black', 
        strokeWidth = 2
    ).encode(
        x= 'x:Q',
        x2='x2:Q',
        y='y:Q',
        y2='y2:Q',
        color = alt.Color('color:N', scale = None)
    )

    text = alt.Chart(rect_data).mark_text(
        align='center',
        baseline='middle',
        fontWeight = 'bold',
        fontSize=24,
        angle=0,  # or 90/-90 for vertical text
        color='black',
        limit=1000  # truncate long text
    ).encode(
        x=alt.X('x_center:Q'),
        y=alt.Y('y_center:Q'),
        text='label:N'
    ).transform_calculate(
        x_center='(datum.x + datum.x2) / 2',
        y_center='(datum.y + datum.y2) / 2'
    )


    legend_data = pd.DataFrame({
    'label': ['α-Helix', 'β-Sheet', 'Loop'],
    'color': ['blue', 'green', 'white']
    })
    
    legend = alt.Chart(legend_data).mark_square(
        size=100,
        filled=True,
        stroke = 'black',
        strokeWidth = 1
    ).encode(
        x = alt.value(80),
        y=alt.Y('label:N', axis=alt.Axis(title=None, labelFontSize=16, orient = 'right', ticks = False, domain = False, labelAlign = 'left', labelPadding = 1),
               sort = ['α-Helix', 'β-Sheet', 'Loop']),
        color=alt.Color('color:N', scale=None, legend=None)
    ).properties(
        width=200,
        title=alt.TitleParams(text = '2ᵒ Structure', fontSize = 20)
    ).configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    )

    '''
    plot = alt.layer(plot, ticks, rectangles,text).encode(
        x = alt.X(scale = alt.Scale(domain = plot_domain)
                 )
    ).configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    )
    '''
 
    
    plot = alt.layer(plot, ticks, rectangles,text).encode(
        x = alt.X(scale = alt.Scale(domain = plot_domain)
                 )
    )

    to_display = plot.configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    )
    #plot = alt.hconcat(plot,legend)
    #plot.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_2a_dels.png', ppi = 500)
    #plot.display()

    #to_display.save('/Users/ivan/Desktop/BARD1_draft_figs/supp_figs/suppfig_depth.png', ppi = 500)
    #to_display.display()
    
    return plot, legend

In [None]:
def deletions(df):

    df = df.loc[~(df['amino_acid_change'].isin(['---']))]
    df = df.copy()
    df['cds_start'] = df['CDS_position'].transform(lambda x: x.split('-')[0])
    df['cds_end'] = df['CDS_position'].transform(lambda x: x.split('-')[1])

    df = df.loc[~(df['cds_start'].isin(['?']))]
    df = df.loc[~(df['cds_end'].isin(['?']))]

    df['cds_start'] = df['cds_start'].astype(int)
    df['cds_end'] = df['cds_end'].astype(int)
    df['ps_aa_start'] = round((df['cds_start'] + 2) / 3, 2)
    df['ps_aa_end'] = round((df['cds_end'] + 2) / 3, 2)

    df.loc[df['consequence'] == 'inframe_indel', 'consequence'] = 'Inframe Indel'
    df.loc[df['consequence'] == 'stop_gained', 'consequence'] = 'Stop Gained'
    df.loc[df['consequence'] == 'start_lost', 'consequence'] = 'Start Lost'
    df.loc[df['consequence'] == 'stop_lost', 'consequence'] = 'Stop Lost'

    

    palette = [
    '#81B4C7', # dusty blue
    '#ffc721', # yellow
    '#888888', # med gray
    '#000000' # black
        
    ]
    
    
    variant_types = [
        'Inframe Indel',  
        'Stop Gained',
        'Stop Lost',
        'Start Lost',

    ]
    
    fig = alt.Chart(df).mark_rule(strokeWidth = 10).encode(
        x = alt.X('ps_aa_start',
                  title = '',
                  axis = alt.Axis(
                        titleFontSize = 20,
                      labelFontSize = 16,
                      ticks = False,
                      labels = False
                  ),
                  scale = alt.Scale(domain = [0, 778]
                                   )
                 ),
        x2 = 'ps_aa_end',
        y = alt.Y('score',
                  title = 'SGE Score',
                  axis = alt.Axis(
                      titleFontSize = 20,
                      labelFontSize = 16
                  )
                 ),
        color = alt.Color('consequence',
                          scale = alt.Scale(
                              range = palette,
                              domain = variant_types
                          ),
                          legend = alt.Legend(title = 'Consequence',
                                              labelFontSize = 16,
                                              titleFontSize = 20
                                             )
                         )
    ).properties(
        width = 1750, 
        height = 300,
        title = alt.TitleParams(text = '3bp Deletions in BARD1',
                                fontSize = 22
                               )
        
    )


    return fig

In [None]:
def dels_wrect(df):

    df = df.loc[~(df['amino_acid_change'].isin(['---']))]
    df = df.copy()
    df['cds_start'] = df['CDS_position'].transform(lambda x: x.split('-')[0])
    df['cds_end'] = df['CDS_position'].transform(lambda x: x.split('-')[1])

    df = df.loc[~(df['cds_start'].isin(['?']))]
    df = df.loc[~(df['cds_end'].isin(['?']))]

    df['cds_start'] = df['cds_start'].astype(int)
    df['cds_end'] = df['cds_end'].astype(int)
    df['ps_aa_start'] = round((df['cds_start'] + 2) / 3, 2)
    df['ps_aa_end'] = round((df['cds_end'] + 2) / 3, 2)

    df = df.loc[df['consequence'].isin(['inframe_indel', 'stop_gained'])]
    df.loc[df['consequence'] == 'inframe_indel', 'consequence'] = 'Inframe Indel'
    df.loc[df['consequence'] == 'stop_gained', 'consequence'] = 'Stop Gained'


    size_map  = {'Inframe Indel': 5,
                 'Stop Gained': 10,
                 'Start Lost': 5,
                 'Stop Lost': 5
                }

    df['rule_size'] = df['consequence'].map(size_map)
    

    y_max = 0.075
    rect_x = [26, 425, 568]
    rect_x2 = [122, 545, 777]
    rect_y1 = [y_max  + .05, y_max  + .05, y_max + .05]
    rect_y2 = [y_max + 0.15, y_max + .15, y_max + 0.15]
    
    rect_colors = ['#B9DBF4', '#C8DBC8', '#F6BF93']
    rect_text = ['RING', 'ARD', 'BRCT']

    lines_df = pd.DataFrame({'x': [26, 425, 568, 122, 545, 777],
                            'y':[y_max + 0.15, y_max + .15, y_max + 0.15,y_max + 0.15, y_max + .15, y_max + 0.15],
                            'y2': [-0.6,-0.6,-0.6,-0.6,-0.6,-0.6]})
    
    rect_data = pd.DataFrame({'x': rect_x,
                             'x2': rect_x2,
                              'y': rect_y1,
                              'y2': rect_y2,
                             'color': rect_colors,
                             'label': rect_text
                             }
                            )

    rectangles = alt.Chart(rect_data).mark_rect(
        opacity=1,
        stroke = 'black', 
        strokeWidth = 2
    ).encode(
        x= 'x:Q',
        x2='x2:Q',
        y='y:Q',
        y2='y2:Q',
        color = alt.Color('color:N', scale = None)
    )

    text = alt.Chart(rect_data).mark_text(
        align='center',
        baseline='middle',
        fontWeight = 'bold',
        fontSize=24,
        angle=0,  # or 90/-90 for vertical text
        color='black',
        limit=1000 # truncate long text
    ).encode(
        x=alt.X('x_center:Q'),
        y=alt.Y('y_center:Q'),
        text='label:N'
    ).transform_calculate(
        x_center='(datum.x + datum.x2) / 2',
        y_center='(datum.y + datum.y2) / 2'
    )

    lines = alt.Chart(lines_df).mark_rule(
        strokeDash = [5,5],
        thickness = 25,
        color = 'black'
    ).encode(
        x = alt.X('x:Q',
                  axis = alt.Axis(values = list(range(0, 781, 50))),
                  scale = alt.Scale(domain = [0, 780])
                 ),
        y = alt.Y('y:Q'),
        y2 = 'y2:Q'
    )
    
    palette = [
    '#81B4C7', # dusty blue
    '#ffc721', # yellow
    '#888888', # med gray
    '#000000' # black
        
    ]
    
    
    variant_types = [
        'Inframe Indel',  
        'Stop Gained',
        'Stop Lost',
        'Start Lost',

    ]
    
    fig = alt.Chart(df).mark_point(color = 'black').encode(
        x = alt.X('ps_aa_start',
                  title = '',
                  axis = alt.Axis(
                        titleFontSize = 20,
                      labelFontSize = 16,
                      ticks = False,
                      labels = False
                  ),
                  scale = alt.Scale(domain = [0, 780],
                                    nice = False
                                   )
                 ),
        y = alt.Y('score',
                  title = 'SGE Score',
                  axis = alt.Axis(
                      titleFontSize = 20,
                      labelFontSize = 16,
                      values = [-0.6,-0.5, -0.4, -0.3, -0.2, -0.1, 0, 0.1]
                  ),
                  scale = alt.Scale(domain = [-0.6, 0.25])
                 ),
        shape = alt.Shape('consequence',
                          scale = alt.Scale(domain = ['Inframe Indel', 'Stop Gained'],
                                            range = ['circle', 'diamond']
                                           ),
                          legend = alt.Legend(title = 'Consequence',
                                              labelFontSize = 16,
                                              titleFontSize = 20
                                             )
                         )
    ).properties(
        width = 1750, 
        height = 300,
        title = alt.TitleParams(text = '3bp Deletions in BARD1',
                                fontSize = 22
                               )
        
    )

    fig = alt.layer(fig, rectangles, text, lines).interactive()
    #fig.display()

    return fig

In [None]:
def heatmap(df,vep_df, thresholds):
    order = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'Stop', 'Min.', 'Mean']

    line_df = pd.DataFrame({'x': [26, 425, 568, 122, 545, 777]})

    lines = alt.Chart(line_df).mark_rule(
            opacity = 1,
            strokeDash = [5,5],
            thickness = 25,
            color = 'black'
        ).encode(
            x = alt.X('x:Q',
                     scale = alt.Scale(domain = [0, 778])
        )
    )

    df = df.loc[~(df['AAsub'].isin(['*778*']))]
    
    map = alt.Chart(df).mark_rect().encode(
        x = alt.X('AApos:Q',
                  title = '',
                  axis = alt.Axis(
                      labels = False,
                      ticks = False
                  ),
                  scale = alt.Scale(domain = [0,778]),
                  bin = alt.Bin(maxbins = 778, minstep = 1)
                 ),
        y = alt.Y('AA_change',
                  title = 'Amino Acid Substitution',
                  axis = alt.Axis(
                      labelFontSize = 16,
                      titleFontSize = 20
                  ),
                 sort = order),
        color = alt.Color('score', 
                          title = 'SGE Score',
                          scale = alt.Scale(
                              domain = [-0.2, 0],
                              clamp = True,
                              scheme = 'bluepurple',
                              reverse = True
                          ),
                          legend = alt.Legend(
                              titleFontSize = 20,
                              labelFontSize = 16
                          )
                         )
    ).properties(
        height = 800, 
        width = 1750
    )

    vep_df = vep_df.loc[~vep_df['AA_change'].isin(['CADD'])]
    vep_map = alt.Chart(vep_df).mark_rect().encode(
        x = alt.X('AApos:Q',
                  title = '',
                  axis = alt.Axis(
                      labels = False,
                      ticks = False
                  ),
                  scale = alt.Scale(domain = [0,778]),
                  bin = alt.Bin(maxbins = 778, minstep = 1)
                 ),
        y = alt.Y('AA_change',
                  title = 'Amino Acid Substitution',
                  axis = alt.Axis(
                      labelFontSize = 16,
                      titleFontSize = 20
                  ),
                 sort = order),
        color = alt.Color('score', 
                          title = 'Predictor Score',
                          scale = alt.Scale(
                              domain = [0,1],
                              clamp = True,
                              scheme = 'bluepurple'
                          ),
                          legend = alt.Legend(
                              titleFontSize = 20,
                              labelFontSize = 16
                          )
                         )
    ).properties(
        height = 800, 
        width = 1750
    )

    map = alt.layer(map, vep_map).resolve_scale(
        color = 'independent'
    )
    
    map = alt.layer(map, lines)

    
    return map

In [None]:
def phylop_plot(df):
    
    df = df.loc[~df['amino_acid_change'].isin(['---'])]
    df = df.copy()
    
    df['AApos'] = df['amino_acid_change'].transform(lambda x: int(x[1: len(x)-1]))

    summary_df = df.groupby('AApos').agg({
        'phyloP': 'max', 
        'exon': 'first'
    }).reset_index()
    
    line_df = pd.DataFrame({'AApos': [26, 425, 568, 122, 545, 777]})

    lines = alt.Chart(line_df).mark_rule(
            opacity = 1,
            strokeDash = [5,5],
            thickness = 25,
            color = 'black'
        ).encode(
            x = alt.X('AApos:Q',
                      axis = alt.Axis(title = 'Amino Acid Position',
                                      titleFontSize = 20,
                                      labelFontSize = 16
                                     ),
                     scale = alt.Scale(domain = [0, 778])
        )
    )

    plot = alt.Chart(summary_df).mark_line(
        size = 2, 
        color = 'black',
    ).transform_window(
        rolling_mean = 'mean(phyloP)',
        frame = [-5,5],
        groupby = ['exon']
    ).encode(
        x = alt.X('AApos:Q',
                  axis = alt.Axis(
                      values = list(range(0, 778, 50))
                  ),
                 scale = alt.Scale(domain = [0,778]
                                  )
                 ),
        y = alt.Y('rolling_mean:Q',
                  title = '',
                  axis = alt.Axis(
                      labelFontSize = 16
                  )
                 )
    ).properties(height = 25,
                 width = 1750
                )

    plot = alt.layer(plot, lines)
    
    return plot

In [None]:
def combine_plots(depth, dels, heat):

    final_plot = alt.vconcat(depth, dels, heat, spacing = 3).resolve_scale(
        x = 'shared',
        color = 'independent'
    ).configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    )


    return final_plot

In [None]:
def short_combine(dels, heat, phylop):
    final_plot = alt.vconcat(dels, heat, phylop, spacing = 3).resolve_scale(
        x = 'shared',
        color = 'independent'
    ).configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    )

    return final_plot

In [None]:
def main():
    sge_scores, del_scores, thresholds, cut_coords, collapsed_depth, min_collapsed_depth_aa = read_sge_data(sge_file)
    heatmap_df, vep_df = make_heatmap_input(sge_scores)


    del_plot = deletions(del_scores)
    del_plot_wdomain = dels_wrect(del_scores)
    depth_plot, depth_legend = graph_aapos_alphafold_rects(collapsed_depth, min_collapsed_depth_aa, cut_coords)
    heat = heatmap(heatmap_df, vep_df, thresholds)

    phylop_line = phylop_plot(sge_scores)
    combined_plot = combine_plots(depth_plot, del_plot, heat) #Builds full figure that combines: median read depth, 3bp dels, and heatmap
    short_plot = short_combine(del_plot_wdomain, heat, phylop_line) #Builds figure seen in figure 2 with: 3bp dels, and heatmap
    

    short_plot.display()

    #combined_plot.display()
    #depth_legend.display() #Legend for median read depth plot

    #short_plot.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_2ab_dels_wHeatmap.png', ppi = 400)
    #combined_plot.save('/Users/ivan/Desktop/BARD1_draft_figs/supp_figs/suppfig_depth.png', ppi = 400)
    #depth_legend.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_2a_dels_legend.png', ppi = 400)
    

In [None]:
main()