In [None]:
import pandas as pd
import altair as alt
from pathlib import Path
import re
from natsort import natsorted

In [None]:
read_depth_path = '../Data/depth_data/depth_files'
gsp_input_file = '../Data/depth_data/deletion_inputs.xlsx'
sge_file = '../Data/20250813_BARD1scores_final_FILTERED.xlsx'
heatmap_input = '/Users/ivan/Desktop/test_excel_outputs/20250821_BARD1scores_final_FILTERED_heatmap.xlsx'
target_coords = '../Data/SNV_filtering_inputs/20250415_BARD1_filter_entry.xlsx'
cut_sites = '../Data/20241217_BARD1_sgRNA_cutsites.xlsx'
alt.data_transformers.disable_max_rows()

In [None]:
def read_input(file, cut_sites): #Reads input file containing coordinates for all exons
    
    input_params = pd.read_excel(file) #Reads input file

    #Loop that creates list of genomic coordinates for coding sequence
    i = 0
    cds_coords = [] #List to hold coding coordinates
    while i < len(input_params):
        start = input_params['start'][i] #Gets starting coordinate
        end = input_params['end'][i] #Gets end coordinate

        #Makes coding coordinates
        for j in range(start, end + 1):
            cds_coords.append(j)
        
        i += 1

    cut_sites = pd.read_excel(cut_sites)
    cut_sites.set_index('target', inplace = True)

    return cds_coords, cut_sites

In [None]:
def process_depth(depth_path, sge,coding_coords, target_coords, cut_df): #Processes all depth files in directory and annotates them
    
    file_path = Path(depth_path) #Creates path object for depth files 

    depth_files = sorted(list(file_path.glob('*.tsv'))) #Gets all depth files
    
    columns = ['region', 'offset', 'depth'] #Column names for renaming depth files
    
    target_coordinates = pd.read_excel(target_coords, sheet_name = 'targets') #Reads input with SGE target coordinates
    target_coordinates.set_index('target', inplace = True) #Sets target name to index

    all_dfs = [] #Empty list to hold processed dataframes

    #For loop iterates through all depth files and processes them
    for file in depth_files:
        df = pd.read_csv(file, sep = '\t') #Reads depth file
        df = df.set_axis(columns, axis = 1) #Renames columns 

        min_depth = df['depth'].min() #Gets minimum read count in file
        max_depth = df['depth'].max() #Gets maximum read count in file

        df['normdepth'] = df['depth'] / max_depth #Calculates normalized depth based on proportion of maximum read counts 

        full_region = df['region'][0] #Gets full SGE target
        region_start = int(re.findall(r':(\d+)-', full_region)[0]) #Gets starting coordinate for sequencing amplicon 

        df['pos'] = region_start + df['offset'] - 1 #Generates genomic coordinates for all regions based on offset column and starting coordinate

        file_str = str(file) #Sets file name to string data type
        region_rep = re.findall(r'/([^/]+)_D13\.depth\.tsv$', file_str)[0] #Gets region and replicate string

        region_rep_split = region_rep.split('_') #Splits string on '_'
        target = region_rep_split[0] + '_' + region_rep_split[1] #Gets target name
        exon_test = region_rep_split[1][1:-1] #Gets exon 

        region_start = target_coordinates.loc[target, 'end'] #Gets SGE target starting coordinate
        region_end = target_coordinates.loc[target, 'start'] #Gets SGE target end coordinate (end/start flipped due to antisense gene)
    
        region_coords = [] #List to hold coordinates in SGE target

        for k in range(region_start, region_end + 1): #Loop creates coordinates for SGE target
            region_coords.append(k)

        #Booleans to get name of exon
        if len(exon_test) > 0: #tests for all regions but X2
            exon = exon_test
        else: #Exception for X2
            exon = '2'
            
        full_rep = region_rep_split[2] #Gets replicate value

        #Boolean tests to assign replicate number
        if full_rep == 'R1R4' or full_rep == 'R1R2R3':
            rep = 'R1'
        elif full_rep == 'R2R5' or full_rep == 'R4R5R6':
            rep = 'R2'
        elif full_rep == 'R3R6' or full_rep == 'R7R8R9':
            rep = 'R3'

        cut_site = cut_df.loc[target, 'pos']
        
        #Sets columns with identifying information
        df['target'] = target
        df['exon'] = exon
        df['repl'] = rep
        df['day'] = 'D13'
        
        df = df.loc[df['pos'].isin(region_coords)] #Dataframe filtered for coordinates in SGE target edited region only

        df['cut_site_distance'] = -(df['pos'] - cut_site)
         
        all_dfs.append(df) #Dataframe appended to list
    
    final_df = pd.concat(all_dfs) #All dataframes concatenated
    final_df = final_df.loc[final_df['pos'].isin(coding_coords)] #Dataframes filtered for coding sequencing only 


    raw_sge = pd.read_excel(sge) #Reads SGE score file
    raw_sge['amino_acid'] = raw_sge['amino_acid_change'].transform(lambda x: x[0:-1]) #Creates amino acid column 
    raw_sge = raw_sge.loc[~(raw_sge['amino_acid'].isin(['--']))] #Drops columns without amino acid 
    annotation_df = raw_sge[['pos', 'amino_acid']] #Keeps position column for mergining and amino acid column for annotation 

    final_df = pd.merge(final_df, annotation_df, on = 'pos', how = 'left') #Depth and annotation_df merged to annotate with amino acids
   
    

    final_df['id'] = final_df['pos']  + final_df['depth']  + final_df['normdepth'] #Unique ID created for each datapoint 

    final_df = final_df.drop_duplicates(subset = 'id', keep = 'first') #Any duplicates dropped
    final_df = final_df.drop(columns = ['id']) #ID column dropped 

    return final_df    

In [None]:
def process_read_depth(df): #Depth dataframe processed for visualization

    df['pos'] = df['pos'].astype(str) #Sets 'pos' column to string datatype
    df['target_id'] = df['target'] + ':' + df['pos'] #Builds a target ID column for target-based collapsing to median
    
    grouped = df.groupby('target_id') #Groups dataframe by target ID

    #Creates dataframe with annotated CDS positions
    cds_annotated = df.groupby('pos').agg({ #Grouping by position allows for accurate CDS pos. to be assigned
    'normdepth': 'median', 
    'target': 'first',
    'exon': 'first',
    'amino_acid': 'last'
    }).reset_index()

    
    cds_pos = [] #List to hold CDS position values

    for i in range(len(cds_annotated)): #Builds CDS position values
        cds_pos.append(i+1)

    cds_pos = cds_pos[::-1] #Reverses values due to negative sense gene

    
    
    cds_annotated['CDSpos'] = cds_pos #Adds CDS position column
    cds_annotated = cds_annotated[['pos', 'CDSpos']] #Drops unncessary column s

    #cds_annotated.to_excel('/Users/ivan/Desktop/BARD1_GenomicCoords_wCDS.xlsx', index = False)
    
    #Collapses depth calculations to median based on shared target and position for all 3 replicates
    median_depth = grouped.agg({
        'normdepth': 'median',
        'pos': 'first',
        'target': 'first',
        'exon': 'first',
        'amino_acid': 'first',
        'cut_site_distance': 'first'
    }
                              )

    median_depth = pd.merge(median_depth, cds_annotated, on = 'pos', how = 'left') #Merges with CDS annotated dataframe with add CDS position
    median_depth = median_depth.rename(columns = {'normdepth': 'median_depth'}) #depth column renamed 
    
    median_depth['AApos'] = median_depth['amino_acid'].str[1:]
    #median_depth.to_excel('/Users/ivan/Desktop/test_excel_outputs/20250806_DepthOutput.xlsx', index = None)

    aa_grouped = median_depth.groupby('AApos')

    min_depth_aa_level = aa_grouped.agg({
        'median_depth': 'min',
        'target': 'first', 
        'exon': 'first',
        'amino_acid': 'first',
    }
                                          )
    min_depth_aa_level = min_depth_aa_level.reset_index(names = ['AApos'])
    
    return median_depth, min_depth_aa_level

In [None]:
def graph_rep_depth(df,median_depth): #Generates line plots pre-median faceted by replicate

    median_depth = median_depth[['pos', 'CDSpos']]
    df = pd.merge(df, median_depth, on = 'pos', how = 'left')
    
    
    sorted = natsorted(set(df['target'].tolist()))
    plot =  alt.Chart(df).mark_line(point = True).encode(
        x = alt.X('CDSpos',
                  axis = alt.Axis(title = 'CDS Position'),
                  scale = alt.Scale(domain = [-50, 2384]
                                   )
                 ),
        y = alt.Y('normdepth', 
                  axis = alt.Axis(title = 'Normalized Depth'),
                  scale = alt.Scale(domain = [0, 1.03]
                                   )
                 ),
        color = alt.Color('target', sort = sorted, legend = alt.Legend(title = 'SGE Target',
                                                                      symbolLimit = 0)),
        tooltip = [alt.Tooltip('target', title = 'SGE Target: '),
                   alt.Tooltip('amino_acid', title = 'Amino Acid: '),
                   alt.Tooltip('normdepth', title = 'Normalized Depth: ')]
    ).properties(
        width = 1000, 
        height = 400
    ).interactive()
    
    plot = plot.facet('repl', columns = 1)
    plot.display()

In [None]:
def graph_median_depth(df, cut_coords): #Generates final line plot based on median collapsed depth

    df['pos'] = df['pos'].astype(int)
    annotate_cutsite_df = df[['pos', 'CDSpos']]

    cds_cut_df = pd.merge(cut_coords, annotate_cutsite_df, on = 'pos', how = 'inner')
    cds_cut_df = cds_cut_df.drop_duplicates()
 
    sorted = natsorted(set(df['target'].tolist()))

    tableau20 = ['#4E79A7', '#F28E2C', '#E15759', '#76B7B2', '#59A14F', 
             '#EDC949', '#AF7AA1', '#FF9DA7', '#9C755F', '#BAB0AB',
             '#4E79A7', '#F28E2C', '#E15759', '#76B7B2', '#59A14F',
             '#EDC949', '#AF7AA1', '#FF9DA7', '#9C755F', '#BAB0AB']

    category20 = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
                  '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf',
                  '#aec7e8', '#ffbb78', '#98df8a', '#ff9896', '#c5b0d5',
                  '#c49c94', '#f7b6d2', '#c7c7c7', '#dbdb8d', '#9edae5']
    
    # Combine them
    combined_colors = tableau20 + category20  # 40 colors total
    
    # Or remove similar colors
    plot_domain = [-25, 2359]
    combined_colors = list(dict.fromkeys(tableau20 + category20))

    #Builds median read depth plot
    plot =  alt.Chart(df).mark_line(point = True).encode(
        x = alt.X('CDSpos',
                  axis = alt.Axis(title = 'CDS Position',
                                  labelFontSize = 16,
                                  titleFontSize = 20
                                 ),
                  scale = alt.Scale(domain = plot_domain
                                   )
                 ),
        y = alt.Y('median_depth', 
                  axis = alt.Axis(title = 'Median Normalized Depth',
                                  values = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                 labelFontSize = 16,
                                 titleFontSize = 20
                                 ),
                  scale = alt.Scale(domain = [0.05, 1.15]
                                   )
                 ),
        color = alt.Color('target', 
                          sort = sorted, 
                          scale = alt.Scale(range = combined_colors[:34]),
                          legend = None
                         ),
        tooltip = [alt.Tooltip('target', title = 'SGE Target: '),
                   alt.Tooltip('CDSpos', title = 'CDS Position: '),
                   alt.Tooltip('pos', title = 'Genomic Coordinate: '),
                   alt.Tooltip('amino_acid', title = 'Amino Acid: '),
                   alt.Tooltip('median_depth', title = 'Median Normalized Depth: ')]
    ).properties(
        width = 1700, 
        height = 300,
        title = alt.TitleParams(text = 'Median Read Depth by Target',
                                fontSize = 22
                               )
    ).interactive()

    #Adds ticks to mark cutsites
    ticks = alt.Chart(cds_cut_df).mark_tick(
        color='red',
        thickness = 2,
        size = 20,
        dy = 10
    ).encode(
        x= alt.X('CDSpos:Q',
                 scale = alt.Scale(domain = plot_domain
                                  )
                ),# Your column name
        y=alt.datum(0)
    )

    #Adds rectangles to specify BARD1 domains
    y_max = 1
    rectangles = alt.Chart(pd.DataFrame({
        'x': [76, 1273, 1696],      # Start x-position of each rectangle
        'x2': [363, 1636, 2331],     # End x-position of each rectangle  
        'y': [y_max * 1.05, y_max * 1.05, y_max * 1.05],    # Bottom of rectangles (5% above max)
        'y2': [y_max * 1.15, y_max * 1.15, y_max * 1.15],# Top of rectangles (15% above max)
        'color': ['#B9DBF4','#C8DBC8', '#F6BF93'] 
    })).mark_rect(
        opacity=0.7,
        stroke = 'black', 
        strokeWidth = 2
    ).encode(
        x= 'x:Q',
        x2='x2:Q',
        y='y:Q',
        y2='y2:Q',
        color = alt.Color('color:N', scale = None)
    )

    plot = alt.layer(plot, ticks, rectangles).encode(
        x = alt.X(scale = alt.Scale(domain = plot_domain)
                 )
    ).configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    )

    #plot.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_2a_dels.png', ppi = 500)
    plot.display()

In [None]:
def graph_aa_pos_median_depth(df, cut_coords): #Generates final line plot based on median collapsed depth

    df = df.dropna(subset = ['AApos'])
    df = df.copy()
    df['pos'] = df['pos'].astype(int)
    df['AApos'] = df['AApos'].astype(int)
    
    annotate_cutsite_df = df[['pos', 'AApos']]

    cds_cut_df = pd.merge(cut_coords, annotate_cutsite_df, on = 'pos', how = 'inner')
    cds_cut_df = cds_cut_df.drop_duplicates()

    sorted = natsorted(set(df['target'].tolist()))

    tableau20 = ['#4E79A7', '#F28E2C', '#E15759', '#76B7B2', '#59A14F', 
             '#EDC949', '#AF7AA1', '#FF9DA7', '#9C755F', '#BAB0AB',
             '#4E79A7', '#F28E2C', '#E15759', '#76B7B2', '#59A14F',
             '#EDC949', '#AF7AA1', '#FF9DA7', '#9C755F', '#BAB0AB']

    category20 = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
                  '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf',
                  '#aec7e8', '#ffbb78', '#98df8a', '#ff9896', '#c5b0d5',
                  '#c49c94', '#f7b6d2', '#c7c7c7', '#dbdb8d', '#9edae5']
    
    # Combine them
    combined_colors = tableau20 + category20  # 40 colors total
    
    # Or remove similar colors
    plot_domain = [-10, 787]
    combined_colors = list(dict.fromkeys(tableau20 + category20))

    #Builds median read depth plot
    plot =  alt.Chart(df).mark_line(point = True).encode(
        x = alt.X('AApos',
                  axis = alt.Axis(title = 'Amino Acid Position',
                                  labelFontSize = 16,
                                  titleFontSize = 20
                                 ),
                  scale = alt.Scale(domain = plot_domain
                                   )
                 ),
        y = alt.Y('median_depth', 
                  axis = alt.Axis(title = 'Median Normalized Depth',
                                  values = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                 labelFontSize = 16,
                                 titleFontSize = 20
                                 ),
                  scale = alt.Scale(domain = [0.05, 1.15]
                                   )
                 ),
        color = alt.Color('target', 
                          sort = sorted, 
                          scale = alt.Scale(range = combined_colors[:34]),
                          legend = None
                         ),
        tooltip = [alt.Tooltip('target', title = 'SGE Target: '),
                   alt.Tooltip('CDSpos', title = 'CDS Position: '),
                   alt.Tooltip('pos', title = 'Genomic Coordinate: '),
                   alt.Tooltip('amino_acid', title = 'Amino Acid: '),
                   alt.Tooltip('median_depth', title = 'Median Normalized Depth: ')]
    ).properties(
        width = 1700, 
        height = 300,
        title = alt.TitleParams(text = 'Median Read Depth by Target',
                                fontSize = 22
                               )
    ).interactive()



    #Adds ticks to mark cutsites
    ticks = alt.Chart(cds_cut_df).mark_tick(
        color='red',
        thickness = 2,
        size = 20,
        dy = 10
    ).encode(
        x= alt.X('AApos:Q',
                 scale = alt.Scale(domain = plot_domain
                                  )
                ),# Your column name
        y=alt.datum(0)
    )

    #Adds rectangles to specify BARD1 domains
    y_max = 1
    rectangles = alt.Chart(pd.DataFrame({
        'x': [26, 425, 568],      # Start x-position of each rectangle
        'x2': [122, 545, 777],     # End x-position of each rectangle  
        'y': [y_max * 1.05, y_max * 1.05, y_max * 1.05],    # Bottom of rectangles (5% above max)
        'y2': [y_max * 1.15, y_max * 1.15, y_max * 1.15],# Top of rectangles (15% above max)
        'color': ['#B9DBF4','#C8DBC8', '#F6BF93'] 
    })).mark_rect(
        opacity=0.7,
        stroke = 'black', 
        strokeWidth = 2
    ).encode(
        x= 'x:Q',
        x2='x2:Q',
        y='y:Q',
        y2='y2:Q',
        color = alt.Color('color:N', scale = None)
    )

    plot = alt.layer(plot, ticks, rectangles).encode(
        x = alt.X(scale = alt.Scale(domain = plot_domain)
                 )
    ).configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    )


    
    #plot.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_2a_dels.png', ppi = 500)
    plot.display()

In [None]:
def graph_aapos_alphafold_rects(df, aa_df, cut_coords): #Generates final line plot based on median collapsed depth

    df = df.loc[~(df['target'].isin(['BARD1_X4J']))]
    aa_df = aa_df.loc[~(aa_df['target'].isin(['BARD1_X4J']))]
    
    df = df.dropna(subset = ['AApos'])
    df = df.copy()
    aa_df = aa_df.copy()
    df['pos'] = df['pos'].astype(int)
    df['AApos'] = df['AApos'].astype(int)
    aa_df['AApos'] = aa_df['AApos'].astype(int)

    depth_only = aa_df[['median_depth', 'AApos']]

    cds_cut_df = cut_coords
    cds_cut_df = cds_cut_df.drop_duplicates()
    cds_cut_df = pd.merge(cds_cut_df, depth_only, on = 'AApos', how = 'inner')
    
    cds_cut_df = cds_cut_df.drop_duplicates()

    cds_cut_df['start_point'] = 0 
    
    
    sorted = natsorted(set(df['target'].tolist()))

    tableau20 = ['#4E79A7', '#F28E2C', '#E15759', '#76B7B2', '#59A14F', 
             '#EDC949', '#AF7AA1', '#FF9DA7', '#9C755F', '#BAB0AB',
             '#4E79A7', '#F28E2C', '#E15759', '#76B7B2', '#59A14F',
             '#EDC949', '#AF7AA1', '#FF9DA7', '#9C755F', '#BAB0AB']

    category20 = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
                  '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf',
                  '#aec7e8', '#ffbb78', '#98df8a', '#ff9896', '#c5b0d5',
                  '#c49c94', '#f7b6d2', '#c7c7c7', '#dbdb8d', '#9edae5']
    
    # Combine them
    combined_colors = tableau20 + category20  # 40 colors total
    
    # Or remove similar colors
    plot_domain = [1, 778]
    combined_colors = list(dict.fromkeys(tableau20 + category20))

    plot_ticks = list(range(0, 776, 25))

    #Builds median read depth plot
    plot =  alt.Chart(df).mark_line(point = True).encode(
        x = alt.X('AApos',
                  axis = alt.Axis(title = '',
                                  labelFontSize = 16,
                                  titleFontSize = 20, 
                                  values = plot_ticks,
                                  labels = False,
                                  ticks = False
                                 ),
                  scale = alt.Scale(domain = plot_domain
                                   )
                 ),
        y = alt.Y('median_depth', 
                  axis = alt.Axis(title = 'Median Normalized Depth (D13)',
                                  values = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                 labelFontSize = 16,
                                 titleFontSize = 20
                                 ),
                  scale = alt.Scale(domain = [0.05, 1.7]
                                   )
                 ),
        color = alt.Color('target', 
                          sort = sorted, 
                          scale = alt.Scale(range = combined_colors[:34]),
                          legend = None
                         ),
        tooltip = [alt.Tooltip('target', title = 'SGE Target: '),
                   alt.Tooltip('amino_acid', title = 'Amino Acid: '),
                   alt.Tooltip('median_depth', title = 'Median Normalized Depth: ')]
    ).properties(
        width = 1750, 
        height = 300,
        title = alt.TitleParams(text = 'Median Read Depth by Target',
                                fontSize = 22
                               )
    ).interactive()



    #Adds ticks to mark cutsites
    
    ticks = alt.Chart(cds_cut_df).mark_rule(
        color='gray',
        strokeWidth = 2,
        strokeDash = [5,5]
    ).encode(
        x= alt.X('AApos:Q',
                 scale = alt.Scale(domain = plot_domain
                                  )
                ),# Your column name
        y= 'start_point',
        y2 = 'median_depth:Q'
    )


    #Adds rectangles for: BARD1 domains, secondary structure based on solved structures, and AlphaFold predicted structure
    y_max = 1

    rect_data_x = [1, 26, 120,141, 165, 210,220, 421, 548, 565, #AlphaFold structure coordinates
              26, 425, 568, #BARD1 Domains
              26,34, 48, 61,63, 68, 70,74, 80, 97, 117, #RING secondary structure
             425,430,439, 441, 450,463, 471, 473, 483,497,504,507, 516, 529, 534,536, #ARD secondary structure
             568,571,574, 578, 592, 595,597, 606, 608, 617, 626, 629, 631, 643, 655,666, 676, 679, 687, 698,700, 702, 712, 717, 735, 739, 750, 752,755, 760, 770 #BRCT secondary structure
                  ]
                   

    rect_data_x2 = [26, 120, 141,165, 210, 220,421, 548, 565, 777, #AlphaFold structure coordinates
              122, 545, 777, #BARD1 Domains
              34,48, 61,63,68, 70, 74, 80, 97, 117,122,#RING secondary structure
             430,439,441, 450,463, 471, 473, 483, 497, 504,507, 516, 529, 534,536, 545,#ARD secondary structure
            571,574, 578, 592, 595, 597,606, 608, 617, 626,629, 631, 643, 655, 666, 676, 679, 687, 698, 700, 702, 712, 717, 735, 739, 750, 752, 755,760,770, 777 #BRCT secondary structure
                   ]

    rect_data_y = [y_max * 1.05, y_max * 1.05, y_max * 1.05, y_max * 1.05, y_max * 1.05, y_max * 1.05, y_max * 1.05, y_max * 1.05, y_max * 1.05, y_max * 1.05, #AlphaFold structure coordinates
              y_max * 1.45, y_max * 1.45, y_max * 1.45, #BARD1 Domains
             y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, #RING secondary structure
             y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25,y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25,#ARD secondary structure
             y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25,y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25,
             y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25, y_max * 1.25 #BRCT secondary structure
                  ]

    rect_data_y2 = [y_max * 1.2, y_max * 1.2, y_max * 1.2, y_max * 1.2, y_max * 1.2, y_max * 1.2, y_max * 1.2, y_max * 1.2, y_max * 1.2, y_max * 1.2, #AlphaFold structure coordinates
               y_max * 1.6, y_max * 1.6, y_max * 1.6, #BARD1 Domains
              y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4,
              y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4,y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, #RING secondary structure
             y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4,y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4,#ARD secondary structure 
                   y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4, y_max * 1.4 #BRCT secondary structure
                   ]

    rect_data_color = ['#b1b1b1', '#DEDBEE', '#b1b1b1', '#DEDBEE', '#b1b1b1', '#DEDBEE', '#b1b1b1', '#DEDBEE', '#b1b1b1', '#DEDBEE', #AlphaFold strucutral colors
                  '#B9DBF4', '#C8DBC8', '#F6BF93', #Domain Colors
                 'white','blue','white','green','white','green','white','blue','white','blue','white', #RING secondary strucutre coloring (based on 1JM7)
                 'white', 'blue', 'white', 'blue', 'white', 'blue', 'white', 'blue','white', 'blue', 'white', 'blue', 'white', 'blue', 'white', 'blue', #ARD secondary structure coloring (based on 3C5R)
                'white', 'green','white', 'blue', 'white', 'green', 'white', 'green', 'white', 'blue', 'white', 'green', 'blue', 'white', 'blue', 'white', 'green', 'white', 'blue', 'white', 'green', 'white', 'blue', 'white', 'green', 'white', 'green', 'white', 'green', 'blue', 'white'  #BRCT Secondary structure coloring (based on 3FA2)
                ]

    rect_data_text = ['', 'Structured', '', '', '', '', 'Disordered', '', '', '', #AlphaFold text
                  'RING', 'ARD', 'BRCT', #Domain Text
                 '', '', '', '', '', '', '', '', '', '','', #Placeholders for secondary structure rectangles
                 '', '', '', '', '', '', '', '', '','', '', '', '', '', '', '', 
                 '', '', '', '', '', '', '', '', '','', '', '', '', '', '', '','', '', '', '', '', '', '', '', '','', '', '', '', '', '']

    #print(len(rect_data_x), len(rect_data_x2), len(rect_data_y), len(rect_data_y2), len(rect_data_color), len(rect_data_text)) #For array length checking
    
    rect_data = pd.DataFrame({
        'x': rect_data_x,    # Start x-position of each rectangle
        'x2':rect_data_x2,  # End x-position of each rectangle  
        'y': rect_data_y,    # Bottom of rectangles (5% above max)
        'y2': rect_data_y2,# Top of rectangles (15% above max)
        'color': rect_data_color,
        'label': rect_data_text
    })
    
    rectangles = alt.Chart(rect_data).mark_rect(
        opacity=0.7,
        stroke = 'black', 
        strokeWidth = 2
    ).encode(
        x= 'x:Q',
        x2='x2:Q',
        y='y:Q',
        y2='y2:Q',
        color = alt.Color('color:N', scale = None)
    )

    text = alt.Chart(rect_data).mark_text(
        align='center',
        baseline='middle',
        fontWeight = 'bold',
        fontSize=24,
        angle=0,  # or 90/-90 for vertical text
        color='black',
        limit=1000  # truncate long text
    ).encode(
        x=alt.X('x_center:Q'),
        y=alt.Y('y_center:Q'),
        text='label:N'
    ).transform_calculate(
        x_center='(datum.x + datum.x2) / 2',
        y_center='(datum.y + datum.y2) / 2'
    )

    '''
    plot = alt.layer(plot, ticks, rectangles,text).encode(
        x = alt.X(scale = alt.Scale(domain = plot_domain)
                 )
    ).configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    )
    '''

    plot = alt.layer(plot, ticks, rectangles,text).encode(
        x = alt.X(scale = alt.Scale(domain = plot_domain)
                 )
    )
    
    #plot.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_2a_dels.png', ppi = 500)
    #plot.display()

    return plot

In [None]:
def mini_heatmap(input):

    #last 3 tracks only
    df = pd.read_excel(input)

    order = ['Stop', 'Mis. Min.', 'Mis. Mean']
    
    df = df.loc[df['AAsub'].isin(['Mis. Min.', 'Mis. Mean', 'Stop'])]
    all_pos = list(range(1,779))
    
    short_map = alt.Chart(df).mark_rect().encode(
    x = alt.X('AApos:Q', 
              bin = alt.Bin(maxbins = 778, minstep = 1),
              axis=alt.Axis(
                values=[i for i in range(0, 777, 50)],
                title='Amino Acid Position', titleFontSize = 20, labelFontSize = 16,
                labels=True,
                ticks= True,
                domain=True
             ),
              scale = alt.Scale(domain = all_pos
                               )
             ),
    y = alt.Y('AAsub:N', sort = order,
              axis = alt.Axis(title = '', titleFontSize = 20, labelFontSize = 16, labelFontWeight = 'bold', ticks = False)),
    color = alt.Color('score:Q', title = 'SGE Score',
                  scale = alt.Scale(
                      domain = [-0.3, 0, 0.15],  # Add upper bound for positive values
                      range = ['#ff0000', '#a6a6a6', '#a6a6a6']  # Keep gray for 0 and positive values
                  ),
                  legend = alt.Legend(titleFontSize = 20, 
                                      labelFontSize = 18)
        ),
    tooltip =[alt.Tooltip('AApos', title = 'Position: '),
               alt.Tooltip('full_sub', title = 'Substitution: '),
               alt.Tooltip('score', title = 'SGE Score: ')]
    ).properties(
    height = 100, 
    width = 1750
    ).interactive()

    return short_map

In [None]:
def graph_cutsite_depth(df):

    sorted = natsorted(set(df['target'].tolist()))
    plot =  alt.Chart(df).mark_line(point = True).encode(
        x = alt.X('cut_site_distance',
                  axis = alt.Axis(title = 'Distance to Cut Site')
                 ),
        y = alt.Y('median_depth', 
                  axis = alt.Axis(title = 'Median Normalized Depth'),
                  scale = alt.Scale(domain = [0, 1.03]
                                   )
                 ),
        tooltip = [alt.Tooltip('target', title = 'SGE Target: '),
                   alt.Tooltip('CDSpos', title = 'CDS Position: '),
                   alt.Tooltip('pos', title = 'Genomic Coordinate: '),
                   alt.Tooltip('amino_acid', title = 'Amino Acid: '),
                   alt.Tooltip('median_depth', title = 'Median Normalized Depth: ')]
    ).properties(
        width = 600, 
        height = 400,
        title = 'Median Read Depth by Target'
    ).interactive()

    plot = plot.facet(
        facet = alt.Facet('target:N',
                          sort = sorted
                         ),
        columns = 2
    ).resolve_scale(
        x = 'independent',
        y = 'independent'
    )
    plot.display()

    '''
    cut_site_collapsed = df.groupby('cut_site_distance').agg({
        'median_depth': 'median',
        'cut_site_distance': 'first'
    }
                                                            )
    print(cut_site_collapsed)
    collapsed_plot =  alt.Chart(df).mark_line(point = True).encode(
        x = alt.X('cut_site_distance',
                  axis = alt.Axis(title = 'Distance to Cut Site')
                 ),
        y = alt.Y('median_depth', 
                  axis = alt.Axis(title = 'Median Normalized Depth'),
                  scale = alt.Scale(domain = [0, 1.03]
                                   )
                 )
    ).properties(
        width = 600, 
        height = 400,
        title = 'Median Read Depth by Target'
    ).interactive()

    collapsed_plot.display()
    '''

In [None]:
def combine_plots(depth, heat):

    final_plot = alt.vconcat(depth, heat, spacing = 5).resolve_scale(
        x = 'shared'
    ).configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    )


    return final_plot

In [None]:
def main():
    coding_coords, cut_coords = read_input(gsp_input_file, cut_sites)
    all_reps_depth = process_depth(read_depth_path, sge_file, coding_coords, target_coords, cut_coords)
    collapsed_depth, min_collapsed_depth_aa = process_read_depth(all_reps_depth)
    
    depth_plot = graph_aapos_alphafold_rects(collapsed_depth, min_collapsed_depth_aa, cut_coords)
    heatmap = mini_heatmap(heatmap_input)

    combined_plot = combine_plots(depth_plot, heatmap)
    combined_plot.display()

    #combined_plot.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_2ab_dels_short_heatmap.png', ppi = 500)
    '''Old Plots''
    #collapsed_depth.to_excel('/Users/ivan/Desktop/test_excel_outputs/20250717_BARD1_CollapsedMedianDepth.xlsx', index = False)
    #graph_rep_depth(all_reps_depth, collapsed_depth)
    #graph_median_depth(collapsed_depth, cut_coords)
    #graph_aa_pos_median_depth(collapsed_depth, cut_coords)
    #graph_cutsite_depth(collapsed_depth)
    '''

In [None]:
main()