This notebook generates numerical figures that are used in the publication text. Includes values such as: number of variants designed, % of {this variant type} LoF, etc. 

In [None]:
import pandas as pd
import math

In [None]:
sge_file = '../Data/final_tables/supplementary_file_1_BARD1_SGE_final_table.xlsx'

target_coords = '../Data/supp_table_inputs/20250415_BARD1_Filter_entry.xlsx' #Annotated excel file with start and end coordates for all BARD1 SGE targts

oligos = '../Data/supp_table_inputs/20250912_BARD1_SGEoligos.xlsx' #SGE oligos

In [None]:
def count_overlapping_numbers(ranges): #Counts overlapping ranges to get number of overlapping base pairs
    # Create events: +1 for start, -1 for end+1
    events = []
    for start, end in ranges:
        events.append((start, 1))      # Range starts
        events.append((end + 1, -1))   # Range ends (exclusive)
    
    # Sort events by position
    events.sort()
    overlap_count = 0
    current_coverage = 0
    last_pos = None
    
    for pos, change in events:
        # If we had 2+ ranges covering the previous segment
        if current_coverage >= 2 and last_pos is not None:
            overlap_count += pos - last_pos
        
        current_coverage += change
        last_pos = pos
    
    return overlap_count

In [None]:
def get_3bp_dels(df): #Gets count of non-unqiue and unique 3bp deletions
    df['Sequence'] = df['Sequence'].transform(lambda x: x.upper()) #Makes all sequences upper-case

    sequences = df['Sequence'].tolist() #Gets sequences

    all_del_variants = [] #List to hold 3bp deletion sequences
	

    for upper_string in sequences: #Iterates through sequences and generates 3bp deletions
        del_size = 3
        for i in range (0, len(upper_string)-del_size+1):
            all_del_variants += [upper_string[:i]+upper_string[i+del_size:]]


    print('All Dels: ', len(all_del_variants))
    print('Unique Dels: ', len(set(all_del_variants)))

    return len(all_del_variants), len(set(all_del_variants)) #Returns number of non-unique and unique deletions

In [None]:
def designed_vars(targets, oligos): #Does math to estimate number of designed variants
    df = pd.read_excel(targets, sheet_name = 'targets') #Reads targets coordinates
    oligos_df = pd.read_excel(oligos) #Gets oligos

    all_dels, unique_dels = get_3bp_dels(oligos_df) #Gets statistics for 3bp dels
    target_coords = list(zip(df['end'], df['start'])) #Makes sets of target coordinates

    non_covered_fixed_edits = 70 #Number of non-covered fixed edits (counted)
    total_bp_covered = 0 #Counter for total basepairs covered
    for elem in target_coords: #Iterates through target_coords and gets raw number of baes covered
        start, end = elem
        bp_covered = end - start + 1
        total_bp_covered += bp_covered

    total_snvs_designed = total_bp_covered * 3 - (3* non_covered_fixed_edits) #Gets raw number of SNVs designed

    count_overlapping_coords = count_overlapping_numbers(target_coords) #Gets numver of overlapping coordinates
    double_cover_designed = 3 * int(count_overlapping_coords) #Gets number of double covered SNVs

    estimated_overlapping_dels = (unique_dels/all_dels) * count_overlapping_coords #Estimate for number of unique overlapping 3bp deletions (assumes proportion of deletions that are unique holds true in overlapping regions)
    
    print('Total Bases Covered: ', str(total_bp_covered), '\n',
            'Total SNVs Designed (Overlaps included): ', str(total_snvs_designed), '\n',
          'Total Unique 3BP Dels Designed (Overlaps Included): ', str(unique_dels), '\n',
          'Total Designed Variants w/ Overlaps: ', str(total_snvs_designed + total_bp_covered), '\n',
          'Overlapping Bases: ', str(count_overlapping_coords), '\n',
          'Overlapping SNVs: ', str(double_cover_designed), '\n',
          'Total Non-Overlapping SNVs: ', str(total_snvs_designed - double_cover_designed), '\n',
          'Total Non-Overlapping Dels: ', str(unique_dels - estimated_overlapping_dels)
         )


In [None]:
def var_count_stats(file): #Gets statistics for each type of variant

    df = pd.read_excel(file, sheet_name = 'scores')
    df = df.loc[~df['variant_qc_flag'].isin(['WARN'])]
    df = df.rename(columns = {'consequence': 'Consequence'})
    
    raw_df = df.loc[df['var_type'].isin(['snv'])]
    del_df = df.loc[df['var_type'].isin(['3bp_del'])]
    

    raw_df = raw_df.rename(columns = {'consequence': 'Consequence'})

    raw_count = len(raw_df)


    stats_list = [('missense_variant',['missense_count', 'missense_abnormal', 'missense_normal']),
                  ('synonymous_variant', ['synonymous_count', 'synonymous_abnormal', 'synonymous_normal']),
                  ('intron_variant', ['intron_count', 'intron_abnormal', 'intron_normal']),
                  ('stop_gained',['stop_gained_count', 'stop_gained_abnormal', 'stop_gained_normal']),
                  ('splice_site_variant',['splicing_count', 'splicing_abnormal', 'splicing_normal']),
                  ('splicing_variant', ['splice_region_count', 'splice_region_abnormal', 'splice_region_normal'])
                 ]

    stats_dict = {}

    for elem in stats_list:
        var_type, stats_to_get = elem

        ab_perc_str = var_type + '_abnormal_percent'
        norm_perc_str = var_type + '_norm_percent'
        
        full_count = len(df[df['Consequence'] == var_type])
        ab_count = len(df[(df['Consequence'] == var_type) & (df['functional_consequence'] == 'functionally_abnormal')])
        norm_count = len(df[(df['Consequence'] == var_type) & (df['functional_consequence'] == 'functionally_normal')])

        ab_perc = (ab_count / full_count) * 100
        norm_perc = 100 - ab_perc

        stats_dict[stats_to_get[0]] = full_count
        stats_dict[stats_to_get[1]] = ab_count
        stats_dict[stats_to_get[2]] = norm_count
        stats_dict[ab_perc_str] = round(ab_perc,2)
        stats_dict[norm_perc_str] = round(norm_perc,2)
        
    missense_count = len(df[df['Consequence'] == 'missense_variant'])
    missense_abnormal = len(df[(df['Consequence'] == 'missense_variant') & (df['functional_consequence'] == 'functionally_abnormal')])
    missense_normal = len(df[(df['Consequence'] == 'missense_variant') & (df['functional_consequence'] == 'functionally_normal')])

    print('Number of SNVs: ', str(raw_count), '\n',
          'Number of Deletions: ', str(len(del_df))
         )

    print('General Count of Variants: ')
    display(stats_dict)

    return df

In [None]:
def rna_count_stats(rna, filtered): #Gets statistics from RNA output
    df = pd.read_excel(rna, sheet_name = 'scores')

    df = df.loc[~df['variant_qc_flag'].isin(['WARN'])]
    df = df.dropna(subset = ['RNAscore'])

    df = df.rename(columns = {'consequence': 'Consequence', 'RNA_consequence': 'RNA_classification'})

    total_rna_scored = len(df)

    stats_list = [('stop_gained', ['stop_count', 'stop_lowRNA', 'stop_abnormal_lowRNA', 'stop_normRNA', 'stop_abnormal_normRNA']),
                  ('missense_variant', ['missense_count', 'missense_lowRNA', 'missense_abnormal_lowRNA', 'missense_normRNA', 'missense_abnormal_normRNA']),
                  ('synonymous_variant', ['syn_count', 'syn_lowRNA', 'syn_abnormal_lowRNA', 'syn_normRNA', 'syn_abnormal_normRNA']),
                  ('splicing_variant', ['splicing_count', 'splicing_lowRNA', 'splicing_abnormal_lowRNA', 'splicing_normRNA', 'splicing_abnormal_normRNA'])
                 ]

    stats_dict = {}
    for elem in stats_list:
        var_type, stats_to_get = elem

        full_count = len(df[df['Consequence'] == var_type])
        lowRNA_count = len(df[(df['Consequence'] == var_type) & (df['RNA_classification'] == 'low')])
        ab_lowRNA_count = len(df[(df['Consequence'] == var_type) & (df['RNA_classification'] == 'low') & (df['functional_consequence'] == 'functionally_abnormal')])
        normRNA_count = len(df[(df['Consequence'] == var_type) & (df['RNA_classification'] == 'normal')])
        ab_normRNA_count = len(df[(df['Consequence'] == var_type) & (df['RNA_classification'] == 'normal') & (df['functional_consequence'] == 'functionally_abnormal')])
        
        stats_to_pass = [full_count, lowRNA_count, ab_lowRNA_count, normRNA_count, ab_normRNA_count]

        i = 0
        while i < len(stats_to_get):
            stats_dict[stats_to_get[i]] = stats_to_pass[i]

            i += 1

    print('RNA Count Stats', '\n',
          'Total RNA Vars.: ', str(total_rna_scored)
         )
    
    display(stats_dict)

In [None]:
def spliceai_count_stats(file): #Statistics for comparison with SpliceAI
    
    df = pd.read_excel(file, sheet_name = 'scores')
    df = df.loc[~df['variant_qc_flag'].isin(['WARN'])]
    df = df.loc[~((df['RNAscore'].isna()) & (df['max_SpliceAI'].isna()))]

    df = df.rename(columns = {'consequence': 'Consequence', 'RNA_consequence': 'RNA_classification'})

    df['splice_prediction'] = 'normal'
    df.loc[df['max_SpliceAI'] >= 0.2, 'splice_prediction'] = 'intermediate'
    df.loc[df['max_SpliceAI'] >= 0.5, 'splice_prediction'] = 'abnormal'

    stats_list = [('missense_variant', ['miss_count', 'miss_lowmRNA_Ab','miss_lowmRNA_Ab_highSpliceAI', 'miss_lowmRNA_Ab_InterSpliceAI', 'miss_lowmRNA_Ab_lowSpliceAI']),
                  ('synonymous_variant', ['syn_count', 'syn_lowmRNA_Ab', 'syn_lowmRNA_Ab_highSpliceAI','syn_lowmRNA_Ab_InterSpliceAI','syn_lowmRNA_Ab_lowSpliceAI']),
                  ('splicing_variant', ['splice_count', 'splice_lowmRNA_Ab', 'splice_lowmRNA_Ab_highSpliceAI', 'splice_lowmRNA_Ab_interSpliceAI', 'splice_lowmRNA_lowSpliceAI'])
                 ]

    stats_dict = {}
    for elem in stats_list:
        var_type, stats_to_get = elem

        full_count = len(df[df['Consequence'] == var_type])
        ab_lowRNA_count = len(df[(df['Consequence'] == var_type) & (df['RNA_classification'] == 'low') & (df['functional_consequence'] == 'functionally_abnormal')])
        ab_lowRNA_highSpliceAI_count = len(df[(df['Consequence'] == var_type) & (df['RNA_classification'] == 'low') & (df['functional_consequence'] == 'functionally_abnormal') & (df['splice_prediction'] == 'abnormal')]) #SpliceAI >= 0.5
        ab_lowRNA_interSpliceAI_count = len(df[(df['Consequence'] == var_type) & (df['RNA_classification'] == 'low') & (df['functional_consequence'] == 'functionally_abnormal') & (df['splice_prediction'] == 'intermediate')]) #0.2 <= SpliceAi < 0.5
        ab_lowRNA_lowSpliceAI_count = len(df[(df['Consequence'] == var_type) & (df['RNA_classification'] == 'low') & (df['functional_consequence'] == 'functionally_abnormal') & (df['splice_prediction'] == 'normal')]) #SpliceAI < 0.2

        stats_to_pass = [full_count, ab_lowRNA_count, ab_lowRNA_highSpliceAI_count, ab_lowRNA_interSpliceAI_count, ab_lowRNA_lowSpliceAI_count]

        i = 0
        while i < len(stats_to_get):
            stats_dict[stats_to_get[i]] = stats_to_pass[i]

            i += 1


    print('Total Vars. Annotated by SpliceAI: ', str(len(df)))

    display(stats_dict)

In [None]:
def main():
    designed_vars(target_coords, oligos)
    filtered_df = var_count_stats(sge_file)
    rna_count_stats(sge_file, filtered_df)
    spliceai_count_stats(sge_file)

In [None]:
main()