In [None]:
import pandas as pd
import math

In [None]:
raw_sge_file = '../Data/20250508_BARD1scores_update.tsv' #Raw unfiltered (no PAM filtering, impossible missense changes due to fixed edits still present)
filtered_sge_file = '../Data/20250825_BARD1snvscores_filtered.xlsx' #Filtered SGE data (variants creating new PAM sites or impossible missense changes removed)
rna_file = '/Users/ivan/Desktop/20250710_BARD1_202505_data_RNAclassified_beta_collapsed.xlsx' #Annotated file with RNA classifications
splice_ai_rna_file = '/Users/ivan/Desktop/20250711_20250626_RNAClassified_wSpliceAI.xlsx' #Annotated RNA file with SpliceAi predictions
target_coords = '../Data/SNV_filtering_inputs/20250415_BARD1_Filter_entry.xlsx' #Annotated excel file with start and end coordates for all BARD1 SGE targts
del_file = '../Data/20250829_BARD1delscores.tsv' #File for scored deletions

In [None]:
def count_overlapping_numbers(ranges):
    # Create events: +1 for start, -1 for end+1
    events = []
    for start, end in ranges:
        events.append((start, 1))      # Range starts
        events.append((end + 1, -1))   # Range ends (exclusive)
    
    # Sort events by position
    events.sort()
    overlap_count = 0
    current_coverage = 0
    last_pos = None
    
    for pos, change in events:
        # If we had 2+ ranges covering the previous segment
        if current_coverage >= 2 and last_pos is not None:
            overlap_count += pos - last_pos
        
        current_coverage += change
        last_pos = pos
    
    return overlap_count

In [None]:
def designed_vars(targets):
    df = pd.read_excel(targets, sheet_name = 'targets')

    target_coords = list(zip(df['end'], df['start']))

    non_covered_fixed_edits = 70
    total_bp_covered = 0
    for elem in target_coords:
        start, end = elem
        bp_covered = end - start + 1
        total_bp_covered += bp_covered

    total_snvs_designed = total_bp_covered * 3 - (3* non_covered_fixed_edits)

    count_overlapping_coords = count_overlapping_numbers(target_coords)
    double_cover_designed = 3 * int(count_overlapping_coords)
    
    print('Total Bases Covered: ', str(total_bp_covered), '\n',
            'Total SNVs Designed (Overlaps included): ', str(total_snvs_designed), '\n',
          'Total 3BP Dels Designed (Overlaps Included): ', str(total_bp_covered), '\n',
          'Total Designed Variants w/ Overlaps: ', str(total_snvs_designed + total_bp_covered), '\n',
          'Overlapping Bases (overlapping 3BP dels): ', str(count_overlapping_coords), '\n',
          'Overlapping SNVs: ', str(double_cover_designed), '\n',
          'Total Non-Overlapping SNVs: ', str(total_snvs_designed - double_cover_designed), '\n',
          'Total Non-Overlapping Dels: ', str(total_bp_covered - count_overlapping_coords)
         )


In [None]:
def var_count_stats(raw, filtered, dels):
    
    raw_df = pd.read_csv(raw, sep = '\t')

    del_df = pd.read_csv(dels, sep = '\t')
    
    raw_overlapping = raw_df['snvlib_lib2'].notna().sum()
    
    df = pd.read_excel(filtered)

    raw_df = raw_df.rename(columns = {'consequence': 'Consequence'})
    df = df.rename(columns = {'consequence': 'Consequence'})

    raw_count = len(raw_df)
    filtered_count = len(df)

    stats_list = [('missense_variant',['missense_count', 'missense_abnormal', 'missense_normal']),
                  ('synonymous_variant', ['synonymous_count', 'synonymous_abnormal', 'synonymous_normal']),
                  ('intron_variant', ['intron_count', 'intron_abnormal', 'intron_normal']),
                  ('stop_gained',['stop_gained_count', 'stop_gained_abnormal', 'stop_gained_normal']),
                  ('splice_site_variant',['splicing_count', 'splicing_abnormal', 'splicing_normal']),
                  ('splicing_variant', ['splice_region_count', 'splice_region_abnormal', 'splice_region_normal'])
                 ]

    stats_dict = {}

    for elem in stats_list:
        var_type, stats_to_get = elem

        ab_perc_str = var_type + '_abnormal_percent'
        norm_perc_str = var_type + '_norm_percent'
        
        full_count = len(df[df['Consequence'] == var_type])
        ab_count = len(df[(df['Consequence'] == var_type) & (df['functional_consequence'] == 'functionally_abnormal')])
        norm_count = len(df[(df['Consequence'] == var_type) & (df['functional_consequence'] == 'functionally_normal')])

        ab_perc = (ab_count / full_count) * 100
        norm_perc = 100 - ab_perc

        stats_dict[stats_to_get[0]] = full_count
        stats_dict[stats_to_get[1]] = ab_count
        stats_dict[stats_to_get[2]] = norm_count
        stats_dict[ab_perc_str] = round(ab_perc,2)
        stats_dict[norm_perc_str] = round(norm_perc,2)
        
    missense_count = len(df[df['Consequence'] == 'missense_variant'])
    missense_abnormal = len(df[(df['Consequence'] == 'missense_variant') & (df['functional_consequence'] == 'functionally_abnormal')])
    missense_normal = len(df[(df['Consequence'] == 'missense_variant') & (df['functional_consequence'] == 'functionally_normal')])

    print('Number of Raw Variants: ', str(raw_count), '\n',
          'Number of Raw Overlapping Variant: ', str(raw_overlapping), '\n',
          'Number of Filtered Variants: ', str(filtered_count), '\n',
          'Number of Deletions: ', str(len(del_df))
         )

    print('General Count of Variants: ')
    display(stats_dict)

    return df

In [None]:
def rna_count_stats(rna, filtered):
    df = pd.read_excel(rna)

    total_rna_scored = len(df)

    stats_list = [('stop_gained', ['stop_count', 'stop_lowRNA', 'stop_abnormal_lowRNA', 'stop_normRNA', 'stop_abnormal_normRNA']),
                  ('missense_variant', ['missense_count', 'missense_lowRNA', 'missense_abnormal_lowRNA', 'missense_normRNA', 'missense_abnormal_normRNA']),
                  ('synonymous_variant', ['syn_count', 'syn_lowRNA', 'syn_abnormal_lowRNA', 'syn_normRNA', 'syn_abnormal_normRNA']),
                  ('splicing_variant', ['splicing_count', 'splicing_lowRNA', 'splicing_abnormal_lowRNA', 'splicing_normRNA', 'splicing_abnormal_normRNA'])
                 ]

    stats_dict = {}
    for elem in stats_list:
        var_type, stats_to_get = elem

        full_count = len(df[df['Consequence'] == var_type])
        lowRNA_count = len(df[(df['Consequence'] == var_type) & (df['RNA_classification'] == 'low')])
        ab_lowRNA_count = len(df[(df['Consequence'] == var_type) & (df['RNA_classification'] == 'low') & (df['functional_consequence'] == 'functionally_abnormal')])
        normRNA_count = len(df[(df['Consequence'] == var_type) & (df['RNA_classification'] == 'normal')])
        ab_normRNA_count = len(df[(df['Consequence'] == var_type) & (df['RNA_classification'] == 'normal') & (df['functional_consequence'] == 'functionally_abnormal')])
        
        stats_to_pass = [full_count, lowRNA_count, ab_lowRNA_count, normRNA_count, ab_normRNA_count]

        i = 0
        while i < len(stats_to_get):
            stats_dict[stats_to_get[i]] = stats_to_pass[i]

            i += 1

    print('RNA Count Stats', '\n',
          'Total RNA Vars.: ', str(total_rna_scored)
         )
    
    display(stats_dict)

In [None]:
def spliceai_count_stats(spliceai):
    df = pd.read_excel(spliceai)

    stats_list = [('missense_variant', ['miss_count', 'miss_lowmRNA_Ab','miss_lowmRNA_Ab_highSpliceAI', 'miss_lowmRNA_Ab_InterSpliceAI', 'miss_lowmRNA_Ab_lowSpliceAI']),
                  ('synonymous_variant', ['syn_count', 'syn_lowmRNA_Ab', 'syn_lowmRNA_Ab_highSpliceAI','syn_lowmRNA_Ab_InterSpliceAI','syn_lowmRNA_Ab_lowSpliceAI']),
                  ('splicing_variant', ['splice_count', 'splice_lowmRNA_Ab', 'splice_lowmRNA_Ab_highSpliceAI', 'splice_lowmRNA_Ab_interSpliceAI', 'splice_lowmRNA_lowSpliceAI'])
                 ]

    stats_dict = {}
    for elem in stats_list:
        var_type, stats_to_get = elem

        full_count = len(df[df['Consequence'] == var_type])
        ab_lowRNA_count = len(df[(df['Consequence'] == var_type) & (df['RNA_classification'] == 'low') & (df['functional_consequence'] == 'functionally_abnormal')])
        ab_lowRNA_highSpliceAI_count = len(df[(df['Consequence'] == var_type) & (df['RNA_classification'] == 'low') & (df['functional_consequence'] == 'functionally_abnormal') & (df['splice_prediction'] == 'abnormal')]) #SpliceAI >= 0.5
        ab_lowRNA_interSpliceAI_count = len(df[(df['Consequence'] == var_type) & (df['RNA_classification'] == 'low') & (df['functional_consequence'] == 'functionally_abnormal') & (df['splice_prediction'] == 'intermediate')]) #0.2 <= SpliceAi < 0.5
        ab_lowRNA_lowSpliceAI_count = len(df[(df['Consequence'] == var_type) & (df['RNA_classification'] == 'low') & (df['functional_consequence'] == 'functionally_abnormal') & (df['splice_prediction'] == 'normal')]) #SpliceAI < 0.2

        stats_to_pass = [full_count, ab_lowRNA_count, ab_lowRNA_highSpliceAI_count, ab_lowRNA_interSpliceAI_count, ab_lowRNA_lowSpliceAI_count]

        i = 0
        while i < len(stats_to_get):
            stats_dict[stats_to_get[i]] = stats_to_pass[i]

            i += 1


    print('Total Vars. Annotated by SpliceAI: ', str(len(df)))

    display(stats_dict)

In [None]:
def main():
    designed_vars(target_coords)
    filtered_df = var_count_stats(raw_sge_file, filtered_sge_file, del_file)
    rna_count_stats(rna_file, filtered_df)
    spliceai_count_stats(splice_ai_rna_file)

In [None]:
main()