In [None]:
import pandas as pd

In [None]:
raw_sge_file = '../Data/20250508_BARD1scores_update.tsv'
filtered_sge_file = '../Data/20250508_BARD1scores_update_FILTERED.xlsx'
rna_file = '/Users/ivan/Desktop/20250626_BARD1_202505_data_RNAclassified_beta.xlsx'
splice_ai_rna_file = '/Users/ivan/Desktop/20250710_20250626_RNAClassified_wSpliceAI.xlsx'

In [None]:
def var_count_stats(raw, filtered):
    
    raw_df = pd.read_csv(raw, sep = '\t')
    df = pd.read_excel(filtered)

    raw_df = raw_df.rename(columns = {'simplified_consequence': 'Consequence'})
    df = df.rename(columns = {'simplified_consequence': 'Consequence'})

    raw_count = len(raw_df)
    filtered_count = len(df)

    stats_list = [('missense_variant',['missense_count', 'missense_abnormal', 'missense_normal']),
                  ('synonymous_variant', ['synonymous_count', 'synonymous_abnormal', 'synonymous_normal']),
                  ('intron_variant', ['intron_count', 'intron_abnormal', 'intron_normal']),
                  ('stop_gained',['stop_gained_count', 'stop_gained_abnormal', 'stop_gained_normal']),
                  ('splice_site_variant',['splicing_count', 'splicing_abnormal', 'splicing_normal']),
                  ('splicing_variant', ['splice_region_count', 'splice_region_abnormal', 'splice_region_normal'])
                 ]

    stats_dict = {}

    for elem in stats_list:
        var_type, stats_to_get = elem

        ab_perc_str = var_type + '_abnormal_percent'
        norm_perc_str = var_type + '_norm_percent'
        
        full_count = len(df[df['Consequence'] == var_type])
        ab_count = len(df[(df['Consequence'] == var_type) & (df['functional_consequence'] == 'functionally_abnormal')])
        norm_count = len(df[(df['Consequence'] == var_type) & (df['functional_consequence'] == 'functionally_normal')])

        ab_perc = (ab_count / full_count) * 100
        norm_perc = 100 - ab_perc

        stats_dict[stats_to_get[0]] = full_count
        stats_dict[stats_to_get[1]] = ab_count
        stats_dict[stats_to_get[2]] = norm_count
        stats_dict[ab_perc_str] = round(ab_perc,2)
        stats_dict[norm_perc_str] = round(norm_perc,2)
        
    missense_count = len(df[df['Consequence'] == 'missense_variant'])
    missense_abnormal = len(df[(df['Consequence'] == 'missense_variant') & (df['functional_consequence'] == 'functionally_abnormal')])
    missense_normal = len(df[(df['Consequence'] == 'missense_variant') & (df['functional_consequence'] == 'functionally_normal')])

    print('Number of Raw Variants: ', str(raw_count), '\n',
          'Number of Filtered Variants: ', str(filtered_count), '\n'
         )

    print('General Count of Variants: ')
    display(stats_dict)

    return df

In [None]:
def rna_count_stats(rna, filtered):
    df = pd.read_excel(rna)

    total_rna_scored = len(df)

    stats_list = [('stop_gained', ['stop_count', 'stop_lowRNA', 'stop_abnormal_lowRNA', 'stop_normRNA', 'stop_abnormal_normRNA']),
                  ('missense_variant', ['missense_count', 'missense_lowRNA', 'missense_abnormal_lowRNA', 'missense_normRNA', 'missense_abnormal_normRNA']),
                  ('synonymous_variant', ['syn_count', 'syn_lowRNA', 'syn_abnormal_lowRNA', 'syn_normRNA', 'syn_abnormal_normRNA']),
                  ('splicing_variant', ['splicing_count', 'splicing_lowRNA', 'splicing_abnormal_lowRNA', 'splicing_normRNA', 'splicing_abnormal_normRNA'])
                 ]

    stats_dict = {}
    for elem in stats_list:
        var_type, stats_to_get = elem

        full_count = len(df[df['Consequence'] == var_type])
        lowRNA_count = len(df[(df['Consequence'] == var_type) & (df['RNA_classification'] == 'low')])
        ab_lowRNA_count = len(df[(df['Consequence'] == var_type) & (df['RNA_classification'] == 'low') & (df['functional_consequence'] == 'functionally_abnormal')])
        normRNA_count = len(df[(df['Consequence'] == var_type) & (df['RNA_classification'] == 'normal')])
        ab_normRNA_count = len(df[(df['Consequence'] == var_type) & (df['RNA_classification'] == 'normal') & (df['functional_consequence'] == 'functionally_abnormal')])
        
        stats_to_pass = [full_count, lowRNA_count, ab_lowRNA_count, normRNA_count, ab_normRNA_count]

        i = 0
        while i < len(stats_to_get):
            stats_dict[stats_to_get[i]] = stats_to_pass[i]

            i += 1

    print('RNA Count Stats', '\n',
          'Total RNA Vars.: ', str(total_rna_scored)
         )
    
    display(stats_dict)

In [None]:
def spliceai_count_stats(spliceai):
    df = pd.read_excel(spliceai)
    print(len(df))

In [None]:
def main():
    filtered_df = var_count_stats(raw_sge_file, filtered_sge_file)
    rna_count_stats(rna_file, filtered_df)
    spliceai_count_stats(splice_ai_rna_file)

In [None]:
main()