In [None]:
import pandas as pd
import altair as alt
import numpy as np
import statistics
from scipy import stats
from pathlib import Path
from natsort import natsorted

In [None]:
#For all regions excluding X1A ATG double mutant library
rna_count_folder = '../Data/RNAscoring_dev_data/RNA_counts/' #path to RNA counts folder
dna_count_folder = '../Data/RNAscoring_dev_data/DNA_counts/' #paths to DNA counts folder
sge_score = '../Data/20250813_BARD1scores_final_FILTERED.xlsx' #most up to date scores
sge_threshold_file = '../Data/20250813_BARD1_thresholds.tsv' #SGE Thresholds file
cds_pos = '../Data/BARD1_GenomicCoords_wCDS.xlsx' #CDS annotated BARD1 coordinates
alt.data_transformers.disable_max_rows()

In [None]:
def read_counts(rna_counts, dna_counts, sge_file, hreshold_file): #reads RNA/DNA count files and merges them. Also reads SGE scores

    dna_scores = pd.read_excel(sge_file) #Reads DNA SGE Scores

    threshold_df = pd.read_csv(threshold_file, sep = '\t')

    lwr = threshold_df['lthresh'][0] * threshold_df['std_neut'][0] + threshold_df['#mu_neut'][0]
    uppr = threshold_df['uthresh'][0] * threshold_df['std_neut'][0] + threshold_df['#mu_neut'][0]

    sge_thresholds = [lwr, uppr]
    
    #dna_scores = pd.read_csv(sge_file, sep = '\t')
    dna_scores = dna_scores.rename(columns = {'consequence': 'Consequence',   'score': 'snv_score', 'amino_acid_change': 'AAsub'})
    dna_scores['pos'] = dna_scores['pos'].astype(str)
    #dna_scores['pos_id'] = dna_scores['pos'] + ':' + dna_scores['alt']

    
    rna_path = Path(rna_counts) #Creates path object to RNA counts
    dna_path = Path(dna_counts) #Creates path object to DNA counts

    rna_outputs = sorted(list(rna_path.glob('*.tsv'))) #Gets list of RNA count files. Sort function added to order replicates
    dna_outputs = sorted(list(dna_path.glob('*.tsv'))) #Gets list of DNA count files. Sort function added to order replicates

    all_outputs = [] #list to hold tuples of DNA/RNA count files paired by replicate

    i = 0
    while i < len(rna_outputs): #Iterates through RNA count files and constructs the paired DNA/RNA tuples
        
        paired_output = (rna_outputs[i], dna_outputs[i]) #Paired RNA/DNA files
        all_outputs.append(paired_output) #Appends ot list

        i += 1

    all_rep_counts = {} #Dictionary to hold merged RNA/DNA count dataframe
    
    for elem in all_outputs: #Iterates through list of paired RNA/DNA count files
        rna, dna = elem #Breaks up tuple into constituent RNA/DNA count file paths
        rna_counts = pd.read_csv(rna, sep = '\t') #Reads RNA count file
        sampleid = rna_counts['sampleid'][0] #Gets sample ID from first row

        #Booleans to get replicate number from sample ID
        if 'R3R6' in sampleid or 'R7R8R9' in sampleid:
            rep = 'rep_3'
        elif 'R2R5'in sampleid or 'R4R5R6' in sampleid:
            rep = 'rep_2'
        elif 'R1R4' in sampleid or 'R1R2R3' in sampleid:
            rep = 'rep_1'

        #Creates a column that has target and replicate number
        target_rep_name = rna_counts['target'] + rep 
        rna_counts['target_rep'] = target_rep_name 
        rna_counts = rna_counts.rename(columns = {'count': 'RNAcount'}) #Renames count column in RNA file
        rna_counts['pos'] = rna_counts['pos'].astype(str) #Sets 'pos' column to string data type for merging
        rna_counts['pos_id'] = rna_counts['pos'] + ':' + rna_counts['allele'] #Creates position ID that is genomic coordinate : allele for merging

        
        dna_counts = pd.read_csv(dna, sep = '\t') #Reads DNA file
        dna_counts = dna_counts.rename(columns = {'count': 'DNAcount'}) #Renames count column in DNA file
        dna_counts['pos'] = dna_counts['pos'].astype(str) #Sets 'pos' column to string data type for merging
        dna_counts['pos_id'] = dna_counts['pos'] + ':' + dna_counts['allele'] #Creates position ID column for merging
         
        merged = pd.merge(rna_counts, dna_counts, how = 'inner', on = 'pos_id') #Merges RNA and DNA dataframes on intersection of pos_id column 
        merged = merged.drop(columns = ['sampleid_x', 'sampleid_y', 'target_x','chrom_x', 'chrom_y', 'pos_x', 'pos_y', 'allele_x', 'allele_y']) #drops duplicate columns 
        merged = merged.rename(columns  = {'target_y' : 'target'}) #Renames target column to keep exon SGE target name
        merged = merged.loc[:, ['target_rep','target', 'pos_id', 'RNAcount', 'DNAcount']] #Columns reordered
        
        all_rep_counts[target_rep_name[0]] = merged #merged dataframe added to dictionary

    return all_rep_counts, dna_scores, sge_thresholds

In [None]:
def rna_v_dna_plot(all_counts): #Generates scatter plots of RNA count vs. DNA count
    
    rep_regions = list(all_counts.keys()) #Gets list of all replicates and regions

    df_list = [] #List to hold data frames
    for elem in rep_regions: #Loop gets all dataframes
        df_list.append(all_counts[elem])


    all_df = pd.concat(df_list) #All dataframes concatenated 

    all_df = all_df[~((all_df['RNAcount'] < 50) | (all_df['DNAcount'] < 50))] #Data filtered against very low read variants

    #Builds RNA count vs. DNA count scatterplot
    scatter = alt.Chart(all_df).mark_circle().encode(
        x = alt.X('RNAcount', title = 'RNA Count'),
        y = alt.Y('DNAcount', title = 'DNA Count'),
        tooltip = [alt.Tooltip('RNAcount', title = 'RNA Count: '),
                   alt.Tooltip('DNAcount', title = 'DNA Count: '),
                   alt.Tooltip('pos_id', title = 'Var: ')
                  ]
    )

    #Builds trendline
    trend_line = alt.Chart(all_df).transform_regression(
        'RNAcount', 'DNAcount',
        groupby=['target_rep']
    ).mark_line(color='blue').encode(
        x='RNAcount',
        y='DNAcount'
    )


    #Combines scatterplot and trendline
    combined = (scatter + trend_line).facet(
        facet=alt.Facet('target_rep', title = 'DNA vs. RNA Counts')  # Set maximum of 4 columns per row
        ).resolve_scale(
            x = 'independent',
            y = 'independent'
        ).properties(
            columns = 6
        ).interactive()

    combined.display()


In [None]:
def get_rna_scores(rep_dict): #Handles RNA scoring of variants (note, as of 4/25/2025, RNA scores generated are no longer used downstream, but RNA/DNA counts/frequencies are)

    target_reps = list(rep_dict.keys()) #Gets list of all targets and replicates present in dictionary
    rna_scored = {} #Diciotnary to hold RNA-scored dataframes
    targets = [] #list of SGE targets present
    
    #Does RNA scoring
    for elem in target_reps:
        df = rep_dict[elem] #Gets dataframe
        df = df[~((df['RNAcount'] == 0) | (df['DNAcount'] == 0))].copy() #Drops rows with RNA or DNA count of 0
        df = df[~((df['RNAcount'] < 25) | (df['DNAcount'] < 25))].copy() #Drops rows with RNA or DNA count less than 25

        df = df.reset_index(drop = True) #Resets index
         
        DNAcount = df['DNAcount'].sum() #Gets total DNA counts
        RNAcount = df['RNAcount'].sum() #Gets total RNA counts

        df['DNAfreq'] = df['DNAcount'] / DNAcount #Calculates DNA frequency
        df['RNAfreq'] = df['RNAcount'] / RNAcount #Calculates RNA frequency

        slope, intercept, r_value, p_value, std_err = stats.linregress(df['RNAfreq'], df['DNAfreq'])
        
        df['RNA/DNA'] = df['RNAfreq'] / df['DNAfreq'] #Calculates ratio of RNA to DNA frequency for each replicate

        rna_scored[df['target_rep'][0]] = df #RNA-scored dataframe is added to rna_scored dataframe

        if df['target'][0] not in targets: #Appends SGE target names
            targets.append(df['target'][0])
        

    return rna_scored, targets
        

In [None]:
def rna_dna_freq_corr_plots(dict, sge_scores): #Builds scatterplots of RNA frequency vs. DNA frequency for all 3 replicates for an SGE target
    
    all_rna = list(dict.values()) #Gets all dataframes
    df = pd.concat(all_rna) #Complete dataframe

    df = pd.merge(df, sge_scores, on = 'pos_id', how = 'left') #Merges with SGE scores
    df = df.rename(columns = {'target_x': 'target'})#Columns renamed
    df = df.rename(columns = {'simplified_consequence': 'Consequence'}) #Columns renamed - may be removed in future input data release
    df = df[['target_rep', 'target', 'pos_id', 'RNAfreq', 'DNAfreq', 'RNA/DNA', 'AAsub', 'Consequence', 'snv_score', 'functional_consequence']] #Columsn pulled for downstream functions
    df = df.loc[df['Consequence'].isin(['missense_variant', 'synonymous_variant', 'stop_gained', 'splicing_variant', 'stop_lost', 'start_lost', 'UTR_vairant'])] #Pulls out these variant types only

    grouped = df.groupby('target') #Groups by target
    charts = [] #List to hold plots
    rna_dna_corr = {}
    
    #Iterates through SGE targets and builds plots
    for target, data in grouped:
        target_charts = [] #Charts for single SGE target
        
        #Gets replicate dataframes
        rep1 = data.loc[data['target_rep'].str.contains('rep_1')].reset_index(drop = True)
        rep2 = data.loc[data['target_rep'].str.contains('rep_2')].reset_index(drop = True)
        rep3 = data.loc[data['target_rep'].str.contains('rep_3')].reset_index(drop = True)

        pairs = [
            ('RNAfreq', 'DNAfreq', rep1, 'DNAfreq', 'RNAfreq', 'Rep 1'),
            ('RNAfreq', 'DNAfreq', rep2, 'DNAfreq', 'RNAfreq', 'Rep 2'),
            ('RNAfreq', 'DNAfreq', rep3, 'DNAfreq', 'RNAfreq', 'Rep 3')
        ] #Pairs for replicate testing

        target_corr = []
        #iterates through pairs and creates plots
        for rna, dna, rep, x_col, y_col, replicate in pairs:
            corr_pearson = np.corrcoef(rep[x_col], rep[y_col])[0,1]
            target_corr.append(corr_pearson)
            
            # Calculate min/max for diagonal line (y = x)
            min_val = min(rep[x_col].min(), rep[y_col].min())
            max_val = max(rep[x_col].max(), rep[y_col].max())
            
            line_df = pd.DataFrame({
                'x': [min_val, max_val],
                'y': [min_val, max_val]
            })

            # Create scatter plot
            scatter = alt.Chart(rep).mark_circle(opacity=0.7).encode(
                x=alt.X(x_col, title=f'{x_col} '),
                y=alt.Y(y_col, title=f'{y_col} '),
                color = alt.Color('Consequence', sort = ['synonymous_variant', 'missense_variant', 'stop_gained']),
                tooltip = [alt.Tooltip('pos_id', title = 'Variant: '),
                           alt.Tooltip(x_col),
                           alt.Tooltip(y_col),
                           alt.Tooltip('AAsub', title = 'Substitution: '), 
                           alt.Tooltip('snv_score', title = 'Score: ')
                          ]
            )
            
            # Create reference line
            line = alt.Chart(line_df).mark_line(
                color='red', 
                strokeDash=[4, 4]
            ).encode(
                x='x:Q',
                y='y:Q'
            )


            #Code for trendline if desired
            '''
            trend_line = alt.Chart(rep).transform_regression(
                'DNAfreq', 'RNAfreq',
                groupby=['target_rep']
            ).mark_line(color='blue').encode(
                x='DNAfreq',
                y='RNAfreq'
            )

            '''
            
            # Combine and add title with correlation
            combined = (scatter + line).properties(
                width=250,
                height=250,
                title= target + ' ' + replicate + ' ' + f'{rna} vs {dna} (r = {corr_pearson:.3f})' 
            ).interactive()

            target_charts.append(combined)

        target_chart = alt.hconcat(target_charts[0], target_charts[1], target_charts[2]) #All charts concatenated
        median_corr = statistics.median(target_corr)
        rna_dna_corr[target] = median_corr
        charts.append(target_chart)
        
    # Concatenate all charts horizontally
    final_chart = alt.vconcat(*charts).properties(
        title=''
    )
    
    # Display the chart
    #final_chart.display()
    
    return df, rna_dna_corr        

In [None]:
def agg_rnafreq(df): #Aggregates replicate dataframes into one target-level dataframe for each target
    df = df[['target', 'pos_id', 'RNAfreq', 'DNAfreq', 'RNA/DNA', 'AAsub', 'Consequence', 'snv_score', 'functional_consequence']] #Gets these columns
    
    df['target_id'] = df['target'] + ':' + df['pos_id'] #Creates unique pos_id that also has target identifier for aggregation


    #Aggregates information 
    summary_df = df.groupby('target_id').agg({
    'target': 'first',
    'pos_id': 'first',
    'RNAfreq': 'median',
    'DNAfreq': 'median',
    'RNA/DNA': ['median', 'sem'],  # Changed to include both median and SEM
    'AAsub': 'first',
    'Consequence': 'first',
    'snv_score': 'first',
    'functional_consequence': 'first'
    }).reset_index()

    # Flatten the multi-level column names for SEM column
    summary_df.columns = [
    'RNA/DNA' if col == ('RNA/DNA', 'median') else
    'RNA/DNA_sem' if col == ('RNA/DNA', 'sem') else
    col[0] if isinstance(col, tuple) else col
    for col in summary_df.columns
    ]
    
    # Rename to cleaner names
    summary_df.rename(columns={
        'RNA/DNA_median': 'RNA/DNA',
        'RNA/DNA_sem': 'RNA/DNA_sem'
    }, inplace=True)

    summary_df['L2RNA/DNA'] = np.log2(summary_df['RNA/DNA']) #Does Log2 of RNA/DNA ratio (not in use)

    return summary_df

In [None]:
def normalize_scores_mean(df): #Beta normalization method based on distribution of synonymous variants in each SGE target rather than distribution of all synonymous variants

    grouped = df.groupby('target')

    normalized_dfs = []
    for target, data in grouped:
        threshold_df = df.loc[df['Consequence'].isin(['synonymous_variant'])]
        percentile_99 = threshold_df['RNA/DNA'].quantile(0.99) #Gets 99th percentile RNA/DNA 
        percentile_1 = threshold_df['RNA/DNA'].quantile(0.01) #Gets 1st percentile RNA/DNA

        threshold_df = data[(data['RNA/DNA'] >= percentile_1) & 
            (data['RNA/DNA'] <= percentile_99)]

        mean_rnadna = threshold_df['RNA/DNA'].mean() #Gets mean RNA/DNA ratio
        rnadna_std = threshold_df['RNA/DNA'].std()

        upper_thresh = mean_rnadna + 2 * rnadna_std #Gets upper threshold by calculating 2 standard deviations above mean
        lower_thresh = mean_rnadna - 2 * rnadna_std

        data['RNA/DNA'] = (data['RNA/DNA'] - lower_thresh)/(upper_thresh - lower_thresh) #min-max normalization

        normalized_dfs.append(data)

    final_df = pd.concat(normalized_dfs)
    final_df = final_df.reset_index(drop = True)


    return final_df

In [None]:
def rnafreq_vs_sge(df, sge_threshold): #Gets thresholds for classifying RNA performance of a variant and various visualizations

    #Threshold setting based on distribution of all scored synonymous variants in data set
    threshold_df = df.loc[df['Consequence'].isin(['synonymous_variant'])]
    percentile_99 = threshold_df['RNA/DNA'].quantile(0.99) #Gets 99th percentile RNA/DNA 
    percentile_1 = threshold_df['RNA/DNA'].quantile(0.01) #Gets 1st percentile RNA/DNA

    threshold_df = threshold_df[(threshold_df['RNA/DNA'] >= percentile_1) & 
            (threshold_df['RNA/DNA'] <= percentile_99)] #Gets threshold setting dataframe by removing outliers
    '''
    percentile_99 = df['RNA/DNA'].quantile(0.99) #Gets 99th percentile RNA/DNA 
    percentile_1 = df['RNA/DNA'].quantile(0.01) #Gets 1st percentile RNA/DNA

    threshold_df = df[(df['RNA/DNA'] >= percentile_1) & 
            (df['RNA/DNA'] <= percentile_99)] #Gets threshold setting dataframe by removing outliers
    '''
    
    mean_rnadna = threshold_df['RNA/DNA'].mean() #Gets mean RNA/DNA ratio
    rnadna_std = threshold_df['RNA/DNA'].std() #Gets standard deviation of RNA/DNA ratio

    upper_thresh = mean_rnadna + 2 * rnadna_std #Gets upper threshold by calculating 2 standard deviations above mean
    lower_thresh = mean_rnadna - 2 * rnadna_std #Gets lower threshold by calculating 3 standard deviations below mean

    thresholds = (lower_thresh, upper_thresh) #Tuple to be passed onto next functions

    #Vertical lines for upper and lower thresholds
    minus_std = alt.Chart(pd.DataFrame({'x': [lower_thresh]})).mark_rule(color = 'red').encode(
        x = 'x')
    plus_std = alt.Chart(pd.DataFrame({'x': [upper_thresh]})).mark_rule(color = 'red').encode(
        x = 'x')

    #Horizontal lines for upper and lower thresholds
    minus_std_horz = alt.Chart(df).mark_rule(color = 'red').encode(
        y = alt.Y(datum = (mean_rnadna - 2 *rnadna_std))
    )

    plus_std_horz = alt.Chart(df).mark_rule(color = 'red').encode(
        y = alt.Y(datum = (mean_rnadna + 2 *rnadna_std))
    )

    #Vertical Lines for SGE Score Thresholds
    LFClower = alt.Chart(df).mark_rule(color = 'red').encode(
        x = alt.X(datum = (sge_threshold[0]))
    )

    LFCupper = alt.Chart(df).mark_rule(color = 'red').encode(
        x = alt.X(datum = (sge_threshold[1]))
    )
    #Base scatter plot of RNA/DNA ratio vs. SGE score
    chart = alt.Chart(df).mark_circle().encode(
        x = alt.X('snv_score',
                  scale = alt.Scale(domain = [-0.4, 0.1]
                                   )
                 ),
        y = alt.Y('RNA/DNA',
                  scale = alt.Scale(domain = [-0.5, 2.5])
                 ),
        color = alt.Color('Consequence', 
                          sort = ['synonymous_variant', 'missense_variant', 'stop_gained']
                         ),
        tooltip = [alt.Tooltip('target', title = 'SGE Target: '),
                   alt.Tooltip('snv_score', title = 'SGE Score: '),
                   alt.Tooltip('RNAfreq', title = 'D05 RNA Frequency: '),
                   alt.Tooltip('Consequence', title = 'Conseuqence: '),
                   alt.Tooltip('AAsub', title = 'Substitution: '),
                   alt.Tooltip('pos_id', title = 'Variant: ')
                  ]
    )


    #Scatter of median collapsed RNA freq vs. median collapsed DNA freq
    scatter = alt.Chart(df).mark_circle().encode(
        x = 'DNAfreq', 
        y = 'RNAfreq',
        color = alt.Color('Consequence',
                          sort = ['syonymous_variant',
                                  'missense_variant', 
                                  'stop_gained'
                                 ]
                         )
    ).facet(
        facet = alt.Facet('target')
    ).resolve_scale(
        x = 'independent',
        y = 'independent'
    ).interactive()

    #Base histogram for visualizing distribution of RNA/DNA ratios
    base_histogram = alt.Chart(df).mark_bar().encode(
        alt.X('RNA/DNA:Q',
                  bin = alt.Bin(maxbins = 100, step = 0.1),
                  scale = alt.Scale(
                      domain = [0,3]
                  )
                 ),
        y = 'count()',
        color = alt.Color('Consequence', 
                          sort = ['synonymous_variant', 'missense_variant', 'stop_gained']
                         )
    ).interactive()
    
    

    histogram = base_histogram + minus_std + plus_std #Threshold lines added to histogram

    #Base scatter plot faceted
    targets = natsorted(list(set(df['target'].tolist())))
    chart = chart + minus_std_horz + plus_std_horz + LFCupper + LFClower
    chart = chart.facet(facet = alt.Facet(
        'target',
        sort = targets
        ),columns = 5
    ).interactive()

    chart.display()
    #scatter.display()
    histogram.display()
    

    return thresholds

In [None]:
def rnafreq_vs_sge_beta(df, sge_threshold):

    upper_thresh = 1 #Gets upper threshold by calculating 2 standard deviations above mean
    lower_thresh = 0 #Gets lower threshold by calculating 3 standard deviations below mean

    thresholds = (lower_thresh, upper_thresh) #Tuple to be passed onto next functions
    print(thresholds)
    #Vertical lines for upper and lower thresholds
    minus_std = alt.Chart(pd.DataFrame({'x': [lower_thresh]})).mark_rule(color = 'red').encode(
        x = 'x')
    plus_std = alt.Chart(pd.DataFrame({'x': [upper_thresh]})).mark_rule(color = 'red').encode(
        x = 'x')

    #Horizontal lines for upper and lower thresholds
    minus_std_horz = alt.Chart(df).mark_rule(color = 'red').encode(
        y = alt.Y(datum = lower_thresh)
    )

    plus_std_horz = alt.Chart(df).mark_rule(color = 'red').encode(
        y = alt.Y(datum = upper_thresh)
    )

    #Vertical Lines for SGE Score Thresholds
    LFClower = alt.Chart(df).mark_rule(color = 'red').encode(
        x = alt.X(datum = (sge_threshold[0]))
    )

    LFCupper = alt.Chart(df).mark_rule(color = 'red').encode(
        x = alt.X(datum = (sge_threshold[1]))
    )

    #Base scatter plot of RNA/DNA ratio vs. SGE score
    chart = alt.Chart(df).mark_circle().encode(
        x = alt.X('snv_score',
                  scale = alt.Scale(domain = [-0.4, 0.1])
                ),
        y = alt.Y('RNA/DNA',
                  scale = alt.Scale(domain = [-0.5, 2.5])
                 ),
        color = alt.Color('Consequence', 
                          sort = ['synonymous_variant', 'missense_variant', 'stop_gained']
                         ),
        tooltip = [alt.Tooltip('target', title = 'SGE Target: '),
                   alt.Tooltip('snv_score', title = 'SGE Score: '),
                   alt.Tooltip('RNAfreq', title = 'D05 RNA Frequency: '),
                   alt.Tooltip('Consequence', title = 'Conseuqence: '),
                   alt.Tooltip('AAsub', title = 'Substitution: '),
                   alt.Tooltip('pos_id', title = 'Variant: ')
                  ]
    )


    #Scatter of median collapsed RNA freq vs. median collapsed DNA freq
    scatter = alt.Chart(df).mark_circle().encode(
        x = 'DNAfreq', 
        y = 'RNAfreq',
        color = alt.Color('Consequence',
                          sort = ['syonymous_variant',
                                  'missense_variant', 
                                  'stop_gained'
                                 ]
                         )
    ).facet(
        facet = alt.Facet('target')
    ).resolve_scale(
        x = 'independent',
        y = 'independent'
    ).interactive()

    #Base histogram for visualizing distribution of RNA/DNA ratios
    base_histogram = alt.Chart(df).mark_bar().encode(
        alt.X('RNA/DNA:Q',
                  bin = alt.Bin(maxbins = 100, step = 0.1),
                  scale = alt.Scale(
                      domain = [0,3]
                  )
                 ),
        y = 'count()',
        color = alt.Color('Consequence', 
                          sort = ['synonymous_variant', 'missense_variant', 'stop_gained']
                         )
    ).interactive()
    
    

    histogram = base_histogram + minus_std + plus_std #Threshold lines added to histogram

    #Base scatter plot faceted
    targets = natsorted(list(set(df['target'].tolist())))
    chart = chart + minus_std_horz + plus_std_horz + LFCupper + LFClower
    chart = chart.facet(facet = alt.Facet(
        'target',
        sort = targets
        ),columns = 5
    ).interactive()

    chart.display()
    #scatter.display()
    histogram.display()
    

    return thresholds

In [None]:
def get_abnormal_rna(df, cds, corr_dict, thresholds): #Gets final dataframe of variants with abnormal RNA performance
    
    min_threshold, max_threshold = thresholds #Gets thresholds
    
    grouped = df.groupby('pos_id') #Groups by position ID

    #Iterates through each variant. For double-covered variants, data from library with higher RNA to DNA correalation kept
    for position, data in grouped:
        target_list = list(set(data['target'].tolist()))

        if len(target_list) == 1:
            continue
        else:
            lib1 = target_list[0]
            lib2 = target_list[1]

            lib1_corr = corr_dict[lib1]
            lib2_corr = corr_dict[lib2]

            if lib1_corr > lib2_corr:
                to_remove = lib2 + ':' + position

                df = df.loc[~(df['target_id'].isin([to_remove]))]
                df = df.copy()

            elif lib2_corr > lib1_corr: 
                to_remove = lib1 + ':' + position

                df = df.loc[~(df['target_id'].isin([to_remove]))]
                df = df.copy()
                
    cds_pos = pd.read_excel(cds)
        
    
    df['RNA_classification'] = 'normal'

    #Classifies variant RNA performance
    df.loc[df['RNA/DNA'] > max_threshold, 'RNA_classification'] = 'high'
    df.loc[df['RNA/DNA'] < min_threshold, 'RNA_classification'] = 'low'

    lowRNA_funcNormal = len(df.loc[(df['RNA_classification'] == 'low') & (df['functional_consequence'] == 'functionally_normal')])
    perc_misclassified = (lowRNA_funcNormal / len(df)) * 100

    df['pos'] = df['pos_id'].transform(lambda x: x.split(':')[0])
    df['pos'] = df['pos'].astype(int)

    df = pd.merge(df, cds_pos, on = 'pos', how = 'inner')

    df = df[['target', 'pos_id', 'AAsub', 'Consequence', 'snv_score', 'functional_consequence', 'RNA/DNA', 'RNA/DNA_sem', 'RNA_classification', 'pos', 'CDSpos']]
    
    print('% Low RNA & Functionally Normal: ', str(perc_misclassified),'\n',
         '# Low RNA & Functionally Normal: ', str(lowRNA_funcNormal))
    df = df.reset_index(drop = True)
    
    return df

In [None]:
def merge_dna(dict, gdna_scores): #This function merges the DNA SGE scores the median RNA scores for each variant

    dna_scores = dna_scores.drop(columns = ['chrom', 'pos', 'allele', 'R1_score', 'R2_score', 'R3_score', 'target']) #Drops these columns
    all_rna = list(dict.values()) #Gets all dataframes stored in the RNAscore dictionary

    df = pd.concat(all_rna) #Concatenates all RNAscore dataframes

    merged = pd.merge(df, dna_scores, how = 'inner', on = 'pos_id') #Merges based on position ID


    multi_target_vars = merged.groupby('pos_id').agg({'RNAscore_med': 'median'}).reset_index()

    merged = pd.merge(multi_target_vars, merged, how = 'left', on = 'pos_id')
    merged = merged.rename(columns = {'RNAscore_med_x': 'RNAscore', 'RNAscore_med_y': 'target_RNAscore'})
    final_df = merged.loc[:, ['target', 'pos_id', 'Consequence', 'AAsub', 'snv_score', 'target_RNAscore', 'RNAscore']]

    
    return final_df

In [None]:
def main():
    all_counts, sge_scores, sge_thresholds = read_counts(rna_count_folder, dna_count_folder, sge_score, sge_threshold_file)

    #rna_v_dna_plot(all_counts)
    
    rna_scored, sge_targets = get_rna_scores(all_counts)
    merged_df, rna_dna_corr = rna_dna_freq_corr_plots(rna_scored, sge_scores)

    #Code for Whole-Gene Synonymous Normalization
    final_df = agg_rnafreq(merged_df)
    print('Whole gene, syn. distribution: ')
    threshold = rnafreq_vs_sge(final_df)

    #Code for Target-by-Target Synonymous Normalization
    final_df_mean_norm = normalize_scores_mean(final_df)
    print('Target by target, syn. distribution: ')
    target_threshold = rnafreq_vs_sge_beta(final_df_mean_norm)
    
    print('Whole Gene QC: ')
    whole_gene_output = get_abnormal_rna(final_df,cds_pos, rna_dna_corr, threshold, sge_thresholds)
    print('Target-by-Target QC: ')
    target_by_target_output = get_abnormal_rna(final_df_mean_norm, cds_pos, rna_dna_corr, target_threshold, sge_thresholds)

    output = 'Whole'

    if output == 'Whole':
        final_df = whole_gene_output
        final_threshold = threshold
        output_path_str = '../Data/20250813/20250813_BARD1_finaldata_RNA_wholeGene.xlsx'
    elif output == 'Target':
        final_df == target_by_target_output
        final_threshold = target_threshold
        output_path_str = '../Data/20250813_BARD1_finaldata_RNA_Targetbased.xlsx'
    
    print(final_df)
    min_threshold, max_threshold = final_threshold
    
    threshold_df = pd.DataFrame({'min': [min_threshold], 'max': [max_threshold]})

    dfs = {'data': final_df,
           'thresholds': threshold_df
          }


    '''
    with pd.ExcelWriter(output_path_str) as writer:
        for sheet_name, df in dfs.items():
            df.to_excel(writer, sheet_name=sheet_name, index=False)
    '''

    #whole_gene_output.to_excel('../Data/20250813/20250813_BARD1_finaldata_RNA_wholeGene.xlsx', index = False)
    #target_by_target_output.to_excel('../Data/20250813_BARD1_finaldata_RNA_Targetbased.xlsx', index = False)
    

In [None]:
main()