In [None]:
import pandas as pd
import altair as alt
import numpy as np
from scipy import stats
from pathlib import Path

In [None]:
rna_count_folder = '../Data/RNAscoring_dev_data/RNA_counts/' #path to RNA counts folder
dna_count_folder = '../Data/RNAscoring_dev_data/DNA_counts/' #paths to DNA counts folder
#sge_score = '../Data/20250122_BARD1_SGEscores_wAAsub.xlsx' #path to SGE score file
sge_score = '/Users/ivan/Desktop/20250428_BARD1_snvscores_IGVFupload_filtered.xlsx'
alt.data_transformers.disable_max_rows()

In [None]:
def read_counts(rna_counts, dna_counts, sge_file): #reads RNA/DNA count files and merges them. Also reads SGE scores

    dna_scores = pd.read_excel(sge_file) #Reads DNA SGE Scores

 
    #dna_scores = pd.read_csv(sge_file, sep = '\t')
    dna_scores = dna_scores.rename(columns = {'consequence': 'Consequence',   'score': 'snv_score', 'amino_acid_change': 'AAsub'})
    dna_scores['pos'] = dna_scores['pos'].astype(str)
    #dna_scores['pos_id'] = dna_scores['pos'] + ':' + dna_scores['alt']

    
    rna_path = Path(rna_counts) #Creates path object to RNA counts
    dna_path = Path(dna_counts) #Creates path object to DNA counts

    rna_outputs = sorted(list(rna_path.glob('*.tsv'))) #Gets list of RNA count files. Sort function added to order replicates
    dna_outputs = sorted(list(dna_path.glob('*.tsv'))) #Gets list of DNA count files. Sort function added to order replicates

    all_outputs = [] #list to hold tuples of DNA/RNA count files paired by replicate

    i = 0
    while i < len(rna_outputs): #Iterates through RNA count files and constructs the paired DNA/RNA tuples
        
        paired_output = (rna_outputs[i], dna_outputs[i]) #Paired RNA/DNA files
        all_outputs.append(paired_output) #Appends ot list

        i += 1

    all_rep_counts = {} #Dictionary to hold merged RNA/DNA count dataframe
    
    for elem in all_outputs: #Iterates through list of paired RNA/DNA count files
        rna, dna = elem #Breaks up tuple into constituent RNA/DNA count file paths
        rna_counts = pd.read_csv(rna, sep = '\t') #Reads RNA count file
        sampleid = rna_counts['sampleid'][0] #Gets sample ID from first row

        #Booleans to get replicate number from sample ID
        if 'R3R6' in sampleid or 'R7R8R9' in sampleid:
            rep = 'rep_3'
        elif 'R2R5'in sampleid or 'R4R5R6' in sampleid:
            rep = 'rep_2'
        elif 'R1R4' in sampleid or 'R1R2R3' in sampleid:
            rep = 'rep_1'

        #Creates a column that has target and replicate number
        target_rep_name = rna_counts['target'] + rep 
        rna_counts['target_rep'] = target_rep_name 
        rna_counts = rna_counts.rename(columns = {'count': 'RNAcount'}) #Renames count column in RNA file
        rna_counts['pos'] = rna_counts['pos'].astype(str) #Sets 'pos' column to string data type for merging
        rna_counts['pos_id'] = rna_counts['pos'] + ':' + rna_counts['allele'] #Creates position ID that is genomic coordinate : allele for merging

        
        dna_counts = pd.read_csv(dna, sep = '\t') #Reads DNA file
        dna_counts = dna_counts.rename(columns = {'count': 'DNAcount'}) #Renames count column in DNA file
        dna_counts['pos'] = dna_counts['pos'].astype(str) #Sets 'pos' column to string data type for merging
        dna_counts['pos_id'] = dna_counts['pos'] + ':' + dna_counts['allele'] #Creates position ID column for merging
         
        merged = pd.merge(rna_counts, dna_counts, how = 'inner', on = 'pos_id') #Merges RNA and DNA dataframes on intersection of pos_id column 
        merged = merged.drop(columns = ['sampleid_x', 'sampleid_y', 'target_x','chrom_x', 'chrom_y', 'pos_x', 'pos_y', 'allele_x', 'allele_y']) #drops duplicate columns 
        merged = merged.rename(columns  = {'target_y' : 'target'}) #Renames target column to keep exon SGE target name
        merged = merged.loc[:, ['target_rep','target', 'pos_id', 'RNAcount', 'DNAcount']] #Columns reordered
        
        all_rep_counts[target_rep_name[0]] = merged #merged dataframe added to dictionary

    return all_rep_counts, dna_scores

In [None]:
def rna_v_dna_plot(all_counts): #Generates scatter plots of RNA count vs. DNA count
    
    rep_regions = list(all_counts.keys()) #Gets list of all replicates and regions

    df_list = [] #List to hold data frames
    for elem in rep_regions: #Loop gets all dataframes
        df_list.append(all_counts[elem])


    all_df = pd.concat(df_list) #All dataframes concatenated 

    all_df = all_df[~((all_df['RNAcount'] < 50) | (all_df['DNAcount'] < 50))] #Data filtered against very low read variants

    #Builds RNA count vs. DNA count scatterplot
    scatter = alt.Chart(all_df).mark_circle().encode(
        x = alt.X('RNAcount', title = 'RNA Count'),
        y = alt.Y('DNAcount', title = 'DNA Count'),
        tooltip = [alt.Tooltip('RNAcount', title = 'RNA Count: '),
                   alt.Tooltip('DNAcount', title = 'DNA Count: '),
                   alt.Tooltip('pos_id', title = 'Var: ')
                  ]
    )

    #Builds trendline
    trend_line = alt.Chart(all_df).transform_regression(
        'RNAcount', 'DNAcount',
        groupby=['target_rep']
    ).mark_line(color='blue').encode(
        x='RNAcount',
        y='DNAcount'
    )


    #Combines scatterplot and trendline
    combined = (scatter + trend_line).facet(
        facet=alt.Facet('target_rep', title = 'DNA vs. RNA Counts')  # Set maximum of 4 columns per row
        ).resolve_scale(
            x = 'independent',
            y = 'independent'
        ).properties(
            columns = 6
        ).interactive()

    combined.display()


In [None]:
def get_rna_scores(rep_dict): #Handles RNA scoring of variants (note, as of 4/25/2025, RNA scores generated are no longer used downstream, but RNA/DNA counts/frequencies are)

    target_reps = list(rep_dict.keys()) #Gets list of all targets and replicates present in dictionary
    rna_scored = {} #Diciotnary to hold RNA-scored dataframes
    targets = [] #list of SGE targets present
    
    #Does RNA scoring
    for elem in target_reps:
        df = rep_dict[elem] #Gets dataframe
        df = df[~((df['RNAcount'] == 0) | (df['DNAcount'] == 0))].copy() #Drops rows with RNA/DNA count of 0
        df = df[~((df['RNAcount'] < 50) | (df['DNAcount'] < 50))].copy()

        df = df.reset_index(drop = True) #Resets index
         
        DNAcount = df['DNAcount'].sum() #Gets total DNA counts
        RNAcount = df['RNAcount'].sum() #Gets total RNA counts

        df['DNAfreq'] = df['DNAcount'] / DNAcount #Calculates DNA frequency
        df['RNAfreq'] = df['RNAcount'] / RNAcount #Calculates RNA frequency

        slope, intercept, r_value, p_value, std_err = stats.linregress(df['RNAfreq'], df['DNAfreq'])
        
        df['RNA/DNA'] = df['RNAcount'] / df['DNAcount']
        df['RNA/DNA_freq'] = df['RNAfreq'] / df['DNAfreq']
        df['RNA/DNA_freq/corr'] = df['RNA/DNA_freq'] / slope
        df['RNAscore'] = np.log2(df['RNAfreq']/ df['DNAfreq']) #Calculates RNA score using log2 ratio

        rna_scored[df['target_rep'][0]] = df #RNA-scored dataframe is added to rna_scored dataframe

        if df['target'][0] not in targets: #Appends SGE target names
            targets.append(df['target'][0])
        
    
    return rna_scored, targets
        

In [None]:
def get_rna_scores_beta(all_counts): #RNA scoring based on residual distance from trendline instead of RNA to DNA ratio on day 5. (Deprecated 4/25/2025)
    
    all_counts = list(all_counts.values())
    rna_scored = {}
    
    df = pd.concat(all_counts)

    
    def calculate_residuals(data):

        data = data[ ~ ((data['RNAcount'] == 0) | (data['DNAcount'] == 0))].copy()
        data = data[~((data['RNAcount'] < 100) | (data['DNAcount'] < 100))].copy()
        
        DNAcount = data['DNAcount'].sum()
        RNAcount = data['RNAcount'].sum()

        data['DNAfreq'] = data['DNAcount'] / DNAcount
        data['RNAfreq'] = data['RNAcount'] / RNAcount
        
        slope, intercept, r_value, p_value, std_err = stats.linregress(data['RNAfreq'], data['DNAfreq']) #Performs linear regression

        data['y_pred'] = slope * data['RNAfreq'] + intercept #Calculates predicted y-value (DNAfreq) from provided x-value (RNAfreq)
        data['residuals'] = -(data['DNAfreq'] - data['y_pred']) #Calculates vertical distance from trend line for each point

        perp_std = data['residuals'].std() #Standard deviation of all residuals calculated (previously used in normalizing scores)


        residual_median = data['residuals'].median() #Tabulates median residual
        residual_iqr = stats.iqr(data['residuals']) #Tabulates interquartile range for residuals
        
        # Avoiding division by zero if all points are exactly on the line
        if perp_std > 0:
            data['std_per_distance'] = (data['residuals'] - residual_median) / residual_iqr #Scores standarized relative to mean and interquartile range
        else:
            data['std_per_distance'] = np.nan

        return data

    df = df.groupby('target_rep').apply(calculate_residuals) #Calculates residuals
    df = df.reset_index(drop = True) #Ungroups
    #df = df.drop(columns = ['y_pred', 'residuals', 'per_distance', 'abs_residuals']) #Drops unused columns
    df = df.rename(columns = {'std_per_distance': 'RNAscore'})

    grouped = df.groupby('target_rep')
    for rep, data in grouped:
        rna_scored[rep] = data

    
    return rna_scored

In [None]:
def rna_dna_freq_corr_plots(dict, sge_scores): #Builds scatterplots of RNA frequency vs. DNA frequency for all 3 replicates for an SGE target
    
    all_rna = list(dict.values()) #Gets all dataframes
    df = pd.concat(all_rna) #Complete dataframe

    df = pd.merge(df, sge_scores, on = 'pos_id', how = 'left') #Merges with SGE scores
    df = df.rename(columns = {'target_x': 'target'}) #Columns renamed
    df = df[['target_rep', 'target', 'pos_id', 'RNAfreq', 'DNAfreq', 'AAsub', 'Consequence', 'snv_score', 'functional_consequence']] #More columns renamed for downstream functions
    df = df.loc[df['Consequence'].isin(['missense_variant', 'synonymous_variant', 'stop_gained'])] #Pulls out these variant types only

    grouped = df.groupby('target') #Groups by target
    charts = [] #List to hold plots

    #Iterates through SGE targets and builds plots
    for target, data in grouped:
        target_charts = [] #Charts for single SGE target
        
        #Gets replicate dataframes
        rep1 = data.loc[data['target_rep'].str.contains('rep_1')].reset_index(drop = True)
        rep2 = data.loc[data['target_rep'].str.contains('rep_2')].reset_index(drop = True)
        rep3 = data.loc[data['target_rep'].str.contains('rep_3')].reset_index(drop = True)

        pairs = [
            ('RNAfreq', 'DNAfreq', rep1, 'DNAfreq', 'RNAfreq', 'Rep 1'),
            ('RNAfreq', 'DNAfreq', rep2, 'DNAfreq', 'RNAfreq', 'Rep 2'),
            ('RNAfreq', 'DNAfreq', rep3, 'DNAfreq', 'RNAfreq', 'Rep 3')
        ] #Pairs for replicate testing

        #iterates through pairs and creates plots
        for rna, dna, rep, x_col, y_col, replicate in pairs:
            corr_pearson = np.corrcoef(rep[x_col], rep[y_col])[0,1]

            # Calculate min/max for diagonal line (y = x)
            min_val = min(rep[x_col].min(), rep[y_col].min())
            max_val = max(rep[x_col].max(), rep[y_col].max())
            
            line_df = pd.DataFrame({
                'x': [min_val, max_val],
                'y': [min_val, max_val]
            })

            # Create scatter plot
            scatter = alt.Chart(rep).mark_circle(opacity=0.7).encode(
                x=alt.X(x_col, title=f'{x_col} '),
                y=alt.Y(y_col, title=f'{y_col} '),
                color = alt.Color('Consequence', sort = ['synonymous_variant', 'missense_variant', 'stop_gained']),
                tooltip = [alt.Tooltip('pos_id', title = 'Variant: '),
                           alt.Tooltip(x_col),
                           alt.Tooltip(y_col),
                           alt.Tooltip('AAsub', title = 'Substitution: '), 
                           alt.Tooltip('snv_score', title = 'Score: ')
                          ]
            )
            
            # Create reference line
            line = alt.Chart(line_df).mark_line(
                color='red', 
                strokeDash=[4, 4]
            ).encode(
                x='x:Q',
                y='y:Q'
            )


            #Code for trendline if desired
            '''
            trend_line = alt.Chart(rep).transform_regression(
                'DNAfreq', 'RNAfreq',
                groupby=['target_rep']
            ).mark_line(color='blue').encode(
                x='DNAfreq',
                y='RNAfreq'
            )

            '''
            
            # Combine and add title with correlation
            combined = (scatter + line).properties(
                width=250,
                height=250,
                title= target + ' ' + replicate + ' ' + f'{rna} vs {dna} (r = {corr_pearson:.3f})' 
            ).interactive()

            target_charts.append(combined)

        target_chart = alt.hconcat(target_charts[0], target_charts[1], target_charts[2]) #All charts concatenated
        
        
        charts.append(target_chart)
        
    # Concatenate all charts horizontally
    final_chart = alt.vconcat(*charts).properties(
        title=''
    )
    
    # Display the chart
    final_chart.display()

    return df        

In [None]:
def agg_rnafreq(df): #Aggregates replicate dataframes into one target-level dataframe for each target
    
    df = df[['target', 'pos_id', 'RNAfreq', 'DNAfreq', 'AAsub', 'Consequence', 'snv_score', 'functional_consequence']] #Gets these columns
    df['target_id'] = df['target'] + ':' + df['pos_id']

    summary_df = df.groupby('target_id').agg({
        'target': 'first', #First target kept
        'pos_id': 'first',
        'RNAfreq': 'median', #Median RNA frequency taken betwen 3 replicates
        'DNAfreq': 'median', #Median DNA frequency taken between 3 replcicates
        'AAsub': 'first', #Amino acid substitution keeps first
        'Consequence': 'first', #molecular consequence keeps first
        'snv_score': 'first', #SGE score keeps first
        'functional_consequence': 'first' #functional consequence keeps first
    }).reset_index()

    summary_df['RNA/DNA'] = summary_df['RNAfreq']/summary_df['DNAfreq'] #Gets RNA/DNA ratio
    summary_df['L2RNA/DNA'] = np.log2(summary_df['RNA/DNA']) #Does Log2 of RNA/DNA ratio (not in use)
    
    return summary_df

In [None]:
def normalize_scores(df):
    grouped = df.groupby('target')

    normalized_dfs = []
    for target, data in grouped:
        syn_df = data.loc[data['Consequence'].isin(['synonymous_variant', 'missense_variant'])]
        syn_score_90th =syn_df['RNA/DNA'].quantile(0.90)
        syn_score_70th = syn_df['RNA/DNA'].quantile(0.75)
        syn_score = (syn_score_70th + syn_score_90th) /2

        data['RNA/DNA'] = data['RNA/DNA'] / syn_score

        normalized_dfs.append(data)


    final_df = pd.concat(normalized_dfs)
    final_df = final_df.reset_index(drop = True)


    return final_df
        

In [None]:
def qc_plots(dict): #Replicate vs. replicate correlation plots of log2 ratio-based RNA scores (not in use)
    
    all_rna = list(dict.values()) #Gets all dataframes
    df = pd.concat(all_rna) #Complete datafrmae

    
    grouped = df.groupby('target')
    charts = []
    for target, data in grouped:
        target_charts = []
        #Gets replicate dataframes
        rep1 = data.loc[data['target_rep'].str.contains('rep_1')].reset_index(drop = True)
        rep2 = data.loc[data['target_rep'].str.contains('rep_2')].reset_index(drop = True)
        rep3 = data.loc[data['target_rep'].str.contains('rep_3')].reset_index(drop = True)
        


        # Create dataframes for QC correlation comparison
        merged_1_2 = pd.merge(rep1, rep2, on='pos_id', how='inner', suffixes=('_rep1', '_rep2'))
        merged_1_3 = pd.merge(rep1, rep3, on='pos_id', how='inner', suffixes=('_rep1', '_rep3'))
        merged_2_3 = pd.merge(rep2, rep3, on='pos_id', how='inner', suffixes=('_rep2', '_rep3'))

            
        # Calculate correlations for each pair
        pairs = [
            ('rep1', 'rep2', merged_1_2, 'RNAfreq_rep1', 'RNAfreq_rep2'),
            ('rep1', 'rep3', merged_1_3, 'RNAfreq_rep1', 'RNAfreq_rep3'),
            ('rep2', 'rep3', merged_2_3, 'RNAfreq_rep2', 'RNAfreq_rep3')
            ]

        for x_rep, y_rep, merged_df, x_col, y_col in pairs:
            
            # Calculate correlation using aligned columns
            corr_pearson = np.corrcoef(merged_df[x_col], merged_df[y_col])[0,1] #Pearson correlation
            corr = stats.spearmanr(merged_df[x_col], merged_df[y_col]).correlation #Spearman correlation
            
            # Calculate min/max for diagonal line (y = x)
            min_val = min(merged_df[x_col].min(), merged_df[y_col].min())
            max_val = max(merged_df[x_col].max(), merged_df[y_col].max())
            
            line_df = pd.DataFrame({
                'x': [min_val, max_val],
                'y': [min_val, max_val]
            })
            
  
            
            # Create scatter plot
            scatter = alt.Chart(merged_df).mark_circle(opacity=0.7).encode(
                x=alt.X(x_col, title=f'{x_col} '),
                y=alt.Y(y_col, title=f'{y_col} '),
                tooltip = [alt.Tooltip('pos_id', title = 'Variant: '),
                           alt.Tooltip(x_col),
                           alt.Tooltip(y_col)
                          ]
            )
            
            # Create reference line
            line = alt.Chart(line_df).mark_line(
                color='red', 
                strokeDash=[4, 4]
            ).encode(
                x='x:Q',
                y='y:Q'
            )
            
            # Combine and add title with correlation
            combined = (scatter + line).properties(
                width=250,
                height=250,
                title= target + ' ' + f'{x_rep} vs {y_rep} (r = {corr_pearson:.3f})' 
            ).interactive()

            target_charts.append(combined)

        target_chart = alt.hconcat(target_charts[0], target_charts[1], target_charts[2])
        
        charts.append(target_chart)
    # Concatenate all charts horizontally
    final_chart = alt.vconcat(*charts).properties(
        title=''
    )
    
    # Display the chart
    final_chart.display()



In [None]:
def oldvsnew_scores(old_scores, new_scores):
    merged = pd.merge(old_scores, new_scores, how = 'left', on = 'pos_id')

    scatter = alt.Chart(merged).mark_circle().encode(
        x = 'RNAscore_x',
        y = 'RNAscore_y'
    )

    scatter.display()
    print(merged)

In [None]:
def collapse_scores(rna_scored, targets): #Collapses RNA scores between replicates to a median

    collapsed_scores = {} #Empty dictionary to hold dataframes with collapsed scores

    for elem in targets: #Iterates through provided SGE targets
        target_scores = [v for k, v in rna_scored.items() if elem in k] #Gets dicitonary elements in same SGE target

        concat_scores = pd.concat(target_scores) #Concatenates all scores
        concat_scores = concat_scores.drop(columns = ['target_rep','RNAcount', 'DNAcount', 'DNAfreq', 'RNAfreq']) #Drops columns no longer used

        #Generates summary dataframe that has a median RNA score for each variant
        summary_df = concat_scores.groupby('pos_id').agg({
            'target': 'first',
            'RNAscore': 'median'
        }).reset_index()

        #Tidys up collapsed RNA score df
        summmary_df = summary_df.loc[:, ['target', 'pos_id', 'RNAscore']]
        summary_df = summary_df.rename(columns = {'RNAscore': 'RNAscore_med'})
        
        collapsed_scores[elem] = summary_df #Appends collapsed RNA score df to dictionary

    return collapsed_scores

In [None]:
def rnafreq_vs_sge(df): #Gets thresholds for classifying RNA performance of a variant and various visualizations

    percentile_99 = df['RNA/DNA'].quantile(0.99) #Gets 99th percentile RNA/DNA 
    percentile_1 = df['RNA/DNA'].quantile(0.01) #Gets 1st percentile RNA/DNA

    threshold_df = df[(df['RNA/DNA'] >= percentile_1) & 
            (df['RNA/DNA'] <= percentile_99)] #Gets threshold setting dataframe by removing outliers
    
    mean_rnadna = threshold_df['RNA/DNA'].mean() #Gets mean RNA/DNA ratio
    rnadna_std = threshold_df['RNA/DNA'].std() #Gets standard deviation of RNA/DNA ratio

    upper_thresh = mean_rnadna + 2 * rnadna_std #Gets upper threshold by calculating 2 standard deviations above mean
    lower_thresh = mean_rnadna - 2 * rnadna_std #Gets lower threshold by calculating 3 standard deviations below mean

    thresholds = (lower_thresh, upper_thresh) #Tuple to be passed onto next functions

    #Vertical lines for upper and lower thresholds
    minus_std = alt.Chart(pd.DataFrame({'x': [lower_thresh]})).mark_rule(color = 'red').encode(
        x = 'x')
    plus_std = alt.Chart(pd.DataFrame({'x': [upper_thresh]})).mark_rule(color = 'red').encode(
        x = 'x')

    #Horizontal lines for upper and lower thresholds
    minus_std_horz = alt.Chart(df).mark_rule(color = 'red').encode(
        y = alt.Y(datum = (mean_rnadna - 2 *rnadna_std))
    )

    plus_std_horz = alt.Chart(df).mark_rule(color = 'red').encode(
        y = alt.Y(datum = (mean_rnadna + 2 *rnadna_std))
    )

    #Base scatter plot of RNA/DNA ratio vs. SGE score
    chart = alt.Chart(df).mark_circle().encode(
        x = 'snv_score',
        y = 'RNA/DNA',
        color = alt.Color('Consequence', 
                          sort = ['synonymous_variant', 'missense_variant', 'stop_gained']
                         ),
        tooltip = [alt.Tooltip('target', title = 'SGE Target: '),
                   alt.Tooltip('snv_score', title = 'SGE Score: '),
                   alt.Tooltip('RNAfreq', title = 'D05 RNA Frequency: '),
                   alt.Tooltip('Consequence', title = 'Conseuqence: '),
                   alt.Tooltip('AAsub', title = 'Substitution: '),
                   alt.Tooltip('pos_id', title = 'Variant: ')
                  ]
    )


    #Scatter of median collapsed RNA freq vs. median collapsed DNA freq
    scatter = alt.Chart(df).mark_circle().encode(
        x = 'DNAfreq', 
        y = 'RNAfreq',
        color = alt.Color('Consequence',
                          sort = ['syonymous_variant',
                                  'missense_variant', 
                                  'stop_gained'
                                 ]
                         )
    ).facet(
        facet = alt.Facet('target')
    ).resolve_scale(
        x = 'independent',
        y = 'independent'
    ).interactive()

    #Base histogram for visualizing distribution of RNA/DNA ratios
    base_histogram = alt.Chart(df).mark_bar().encode(
        alt.X('RNA/DNA:Q',
                  bin = alt.Bin(maxbins = 100, step = 0.1),
                  scale = alt.Scale(
                      domain = [0,3]
                  )
                 ),
        y = 'count()',
        color = alt.Color('Consequence', 
                          sort = ['synonymous_variant', 'missense_variant', 'stop_gained']
                         )
    ).interactive()
    
    

    histogram = base_histogram + minus_std + plus_std #Threshold lines added to histogram

    #Base scatter plot faceted
    chart = chart + minus_std_horz + plus_std_horz
    chart = chart.facet('target').resolve_scale(
        x = 'independent', 
        y = 'independent'
    ).interactive()

    chart.display()
    scatter.display()
    histogram.display()
    

    return thresholds

In [None]:
def get_abnormal_rna(df, thresholds): #Gets final dataframe of variants with abnormal RNA performance
    
    min_threshold, max_threshold = thresholds #Gets thresholds

    #This block can be added to retrieve abnormal variants only
    '''
    df = df[(df['RNA/DNA'] > max_threshold) |
            (df['RNA/DNA'] < min_threshold)] #Gets variants
            
    df = df.copy()
    
    '''
    
    
    
    df['RNA_classification'] = 'normal'

    #Classifies variant RNA performance
    df.loc[df['RNA/DNA'] > max_threshold, 'RNA_classification'] = 'high'
    df.loc[df['RNA/DNA'] < min_threshold, 'RNA_classification'] = 'low'
    df = df.reset_index(drop = True)
    
    return df

In [None]:
def merge_dna(dict, gdna_scores): #This function merges the DNA SGE scores the median RNA scores for each variant

    dna_scores = dna_scores.drop(columns = ['chrom', 'pos', 'allele', 'R1_score', 'R2_score', 'R3_score', 'target']) #Drops these columns
    all_rna = list(dict.values()) #Gets all dataframes stored in the RNAscore dictionary

    df = pd.concat(all_rna) #Concatenates all RNAscore dataframes

    merged = pd.merge(df, dna_scores, how = 'inner', on = 'pos_id') #Merges based on position ID


    multi_target_vars = merged.groupby('pos_id').agg({'RNAscore_med': 'median'}).reset_index()

    merged = pd.merge(multi_target_vars, merged, how = 'left', on = 'pos_id')
    merged = merged.rename(columns = {'RNAscore_med_x': 'RNAscore', 'RNAscore_med_y': 'target_RNAscore'})
    final_df = merged.loc[:, ['target', 'pos_id', 'Consequence', 'AAsub', 'snv_score', 'target_RNAscore', 'RNAscore']]

    
    return final_df

In [None]:
def visualize_scores(df): #Visualizes log2-ratio-based RNA scores vs. SGE scores (not in use)

    scatter_all = alt.Chart(df).mark_circle().encode(
        x = alt.X('snv_score', axis = alt.Axis(title = 'DNA Score')),
        y = alt.Y('RNAscore', axis = alt.Axis(title = 'RNA Score')),
        color = 'Consequence',
        tooltip = [alt.Tooltip('target', title = 'SGE Target: '),
                    alt.Tooltip('AAsub', title = 'AA Substitution: '),
                   alt.Tooltip('snv_score', title = 'DNA Score: '),
                   alt.Tooltip('RNAscore', title = 'RNA Score: ')
                  ]
    ).properties(
        height = 400, 
        width = 600,
        title = alt.TitleParams('RNA Score vs. DNA Score')
    ).interactive()

    scatter_all.display()
    
    histogram = alt.Chart(df).mark_bar().encode(
        x = alt.X('RNAscore:Q', title = 'RNA Score', bin = alt.Bin(maxbins = 30)),
        y =  alt.Y('count()', title = 'Number of Variants'),
        color = 'Consequence'
    ).properties(
        title = 'Distribution of RNA Scores',
        width = 600, 
        height = 400
    )

    histogram.display()
    
    scatter_faceted = scatter_all.facet(
        facet = alt.Facet('target', title = 'RNA vs. DNA Scores by Region')
    ).resolve_scale(
        x = 'independent', 
        y = 'independent'
    ).properties(
        columns = 3
    )

    scatter_faceted.display()

In [None]:
def main():
    all_counts, sge_scores = read_counts(rna_count_folder, dna_count_folder, sge_score)

    #rna_v_dna_plot(all_counts)
    
    rna_scored, sge_targets = get_rna_scores(all_counts)
    merged_df = rna_dna_freq_corr_plots(rna_scored, sge_scores)
    final_df = agg_rnafreq(merged_df)
    final_df_normalized = normalize_scores(final_df)
    threshold = rnafreq_vs_sge(final_df)
    test = rnafreq_vs_sge(final_df_normalized)
    output = get_abnormal_rna(final_df, threshold)

    
    #output.to_excel('/Users/ivan/Desktop/20250428_BARD1_IGVFupload_RNAclassified_plusX4.xlsx', index = False)
    
    '''
    rna_scored_beta = get_rna_scores_beta(all_counts)

    qc_plots(rna_scored)
    med_rna_scores = collapse_scores(rna_scored, sge_targets)
    med_rna_scores_new = collapse_scores(rna_scored_beta, sge_targets)

    dna_rna_df = merge_dna(med_rna_scores, sge_score)
    dna_rna_df_new = merge_dna(med_rna_scores_new, sge_score)
    
    oldvsnew_scores(dna_rna_df, dna_rna_df_new)
    visualize_scores(dna_rna_df)
    '''

In [None]:
main()