In [None]:
import pandas as pd
import altair as alt
import numpy as np
from scipy import stats
from pathlib import Path

In [None]:
rna_count_folder = '../Data/RNAscoring_dev_data/RNA_counts/' #path to RNA counts folder
dna_count_folder = '../Data/RNAscoring_dev_data/DNA_counts/' #paths to DNA counts folder
sge_score = '../Data/20250122_BARD1_SGEscores_wAAsub.xlsx' #path to SGe score file

In [None]:
def read_counts(rna_counts, dna_counts): #reads RNA/DNA count files and merges them
               
    rna_path = Path(rna_counts) #Creates path object to RNA counts
    dna_path = Path(dna_counts) #Creates path object to DNA counts
    
    rna_outputs = sorted(list(rna_path.glob('*.tsv'))) #Gets list of RNA count files. Sort function added to order replicates
    dna_outputs = sorted(list(dna_path.glob('*.tsv'))) #Gets list of DNA count files. Sort function added to order replicates
    
    all_outputs = [] #list to hold tuples of DNA/RNA count files paired by replicate

    i = 0
    while i < len(rna_outputs): #Iterates through RNA count files and constructs the paired DNA/RNA tuples
        paired_output = (rna_outputs[i], dna_outputs[i]) #Paired RNA/DNA files
        all_outputs.append(paired_output) #Appends ot list

        i += 1

    all_rep_counts = {} #Dictionary to hold merged RNA/DNA count dataframe
    
    for elem in all_outputs: #Iterates through list of paired RNA/DNA count files
        rna, dna = elem #Breaks up tuple into constituent RNA/DNA count file paths
        rna_counts = pd.read_csv(rna, sep = '\t') #Reads RNA count file
        sampleid = rna_counts['sampleid'][0] #Gets sample ID from first row

        #Booleans to get replicate number from sample ID
        if 'R3R6' in sampleid or 'R7R8R9' in sampleid:
            rep = 'rep_3'
        elif 'R2R5'in sampleid or 'R4R5R6' in sampleid:
            rep = 'rep_2'
        elif 'R1R4' in sampleid or 'R1R2R3' in sampleid:
            rep = 'rep_1'

        #Creates a column that has target and replicate number
        target_rep_name = rna_counts['target'] + rep 
        rna_counts['target_rep'] = target_rep_name 
        rna_counts = rna_counts.rename(columns = {'count': 'RNAcount'}) #Renames count column in RNA file
        rna_counts['pos'] = rna_counts['pos'].astype(str) #Sets 'pos' column to string data type for merging
        rna_counts['pos_id'] = rna_counts['pos'] + ':' + rna_counts['allele'] #Creates position ID that is genomic coordinate : allele for merging

        
        dna_counts = pd.read_csv(dna, sep = '\t') #Reads DNA file
        dna_counts = dna_counts.rename(columns = {'count': 'DNAcount'}) #Renames count column in DNA file
        dna_counts['pos'] = dna_counts['pos'].astype(str) #Sets 'pos' column to string data type for merging
        dna_counts['pos_id'] = dna_counts['pos'] + ':' + dna_counts['allele'] #Creates position ID column for merging
         
        merged = pd.merge(rna_counts, dna_counts, how = 'inner', on = 'pos_id') #Merges RNA and DNA dataframes on intersection of pos_id column 
        merged = merged.drop(columns = ['sampleid_x', 'sampleid_y', 'target_x','chrom_x', 'chrom_y', 'pos_x', 'pos_y', 'allele_x', 'allele_y']) #drops duplicate columns 
        merged = merged.rename(columns  = {'target_y' : 'target'}) #Renames target column to keep exon SGE target name
        merged = merged.loc[:, ['target_rep','target', 'pos_id', 'RNAcount', 'DNAcount']] #Columns reordered
        
        all_rep_counts[target_rep_name[0]] = merged #merged dataframe added to dictionary


    return all_rep_counts

In [None]:
def rna_v_dna_plot(all_counts):
    rep_regions = list(all_counts.keys())

    df_list = []
    for elem in rep_regions:
        df_list.append(all_counts[elem])


    all_df = pd.concat(df_list)

    all_df = all_df[~((all_df['RNAcount'] < 50) | (all_df['DNAcount'] < 50))]

    scatter = alt.Chart(all_df).mark_circle().encode(
        x = alt.X('RNAcount', title = 'RNA Count'),
        y = alt.Y('DNAcount', title = 'DNA Count'),
        tooltip = [alt.Tooltip('RNAcount', title = 'RNA Count: '),
                   alt.Tooltip('DNAcount', title = 'DNA Count: '),
                   alt.Tooltip('pos_id', title = 'Var: ')
                  ]
    )

    trend_line = alt.Chart(all_df).transform_regression(
        'RNAcount', 'DNAcount',
        groupby=['target_rep']
    ).mark_line(color='blue').encode(
        x='RNAcount',
        y='DNAcount'
    )


    combined = (scatter + trend_line).facet(
        facet=alt.Facet('target_rep', title = 'DNA vs. RNA Counts')  # Set maximum of 4 columns per row
        ).resolve_scale(
            x = 'independent',
            y = 'independent'
        ).properties(
            columns = 6
        ).interactive()

    combined.display()

    all_targets = list(set(all_df['target'].tolist()))

In [None]:
def get_rna_scores(rep_dict): #Handles RNA scoring of variants

    target_reps = list(rep_dict.keys()) #Gets list of all targets and replicates present in dictionary
    rna_scored = {} #Diciotnary to hold RNA-scored dataframes
    targets = [] #list of SGE targets present
    
    #Does RNA scoring
    for elem in target_reps:
        df = rep_dict[elem] #Gets dataframe
        df = df[~((df['RNAcount'] == 0) | (df['DNAcount'] == 0))].copy() #Drops rows with RNA/DNA count of 0
        df = df[~((df['RNAcount'] < 100) | (df['DNAcount'] < 100))].copy()

        df = df.reset_index(drop = True) #Resets index
         
        DNAcount = df['DNAcount'].sum() #Gets total DNA counts
        RNAcount = df['RNAcount'].sum() #Gets total RNA counts

        df['DNAfreq'] = df['DNAcount'] / DNAcount #Calculates DNA frequency
        df['RNAfreq'] = df['RNAcount'] / RNAcount #Calculates RNA frequency

        
        df['RNA/DNA'] = df['RNAcount'] / df['DNAcount']
        df['RNAscore'] = np.log2(df['RNAfreq']/ df['DNAfreq']) #Calculates RNA score using log2 ratio

        rna_scored[df['target_rep'][0]] = df #RNA-scored dataframe is added to rna_scored dataframe

        if df['target'][0] not in targets: #Appends SGE target names
            targets.append(df['target'][0])
        
    
    return rna_scored, targets
        

In [None]:
def get_rna_scores_beta(all_counts):
    all_counts = list(all_counts.values())
    rna_scored = {}
    
    df = pd.concat(all_counts)

    
    def calculate_residuals(data):

        data = data[ ~ ((data['RNAcount'] == 0) | (data['DNAcount'] == 0))].copy()
        data = data[~((data['RNAcount'] < 100) | (data['DNAcount'] < 100))].copy()
        
        DNAcount = data['DNAcount'].sum()
        RNAcount = data['RNAcount'].sum()

        data['DNAfreq'] = data['DNAcount'] / DNAcount
        data['RNAfreq'] = data['RNAcount'] / RNAcount
        
        slope, intercept, r_value, p_value, std_err = stats.linregress(data['RNAfreq'], data['DNAfreq'])

        data['y_pred'] = slope * data['RNAfreq'] + intercept
        data['residuals'] = -(data['DNAfreq'] - data['y_pred'])
        data['abs_residuals'] = np.abs(data['residuals'])

        raw_per_distance = data['DNAfreq'] - slope * data['RNAfreq'] - intercept / np.sqrt(slope**2 + 1)
        data['per_distance'] = -raw_per_distance

        # Calculate standard deviation of perpendicular distances in this group
        perp_std = data['per_distance'].std()
        
        # Add standardized perpendicular distance column
        # Avoiding division by zero if all points are exactly on the line
        if perp_std > 0:
            data['std_per_distance'] = data['per_distance'] / perp_std
        else:
            data['std_per_distance'] = np.nan

        return data

    df = df.groupby('target_rep').apply(calculate_residuals)
    df = df.reset_index(drop = True)
    df = df.drop(columns = ['y_pred', 'residuals', 'per_distance', 'abs_residuals'])
    df = df.rename(columns = {'std_per_distance': 'RNAscore'})

    grouped = df.groupby('target_rep')
    for rep, data in grouped:
        rna_scored[rep] = data

    
    return rna_scored

In [None]:
def qc_plots(dict):
    all_rna = list(dict.values())
    df = pd.concat(all_rna)
    grouped = df.groupby('target')
    
    for target, data in grouped:
        charts = []

        rep1 = data.loc[data['target_rep'].str.contains('rep_1')].reset_index(drop = True)
        rep2 = data.loc[data['target_rep'].str.contains('rep_2')].reset_index(drop = True)
        rep3 = data.loc[data['target_rep'].str.contains('rep_3')].reset_index(drop = True)
        
        pairs = [('rep1', 'rep2'), ('rep1', 'rep3'), ('rep2', 'rep3')]



        for x_rep, y_rep in pairs:
            # Create temp dataframe for this comparison
            temp_df = pd.DataFrame({
                'x_value': rep1['RNAscore'] if x_rep == 'rep1' else (rep2['RNAscore'] if x_rep == 'rep2' else rep3['RNAscore']),
                'y_value': rep1['RNAscore'] if y_rep == 'rep1' else (rep2['RNAscore'] if y_rep == 'rep2' else rep3['RNAscore']), 
                'RNAcount_1': rep1['RNAcount'] if x_rep == 'rep1' else (rep2['RNAcount'] if x_rep == 'rep2' else rep3['RNAcount']),
                'RNAcount_2': rep1['RNAcount'] if y_rep == 'rep1' else (rep2['RNAcount'] if y_rep == 'rep2' else rep3['RNAcount']),
                'DNAcount_1': rep1['DNAcount'] if x_rep == 'rep1' else (rep2['DNAcount'] if x_rep == 'rep2' else rep3['DNAcount']),
                'DNAcount_2': rep1['DNAcount'] if y_rep == 'rep1' else (rep2['DNAcount'] if y_rep == 'rep2' else rep3['DNAcount'])
            })

            # Calculate min/max for diagonal line
            min_val = min(temp_df['x_value'].min(), temp_df['y_value'].min())
            max_val = max(temp_df['x_value'].max(), temp_df['y_value'].max())
            
            line_df = pd.DataFrame({
                'x': [min_val, max_val],
                'y': [min_val, max_val]
            })
            
            # Calculate correlation coefficient
            corr = np.corrcoef(temp_df['x_value'], temp_df['y_value'])[0, 1]
            
            # Create scatter plot
            scatter = alt.Chart(temp_df).mark_circle(opacity=0.7).encode(
                x=alt.X('x_value:Q', title=f'{x_rep} RNAscore'),
                y=alt.Y('y_value:Q', title=f'{y_rep} RNAscore'),
                tooltip=[alt.Tooltip('x_value:Q', title =f'{x_rep} RNAscore'),
                         alt.Tooltip('y_value:Q', title = f'{y_rep} RNAscore'), 
                        alt.Tooltip('RNAcount_1', title = f'{x_rep} RNAcount'),
                        alt.Tooltip('RNAcount_2', title = f'{y_rep} RNAcount'),
                        alt.Tooltip('DNAcount_1', title = f'{x_rep} DNAcount'),
                        alt.Tooltip('DNAcount_2', title = f'{y_rep} DNAcount')]
            )
            
            # Create reference line
            line = alt.Chart(line_df).mark_line(
                color='red', 
                strokeDash=[4, 4]
            ).encode(
                x='x:Q',
                y='y:Q'
            )
            
            # Combine and add title with correlation
            combined = (scatter + line).properties(
                width=250,
                height=250,
                title= target + ' ' + f'{x_rep} vs {y_rep} (r = {corr:.3f})'
            ).interactive()
            
            charts.append(combined)
        
        # Concatenate all charts horizontally
        final_chart = alt.hconcat(*charts).properties(
            title=''
        )
        
        # Display the chart
        final_chart.display()



In [None]:
def oldvsnew_scores(old_scores, new_scores):
    merged = pd.merge(old_scores, new_scores, how = 'left', on = 'pos_id')

    scatter = alt.Chart(merged).mark_circle().encode(
        x = 'RNAscore_x',
        y = 'RNAscore_y'
    )

    scatter.display()
    print(merged)

In [None]:
def collapse_scores(rna_scored, targets): #Collapses RNA scores between replicates to a median

    collapsed_scores = {} #Empty dictionary to hold dataframes with collapsed scores

    for elem in targets: #Iterates through provided SGE targets
        target_scores = [v for k, v in rna_scored.items() if elem in k] #Gets dicitonary elements in same SGE target

        concat_scores = pd.concat(target_scores) #Concatenates all scores
        concat_scores = concat_scores.drop(columns = ['target_rep','RNAcount', 'DNAcount', 'DNAfreq', 'RNAfreq']) #Drops columns no longer used

        #Generates summary dataframe that has a median RNA score for each variant
        summary_df = concat_scores.groupby('pos_id').agg({
            'target': 'first',
            'RNAscore': 'median'
        }).reset_index()

        #Tidys up collapsed RNA score df
        summmary_df = summary_df.loc[:, ['target', 'pos_id', 'RNAscore']]
        summary_df = summary_df.rename(columns = {'RNAscore': 'RNAscore_med'})
        
        collapsed_scores[elem] = summary_df #Appends collapsed RNA score df to dictionary

    return collapsed_scores

In [None]:
def merge_dna(dict, gdna_scores): #This function merges the DNA SGE scores the median RNA scores for each variant
    dna_scores = pd.read_excel(gdna_scores) #Reads DNA SGE Scores

    dna_scores = dna_scores.drop(columns = ['chrom', 'pos', 'allele', 'R1_score', 'R2_score', 'R3_score', 'target']) #Drops these columns
    all_rna = list(dict.values()) #Gets all dataframes stored in the RNAscore dictionary

    df = pd.concat(all_rna) #Concatenates all RNAscore dataframes

    merged = pd.merge(df, dna_scores, how = 'inner', on = 'pos_id') #Merges based on position ID


    multi_target_vars = merged.groupby('pos_id').agg({'RNAscore_med': 'median'}).reset_index()

    merged = pd.merge(multi_target_vars, merged, how = 'left', on = 'pos_id')
    merged = merged.rename(columns = {'RNAscore_med_x': 'RNAscore', 'RNAscore_med_y': 'target_RNAscore'})
    final_df = merged.loc[:, ['target', 'pos_id', 'Consequence', 'AAsub', 'snv_score', 'target_RNAscore', 'RNAscore']]

    
    return final_df

In [None]:
def visualize_scores(df):

    scatter_all = alt.Chart(df).mark_circle().encode(
        x = alt.X('snv_score', axis = alt.Axis(title = 'DNA Score')),
        y = alt.Y('RNAscore', axis = alt.Axis(title = 'RNA Score')),
        color = 'Consequence',
        tooltip = [alt.Tooltip('target', title = 'SGE Target: '),
                    alt.Tooltip('AAsub', title = 'AA Substitution: '),
                   alt.Tooltip('snv_score', title = 'DNA Score: '),
                   alt.Tooltip('RNAscore', title = 'RNA Score: ')
                  ]
    ).properties(
        height = 400, 
        width = 600,
        title = alt.TitleParams('RNA Score vs. DNA Score')
    ).interactive()

    scatter_all.display()
    
    histogram = alt.Chart(df).mark_bar().encode(
        x = alt.X('RNAscore:Q', title = 'RNA Score', bin = alt.Bin(maxbins = 30)),
        y =  alt.Y('count()', title = 'Number of Variants'),
        color = 'Consequence'
    ).properties(
        title = 'Distribution of RNA Scores',
        width = 600, 
        height = 400
    )

    histogram.display()
    
    scatter_faceted = scatter_all.facet(
        facet = alt.Facet('target', title = 'RNA vs. DNA Scores by Region')
    ).resolve_scale(
        x = 'independent', 
        y = 'independent'
    ).properties(
        columns = 3
    )

    scatter_faceted.display()

In [None]:
def main():
    all_counts = read_counts(rna_count_folder, dna_count_folder)
    rna_v_dna_plot(all_counts)
    
    rna_scored, sge_targets = get_rna_scores(all_counts)
    rna_scored_beta = get_rna_scores_beta(all_counts)

    qc_plots(rna_scored_beta)    
    
    med_rna_scores = collapse_scores(rna_scored, sge_targets)
    med_rna_scores_new = collapse_scores(rna_scored_beta, sge_targets)

    dna_rna_df = merge_dna(med_rna_scores, sge_score)
    dna_rna_df_new = merge_dna(med_rna_scores_new, sge_score)

    oldvsnew_scores(dna_rna_df, dna_rna_df_new)
    #visualize_scores(dna_rna_df_new)


In [None]:
main()