In [None]:
import pandas as pd
import altair as alt
import numpy as np
from scipy import stats
from pathlib import Path

In [None]:
rna_count_folder = '../Data/RNAscoring_dev_data/RNA_counts/' #path to RNA counts folder
dna_count_folder = '../Data/RNAscoring_dev_data/DNA_counts/' #paths to DNA counts folder
sge_score = '../Data/20250122_BARD1_SGEscores_wAAsub.xlsx' #path to SGe score file

In [None]:
def read_counts(rna_counts, dna_counts): #reads RNA/DNA count files and merges them
               
    rna_path = Path(rna_counts) #Creates path object to RNA counts
    dna_path = Path(dna_counts) #Creates path object to DNA counts
    
    rna_outputs = sorted(list(rna_path.glob('*.tsv'))) #Gets list of RNA count files. Sort function added to order replicates
    dna_outputs = sorted(list(dna_path.glob('*.tsv'))) #Gets list of DNA count files. Sort function added to order replicates
    
    all_outputs = [] #list to hold tuples of DNA/RNA count files paired by replicate

    i = 0
    while i < len(rna_outputs): #Iterates through RNA count files and constructs the paired DNA/RNA tuples
        paired_output = (rna_outputs[i], dna_outputs[i]) #Paired RNA/DNA files
        all_outputs.append(paired_output) #Appends ot list

        i += 1

    all_rep_counts = {} #Dictionary to hold merged RNA/DNA count dataframe
    
    for elem in all_outputs: #Iterates through list of paired RNA/DNA count files
        rna, dna = elem #Breaks up tuple into constituent RNA/DNA count file paths
        rna_counts = pd.read_csv(rna, sep = '\t') #Reads RNA count file
        sampleid = rna_counts['sampleid'][0] #Gets sample ID from first row

        #Booleans to get replicate number from sample ID
        if 'R3R6' in sampleid or 'R7R8R9' in sampleid:
            rep = 'rep_3'
        elif 'R2R5'in sampleid or 'R4R5R6' in sampleid:
            rep = 'rep_2'
        elif 'R1R4' in sampleid or 'R1R2R3' in sampleid:
            rep = 'rep_1'

        #Creates a column that has target and replicate number
        target_rep_name = rna_counts['target'] + rep 
        rna_counts['target_rep'] = target_rep_name 
        rna_counts = rna_counts.rename(columns = {'count': 'RNAcount'}) #Renames count column in RNA file
        rna_counts['pos'] = rna_counts['pos'].astype(str) #Sets 'pos' column to string data type for merging
        rna_counts['pos_id'] = rna_counts['pos'] + ':' + rna_counts['allele'] #Creates position ID that is genomic coordinate : allele for merging

        
        dna_counts = pd.read_csv(dna, sep = '\t') #Reads DNA file
        dna_counts = dna_counts.rename(columns = {'count': 'DNAcount'}) #Renames count column in DNA file
        dna_counts['pos'] = dna_counts['pos'].astype(str) #Sets 'pos' column to string data type for merging
        dna_counts['pos_id'] = dna_counts['pos'] + ':' + dna_counts['allele'] #Creates position ID column for merging
         
        merged = pd.merge(rna_counts, dna_counts, how = 'inner', on = 'pos_id') #Merges RNA and DNA dataframes on intersection of pos_id column 
        merged = merged.drop(columns = ['sampleid_x', 'sampleid_y', 'target_x','chrom_x', 'chrom_y', 'pos_x', 'pos_y', 'allele_x', 'allele_y']) #drops duplicate columns 
        merged = merged.rename(columns  = {'target_y' : 'target'}) #Renames target column to keep exon SGE target name
        merged = merged.loc[:, ['target_rep','target', 'pos_id', 'RNAcount', 'DNAcount']] #Columns reordered
        
        all_rep_counts[target_rep_name[0]] = merged #merged dataframe added to dictionary


    return all_rep_counts

In [None]:
def rna_v_dna_plot(all_counts):
    rep_regions = list(all_counts.keys())

    df_list = []
    for elem in rep_regions:
        df_list.append(all_counts[elem])


    all_df = pd.concat(df_list)


    scatter = alt.Chart(all_df).mark_circle().encode(
        x = alt.X('RNAcount', title = 'RNA Count'),
        y = alt.Y('DNAcount', title = 'DNA Count'),
        tooltip = [alt.Tooltip('RNAcount', title = 'RNA Count: '),
                   alt.Tooltip('DNAcount', title = 'DNA Count: ')
                  ]
    )

    trend_line = alt.Chart(all_df).transform_regression(
        'RNAcount', 'DNAcount',
        groupby=['target_rep']
    ).mark_line(color='blue').encode(
        x='RNAcount',
        y='DNAcount'
    )


    combined = (scatter + trend_line).facet(
        facet=alt.Facet('target_rep', title = 'DNA vs. RNA Counts')  # Set maximum of 4 columns per row
        ).resolve_scale(
            x = 'independent',
            y = 'independent'
        ).properties(
            columns = 3
        ).interactive()

    combined.display()

In [None]:
def get_rna_scores(rep_dict): #Handles RNA scoring of variants

    target_reps = list(rep_dict.keys()) #Gets list of all targets and replicates present in dictionary
    rna_scored = {} #Diciotnary to hold RNA-scored dataframes
    targets = [] #list of SGE targets present
    
    #Does RNA scoring
    for elem in target_reps:
        df = rep_dict[elem] #Gets dataframe
        df = df[~((df['RNAcount'] == 0) | (df['DNAcount'] == 0))].copy() #Drops rows with RNA/DNA count of 0

        df = df.reset_index(drop = True) #Resets index
         
        DNAcount = df['DNAcount'].sum() #Gets total DNA counts
        RNAcount = df['RNAcount'].sum() #Gets total RNA rounts

        df['DNAfreq'] = df['DNAcount'] / DNAcount #Calculates DNA frequency
        df['RNAfreq'] = df['RNAcount'] / RNAcount #Calculates RNA frequency

        df['RNAscore'] = np.log2(df['RNAfreq']/ df['DNAfreq']) #Calculates RNA score using log2 fold change

        rna_scored[df['target_rep'][0]] = df #RNA-scored dataframe is added to rna_scored dataframe

        if df['target'][0] not in targets: #Appends SGE target names
            targets.append(df['target'][0])
        
    
    return rna_scored, targets
        

In [None]:
def collapse_scores(rna_scored, targets): #Collapses RNA scores between replicates to a median

    collapsed_scores = {} #Empty dictionary to hold dataframes with collapsed scores

    for elem in targets: #Iterates through provided SGE targets
        target_scores = [v for k, v in rna_scored.items() if elem in k] #Gets dicitonary elements in same SGE target

        concat_scores = pd.concat(target_scores) #Concatenates all scores
        concat_scores = concat_scores.drop(columns = ['target_rep','RNAcount', 'DNAcount', 'DNAfreq', 'RNAfreq']) #Drops columns no longer used

        #Generates summary dataframe that has a median RNA score for each variant
        summary_df = concat_scores.groupby('pos_id').agg({
            'target': 'first',
            'RNAscore': 'median'
        }).reset_index()

        #Tidys up collapsed RNA score df
        summmary_df = summary_df.loc[:, ['target', 'pos_id', 'RNAscore']]
        summary_df = summary_df.rename(columns = {'RNAscore': 'RNAscore_med'})
        
        collapsed_scores[elem] = summary_df #Appends collapsed RNA score df to dictionary

    return collapsed_scores

In [None]:
def merge_dna(dict, gdna_scores): #This function merges the DNA SGE scores the median RNA scores for each variant
    dna_scores = pd.read_excel(gdna_scores) #Reads DNA SGE Scores

    dna_scores = dna_scores.drop(columns = ['chrom', 'pos', 'allele', 'R1_score', 'R2_score', 'R3_score', 'target']) #Drops these columns
    all_rna = list(dict.values()) #Gets all dataframes stored in the RNAscore dictionary

    df = pd.concat(all_rna) #Concatenates all RNAscore dataframes

    merged = pd.merge(df, dna_scores, how = 'inner', on = 'pos_id') #Merges based on position ID

    return merged

In [None]:
def visualize_scores(df):

    scatter_all = alt.Chart(df).mark_circle().encode(
        x = alt.X('snv_score', axis = alt.Axis(title = 'DNA Score')),
        y = alt.Y('RNAscore_med', axis = alt.Axis(title = 'RNA Score')),
        color = 'Consequence',
        tooltip = [alt.Tooltip('target', title = 'SGE Target: '),
                    alt.Tooltip('AAsub', title = 'AA Substitution: '),
                   alt.Tooltip('snv_score', title = 'DNA Score: '),
                   alt.Tooltip('RNAscore_med', title = 'RNA Score: ')
                  ]
    ).properties(
        height = 400, 
        width = 600,
        title = alt.TitleParams('RNA Score vs. DNA Score')
    ).interactive()

    scatter_all.display()

In [None]:
def main():
    all_counts = read_counts(rna_count_folder, dna_count_folder)
    rna_v_dna_plot(all_counts)
    rna_scored, sge_targets = get_rna_scores(all_counts)
    med_rna_scores = collapse_scores(rna_scored, sge_targets)
    dna_rna_df = merge_dna(med_rna_scores, sge_score)
    visualize_scores(dna_rna_df)

In [None]:
main()