In [None]:
import pandas as pd
import numpy as np
import altair as alt

In [None]:
file = '/Users/ivan/Library/CloudStorage/OneDrive-UW/Research/Miscellaneous/BARD1_Figure_Data_Inputs/20240828_BARD1_AllScores.xlsx' #SGE score file
#file = '/Users/ivan/Downloads/20240828_BARD1_AllScores_noX4.xlsx' #no X4A-K file

In [None]:
def read_scores(file): #Reads SGE scores
    df = pd.read_excel(file)

    all_columns = ['target','D05_R1R4_freq','D05_R2R5_freq','D05_R3R6_freq','D13_R1R4_freq','D13_R2R5_freq','D13_R3R6_freq',
                   'D05_R1R2R3_freq','D05_R4R5R6_freq','D05_R7R8R9_freq','D13_R1R2R3_freq','D13_R4R5R6_freq','D13_R7R8R9_freq'
                  ]
    
    df = df[all_columns]


    return df

In [None]:
def process_data(df):# Pre-process data prior to heatmapping  - needed to differentiate between regions labeled using R1R4 vs. R1R2R3 
                         
    grouped = df.groupby(by = 'target') #groups data by SGE region
    groups = [] #list to hold SGE target dataframes
    
    for target, group in grouped: #iterates through groupby objects and generates a name without the GeneName_X portion

        if group['D05_R1R2R3_freq'].isna().values.any(): #sorts out regions that are labeled R1R4 etc.
            
            split_target = target.split('X') #splits target name using the 'X' in the name
            new_target = split_target[1] #rewrites target name to just have exon number and SGE sub-target letter if applicable
            
            group.loc[group['target'] == target, 'target'] = new_target #renames SGE with cleaned target name
            
            group = group[['target','D05_R1R4_freq','D05_R2R5_freq','D05_R3R6_freq','D13_R1R4_freq','D13_R2R5_freq','D13_R3R6_freq']].copy() #pulls out columns with actual data
            
            #renames columns to be nicer on map
            group.rename(columns = {'target': 'target','D05_R1R4_freq': 'D05 Rep 1', 'D05_R2R5_freq': 'D05 Rep 2', 'D05_R3R6_freq': 'D05 Rep 3',
                                 'D13_R1R4_freq': 'D13 Rep 1', 'D13_R2R5_freq': 'D13 Rep 2', 'D13_R3R6_freq': 'D13 Rep 3'}, inplace = True)
            
            groups.append(group) #appends group dataframe to list
            
        elif group['D05_R1R4_freq'].isna().values.any(): #sorts out regions labeled R1R2R3 - rest of the code does the same thing as the above if statement 
            split_target = target.split('X')
            new_target = split_target[1]
            group.loc[group['target'] == target, 'target'] = new_target

            group = group[['target','D05_R1R2R3_freq','D05_R4R5R6_freq','D05_R7R8R9_freq','D13_R1R2R3_freq','D13_R4R5R6_freq','D13_R7R8R9_freq']].copy()
            group.rename(columns = {'target': 'target','D05_R1R2R3_freq': 'D05 Rep 1', 'D05_R4R5R6_freq': 'D05 Rep 2', 'D05_R7R8R9_freq': 'D05 Rep 3',
                                 'D13_R1R2R3_freq': 'D13 Rep 1', 'D13_R4R5R6_freq': 'D13 Rep 2', 'D13_R7R8R9_freq': 'D13 Rep 3'}, inplace = True)
     
            groups.append(group)
            
    df = pd.concat(groups) #concatenates all dataframes to yield final dataframe for heatmap

    return df

In [None]:
def group_targets(df):
    
    grouped = df.groupby('target') #Creates groupby objects for each SGE region
    r_lists = [] #list that will hold lists that have correlation
    combos = [('D05 Rep 1','D05 Rep 2'),('D05 Rep 1','D05 Rep 3'),('D05 Rep 2','D05 Rep 3'),
              ('D13 Rep 1','D13 Rep 2'),('D13 Rep 1','D13 Rep 3'),('D13 Rep 2','D13 Rep 3')
             ] #pairwise arguments for correlation testing
    
    for group_name, group_df in grouped: #iterates through groupby objects
        group_df = group_df[['D05 Rep 1', 'D05 Rep 2', 'D05 Rep 3', 
           'D13 Rep 1', 'D13 Rep 2', 'D13 Rep 3']] #pulls out frequency columns only
        
        for elem in combos: #iterates through each combination of replicates for r-testing
            output = [] #output list
            col1, col2 = elem #unpacks tuple that is combination
            output.append(group_name) #adds SGE target name
            output.append(col1 + ' vs ' + col2) #adds which replicates were tested
            output.append(compute_r(group_df,col1,col2)) #appends r
            r_lists.append(output) #appends final list to output list
            
    return r_lists
    

In [None]:
def compute_r(group,col1,col2): #does the correlation math
    return group[col1].corr(group[col2])

In [None]:
def parse_r_lists(r_list):
    target = [] #list to hold SGE target names
    test_type = [] #list to hold the replicates compared
    r = [] #list to hold r values
    for elem in r_list: #iterates through each item in list and appends respective information
        target.append(elem[0])
        test_type.append(elem[1])
        r.append(elem[2])

    data = {
            'Targets' : target,
            'Tests' : test_type,
            'r_correlation': r
    }
    to_map = pd.DataFrame(data)
    
    return to_map

In [None]:
def heatmap(pivoted):

    pivoted['Exon'] = pivoted['Targets'].str.extract('(\d+)').astype(float) #extracts numerical part of SGE target name using a regex and adds new column to be exon number
    pivoted['Subregion'] = pivoted['Targets'].str.extract('([A-Za-z]+)') #extracts letter part of SGE target name using regex and adds new column to be exon subregion letter

    pivoted = pivoted.sort_values(by=['Exon', 'Subregion'], ascending=[True, True]) #sorts numerically then alphabetically
    pivoted = pivoted.drop(columns = ['Exon', 'Subregion']) #drops the sorting columns
    
    graph = alt.Chart(pivoted, title = alt.TitleParams(text = 'Correlation of Replicates', fontSize = 40)).mark_rect().encode(
                x = alt.X('Tests:N', axis = alt.Axis(title = '', titleFontSize = 32, labelFontSize = 24, labelLimit = 300, labelAngle = 45)),
                y = alt.Y('Targets', axis = alt.Axis(title = 'SGE Target Region', titleFontSize = 32, labelFontSize = 24), sort = list(pivoted['Targets'])),
                color = alt.Color('r_correlation:Q', scale = alt.Scale(domain = [.15, 1]), legend = alt.Legend(title = "Pearson's r", titleFontSize = 26,labelFontSize = 22))
    ).properties(
        width = 1150,
        height = 700)

    graph.show()
    

In [None]:
def main():
    data = read_scores(file) 
    processed_data = process_data(data)
    r_lists = group_targets(processed_data)
    test = parse_r_lists(r_lists)
    heatmap(test)

In [None]:
main()