In [None]:
import pandas as pd
import numpy as np
import altair as alt
from natsort import natsorted

In [None]:
file = '../Data/20250423_BARD1_snvscores_IGVFupload.tsv' #SGE score file
#file = '/Users/ivan/Downloads/20240828_BARD1_AllScores_noX4.xlsx' #no X4A-K file

In [None]:
def read_scores(file): #Reads SGE scores
    df = pd.read_csv(file, sep = '\t')

    all_columns = ['target','D05_R1_lib1','D05_R2_lib1','D05_R3_lib1','D13_R1_lib1','D13_R2_lib1','D13_R3_lib1'] #List of columns to retain
    
    df = df[all_columns] #Gets necessary columns
    
    df = df.rename(
        columns = {
            'D05_R1_lib1': 'D05 R1', 
            'D05_R2_lib1': 'D05 R2', 
            'D05_R3_lib1': 'D05 R3', 
            'D13_R1_lib1': 'D13 R1',
            'D13_R2_lib1': 'D13 R2', 
            'D13_R3_lib1': 'D13 R3',
        }
    ) #Renames columns to be consistent with downstream code
    
    df['target'] = df['target'].transform(lambda x: x[7::]) #Rewrites targets column to contain target name only

    return df

In [None]:
def group_targets(df): #Groups targets for correlation testing
    
    grouped = df.groupby('target') #Creates groupby objects for each SGE region
    r_lists = [] #list that will hold lists that have correlation
    combos = [('D05 R1','D05 R2'),('D05 R1','D05 R3'),('D05 R2','D05 R3'),
              ('D13 R1','D13 R2'),('D13 R1','D13 R3'),('D13 R2','D13 R3')
             ] #pairwise arguments for correlation testing
    
    for group_name, group_df in grouped: #iterates through groupby objects
        group_df = group_df[['D05 R1', 'D05 R2', 'D05 R3', 
           'D13 R1', 'D13 R2', 'D13 R3']] #pulls out count columns only
        
        for elem in combos: #iterates through each combination of replicates for r-testing
            output = [] #output list
            col1, col2 = elem #unpacks tuple that is combination
            output.append(group_name) #adds SGE target name
            output.append(col1 + ' vs ' + col2) #adds which replicates were tested
            output.append(compute_r(group_df,col1,col2)) #appends r
            r_lists.append(output) #appends final list to output list
            
    return r_lists
    

In [None]:
def compute_r(group,col1,col2): #does the correlation math
    return group[col1].corr(group[col2])

In [None]:
def parse_r_lists(r_list): #Parses through grouped lists and create Pearson R values
    
    target = [] #list to hold SGE target names
    test_type = [] #list to hold the replicates compared
    r = [] #list to hold r values
    
    for elem in r_list: #iterates through each item in list and appends respective information
        target.append(elem[0])
        test_type.append(elem[1])
        r.append(round(elem[2],3))

    data = {
            'Targets' : target,
            'Tests' : test_type,
            'r_correlation': r
    } #Dictionary to build dataframe
    
    to_map = pd.DataFrame(data) #Final dataframe with all R-values
    
    return to_map

In [None]:
def heatmap(pivoted):
    
    targets = set(pivoted['Targets'].tolist())
    targets = natsorted(targets)
    
    graph = alt.Chart(pivoted, title = alt.TitleParams(text = 'Correlation of Replicates', fontSize = 40)).mark_rect().encode(
                x = alt.X('Tests:N', axis = alt.Axis(title = '', titleFontSize = 32, labelFontSize = 24, labelLimit = 300, labelAngle = 45)),
                y = alt.Y('Targets', axis = alt.Axis(title = 'SGE Target Region', titleFontSize = 32, labelFontSize = 24), sort = targets),
                color = alt.Color('r_correlation:Q', scale = alt.Scale(domain = [.2, 1]), legend = alt.Legend(title = "Pearson's r", titleFontSize = 26,labelFontSize = 22)),
                tooltip = [alt.Tooltip('r_correlation', title = "Pearson's r: ")]
    ).properties(
        width = 1150,
        height = 700
    )

    #graph.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_1b.png', ppi = 500)
    graph.display()


In [None]:
def main():
    data = read_scores(file) 
    r_lists = group_targets(data)
    test = parse_r_lists(r_lists)
    heatmap(test)

In [None]:
main()