In [None]:
import pandas as pd
import numpy as np
import altair as alt
from natsort import natsorted

In [None]:
file = '../Data/BARD1_SGE_final_table.xlsx' #Final data table

In [None]:
def read_scores(file): #Reads counts
    df = pd.read_excel(file, sheet_name = "snv_counts")

    df['target'] = df['target'].transform(lambda x: x[7::]) #Rewrites targets column to contain target name onl
    
    df = df.rename(
        columns = {
            'D05_R1': 'D05 R1', 
            'D05_R2': 'D05 R2', 
            'D05_R3': 'D05 R3', 
            'D13_R1': 'D13 R1',
            'D13_R2': 'D13 R2', 
            'D13_R3': 'D13 R3',
        }
    ) #Renames columns to be consistent with downstream code

    return df

In [None]:
def group_targets(df): #Groups targets for correlation testing
    
    grouped = df.groupby('target') #Creates groupby objects for each SGE region
    r_lists = [] #list that will hold lists that have correlation
    combos = [('D05 R1','D05 R2'),('D05 R1','D05 R3'),('D05 R2','D05 R3'),
              ('D13 R1','D13 R2'),('D13 R1','D13 R3'),('D13 R2','D13 R3')
             ] #pairwise arguments for correlation testing
    
    for group_name, group_df in grouped: #iterates through groupby objects
        group_df = group_df[['D05 R1', 'D05 R2', 'D05 R3', 
           'D13 R1', 'D13 R2', 'D13 R3']] #pulls out count columns only
        
        for elem in combos: #iterates through each combination of replicates for r-testing
            output = [] #output list
            col1, col2 = elem #unpacks tuple that is combination
            output.append(group_name) #adds SGE target name
            output.append(col1 + ' vs ' + col2) #adds which replicates were tested
            output.append(compute_r(group_df,col1,col2)) #appends r
            r_lists.append(output) #appends final list to output list

    #print(r_lists)
    return r_lists
    

In [None]:
def compute_r(group,col1,col2): #does the correlation math
    return group[col1].corr(group[col2])

In [None]:
def parse_r_lists(r_list): #Parses through grouped lists and create Pearson R values
    
    target = [] #list to hold SGE target names
    test_type = [] #list to hold the replicates compared
    r = [] #list to hold r values
    
    for elem in r_list: #iterates through each item in list and appends respective information
        target.append(elem[0])
        test_type.append(elem[1])
        r.append(round(elem[2],3))

    data = {
            'Targets' : target,
            'Tests' : test_type,
            'r_correlation': r
    } #Dictionary to build dataframe
    
    to_map = pd.DataFrame(data) #Final dataframe with all R-values

    to_map['exon'] = to_map['Targets'].str.extract(r'^(\d+\.?\d*)').astype(str)

    agg_df = to_map.groupby('Targets').agg({'r_correlation': 'median',
                                            'exon': 'first'
                                           }
                                          ).reset_index()

    #agg_df.to_excel('/Users/ivan/Desktop/test_excel_outputs/20250721_AggMedianRCorrelation.xlsx', index = None)
    
    return to_map

In [None]:
def heatmap(pivoted):

    targets = set(pivoted['Targets'].tolist())
    targets = natsorted(targets)
    
    graph = alt.Chart(pivoted, title = alt.TitleParams(text = 'Correlation of Replicates', fontSize = 32)).mark_rect().encode(
                x = alt.X('Tests:N', axis = alt.Axis(title = '', titleFontSize = 28, labelFontSize = 24, labelLimit = 300, labelAngle = 45)),
                y = alt.Y('Targets', axis = alt.Axis(title = 'SGE Target Region', titleFontSize = 28, labelFontSize = 24), sort = targets),
                color = alt.Color('r_correlation:Q', scale = alt.Scale(domain = [.2, 1]), legend = alt.Legend(title = "Pearson's r", titleFontSize = 24,labelFontSize = 22)),
                tooltip = [alt.Tooltip('r_correlation', title = "Pearson's r: ")]
    ).properties(
        width = 600,
        height = 900
    )

    #graph.save('/Users/ivan/Desktop/BARD1_draft_figs/fig_1b.png', ppi = 500)
    graph.display()

    return graph

In [None]:
def boxplot(df):

    df_all = df.copy()
    df_all['Domain'] = 'All Data'
    df_all['Structure'] = 'All Data'

    
    df['Domain'] = None
    df.loc[df['Targets'].isin(['1B, 2, 3A, 3B']), 'Domain'] = 'RING'
    df.loc[df['Targets'].isin(['4A', '4B', '4C', '4D', '4E', '4F', '4G', '4H', '4I', '4J', '4K']),'Domain'] = 'IDR'
    df.loc[df['Targets'].isin(['4L', '5A', '5B', '6A', '6B', '7A', '7B']), 'Domain'] = 'ARD'
    df.loc[df['Targets'].isin(['8A','8B', '9A', '9B', '10A', '10B', '11A', '11B', '11C', '11D']), 'Domain'] ='BRCT'

    df['Structure'] = None
    df.loc[~(df['Targets'].isin(['4A', '4B', '4C', '4D', '4E', '4F', '4G', '4H', '4I', '4J', '4K'])),'Structure'] = 'Structured\nDomains'
    df.loc[df['Targets'].isin(['4A', '4B', '4C', '4D', '4E', '4F', '4G', '4H', '4I', '4J', '4K']),'Structure'] = ' X4 Disordered\nDomain'

    df = df.dropna(subset = ['Structure'])

    df = pd.concat([df_all, df])

    df['Day'] = None
    df.loc[df['Tests'].str.contains('D05'), 'Day'] = 'Day 5'
    df.loc[df['Tests'].str.contains('D13'), 'Day'] = 'Day 13'

    d5_df = df.loc[df['Day'].isin(['Day 5'])]
    d13_df = df.loc[df['Day'].isin(['Day 13'])]

    

    plot = alt.Chart(df).mark_boxplot(
        size=75,
        box={'fill': 'white', 'stroke': 'black', 'strokeWidth': 1},
        median={'stroke': 'black', 'strokeWidth': 2},
        outliers={'fill': 'black', 'stroke': 'black'},
        ticks={'stroke': 'black', 'strokeWidth': 1},
        rule={'stroke': 'black', 'strokeWidth': 1}
    ).encode(
        y = alt.Y('r_correlation', 
                  scale = alt.Scale(
                    domain = [0.45, 1.02]
                    ),
                  axis = alt.Axis(title = "",
                                  labelFontSize = 16,
                                  values = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
                                 )
        ),
        x = alt.X('Structure',
                 axis = alt.Axis(labelAngle = 0,
                                title = '',
                                ticks = False),
                  sort = ['All Data', 'Structured\nDomains', 'X4 Disordered\nDomain']
                 )
    ).configure_view(
        stroke = None
    ).configure_axis(
        grid = False
    ).properties(
        width = 300,
        height = 500
    ).interactive()

    plot.display()

    days_df = [d5_df, d13_df]
    day = ["Day 5", "Day 13"]
    save_day = ['d5_boxplot', 'd13_plot']
    
    days_plots = []
    for i, df in enumerate(days_df):

        title_str = day[i]
        save_str = '/Users/ivan/Desktop/BARD1_draft_figs/fig_1b_' + save_day[i] + '.png'

        # Create a copy and rename the Structure column with a unique suffix
        df_copy = df.copy()
        df_copy[f'Structure_{i}'] = df_copy['Structure']
        
        plot = alt.Chart(df_copy).mark_boxplot(
            size=50,
            box={'fill': 'white', 'stroke': 'black', 'strokeWidth': 1},
            median={'stroke': 'black', 'strokeWidth': 2},
            outliers={'fill': 'black', 'stroke': 'black'},
            ticks={'stroke': 'black', 'strokeWidth': 1},
            rule={'stroke': 'black', 'strokeWidth': 1}
        ).encode(
            y = alt.Y('r_correlation', 
                      scale = alt.Scale(
                        domain = [0.45, 1.02]
                        ),
                      axis = alt.Axis(title = "",
                                      labelFontSize = 18,
                                      values = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
                                     )
            ),
            x = alt.X(f'Structure_{i}',  # Use the renamed column
                     axis = alt.Axis(title = '',
                                    ticks = False),
                      sort = ['All Data', 'Structured\nDomains', 'X4 Disordered\nDomain']
                     )
        ).properties(
            width = 200,
            height = 400,
            title = title_str
        ).configure_view(
            stroke = None
        ).configure_axis(
            grid = False
        ).interactive()

        plot.display()
        #plot.save(save_str, ppi = 500)


In [None]:
def main():
    data = read_scores(file) 
    r_lists = group_targets(data)
    test = parse_r_lists(r_lists)

    supp_heatmap = heatmap(test)
    #supp_heatmap.save('/Users/ivan/Desktop/BARD1_draft_figs/supp_figs/pearsonR_heatmap.png', dpi = 400)
    boxplot(test)

In [None]:
main()