In [None]:
import pandas as pd
import altair as alt
from natsort import natsorted

from pydeseq2.dds import DeseqDataSet
from pydeseq2.default_inference import DefaultInference
from pydeseq2.ds import DeseqStats

In [None]:
original_data = '../Data/20250508_BARD1scores_update_FILTERED.xlsx'
agg_correlation = '/Users/ivan/Desktop/test_excel_outputs/20250721_AggMedianRCorrelation.xlsx'
cutoffs = [-3.438968 * 0.028675 + 0.009242,-3.018904 * 0.028675 + 0.009242]
alt.data_transformers.disable_max_rows()

In [None]:
def get_rescore(data_file, corr_file):
    
    df = pd.read_excel(data_file)
    print('Full Dataset Length: ', len(df))
    rescore_df = df.loc[df['snvlib_lib2'].notna()]
    rescore_df = rescore_df[['target', 'pos_id', 'snvlib_lib1', 'D05_R1_lib1', 'D05_R2_lib1', 'D05_R3_lib1', 'D13_R1_lib1', 'D13_R2_lib1' ,'D13_R3_lib1',
                          'snvlib_lib2', 'D05_R1_lib2', 'D05_R2_lib2', 'D05_R3_lib2', 'D13_R1_lib2', 'D13_R2_lib2' ,'D13_R3_lib2']]
    
    no_rescore = df.loc[~(df['snvlib_lib2'].notna())]

    r_df = pd.read_excel(corr_file)
    r_df = r_df.rename(columns = {'Targets': 'target'})

    r_df['target'] = 'BARD1_X' + r_df['target']
    r_df['exon'] = r_df['exon'].astype(str)
    r_df['exon'] = 'BARD1_X' + r_df['exon']

    r_dict = dict(zip(r_df['target'], r_df['r_correlation']))

    targets_sorted = natsorted(r_df['target'].tolist())

    print('Rescore after get_rescore: ', len(rescore_df))
    print('No rescore after get_rescore: ', len(no_rescore))

    return rescore_df, r_dict, targets_sorted, no_rescore, df

In [None]:
def get_rescore_x4_only(data_file, corr_file):
    df = pd.read_excel(data_file)

    print('Full Dataset Length: ', len(df))
    rescore_df = df.loc[df['exon'].isin(['BARD1_X4'])]
    rescore_df = rescore_df.loc[df['snvlib_lib2'].notna()]
    
    rescore_df = rescore_df[['target', 'pos_id', 'snvlib_lib1', 'D05_R1_lib1', 'D05_R2_lib1', 'D05_R3_lib1', 'D13_R1_lib1', 'D13_R2_lib1' ,'D13_R3_lib1',
                          'snvlib_lib2', 'D05_R1_lib2', 'D05_R2_lib2', 'D05_R3_lib2', 'D13_R1_lib2', 'D13_R2_lib2' ,'D13_R3_lib2']]
    
    no_rescore_no_X4 = df.loc[~df['target'].str.contains('BARD1_X4')]
    no_rescore_x4 = df.loc[df['target'].str.contains('BARD1_X4')]
    
    no_rescore_x4 = no_rescore_x4.loc[no_rescore_x4['snvlib_lib2'].isna()]
    
    no_rescore = pd.concat([no_rescore_no_X4, no_rescore_x4])

    r_df = pd.read_excel(corr_file)
    r_df = r_df.rename(columns = {'Targets': 'target'})

    r_df['target'] = 'BARD1_X' + r_df['target']
    r_df['exon'] = r_df['exon'].astype(str)
    r_df['exon'] = 'BARD1_X' + r_df['exon']

    r_dict = dict(zip(r_df['target'], r_df['r_correlation']))

    targets_sorted = natsorted(r_df['target'].tolist())

    print('Rescore after get_rescore: ', len(rescore_df))
    print('No rescore after get_rescore: ', len(no_rescore))

    return rescore_df, r_dict, targets_sorted, no_rescore, df

In [None]:
def prep_rescore(rescore_df, r_dict, targets_sorted):

    grouped = rescore_df.groupby('target')

    ready_to_score = []
    
    pre_dict = {}
    post_dict = {}
    
    for target, df in grouped:
        pre_dict[target] = len(df)
        lib1_r = r_dict[target]
        
        lib2_index = targets_sorted.index(target) + 1
        lib2 = targets_sorted[lib2_index]
        lib2_r = r_dict[lib2]

        
        if lib1_r > lib2_r:
            df = df.rename(columns = {'D05_R1_lib1' : 'D05_R1',
                                        'D05_R2_lib1' : 'D05_R2',
                                        'D05_R3_lib1' : 'D05_R3', 
                                        'D13_R1_lib1' : 'D13_R1',
                                        'D13_R2_lib1' : 'D13_R2',
                                        'D13_R3_lib1' : 'D13_R3',
                                        'snvlib_lib1' : 'lib_count'
                                     }
                          )
            df = df.drop(columns = ['snvlib_lib2', 'D05_R1_lib2', 'D05_R2_lib2', 'D05_R3_lib2', 'D13_R1_lib2', 'D13_R2_lib2' ,'D13_R3_lib2'])
            
            ready_to_score.append(df)

            post_dict[target] = len(df)
        elif lib2_r >= lib1_r:
            df = df.rename(columns = {'D05_R1_lib2' : 'D05_R1',
                                        'D05_R2_lib2' : 'D05_R2',
                                        'D05_R3_lib2' : 'D05_R3', 
                                        'D13_R1_lib2' : 'D13_R1',
                                        'D13_R2_lib2' : 'D13_R2',
                                        'D13_R3_lib2' : 'D13_R3',
                                        'snvlib_lib2' : 'lib_count'
                                       }
                          )
            df = df.drop(columns = ['snvlib_lib1', 'D05_R1_lib1', 'D05_R2_lib1', 'D05_R3_lib1', 'D13_R1_lib1', 'D13_R2_lib1' ,'D13_R3_lib1'])


            ready_to_score.append(df)
            post_dict[target] = len(df)
            
    to_rescore = pd.concat(ready_to_score)
    to_rescore = to_rescore.reset_index(drop = True)

    #to_rescore.to_excel('/Users/ivan/Desktop/test_excel_outputs/20250718_OverlapRescoreCounts.xlsx', index = None)
    '''
    display('Pre-Process Dictionary: ', pre_dict,
          'Post-Process Dictionary: ', post_dict
         )
    '''
    return to_rescore

In [None]:
def score(df, all_df):

    all_df = all_df[["pos", 'ref', 'exon', "allele", "pos_id", 'amino_acid_change', 'simplified_consequence', 'target']]

    value_cols = [col for col in df.columns if "D13"  in col or "D05"  in col or "lib_count" in col]
    dfmelt = pd.melt(df, id_vars=['pos_id'], value_vars=value_cols)
    dfpivot = pd.pivot(dfmelt, index='variable', columns=['pos_id'], values='value')
    dfpivot = dfpivot.rename_axis(None, axis=1)
    dfpivot = dfpivot.rename_axis(None, axis=0)


    metanames = ['D05_R1', 'D05_R2', 'D05_R3', 'D13_R1', 'D13_R2', 'D13_R3', 'lib_count']
    metadays = [5, 5, 5, 13, 13, 13, 0]

    metadf = pd.DataFrame(
        {'sample_name': metanames,
        'time': metadays,
        }
    )
    metadf = metadf.set_index('sample_name').rename_axis(None, axis=0)

    inference = DefaultInference(n_cpus=1)
    dds = DeseqDataSet(
        counts=dfpivot,
        metadata=metadf,
        design="~time",
        refit_cooks=True
    )
    
    dds.deseq2()
    contrast = ["time", 1, 0]
    stat_res = DeseqStats(dds, contrast=contrast, inference=inference)
    stat_res.summary()
    resdf = stat_res.results_df

    resdf = resdf.reset_index(names = 'pos_id')
    resdf = resdf.merge(all_df[["pos", 'ref', 'exon', "allele", "pos_id", 'amino_acid_change', 'simplified_consequence', 'target']])

    resdf = resdf[['pos_id', 'log2FoldChange', 'lfcSE', 'pos', 'allele', 'amino_acid_change', 'simplified_consequence', 'target', 'ref', 'exon']]
    to_return = resdf.rename(columns = {'log2FoldChange': 'score', 'lfcSE': 'standard_error'})
    to_return = to_return.loc[:,['pos', 'pos_id', 'ref', 'allele', 'exon', 'target', 'simplified_consequence', 'score', 'standard_error', 'amino_acid_change']]

    return to_return

In [None]:
def remake_data(rescored, no_rescore):
 
    no_rescore = no_rescore[["pos", 'ref', 'exon', "allele", "pos_id", 'score', 'standard_error', 'amino_acid_change', 'simplified_consequence', 'target']]

    final_df = pd.concat([no_rescore, rescored])

    print('# Variants After Rescore: ', len(final_df))

    #final_df.to_excel('/Users/ivan/Desktop/test_excel_outputs/20250721_BARD1_OverlapRescore_beta.xlsx', index = None)

    return final_df

In [None]:
def rename_df(df):
    df = df.rename(columns = {'simplified_consequence': 'Consequence'})
    df.loc[df['Consequence'].str.contains('missense'), 'Consequence'] = 'Missense'
    df.loc[df['Consequence'] == 'synonymous_variant', 'Consequence'] = 'Synonymous'
    df.loc[df['Consequence'] == 'intron_variant', 'Consequence'] = 'Intron'
    df.loc[df['Consequence'] == 'stop_gained', 'Consequence'] = 'Stop Gained'
    df.loc[df['Consequence'] == 'stop_lost', 'Consequence'] = 'Stop Lost'
    df.loc[df['Consequence'].str.contains('site'), 'Consequence'] = 'Canonical Splice'
    df.loc[df['Consequence'].str.contains('ing_var'), 'Consequence'] = 'Splice Region'
    df.loc[df['Consequence'].str.contains('UTR'), 'Consequence'] = 'UTR Variant'
    df.loc[df['Consequence'] == 'start_lost', 'Consequence'] = 'Start Lost'

    return df

In [None]:
def strip_plot(df, cutoffs, plot_title):

    nf_line = alt.Chart(pd.DataFrame({'x': [cutoffs[0]]})).mark_rule(color = 'red').encode(
        x = 'x')

    func_lin = alt.Chart(pd.DataFrame({'x': [cutoffs[1]]})).mark_rule(color = 'blue').encode(
        x = 'x')

    sorted = ["Intron", "Missense", "Synonymous", "Stop Gained", "Splice", "Start Lost", 'Stop Lost', 'UTR']
    controls_sorted = ["Intron",  "Synonymous", "Stop Gained", "Missense", "Splice", "Start Lost", 'Stop Lost', 'UTR']


    # Get the category10 colors
    category10_colors = [
        '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', 
        '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'
    ]
    
    # Map each category to a specific color
    color_mapping = {
        "Intron": category10_colors[0],
        "Synonymous": category10_colors[1],
        "Stop Gained": category10_colors[2],
        "Missense": category10_colors[3],
        "Splice": category10_colors[4],
        "Start Lost": category10_colors[5],
        "Stop Lost": category10_colors[6],
        "UTR": category10_colors[7]
    }
    
    plot = alt.Chart(df).mark_tick(opacity = 1).encode(
        x = alt.X('score:Q',
                  axis = alt.Axis(title = '', 
                                  titleFontSize = 20,
                                 labelFontSize = 24),
                  scale = alt.Scale(domain = [-0.5, 0.15]
                                   )
                 ),
        y = alt.Y('Consequence:N', 
                  sort = sorted,
                  axis = alt.Axis(title = '',
                                 labelFontSize = 24)
                 ),
        color = alt.Color('Consequence:N',
                legend=None,
                sort = sorted,
                scale = alt.Scale(scheme = 'category10')
                         ),
        tooltip = [alt.Tooltip('target', title = 'SGE Target: '),
                  alt.Tooltip('amino_acid_change', title = 'Amino Acid Change: ')]
        ).properties(
            width = 800,
            height = 400,
            title = plot_title
        )
    
    plot = plot + nf_line + func_lin


    return plot

In [None]:
def visualize(og_df, rescore_df, x4_df, cutoffs):

    dfs = [(og_df, 'No Rescore'), (rescore_df, 'All Overlaps Rescored'), (x4_df, 'X4 Overlaps Rescored Only')]
    
    all_strips =[]
    for df,title in dfs:
        df = rename_df(df)
        strip = strip_plot(df, cutoffs, title)

        all_strips.append(strip)
        
    all_strips_concat = all_strips[0] & all_strips[1] & all_strips[2]

    all_strips_concat.display()

In [None]:
def main():
    to_rescore, r_dict, targets_sorted, no_rescore_df, all_data = get_rescore(original_data, agg_correlation)
    x4_to_rescore,x4_r_dict, x4_targets_sorted, x4_no_rescore_df, all_data = get_rescore_x4_only(original_data, agg_correlation)
    
    rescore_df = prep_rescore(to_rescore, r_dict, targets_sorted)
    x4_rescore_df = prep_rescore(x4_to_rescore, x4_r_dict, x4_targets_sorted)
    
    rescored_df = score(rescore_df, all_data)
    x4_scored_df = score(x4_rescore_df, all_data)

    final_df = remake_data(rescored_df, no_rescore_df)
    x4_rescore_final_df = remake_data(x4_scored_df, x4_no_rescore_df)

    visualize(all_data, final_df, x4_rescore_final_df, cutoffs)

In [None]:
main()