In [None]:
import pandas as pd
import altair as alt

from pydeseq2.dds import DeseqDataSet
from pydeseq2.default_inference import DefaultInference
from pydeseq2.ds import DeseqStats

In [None]:
bard1_data = '../Data/20250508_BARD1scores_update_FILTERED.xlsx'
depth_file = '/Users/ivan/Desktop/test_excel_outputs/20250717_BARD1_CollapsedMedianDepth.xlsx'

cutoffs = [-3.438968 * 0.028675 + 0.009242,-3.018904 * 0.028675 + 0.009242]

In [None]:
def process_data(file):
    df = pd.read_excel(file)
    df = df.loc[(df['exon'].isin(['BARD1_X4'])) & (df['simplified_consequence'].isin(['missense_variant', 'synonymous_variant']))]
    df['overlapping'] = 'N'
    df.loc[df['snvlib_lib2'].notna(), 'overlapping'] = 'Y'

    df['consequence_overlap'] = df['functional_consequence'] + '+' + df['overlapping']

    not_normal = df.loc[(df['functional_consequence'].isin(['functionally_abnormal', 'indeterminate'])) & (df['snvlib_lib2'].notna())]
    
    return df, not_normal

In [None]:
def std_error(df, depth):
    df['min_tc_count'] = df[['D05_R1_lib1', 'D05_R1_lib2', 'D05_R2_lib1', 'D05_R2_lib2', 'D05_R3_lib1', 'D05_R3_lib2',
                   'D13_R1_lib1','D13_R1_lib2', 'D13_R2_lib1', 'D13_R2_lib2', 'D13_R3_lib1', 'D13_R3_lib2']].min(axis = 1)
    
    chart = alt.Chart(df).mark_point().encode(
        x = alt.X('pos',
                  axis = alt.Axis(labels = False),
                  scale = alt.Scale(domain = [214780602,214781509],
                                   reverse = True)
                 ),
        y = 'standard_error',
        color = 'functional_consequence',
        tooltip = [alt.Tooltip('target', title = 'SGE Target: '),
                   alt.Tooltip('amino_acid_change', title = 'Amino Acid Sub.: '),
                   alt.Tooltip('score', title = 'SGE Score: '),
                   alt.Tooltip('functional_consequence', title = 'Functional Class: ')
                  ]
                           
    ).properties(
        width = 2000,
        height = 400,
        title = 'Std. Error vs. Position'
    )

    count_chart = alt.Chart(df).mark_point().encode(
        x = alt.X('pos',
                  axis = alt.Axis(labels = False),
                  scale = alt.Scale(domain = [214780602,214781509],
                                   reverse = True)
                 ),
        y = alt.Y('min_tc_count',
                  scale = alt.Scale(domain = [0, 1000])
                 ),
        color = 'functional_consequence',
        tooltip = [alt.Tooltip('target', title = 'SGE Target: '),
                   alt.Tooltip('amino_acid_change', title = 'Amino Acid Sub.: '),
                   alt.Tooltip('score', title = 'SGE Score: '),
                   alt.Tooltip('functional_consequence', title = 'Functional Class: '),
                   alt.Tooltip('min_tc_count', title = 'Min. Count in TC Library: ')
                  ]
                           
    ).properties(
        width = 2000,
        height = 400,
        title = 'Min. Count vs. Position'
    )

    score_chart = alt.Chart(df).mark_point().encode(
        x = alt.X('pos',
                  axis = alt.Axis(labels = False),
                  scale = alt.Scale(domain = [214780602,214781509],
                                   reverse = True)
                 ),
        y = 'score',
        color = 'functional_consequence',
        tooltip = [alt.Tooltip('target', title = 'SGE Target: '),
                   alt.Tooltip('amino_acid_change', title = 'Amino Acid Sub.: '),
                   alt.Tooltip('score', title = 'SGE Score: '),
                   alt.Tooltip('functional_consequence', title = 'Functional Class: ')
                  ]
                           
    ).properties(
        width = 2000,
        height = 400,
        title = 'SGE Score vs. Position'
    )


    depth_df = pd.read_excel(depth)
    depth_df = depth_df.loc[depth_df['pos'].isin(list(range(214780602,214781509 + 1)))]
    
    depth_chart = alt.Chart(depth_df).mark_line().encode(
        x = alt.X('pos',
                  axis = alt.Axis(labels = False),
                  scale = alt.Scale(domain = [214780602,214781509],
                                   reverse = True)
                 ),
        y = 'median_depth',
        color = 'target',
        tooltip = [alt.Tooltip('target', title = 'SGE Target: '),
                   alt.Tooltip('amino_acid', title = 'Amino Acid: ')
                  ]
                           
    ).properties(
        width = 2000,
        height = 400,
        title = 'Median Normalized Depth vs. Position'        
    )


    final_plot = chart & depth_chart

    final_plot = final_plot & score_chart & count_chart

    final_plot = final_plot.interactive() 
    final_plot.display() 

In [None]:
def counts_for_abnormal(ab_df):
    df = ab_df[['snvlib_lib1', 'snvlib_lib2',
                   'D05_R1_lib1', 'D05_R1_lib2', 'D05_R2_lib1', 'D05_R2_lib2', 'D05_R3_lib1', 'D05_R3_lib2',
                   'D13_R1_lib1','D13_R1_lib2', 'D13_R2_lib1', 'D13_R2_lib2', 'D13_R3_lib1', 'D13_R3_lib2', 'amino_acid_change'
                  ]]
    df_long = df.melt(
        id_vars=['amino_acid_change'], 
        value_vars=[
                   'D05_R1_lib1', 'D05_R1_lib2', 'D05_R2_lib1', 'D05_R2_lib2', 'D05_R3_lib1', 'D05_R3_lib2',
                   'D13_R1_lib1','D13_R1_lib2', 'D13_R2_lib1', 'D13_R2_lib2', 'D13_R3_lib1', 'D13_R3_lib2'],
        var_name='library', 
        value_name='count'
    )

    chart = alt.Chart(df_long).mark_bar().encode(
        x=alt.X('library:N', axis=alt.Axis(labelAngle=-45)),
        y='count:Q',
        color='library:N',
        tooltip = [alt.Tooltip('library', title = 'Library: '),
                   alt.Tooltip('count', title = 'Count: ')
                  ]
    ).properties(
        height = 150,
        width = 200
    ).facet(
        facet='amino_acid_change:N',
        columns=14  # Number of columns in the grid
    )

    chart.display()

In [None]:
def overlap_nonoverlap_barchart(df):

    summary_df = df.groupby(['functional_consequence', 'overlapping']).size().reset_index(name='count')
    summary_df['percentage_normal/ab'] = summary_df.groupby('functional_consequence')['count'].transform(lambda x: (x / x.sum()) * 100)
    summary_df['percentage_overlapping'] = summary_df.groupby('overlapping')['count'].transform(lambda x: (x / x.sum()) * 100)
    

    totals = summary_df.groupby('overlapping')['count'].sum().reset_index()
    totals['functional_consequence'] = 'All Variants'
    totals['percentage_normal/ab'] = 100  # or 100.0 if you prefer
    totals['percentage_overlapping'] = (totals['count'] / totals['count'].sum()) * 100

    summary_df = pd.concat([summary_df, totals], ignore_index=True)
    print(summary_df)

    chart = alt.Chart(summary_df).mark_bar().encode(
        x = alt.X('functional_consequence',
                  sort = ['All Variants', 'functionally_normal', 'functionally_abnormal'],
                  axis = alt.Axis(labelAngle = 0
                                 )
                 ),
        y = alt.Y('count'
                 ),
        color = alt.Color('overlapping',
                          sort = ['Y', 'N'],
                          legend = alt.Legend(title = 'In Overlap?')
                         )
    ).properties(
        width = 300,
        height = 300,
        title = 'X4 Missense Variants'
    )

    


In [None]:
def overlapping_scatter(df):

    df = df.loc[(df['snvlib_lib2'].notna()) & (df['functional_consequence'].isin(['functionally_abnormal']))]
   

    d5_reps_pairs_lib1 = [('D13_R1_lib1', 'D13_R2_lib1'),
                     ('D13_R1_lib1', 'D13_R3_lib1'),
                     ('D13_R2_lib1', 'D13_R3_lib1')
                    ]

    d5_reps_pairs_lib2 = [('D13_R1_lib2', 'D13_R2_lib2'),
                     ('D13_R1_lib2', 'D13_R3_lib2'),
                     ('D13_R2_lib2', 'D13_R3_lib2')
                    ]

    print('Library 1: ')
    for elem in d5_reps_pairs_lib1:
        rep1, rep2 = elem
        d5_chart_lib1 = alt.Chart(df).mark_point().encode(
            x = rep1,
            y = rep2
        ).interactive()

       

        d5_chart_lib1 = d5_chart_lib1.facet('target')
        '''
        d5_chart_lib1 = d5_chart_lib1.resolve_scale(
            x = 'independent',
            y = 'independent'
        )
        '''
        d5_chart_lib1.display()

    
        
    print('Library 2: ')
    for elem in d5_reps_pairs_lib2:
        rep1, rep2 = elem
        d5_chart_lib1 = alt.Chart(df).mark_point().encode(
            x = rep1,
            y = rep2
        ).interactive()

       

        d5_chart_lib1 = d5_chart_lib1.facet('target')

        '''
        d5_chart_lib1 = d5_chart_lib1.resolve_scale(
            x = 'independent',
            y = 'independent'
        )
        '''
        d5_chart_lib1.display()
        

In [None]:
def rescore_pilot(df, cutoffs):

    #df = df.loc[df['target'].isin(['BARD1_X4A'])]
    '''

    base = alt.Chart(df)
    original_score_chart = base.mark_point().encode(
        x = alt.X('pos',
                  axis = alt.Axis(labels = False),
                  scale = alt.Scale(domain = [214780602,214781509],
                                   reverse = True)
                 ), 
        y = 'score',
        color = alt.Color('simplified_consequence',
                          sort = ['synonymous_variant', 'missense_variant']
                         ),
        tooltip = [alt.Tooltip('target', title = 'SGE Target: '),
                   alt.Tooltip('amino_acid_change', title = 'Amino Acid Sub.: '),
                   alt.Tooltip('score', title = 'SGE Score: '),
                   alt.Tooltip('functional_consequence', title = 'Functional Class: ')
                  ]
    ).properties(
        width = 2000,
        height = 400,
        title = 'Original Scores'
    )
    nf_line = base.mark_rule(color='red').encode(
        y=alt.datum(cutoffs[0])
    )
    
    func_line = base.mark_rule(color='blue').encode(
        y=alt.datum(cutoffs[1])
    )
    
    original_score_chart = original_score_chart + nf_line + func_line
    

    original_score_chart.display()
    '''

    lib1_df = df[['pos_id', 'snvlib_lib1', 'D05_R1_lib1', 'D05_R2_lib1', 'D05_R3_lib1', 'D13_R1_lib1', 'D13_R2_lib1' ,'D13_R3_lib1']]
    lib1_df = lib1_df.rename(columns = {'D05_R1_lib1' : 'D05_R1',
                                        'D05_R2_lib1' : 'D05_R2',
                                        'D05_R3_lib1' : 'D05_R3', 
                                        'D13_R1_lib1' : 'D13_R1',
                                        'D13_R2_lib1' : 'D13_R2',
                                        'D13_R3_lib1' : 'D13_R3',
                                        'snvlib_lib1' : 'lib_count'
                                       }
                            )

    value_cols = [col for col in lib1_df.columns if "D13"  in col or "D05"  in col or "lib_count" in col]
    dfmelt = pd.melt(lib1_df, id_vars=['pos_id'], value_vars=value_cols)
    dfpivot = pd.pivot(dfmelt, index='variable', columns=['pos_id'], values='value')
    dfpivot = dfpivot.rename_axis(None, axis=1)
    dfpivot = dfpivot.rename_axis(None, axis=0)


    metanames = ['D05_R1', 'D05_R2', 'D05_R3', 'D13_R1', 'D13_R2', 'D13_R3', 'lib_count']
    metadays = [5, 5, 5, 13, 13, 13, 0]

    metadf = pd.DataFrame(
        {'sample_name': metanames,
        'time': metadays,
        }
    )
    metadf = metadf.set_index('sample_name').rename_axis(None, axis=0)

    inference = DefaultInference(n_cpus=1)
    dds = DeseqDataSet(
        counts=dfpivot,
        metadata=metadf,
        design="~time",
        refit_cooks=True
    )
    
    dds.deseq2()
    contrast = ["time", 1, 0]
    stat_res = DeseqStats(dds, contrast=contrast, inference=inference)
    stat_res.summary()
    resdf = stat_res.results_df

    resdf = resdf.reset_index(names = 'pos_id')

    resdf = resdf.merge(df[[
        "pos", "allele", "pos_id", 'amino_acid_change', 'simplified_consequence', 
    ]])
    resdf["target"] = 'BARD1_X4A'
    
    resdf = resdf[['pos_id', 'log2FoldChange', 'lfcSE', 'pos', 'allele', 'amino_acid_change', 'simplified_consequence', 'target']]
    lib1_scored_df = resdf.rename(columns = {'log2FoldChange': 'score', 'lfcSE': 'standard_error'})


    lib2_df = df[['pos_id', 'snvlib_lib2', 'D05_R1_lib2', 'D05_R2_lib2', 'D05_R3_lib2', 'D13_R1_lib2', 'D13_R2_lib2' ,'D13_R3_lib2']]
    lib2_df = lib2_df.rename(columns = {'D05_R1_lib2' : 'D05_R1',
                                        'D05_R2_lib2' : 'D05_R2',
                                        'D05_R3_lib2' : 'D05_R3', 
                                        'D13_R1_lib2' : 'D13_R1',
                                        'D13_R2_lib2' : 'D13_R2',
                                        'D13_R3_lib2' : 'D13_R3',
                                        'snvlib_lib2' : 'lib_count'
                                       }
                            )

    lib2_df.to_excel('/Users/ivan/Desktop/test_excel_outputs/20250718_X4Overlap_lib2Counts.xlsx', index = None)
    
    value_cols = [col for col in lib2_df.columns if "D13"  in col or "D05"  in col or "lib_count" in col]
    dfmelt = pd.melt(lib2_df, id_vars=['pos_id'], value_vars=value_cols)
    dfpivot = pd.pivot(dfmelt, index='variable', columns=['pos_id'], values='value')
    dfpivot = dfpivot.rename_axis(None, axis=1)
    dfpivot = dfpivot.rename_axis(None, axis=0)


    metanames = ['D05_R1', 'D05_R2', 'D05_R3', 'D13_R1', 'D13_R2', 'D13_R3', 'lib_count']
    metadays = [5, 5, 5, 13, 13, 13, 0]

    metadf = pd.DataFrame(
        {'sample_name': metanames,
        'time': metadays,
        }
    )
    metadf = metadf.set_index('sample_name').rename_axis(None, axis=0)

    inference = DefaultInference(n_cpus=1)
    dds = DeseqDataSet(
        counts=dfpivot,
        metadata=metadf,
        design="~time",
        refit_cooks=True
    )
    
    dds.deseq2()
    contrast = ["time", 1, 0]
    stat_res = DeseqStats(dds, contrast=contrast, inference=inference)
    stat_res.summary()
    resdf = stat_res.results_df

    resdf = resdf.reset_index(names = 'pos_id')

    resdf = resdf.merge(df[[
        "pos", "allele", "pos_id", 'amino_acid_change', 'simplified_consequence', 
    ]])
    resdf["target"] = 'BARD1_X4A'
    
    resdf = resdf[['pos_id', 'log2FoldChange', 'lfcSE', 'pos', 'allele', 'amino_acid_change', 'simplified_consequence', 'target']]
    lib2_scored_df = resdf.rename(columns = {'log2FoldChange': 'score', 'lfcSE': 'standard_error'})

    rescored_dfs = [('Lib1 & Lib2 Counts', df), ('Lib1 Counts', lib1_scored_df), ('Lib2 Counts', lib2_scored_df)]

    score_plots = []
    se_plots = []
    for elem in rescored_dfs: 
        title, df = elem
        base = alt.Chart(df)
        original_score_chart = base.mark_point().encode(
            x = alt.X('pos',
                      axis = alt.Axis(labels = False),
                      scale = alt.Scale(domain = [214780602,214781509],
                                       reverse = True)
                     ), 
            y = alt.Y('score',
                      scale = alt.Scale(
                          domain = [-0.25, 0.1]
                      
                      )
                     ),
            color = alt.Color('simplified_consequence',
                              sort = ['synonymous_variant', 'missense_variant']
                             ),
            tooltip = [alt.Tooltip('target', title = 'SGE Target: '),
                       alt.Tooltip('amino_acid_change', title = 'Amino Acid Sub.: '),
                       alt.Tooltip('score', title = 'SGE Score: ')
                      ]
        ).properties(
            width = 2000,
            height = 400,
            title = 'With ' + title + ' SGE Scores'
        )
        nf_line = base.mark_rule(color='red').encode(
            y=alt.datum(cutoffs[0])
        )
        
        func_line = base.mark_rule(color='blue').encode(
            y=alt.datum(cutoffs[1])
        )

        original_score_chart = original_score_chart + nf_line + func_line
        score_plots.append(original_score_chart)

        #SE Chart
        se_chart = base.mark_point().encode(
            x = alt.X('pos',
                      axis = alt.Axis(labels = False),
                      scale = alt.Scale(domain = [214780602,214781509],
                                       reverse = True)
                     ), 
            y = alt.Y('standard_error',
                      scale = alt.Scale(
                          domain = [0, 0.1]
                      
                      )
                     ),
            color = alt.Color('simplified_consequence',
                              sort = ['synonymous_variant', 'missense_variant']
                             ),
            tooltip = [alt.Tooltip('target', title = 'SGE Target: '),
                       alt.Tooltip('amino_acid_change', title = 'Amino Acid Sub.: '),
                       alt.Tooltip('score', title = 'SGE Score: ')
                      ]
        ).properties(
            width = 2000,
            height = 400,
            title = 'With ' + title + ' Stanard Error'
        )

        se_plots.append(se_chart)
        
    score_plot = score_plots[0] & score_plots[1] & score_plots[2]
    se_plot = se_plots[0] & se_plots[1] & se_plots[2]
    score_plot.display()
    se_plot.display()
        
    return lib2_scored_df

In [None]:
def merge_rescored(data_path, rescored, cutoffs):

    rescored['exon'] = 'BARD1_X4'
    pos_id_to_remove = rescored['pos_id'].tolist()
    original_df = pd.read_excel(data_path)

    original_df = original_df[['exon', 'pos_id', 'score', 'standard_error', 'pos', 'allele', 'amino_acid_change', 'simplified_consequence', 'target']]
    
    df = original_df.loc[~(original_df['pos_id'].isin(pos_id_to_remove))]

    df = pd.concat([df, rescored])
    df['functional_consequence'] = 'indeterminate'
    df.loc[df['score'] <= cutoffs[0], 'functional_consequence'] = 'functionally_abnormal'
    df.loc[df['score'] >= cutoffs[1], 'functional_consequence'] = 'functionally_normal'

    df = df.loc[:, ['exon', 'target', 'pos', 'allele', 'pos_id', 'score', 'standard_error', 'simplified_consequence', 'amino_acid_change', 'functional_consequence']]
    
    #df.to_excel('/Users/ivan/Desktop/test_excel_outputs/20250718_BARD1scores_X4Rescored_filtered.xlsx', index = False)
    

In [None]:
def main():
    df,abnormal_df = process_data(bard1_data)
    std_error(df,depth_file)
    counts_for_abnormal(abnormal_df)
    overlap_nonoverlap_barchart(df)
    #overlapping_scatter(df)
    rescored_df = rescore_pilot(abnormal_df, cutoffs)
    merge_rescored(bard1_data, rescored_df, cutoffs)


In [None]:
main()