In [None]:
import pandas as pd
import altair as alt
import numpy as np
import statsmodels.api as sm
from scipy import stats
import re

In [None]:
input_files = '../Data/filtered_ppj_data/pillar_project_data_inputs.xlsx'
carriers = '/Users/ivan/Downloads/7_genes_for_Starita_02282025_hg38.xlsx' #carriers data
bridges_all = '../Data/BRIDGES_data/20250815_BRIDGES_missense_all.xlsx' #bridges all missense variants
bridges_population = '../Data/BRIDGES_data/20250815_BRIDGES_missense_population.xlsx' #bridges population missense variants
bridges_all_ptv = '../Data/BRIDGES_data/20250815_BRIDGES_PTVs_all.xlsx' #bridges all PTVs
bridges_pop_ptv = '../Data/BRIDGES_data/20250815_BRIDGES_PTVs_pop.xlsx' #bridges population PTVs

In [None]:
def read_inputs(input_files, carriers, bridges_all_vars, bridges_population, bridges_all_ptv, bridges_pop_ptv):
    sge_inputs = pd.read_excel(input_files)
    sge_genes = list(set(sge_inputs['gene'].tolist()))
    sge_inputs.set_index('gene', inplace = True)

    carriers_df = pd.read_excel(carriers)
    carriers_df = carriers_df.loc[carriers_df['CAVA_GENE'].isin(sge_genes)]
    
    bridges_all = pd.ExcelFile(bridges_all_vars)
    
    bridges_genes = bridges_all.sheet_names

    bridges_pop = pd.ExcelFile(bridges_population)
    bridges_allptv = pd.ExcelFile(bridges_all_ptv)
    bridges_popptv = pd.ExcelFile(bridges_pop_ptv)


    return sge_inputs, sge_genes, carriers_df, bridges_genes, bridges_all, bridges_pop, bridges_allptv, bridges_popptv

In [None]:
def get_thresholds(df):
    
    # find the GMM thresholds
    target_value = 0.950
    # Calculate the absolute difference for the Normal (N) density
    diffN = (df['gmm_density_normal'] - target_value).abs()
    # Find the index of the minimum difference
    closest_index = diffN.idxmin()
    # Retrieve the row with the closest value
    closest_row_n = df.loc[closest_index]
    
    # now repeat that for the abnormal density
    # Calculate the absolute difference
    diffA = (df['gmm_density_abnormal'] - target_value).abs()
    # Find the index of the minimum difference
    closest_index = diffA.idxmin()
    # Retrieve the row with the closest value
    closest_row_a = df.loc[closest_index]
    
    # now we get the scores that are the closest to the (n)ormal and (a)bnormal thresholds
    score_n_95 = closest_row_n['score']
    score_a_95 = closest_row_a['score']

    thresholds = [score_a_95, score_n_95]

    return thresholds

In [None]:
def get_gene_data(gene, sge_inputs, use_one_threshold = False):
    input_path = sge_inputs['sge_file'][gene]
    
    df = pd.read_excel(input_path)

    thresholds = get_thresholds(df)
    
    df = df.drop(columns = ['functional_consequence'])
    df = df.rename(columns = {'gmm_consequence_0.95': 'functional_consequence'})
    df = df.loc[~(df['functional_consequence'].isin(['indeterminate']))]


    df = df.rename(columns = {'functional_consequence': 'Classification', 
                               'consequence': 'Consequence'})

    if use_one_threshold:
        df.loc[df['score']>= thresholds[1], 'Classification'] = "F"
        df.loc[df['score'] <= thresholds[1], 'Classification'] = 'NF'

    else:
        df.loc[df['Classification'] == 'functionally_normal', 'Classification'] = 'F'
        df.loc[df['Classification'] == 'functionally_abnormal', 'Classification'] = 'NF'
        df.loc[df['Classification'] == 'indeterminate', 'Classification'] = 'I'
    
    df = df.loc[~(df['Classification'].isin(['I']))]

    
    df_all = df
    df_miss = df.loc[df['Consequence'].isin(['missense_variant'])]

    to_return = {'all': df_all, 'miss': df_miss}
    keys_to_return = ['all', 'miss']
    
    return to_return, keys_to_return

In [None]:
def get_gene_carriers(gene, cc):
    
    cc = cc[cc['CAVA_GENE'].isin([gene])] #Filters for one gene
    cc = cc[['Classification', '#CHROM', 'REF', 'ALT', 'CAVA_GENE', 'CAVA_CSN', 'CAVA_SO', 'Sample_AAF', 'Sample_ID', 'CaseControl','ER_status1', 'hg38_start']].copy() #Keeps necessary columns
    
    cc['pos_id'] = None #Creates emtpy pos_id column
    cc = cc[cc['ALT'].str.len() == 1].copy()
    cc['hg38_start'] = cc['hg38_start'].astype(str) #Sets hg38 coordinates as str data type
    cc['pos_id'] = cc['hg38_start'] + ':' + cc['ALT'] #Creates position ID

    cc_er = cc[cc['ER_status1'].isin([0, 777])]

    return cc, cc_er

In [None]:
def get_gene_bridges(gene, all_bridges, pop_bridges, all_ptv_bridges, pop_ptv_bridges):
    bridges_all = all_bridges.parse(gene)
    bridges_pop = pop_bridges.parse(gene)
    bridges_all_ptv = all_ptv_bridges.parse(gene)
    bridges_pop_ptv = pop_ptv_bridges.parse(gene)

    raw_dfs = [bridges_all, bridges_pop, bridges_all_ptv, bridges_pop_ptv]
    cleaned_dfs = []
    
    for df in raw_dfs:
        df = df[['Cases', 'Controls', 'chr', 'ref', 'alt', 'hg38_pos']]
        df = df.loc[(df['ref'].str.len() == 1) & (df['alt'].str.len() == 1)]
        
        df = df.rename(columns = {'hg38_pos': 'pos'})

        df['pos_id'] = df['pos'].astype(str) + ':' + df['alt']

        df = df[['Cases', 'Controls', 'pos_id']]
        cleaned_dfs.append(df)


    final_dfs = []
    bridges_all = pd.concat([cleaned_dfs[0], cleaned_dfs[2]])
    bridges_pop = pd.concat([cleaned_dfs[1], cleaned_dfs[3]])

    dfs = [bridges_all, bridges_pop]
    
    for elem in dfs:
        final_dfs.append(elem)

    return bridges_all, bridges_pop   

In [None]:
def analyze_carriers(cc, sge):
    merged = pd.merge(cc, sge, on = 'pos_id', how = 'inner') #Merges case-control and SGE data
    merged = merged.dropna(subset = ['Classification_y']) #drops any columsn without a classification
    
    contingency_tab = merged[['CaseControl', 'Classification_y']] #Creates dataframe for contingency table

    contingency_tab = pd.crosstab(merged['CaseControl'], merged['Classification_y']) #Creates contingency table
    contingency_tab = contingency_tab[contingency_tab.columns[::-1]] #Swaps NF and F columns in df
    table_array = contingency_tab.values #Gets values from contingency table

    oddsratio, p_value = stats.fisher_exact(table_array) #Tabulates odds-ratio and p-value from Fischer's exact test
    table = sm.stats.Table2x2(table_array) #Generates confidence intervals

    to_return = (table.oddsratio, [table.oddsratio_confint()[0], table.oddsratio_confint()[1]])

    return to_return

In [None]:
def analyze_bridges(sge, df):

    merged = pd.merge(df, sge, on = 'pos_id', how = 'inner')

    summary = merged.pivot_table(
        values = ['Cases', 'Controls'],
        index = 'Classification',
        aggfunc = 'sum'
    )
    #print(summary.transpose())
    table_array = np.array([[summary['Cases']['NF'], summary['Cases']['F']],
                             [summary['Controls']['NF'], summary['Controls']['F']]])

    
    oddsratio, p_value = stats.fisher_exact(table_array) #Tabulates odds-ratio and p-value from Fischer's exact test
    table = sm.stats.Table2x2(table_array) #Generates confidence intervals

    to_return = (table.oddsratio, [table.oddsratio_confint()[0], table.oddsratio_confint()[1]])

    return to_return

In [None]:
def process_data_dict(dict, key_dict):
    all_keys = list(dict.keys())


    odds_ratios = []

    lwr_ci = []
    uppr_ci = []
    for key in all_keys:
        odds, ci_list = dict[key]
        odds_ratios.append(odds)
        lwr_ci.append(ci_list[0])
        uppr_ci.append(ci_list[1])


    datasets = ['CARRIERS (All)', 'CARRIERS (ER-)', 'BRIDGES (All)', 'BRIDGES (Pop.)',
                'CARRIERS (All)', 'CARRIERS (ER-)', 'BRIDGES (All)', 'BRIDGES (Pop.)'
               ]

    vars_incl = ['All Vars.', 'All Vars.', 'All Vars.', 'All Vars.',
                  'Mis. Only', 'Mis. Only', 'Mis. Only', 'Mis. Only'
                ]

    
    df = pd.DataFrame({'analysis': all_keys, 'OR': odds_ratios, 'lwr_ci': lwr_ci, 'uppr_ci': uppr_ci,
                      'dataset': datasets, 'incl_vars': vars_incl})

    df['full_type'] =  df['dataset'] + ' ' + df['incl_vars']
    
    return df

In [None]:
def process_data_dict_bridges_only(dict, key_dict):
    all_keys = list(dict.keys())


    odds_ratios = []

    lwr_ci = []
    uppr_ci = []
    for key in all_keys:
        odds, ci_list = dict[key]
        odds_ratios.append(odds)
        lwr_ci.append(ci_list[0])
        uppr_ci.append(ci_list[1])


    datasets = ['BRIDGES (All)', 'BRIDGES (Pop.)',
                'BRIDGES (All)', 'BRIDGES (Pop.)'
               ]

    vars_incl = ['All Vars.', 'All Vars.',
                 'Mis. Only', 'Mis. Only'
                ]

    
    df = pd.DataFrame({'analysis': all_keys, 'OR': odds_ratios, 'lwr_ci': lwr_ci, 'uppr_ci': uppr_ci,
                      'dataset': datasets, 'incl_vars': vars_incl})

    df['full_type'] =  df['dataset'] + ' ' + df['incl_vars']
    
    return df

In [None]:
def cc_fig(df, gene):

    base = alt.Chart(df)

    sort_order = [ 'BRIDGES (All) All Vars.', 'BRIDGES (Pop.) All Vars.', 'CARRIERS (All) All Vars.', 'CARRIERS (ER-) All Vars.',
                    'BRIDGES (All) Mis. Only', 'BRIDGES (Pop.) Mis. Only', 'CARRIERS (All) Mis. Only', 'CARRIERS (ER-) Mis. Only'
                 ]
    
    points = base.mark_point(
        filled = True,
        size = 50, 
        color = 'black'
        ).encode(
        y = alt.Y('full_type:O',
                  scale = alt.Scale(domain = sort_order),
                 axis = alt.Axis(title = '',
                                 labelFontSize = 16, 
                                 labelLimit = 1000
                                )
                 ),
        x = alt.X('OR',
                 axis = alt.Axis(
                     title = 'Odds Ratio',
                     labelFontSize = 16,
                     titleFontSize = 18,
                     values = list(range(0, 9))
                                 ),
                  scale = alt.Scale(domain = [0, 6]
                                   )
                 )
        ).properties(
            title = alt.TitleParams(text = gene, fontSize = 20)
    )
    
    ci_bars = base.mark_errorbar().encode(
        y = 'full_type',
        x = alt.Y('lwr_ci:Q', axis = alt.Axis(title = '')),
        x2 = 'uppr_ci:Q'
        )

    line = alt.Chart(pd.DataFrame({'Odds Ratio': [1]})).mark_rule(color = 'red').encode(
        x = 'Odds Ratio')


    plot = (points + ci_bars + line).configure_view(
        stroke = None
        ).configure_axis(
        grid = False
        ).interactive()

    
    plot.display()
    save_string = '/Users/ivan/Desktop/pillar_project_figs/OddsRatios/' + gene + '_ORplot.pdf'
    #plot.save(save_string, ppi = 300)
    

In [None]:
def cc_fig_bridges_only(df, gene):
    base = alt.Chart(df)

    sort_order = [ 'BRIDGES (All) All Vars.', 'BRIDGES (Pop.) All Vars.', 
                    'BRIDGES (All) Mis. Only', 'BRIDGES (Pop.) Mis. Only'
                 ]
    
    points = base.mark_point(
        filled = True,
        size = 50, 
        color = 'black'
        ).encode(
        y = alt.Y('full_type:O',
                  scale = alt.Scale(domain = sort_order),
                 axis = alt.Axis(title = '',
                                 labelFontSize = 16, 
                                 labelLimit = 1000
                                )
                 ),
        x = alt.X('OR',
                 axis = alt.Axis(
                     title = 'Odds Ratio',
                     labelFontSize = 16,
                     titleFontSize = 18,
                     values = list(range(0, 9))
                                 ),
                  scale = alt.Scale(domain = [0, 6]
                                   )
                 )
        ).properties(
            title = alt.TitleParams(text = gene, fontSize = 20)
    )
    
    ci_bars = base.mark_errorbar().encode(
        y = 'full_type',
        x = alt.Y('lwr_ci:Q', axis = alt.Axis(title = '')),
        x2 = 'uppr_ci:Q'
        )

    line = alt.Chart(pd.DataFrame({'Odds Ratio': [1]})).mark_rule(color = 'red').encode(
        x = 'Odds Ratio')


    plot = (points + ci_bars + line).configure_view(
        stroke = None
        ).configure_axis(
        grid = False
        ).interactive()

    
    plot.display()
    save_string = '/Users/ivan/Desktop/pillar_project_figs/OddsRatios/' + gene + '_ORplot.pdf'
    #plot.save(save_string, ppi = 300)

In [None]:
def main():
    sge_input, sge_genes, carriers_df, bridges_genes, bridges_all_excel, bridges_pop, bridges_allptv, bridges_popptv =read_inputs(input_files, carriers, bridges_all, bridges_population, bridges_all_ptv, bridges_pop_ptv)

    for gene in sge_genes:
        print('Processing ', gene, '...')
        if gene == 'SFPQ' or gene == 'CTCF':
            print('No Case-Control Data for ', gene)
            continue
        elif gene == 'BRCA2':
            print('No Functional Classes Currently')
            continue
        elif gene == 'PALB2':
            print('Only BRIDGES Data for ', gene)
            dfs, keys = get_gene_data(gene, sge_input)
            bridges_all_df, bridges_pop_df = get_gene_bridges(gene, bridges_all_excel, bridges_pop, bridges_allptv, bridges_popptv)
    
            data_keys = {'miss': ['bridges_all_miss', 'bridges_pop_miss'], 
                         'all': ['bridges_all_all', 'bridges_pop_all']
                        }
    
            data_dict = {}
            
            for key in keys:
                sge_df = dfs[key]
                output_keys = data_keys[key]
                data_dict[output_keys[0]] = analyze_bridges(sge_df, bridges_all_df)
                data_dict[output_keys[1]] = analyze_bridges(sge_df, bridges_pop_df)

        
            final_df = process_data_dict_bridges_only(data_dict, data_keys)
            cc_fig_bridges_only(final_df, gene)

            continue
            
        dfs, keys = get_gene_data(gene, sge_input, use_one_threshold = True)
        cc_df, cc_er_df = get_gene_carriers(gene, carriers_df)
        bridges_all_df, bridges_pop_df = get_gene_bridges(gene, bridges_all_excel, bridges_pop, bridges_allptv, bridges_popptv)

        data_keys = {'miss': ['cc_miss', 'cc_miss_er', 'bridges_all_miss', 'bridges_pop_miss'], 
                     'all': ['cc_all', 'cc_all_er', 'bridges_all_all', 'bridges_pop_all']
                    }

        data_dict = {}
        
        for key in keys:
            sge_df = dfs[key]           
            output_keys = data_keys[key]
            data_dict[output_keys[0]] = analyze_carriers(cc_df, sge_df)
            data_dict[output_keys[1]] = analyze_carriers(cc_er_df, sge_df)
            data_dict[output_keys[2]] = analyze_bridges(sge_df, bridges_all_df)
            data_dict[output_keys[3]] = analyze_bridges(sge_df, bridges_pop_df)

        
        final_df = process_data_dict(data_dict, data_keys)
        cc_fig(final_df, gene)
        #print(final_df)

In [None]:
main()