In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import altair as alt
import numpy as np
import matplotlib.patches as mpatches

In [None]:
gnom_path = '/Users/ivan/Library/CloudStorage/OneDrive-UW/Research/Miscellaneous/BARD1_Figure_Data_Inputs/20240905_BARD1_gnomADv4.1.0_SNVs.xlsx'
scores = '/Users/ivan/Library/CloudStorage/OneDrive-UW/Research/Miscellaneous/BARD1_Figure_Data_Inputs/20240828_BARD1_AllScores.xlsx'


In [None]:
def read_gnomAD(gnomAD_path):
    unfiltered = pd.read_excel(gnomAD_path)
    filtered = unfiltered[['gnomAD ID', 'Allele Frequency']]
    return filtered

In [None]:
def rewrite(df):
    for i in range(len(df)):
        df.loc[df['gnomAD ID'] == df['gnomAD ID'][i], 'gnomAD ID'] = df['gnomAD ID'][i][2:11] + ':' + df['gnomAD ID'][i][14]
    return df

In [None]:
def add_scores(df, scores):
    scores = pd.read_excel(scores)
    sge_scores = scores[['pos_id','Consequence','snv_score_minmax']]
    df['snv_score_minmax'] = np.nan
    df['Consequence'] = ''

    
    gnomad_vars = []
    for i in range(len(df)):
        gnomad_vars.append(df['gnomAD ID'][i])
    gnomad_scores = sge_scores[sge_scores['pos_id'].isin(gnomad_vars)]
    new_index = []
    for i in range(len(gnomad_scores)):
        new_index.append(i)
    gnomad_scores = gnomad_scores.reset_index(drop = True)
    gnomad_scores.index = new_index
    
    df_filter = []
    
    for i in range(len(gnomad_scores)):
        df_filter.append(gnomad_scores['pos_id'][i])
        
    df_filtered = df[df['gnomAD ID'].isin(df_filter)]
    df_filtered = df_filtered.reset_index(drop = True)
    df_filtered.index = new_index

    for i in range(len(df_filtered)):
        pos = df_filtered['gnomAD ID'][i]
        score_df = gnomad_scores.loc[gnomad_scores['pos_id'] == pos, ['Consequence','snv_score_minmax']]
        new_index = [0]
        score_df = score_df.reset_index(drop = True)
        score_df.index = new_index
        score = score_df['snv_score_minmax'][0]
        consequence = score_df['Consequence'][0]
        df_filtered.loc[df_filtered['gnomAD ID'] == pos, 'snv_score_minmax'] = score
        df_filtered.loc[df_filtered['gnomAD ID'] == pos, 'Consequence'] = consequence
    
    return df_filtered

In [None]:
def graph(df):
    grouped_colormap = {'synonymous_variant': 'Synonymous / Intronic', 
                        'intron_variant': 'Synonymous / Intronic',
                        'missense_variant': 'Missense', 
                        'stop_gained': 'Stop Gained',
                        'stop_lost': 'Stop Lost / Retained',
                        'stop_retained_variant': 'Stop Lost / Retained',
                        'splice_polypyrimidine_tract_variant': 'Splice',
                        'splice_region_variant': 'Splice',
                        'splice_acceptor_variant': 'Splice',
                        'splice_donor_5th_base_variant': 'Splice',
                        'splice_donor_region_variant': 'Splice',
                        'splice_donor_variant': 'Splice',
                        '3_prime_UTR_variant': 'UTR'}

    colors1 = {'Synonymous / Intronic': 'b',
              'Missense': 'y',
              'Stop Gained': 'r',
              'Stop Lost / Retained': 'g',
              'Splice': 'm',
              'UTR': 'c'}
         
    
    df['Consequence'] = df['Consequence'].map(grouped_colormap)

    graph = alt.Chart(df).mark_point().encode(
        x=alt.X('snv_score_minmax', axis=alt.Axis(title='SGE Score', labelFontSize = 16, titleFontSize = 16)),
        y = alt.Y('Allele Frequency', axis = alt.Axis(grid = False, labelFontSize = 16, titleFontSize = 16), scale = alt.Scale(type = 'log')),
        color = alt.Color('Consequence', legend = alt.Legend(titleFontSize = 16, labelFontSize = 14))
    ).properties(
        width = 487.2,
        height = 238,
        title = alt.TitleParams( text = 'gnomAD Allele Frequency vs. SGE Scores' + ' (n = ' + str(len(df)) + ')', fontSize = 22)
    )

    graph.show()

In [None]:
def main():
    filt = read_gnomAD(gnom_path)
    rewritten = rewrite(filt)
    test = add_scores(rewritten, scores)
    
    graph(test)


In [None]:
main()