In [None]:
import pandas as pd
import statistics
import altair as alt

In [None]:
scores = '/Users/ivan/Desktop/AAsubstitutions.withSNVscores.allexons.tsv' #path to SGE scores
func_cutoff = 0.8 #functional class GMM cutoff (min)
nf_cutoff = 0.65 #non-functional class GMM cutoff (max)
BARD1_WT = 'MPDNRQPRNRQPRIRSGNEPRSAPAMEPDGRGAWAHSRAALDRLEKLLRCSRCTNILREPVCLGGCEHIFCSNCVSDCIGTGCPVCYTPAWIQDLKINRQLDSMIQLCSKLRNLLHDNELSDLKEDKPRKSLFNDAGNKKNSIKMWFSPRSKKVRYVVSKASVQTQPAIKKDASAQQDSYEFVSPSPPADVSERAKKASARSGKKQKKKTLAEINQKWNLEAEKEDGEFDSKEESKQKLVSFCSQPSVISSPQINGEIDLLASGSLTESECFGSLTEVSLPLAEQIESPDTKSRNEVVTPEKVCKNYLTSKKSLPLENNGKRGHHNRLSSPISKRCRTSILSTSGDFVKQTVPSENIPLPECSSPPSCKRKVGGTSGRKNSNMSDEFISLSPGTPPSTLSSSSYRRVMSSPSAMKLLPNMAVKRNHRGETLLHIASIKGDIPSVEYLLQNGSDPNVKDHAGWTPLHEACNHGHLKVVELLLQHKALVNTTGYQNDSPLHDAAKNGHVDIVKLLLSYGASRNAVNIFGLRPVDYTDDESMKSLLLLPEKNESSSASHCSVMNTGQRRDGPLVLIGSGLSSEQQKMLSELAVILKAKKYTEFDSTVTHVVVPGDAVQSTLKCMLGILNGCWILKFEWVKACLRRKVCEQEEKYEIPEGPRRSRLNREQLLPKLFDGCYFYLWGTFKHHPKDNLIKLVTAGGGQILSRKPKPDSDVTQTINTVAYHARPDSDQRFCTQYIIYEDLCNYHPERVRQGKVWKAPSSWFIDCVMSFELLPLDS'
alt.data_transformers.disable_max_rows()

In [None]:
def read_scores(file): #Reads SGE scores
    df = pd.read_csv(file, sep = '\t')

    df = df.loc[df['Consequence'].isin(['missense_variant', 'stop_gained', 'synonymous_variant'])]
    #df = df.loc[df['snv_score'] > -3]
    return df

In [None]:
def process_scores(df, func, nf): #Processes scores. Gets original AA, substituted AA, and functional classification
    
    df['og_AA'] = df['AAsub'].transform(lambda x: x[0]) #Makes column with the original amino acid
    df['AA_change'] = df['AAsub'].transform(lambda x: x[-1]) #makes column with amino acid change
    df['AApos'] = df['AAsub'].transform(lambda x: x[1: len(x)-1]) #makes column with residue position

    aa_grouped =df.groupby('AAsub') #groups by amino acid substitution
 
    aa_pos = [] #empty list to hold amino acid positions
    full_sub = [] #empty list for substitution
    og_aa = [] #empty list to hold original amino acids
    sub_aa = [] #empty list to hold substituted amino acids
    sub_score = [] #empty list to hold scores for each changge
    function = [] #empty list to hold functional classification
    
    for aa_sub, vars in aa_grouped:
        full_sub.append(aa_sub)
        vars = vars.reset_index(drop = True) #resets index for each grouopby object
        aa_pos.append(int(vars['AApos'][0])) #appends amino position
        og_aa.append(vars['og_AA'][0]) #appends original amino acid
        sub_aa.append(vars['AA_change'][0]) #appends substituted amino acid

        score_list = vars['snv_score'].tolist() #makes list of SGE scores for that amino acid position
        median_score = statistics.median(score_list)
        sub_score.append(median_score)

        if median_score > func:
            function.append('Functional')
        elif nf < median_score < func:
            function.append('Intermediate')
        else:
            function.append('Non-Functional')


    df = pd.DataFrame({'AApos': aa_pos, 'og_AA': og_aa,
                        'AAsub': sub_aa, 'score': sub_score,
                       'full_sub': full_sub, 'function': function
                      }) #output dataframe

    return df

In [None]:
def get_min_res_score(df): #Gets minimum missense scores for each residue position

    df = df.loc[df['Consequence'].isin(['missense_variant'])]
    aa_grouped_df = df.groupby('AApos')['snv_score'].min().reset_index() #gets the mean missense SGE score for each amino acid position

    aa_grouped_df = aa_grouped_df.rename(columns = {'snv_score': 'min_snv_score'})
    #makes bar chart that displays mean score vs. amino acid position
    sge_min_bar = alt.Chart(aa_grouped_df).mark_bar().encode(
        x = alt.X('AApos:Q', axis = alt.Axis(title = None, 
                                             labels = False, 
                                             ticks = False,
                                             domain = True
                                            ),
                  scale = alt.Scale(domain = [0,777])), 
        y = alt.Y('min_snv_score:Q', axis = alt.Axis(title = 'Min. Missense Score', labelFontSize = 12, titleFontSize = 14)),
        tooltip = [alt.Tooltip('AApos', title = 'Position: '),
                   alt.Tooltip('min_snv_score', title = 'Min. Missense Score: ')]
    ).properties(
        width = 1500,
        height = 150,
        title = alt.TitleParams(text = '', fontSize = 22)
    )

    #sge_min_bar.display()

    return sge_min_bar, aa_grouped_df

In [None]:
def get_mean_res_score(df): #Gets mean missense score for each residue position

    df = df.loc[df['Consequence'].isin(['missense_variant'])]
    aa_grouped_df = df.groupby('AApos')['snv_score'].mean().reset_index() #gets the mean missense SGE score for each amino acid position

    aa_grouped_df = aa_grouped_df.rename(columns = {'snv_score': 'mean_snv_score'})
    #makes bar chart that displays mean score vs. amino acid position
    sge_mean_bar = alt.Chart(aa_grouped_df).mark_bar().encode(
        x = alt.X('AApos:Q', scale = alt.Scale(domain = [0,777]),
                  axis = alt.Axis(title = 'Amino Acid Position', 
                                  labelFontSize = 12, 
                                  titleFontSize = 14
                                 )
                 ),
        y = alt.Y('mean_snv_score:Q', axis = alt.Axis(title = 'Mean Missense Score', labelFontSize = 12, titleFontSize = 14)),
        tooltip = [alt.Tooltip('AApos', title = 'Position: '),
                   alt.Tooltip('mean_snv_score', title = 'Mean Missense Score: ')]
    ).properties(
        width = 1500,
        height = 150,
        title = alt.TitleParams(text = '', fontSize = 22)
    )

    #sge_mean_bar.display()

    return sge_mean_bar, aa_grouped_df

In [None]:
def min_mean_stacked(min_df, mean_df): #Builds min-mean stacked bar chart
    
    df = pd.merge(min_df, mean_df, how = 'inner')

    df_long = df.melt(
        id_vars=['AApos'],
        value_vars=['min_snv_score', 'mean_snv_score'],
        var_name='Series',
        value_name='Value'
    )

    chart = alt.Chart(df_long).mark_bar(opacity = 0.7).encode(
        x=alt.X('AApos:Q', scale = alt.Scale(domain = [0, 777]), title= 'Position'),
        y=alt.Y('Value:Q', stack=True),
        color=alt.Color('Series:N', legend=alt.Legend(title='Values')),
        tooltip=['AApos', 'Series', 'Value']
    ).properties(
        width=1500,
        height=200
    )
    
    return chart

In [None]:
def append_mean_min(sub_df, min_df, mean_df): #Appends residue-level min and mean scores to main dataframe

    #Create shared columns
    min_df['AAsub'] = 'Min.'
    mean_df['AAsub'] = 'Mean'

    min_df['og_AA'] = 'Min.'
    mean_df['og_AA'] = 'Mean'

    min_df['full_sub'] = 'Min.'
    mean_df['full_sub'] = 'Mean'

    min_df['function'] = 'Min.'
    mean_df['function'] = 'Mean'

    min_df.AApos = min_df.AApos.astype(int) #set datatype as int
    mean_df.AApos = mean_df.AApos.astype(int) #set datatype as int
    
    min_df = min_df.rename(columns = {'min_snv_score': 'score'}) #Renames to yield shared columns
    mean_df = mean_df.rename(columns = {'mean_snv_score': 'score'}) #renames to yield shared columns

    df = pd.concat([sub_df,min_df, mean_df]) #concatenates dataframes
    df.loc[(df['AAsub'] == '*', 'AAsub')] = 'Stop' #Renames stop-gained variants
    return df

In [None]:
def add_wt_seq(df, wt_seq): #Adds the WT seqeuence to dataframe
    wt = [] #list to hold WT sequence
    
    for char in wt_seq:
        wt.append(char) #Splits up large string into separate characters

    #Creates DataFrame with WT sequence
    wt_data = pd.DataFrame({
        'AApos': range(1, len(wt) + 1),
        'og_AA': wt,
        'AAsub': wt, 
        'score': 1,
        'full_sub': 'WT',
        'is_wt': True
    })

    df['is_wt'] = False #Sets WT to False in SGE dataframe

    df_w_WT = pd.concat([df, wt_data], ignore_index = True) #concatenates dataframes

    return df_w_WT

In [None]:
def heatmap(df): #Builds heatmap without WT sequence


    order = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'Stop', 'Min.', 'Mean']
        
    map = alt.Chart(df).mark_rect().encode(
        x = alt.X('AApos:N', axis=alt.Axis(
                values=[i for i in range(0, 777, 100)],
                title=None,  # removes title
                labels=False,  # removes labels
                ticks=False,  # removes ticks
                domain=True  # keeps the domain line,
        )),
        y = alt.Y('AAsub:N', sort = order,
                  axis = alt.Axis(title = 'Amino Acid Substitution', titleFontSize = 16)),
        color = alt.condition(
            alt.datum.score <= 0,  # Replace X with your lower threshold
            alt.value('#ff0000'),  # Pure red for values <= X
            alt.condition(
                alt.datum.score >= 1,  # Replace Y with your upper threshold
                alt.value('#0000ff'),  # Pure blue for values >= Y
                alt.Color('score:Q',  # Gradient for values between X and Y
                         scale=alt.Scale(
                             domain=[0, 1],
                             range=['#ff0000', '#a6a6a6', '#0000ff']
                         ))
            )
        ),
        tooltip = [alt.Tooltip('AApos', title = 'Position: '),
                   alt.Tooltip('full_sub', title = 'Substitution: '),
                   alt.Tooltip('score', title = 'SGE Score: ')]
    ).properties(
        height = 700, 
        width = 1500
    ).interactive()

    
    #Things for rectanges for domains - probably easier to just draw them in
    rects_df = pd.DataFrame({
        'start': [26, 426, 566],
        'end': [120, 560, 777],
        'y': [10,10,10]
    })
    domain_rects = alt.Chart(rects_df).mark_rect(
        height = 20
    ).encode(
        x = 'start:Q',
        x2 = 'end:Q',
        color = alt.value('gray')
    )
    
    map.display()

    return map
    #domain_rects.display()

    #domain_heatmap = alt.layer(domain_rects, map)

    #domain_heatmap.display()

In [None]:
def heatmap_WT(df): #Builds heatmap with WT sequence in black


    order = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'Stop', 'Min.', 'Mean'] #order for heatmap y-axis

    
    #for heatmap
    map = alt.Chart(df).mark_rect().encode(
    x = alt.X('AApos:N', axis=alt.Axis(
            values=[i for i in range(0, 777, 100)],
            title='Amino Acid Residue', titleFontSize = 16,
            labels=True,
            ticks= True,
            domain=True
    )),
    y = alt.Y('AAsub:N', sort = order,
              axis = alt.Axis(title = 'Amino Acid Substitution', titleFontSize = 16)),
    color = alt.condition(
    'datum.is_wt == true',  # Note: in Vega-Lite, we need to use 'true' not 'True'
    alt.value('#000000'),  # Black for WT
    alt.condition(
        alt.datum.score <= 0,
        alt.value('#ff0000'),
        alt.condition(
            alt.datum.score >= 1,
            alt.value('#0000ff'),
            alt.Color('score:Q',title = 'SGE Score',
                     scale=alt.Scale(
                         domain=[0, 1],
                         range=['#ff0000', '#a6a6a6', '#0000ff']
                     ))
        )
    )
    ),
    tooltip = [alt.Tooltip('AApos', title = 'Position: '),
               alt.Tooltip('full_sub', title = 'Substitution: '),
               alt.Tooltip('score', title = 'SGE Score: ')]
    ).properties(
    height = 700, 
    width = 1500
    ).interactive()

    #for WT legend
    legend_data = pd.DataFrame([{'label': 'Wild Type'}])
    legend = alt.Chart(legend_data).mark_rect().encode(
        y=alt.Y('label:N', title = ''),
        color=alt.value('#000000')
    ).properties(
        title=''
    )
    
    # Combine the charts
    final_chart = alt.hconcat(map, legend).resolve_scale(
        color='independent'
    )

    final_chart.display()

In [None]:
def final_fig(map, min_bar, mean_bar, stack):

    fig = (alt.vconcat(map, min_bar, mean_bar, spacing = 0).configure_view(strokeWidth = 0).configure_axis(domain = False))
    fig.display()

    fig_2 = alt.vconcat(map, stack, spacing = 0).resolve_scale(
        color = 'independent',
        y = 'independent'
    ).resolve_legend(
        color = 'independent'
    ).resolve_axis(
        x = 'independent', 
        y = 'independent'
    )
    fig_2.display()



In [None]:
def main():
    raw_scores = read_scores(scores)
    sub_data = process_scores(raw_scores, func_cutoff, nf_cutoff)
    min_bar, min_df = get_min_res_score(raw_scores)
    mean_bar, mean_df = get_mean_res_score(raw_scores)
    min_mean_sub_data = append_mean_min(sub_data, min_df, mean_df)
    with_wt = add_wt_seq(min_mean_sub_data, BARD1_WT)
    
    #stacked_bar = min_mean_stacked(min_df, mean_df)
    #map = heatmap(sub_data)
    #heatmap(with_wt)
    
    heatmap_WT(with_wt)
    
    #final_fig(map, min_bar, mean_bar, stacked_bar)

In [None]:
main()