In [None]:
import pandas as pd
import statistics
import altair as alt

In [None]:
sge_input_files = '../Data/filtered_ppj_data/pillar_project_data_inputs.xlsx'
alt.data_transformers.disable_max_rows()

In [None]:
def read_inputs(inputs):
    df = pd.read_excel(inputs)

    genes = df['gene'].tolist()
    df.set_index('gene', inplace = True)

    return df, genes

In [None]:
def get_thresholds(df):
    
    # find the GMM thresholds
    target_value = 0.950
    # Calculate the absolute difference for the Normal (N) density
    diffN = (df['gmm_density_normal'] - target_value).abs()
    # Find the index of the minimum difference
    closest_index = diffN.idxmin()
    # Retrieve the row with the closest value
    closest_row_n = df.loc[closest_index]
    
    # now repeat that for the abnormal density
    # Calculate the absolute difference
    diffA = (df['gmm_density_abnormal'] - target_value).abs()
    # Find the index of the minimum difference
    closest_index = diffA.idxmin()
    # Retrieve the row with the closest value
    closest_row_a = df.loc[closest_index]
    
    # now we get the scores that are the closest to the (n)ormal and (a)bnormal thresholds
    score_n_95 = closest_row_n['score']
    score_a_95 = closest_row_a['score']

    thresholds = [score_a_95, score_n_95]

    return thresholds

In [None]:
def process_gsp_data(input_df, gene):
    input_path = input_df['sge_file'][gene]
    
    df = pd.read_excel(input_path)
    df = df.drop(columns = ['functional_consequence'])
    df = df.rename(columns = {'gmm_consequence_0.95': 'functional_consequence'})
    
    df = df.loc[~(df['amino_acid_change'].isin(['---']))]
    df = df.loc[~(df['consequence'].isin(['synonymous_variant']))]
    
    df['og_AA'] = df['amino_acid_change'].transform(lambda x: x[0]) #Makes column with the original amino acid
    df['AA_change'] = df['amino_acid_change'].transform(lambda x: x[-1]) #makes column with amino acid change
    df['AApos'] = df['amino_acid_change'].transform(lambda x: x[1: len(x)-1]) #makes column with residue position
    df['AApos'] = df['AApos'].astype(int)

    df.loc[df['functional_consequence'] == 'functionally_normal', 'functional_consequence'] = 'Normal'
    df.loc[df['functional_consequence'] == 'functionally_abnormal', 'functional_consequence'] = 'Abnormal'
    df.loc[df['functional_consequence'] == 'indeterminate', 'functional_consequence'] = 'Indeterminate'

    df = df[['AA_change', 'og_AA', 'AApos', 'functional_consequence', 'consequence', 'score', 'gmm_density_normal', 'gmm_density_abnormal']]

    return df

In [None]:
def get_min_mean_scores(df, thresholds):

    lwr = thresholds[0]
    uppr = thresholds[1]
    
    mis_df = df.loc[df['consequence'].isin(['missense_variant'])]

    min_df = mis_df.groupby('AApos')['score'].min().reset_index()
    mean_df = mis_df.groupby('AApos')['score'].mean().reset_index()


    min_df['AA_change'] = 'Mis. Min.'
    mean_df['AA_change'] = 'Mis. Mean'

    min_df['og_AA'] = 'Mis. Min.'
    mean_df['og_AA'] = 'Mis. Mean'

    min_df['full_sub'] = 'Mis. Min.'
    mean_df['full_sub'] = 'Mis. Mean'


    min_df['functional_consequence'] = 'Indeterminate'
    mean_df['functional_consequence'] = 'Indeterminate'

    min_df.loc[min_df['score'] >= uppr, 'functional_consequence'] = 'Normal'
    min_df.loc[min_df['score'] <= lwr, 'functional_consequence'] = 'Abnormal'
    mean_df.loc[mean_df['score'] >= uppr, 'functional_consequence'] = 'Normal'
    mean_df.loc[mean_df['score'] <= lwr, 'functional_consequence'] = 'Abnormal'
    
    min_df.AApos = min_df.AApos.astype(int) #set datatype as int
    mean_df.AApos = mean_df.AApos.astype(int) #set datatype as int

    df = pd.concat([df,min_df, mean_df]) #concatenates dataframes
    df.loc[(df['AA_change'] == '*', 'AA_change')] = 'Stop' #Renames stop-gained variants

    return df

In [None]:
def heatmap(df, gene):

    order = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'Stop', 'Mis. Min.', 'Mis. Mean']

    df = df.loc[~(df['AA_change'].isin(['Mis. Min.', 'Mis. Mean']))]
    bins = len(df)
    sge_map = alt.Chart(df).mark_rect().encode(
        x = alt.X('AApos:Q',
                  title = 'Amino Acid Position',
                  bin = alt.Bin(maxbins = bins, minstep = 1),
                  axis = alt.Axis(values = list(range(0, bins, 50))
                                 )
                 ),
        y = alt.Y('AA_change:O',
                  title = 'Amino Acid Substitution',
                  sort = order
                 ),
        color = alt.Color('score',
                          scale = alt.Scale(scheme = 'bluepurple',
                                            domain = [-0.2, 0],
                                            reverse = True
                                           )
                         ),
        tooltip = ['score']
    ).properties(
        width = 1750,
        height = 300,
        title = alt.TitleParams(gene, fontSize = 20)
    )

    sge_map.display()

In [None]:
def main():
    sge_inputs, sge_genes = read_inputs(sge_input_files)

    #sge_genes = ['BARD1']

    for gene in sge_genes:
        if gene == 'BRCA2':
            continue
        df = process_gsp_data(sge_inputs, gene)
        thresholds = get_thresholds(df)
        final_df = get_min_mean_scores(df, thresholds)
        heatmap(final_df, gene)

In [None]:
main()