In [2]:
import pandas as pd
import altair as alt

In [3]:
file = '/Users/ivan/Library/CloudStorage/OneDrive-UW/Research/Miscellaneous/BARD1_Figure_Data_Inputs/20241004_BARD1_PillarProject_DataFreeze.xlsx'
#file = '/Users/ivan/Library/CloudStorage/OneDrive-UW/Research/Miscellaneous/BARD1_Figure_Data_Inputs/20240828_BARD1_AllScores.xlsx'

In [6]:
def read_scores(file): #reads score from excel file
    df = pd.read_excel(file)
    df = df[['target','Consequence','snv_score_minmax']]

    df = df.loc[(df['snv_score_minmax'] <= 2) & (df['snv_score_minmax'] > -2)] #Needed to control outliers in the data - due to extreme outliers, the plot will not display correctly

    return df

In [8]:
def prep_data(df): #Renames categories to be nicer
    df.loc[df['Consequence'] == 'missense_variant', 'Consequence'] = 'Missense'
    df.loc[df['Consequence'] == 'synonymous_variant', 'Consequence'] = 'Synonymous'
    df.loc[df['Consequence'] == 'intron_variant', 'Consequence'] = 'Intron'
    df.loc[(df['Consequence'] == 'stop_gained') | (df['Consequence'] == 'stop_lost') | (df['Consequence'] == 'stop_retained_variant'), 'Consequence'] = 'Stop'
    df.loc[(df['Consequence'] == 'splice_polypyrimidine_tract_variant') |(df['Consequence'] == 'splice_region_variant') | (df['Consequence'] == 'splice_acceptor_variant') | (df['Consequence'] == 'splice_donor_region_variant') | (df['Consequence'] == 'splice_donor_5th_base_variant') | (df['Consequence'] == 'splice_donor_variant'),'Consequence'] = 'Splice'
    df.loc[(df['Consequence'] == '3_prime_UTR_variant'), 'Consequence'] = "3' UTR"
    df.loc[(df['Consequence'] == '5_prime_UTR_variant'), 'Consequence'] = "5' UTR"
    df.loc[df['Consequence'] == 'start_lost', 'Consequence'] = 'Start'

    return df


In [10]:
def make_histogram(df): #makes histogram

    alt.data_transformers.disable_max_rows() #gets rid of max data length problem

    length = str(len(df)) #gets length of data

    final_tital = 'Distribution of BARD1 SGE Scores ' + '(n = ' + length + ')' #used to build title
    bins = 50 #number of bins
    scale = [-2,2] #scale
    ticks = list(range(-2,2)) #tick marks

    sorted = ["Intron", "Missense", "Synonymous", "Stop", "Splice", "Start", "5' UTR", "3' UTR"] #order for the legend
  

    
    histogram = alt.Chart(df).mark_bar().encode(
            alt.X('snv_score_minmax', axis = alt.Axis(values = ticks, title = 'SGE Score', labelFontSize = 16, titleFontSize = 20), bin = alt.Bin(maxbins = bins),
             scale = alt.Scale(domain = scale)),
            alt.Y('count()', axis = alt.Axis(title = 'Number of Variants', labelFontSize = 16, titleFontSize = 20)),
            color = alt.Color('Consequence:N', scale = alt.Scale(scheme = 'category10'), sort = sorted, legend = alt.Legend(titleFontSize = 16, labelFontSize = 14))
    ).properties(
        width = 800,
        height = 400,
        title = alt.TitleParams(text = final_tital, fontSize = 22)
    )

    histogram.display()


In [12]:
def main():
    data = read_scores(file)
    to_graph = prep_data(data)
    to_graph.to_excel('test.xlsx')
    make_histogram(to_graph)

In [14]:
main()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/ivan/Library/CloudStorage/OneDrive-UW/Research/Miscellaneous/BARD1_Figure_Data_Inputs/20241004_BARD1_PillarProject_DataFreeze.xlsx'