In [None]:
import pandas as pd
import altair as alt

In [None]:
bard1_data = '../Data/20250508_BARD1scores_update_FILTERED.xlsx'

In [None]:
def process_data(file):
    df = pd.read_excel(file)
    df = df.loc[(df['exon'].isin(['BARD1_X4'])) & (df['simplified_consequence'].isin(['missense_variant'])) & (df['functional_consequence'].isin(['functionally_normal', 'functionally_abnormal']))]
    df['overlapping'] = 'N'
    df.loc[df['snvlib_lib2'].notna(), 'overlapping'] = 'Y'

    df['consequence_overlap'] = df['functional_consequence'] + '+' + df['overlapping']
    return df

In [None]:
def overlap_nonoverlap_barchart(df):

    summary_df = df.groupby(['functional_consequence', 'overlapping']).size().reset_index(name='count')
    summary_df['percentage_normal/ab'] = summary_df.groupby('functional_consequence')['count'].transform(lambda x: (x / x.sum()) * 100)
    summary_df['percentage_overlapping'] = summary_df.groupby('overlapping')['count'].transform(lambda x: (x / x.sum()) * 100)
    

    totals = summary_df.groupby('overlapping')['count'].sum().reset_index()
    totals['functional_consequence'] = 'All Variants'
    totals['percentage_normal/ab'] = 100  # or 100.0 if you prefer
    totals['percentage_overlapping'] = (totals['count'] / totals['count'].sum()) * 100

    summary_df = pd.concat([summary_df, totals], ignore_index=True)
    print(summary_df)

    chart = alt.Chart(summary_df).mark_bar().encode(
        x = alt.X('functional_consequence',
                  sort = ['All Variants', 'functionally_normal', 'functionally_abnormal'],
                  axis = alt.Axis(labelAngle = 0
                                 )
                 ),
        y = alt.Y('count'
                 ),
        color = alt.Color('overlapping',
                          sort = ['Y', 'N'],
                          legend = alt.Legend(title = 'In Overlap?')
                         )
    ).properties(
        width = 300,
        height = 300,
        title = 'X4 Missense Variants'
    )

    chart.display()


In [None]:
def main():
    df = process_data(bard1_data)
    overlap_nonoverlap_barchart(df)


In [None]:
main()