In [None]:
import pandas as pd
import numpy as np
import altair as alt

In [None]:
file = '/Users/ivan/Downloads/Orthogonal_BARD1_FunctionalAssays.xlsx'
scores = '/Users/ivan/Desktop/AAsubstitutions.withSNVscores.allexons.tsv'

In [None]:
def read_scores(scores): #Reads SGE scores
    df = pd.read_csv(scores,sep = '\t') #reads TSV

    df = df.loc[df['Consequence'] == 'missense_variant'] #pulls out missense variants only
    df = df.drop(columns = ['chrom', 'pos', 'allele', 'R1_score', 'R2_score', 'R3_score']) #drops these unnecessary columns
    df = df.rename(columns = {'snv_score' : 'snv_score_minmax'}) #renames score column
    df = df.reset_index(drop = True) #resets index
    
    return df

In [None]:
def read_orthogonal(file): #Reads orthogonal assay file
    
    df = pd.read_excel(file) #reads excel
    #df = df.drop(columns = ['Notes'])
    df = df.rename(columns = {'Unnamed: 0': 'AAsub'}) #renames unnmaed 
    df['Orthogonal Assay'] = 'ok' #Adds column to hold orthogonal assay characterization

    df = df.reset_index(drop = True)
    return df

In [None]:
def merge(scores,orthogonal): #Merges dataframes
    
    df = pd.merge(scores, orthogonal, how = 'inner', on = 'AAsub') #merged on intersection of AA substitutions
    df = df.drop_duplicates(subset = 'pos_id') #drops duplicate SNVs that can appear

    return df

In [None]:
def characterize_orthogonal(df): #Determines how variant performed in orthogonal assay(s)

    df = df.dropna(subset = ['pos_id']) #drops any variants without score

    #This block iterates through the dataframe and assigns an orthogonal assay function to each variant
    for i in range(len(df)):
        row_raw = df.iloc[i].tolist() #pulls out the row of data and stores as list
        row = [] #empty list to hold each element of the row

        #Iterates through row_raw and converts to string and removes extra spaces
        for elem in row_raw:
            elem = str(elem)
            row.append(elem.strip())

        #Boolean tests for how that variant performed in orthogonal assay
        if 'Abnormal' in row:
            df.iloc[i, len(row) - 1] = 'Abnormal'
        elif 'Intermediate' in row:
            df.iloc[i, len(row) - 1] = 'Intermediate'
        else:
            df.iloc[i, len(row) - 1] = 'Normal'

    #This chunk assigns what assay(s) were used to characterize each variant
    df['Assay Type'] = np.nan #establishes an empty column
    df['Assay Type'] = np.where((df['Adamovich2019_HDR'].notna()) | (df['Lee2015_HDR'].notna()),'HDR', df['Assay Type']) #Finds variants done by HDR 
    df['Assay Type'] = np.where((pd.isna(df['Adamovich2019_HDR'])) & (pd.notna(df['Adamovich2019_western'])),'Western Only', df['Assay Type']) #Finds variants done only by Western
    df['Assay Type'] = np.where((pd.notna(df['Dai2021_brca1foci'])) & (pd.notna(df['Dai2021_bard1foci'])) & (pd.notna(df['Dai2021_Ubbinding'])), #finds variants done by nuclear foci and Ub binding
                                'Nuclear Foci & Ub Binding', df['Assay Type'])
    df['Assay Type'] = np.where((pd.notna(df['Dai2021_Ubbinding'])) & (pd.isna(df['Dai2021_brca1foci'])) & (pd.isna(df['Dai2021_bard1foci'])), #Finds Variants only done by Ub binding
                                'Ub Binding Only', df['Assay Type'])
    df['Assay Type'] = np.where(df['Witus2021_Ubactivity'].notna(),'Ub Activity', df['Assay Type']) #Finds variants done by Ub activity
    
    
    return df
        

In [None]:
def make_histogram(df): #Makes histogram that displays distribution of SGE scores for variants asayed by orthogonal assays
    
    bins = 50 #number of bins
    scale = [-2,2] #scale
    ticks = list(range(-2,2)) #tick marks
    sorted = ['Normal','Abnormal','Intermediate'] #order for the legend

    #df = df.loc[df['Assay Type'].isin(['HDR'])]
    histogram = alt.Chart(df).mark_bar().encode(
            alt.X('snv_score_minmax', axis = alt.Axis(values = ticks, title = 'SGE Score', labelFontSize = 16, titleFontSize = 20), bin = alt.Bin(maxbins = bins),
             scale = alt.Scale(domain = scale)),
            alt.Y('count()', axis = alt.Axis(title = 'Number of Variants', labelFontSize = 16, titleFontSize = 20)),
            color = alt.Color('Orthogonal Assay:N', scale = alt.Scale(scheme = 'category10'), sort = sorted, legend = alt.Legend(titleFontSize = 16, labelFontSize = 14))
    ).properties(
        width = 800,
        height = 400,
        title = alt.TitleParams(text = 'SGE Scores for Variants with Orthogonal Assay Data', fontSize = 22)
    )

    histogram.display()

In [None]:
def make_dotplot(df): #Makes dotplot to show how variants in orthogonal assays scored in SGE

    #df = df.loc[df['Assay Type'].isin(['HDR'])]
    sorted = ['Normal','Abnormal','Intermediate'] #order for the legend

    dotplot = alt.Chart(df).mark_point(size = 50).encode(
        x = alt.X('snv_score_minmax', axis = alt.Axis(title = 'SGE Score', labelFontSize = 16, titleFontSize = 20)),
        y = alt.Y('Orthogonal Assay:N', axis = alt.Axis(title = 'Orthogonal Assay', labelFontSize = 16, titleFontSize = 20)),
        color = alt.Color('Orthogonal Assay:N', scale = alt.Scale(scheme = 'category10'),sort = sorted, legend = alt.Legend(titleFontSize = 16, labelFontSize = 14)),
        shape = alt.Shape('Assay Type:N', legend = alt.Legend(labelLimit = 200, titleFontSize = 16, labelFontSize = 14)) #added line adds shape handling by assay type
    ).properties(
        width = 800,
        height = 400,
        title = alt.TitleParams(text = 'SGE Scores for Variants with Orthogonal Assay Data', fontSize = 22)
    )

    dotplot.display()

In [None]:
def main():
    data = read_orthogonal(file)
    sge_scores = read_scores(scores)
    merged = merge(sge_scores,data)
    ready_df = characterize_orthogonal(merged)
    make_histogram(ready_df)
    make_dotplot(ready_df)
    #ready_df.to_excel('/Users/ivan/Desktop/20241205_BARD1_SGEvsOrthogonalAssays.xlsx', index = False)


In [None]:
main()