In [None]:
import pandas as pd
import altair as alt

In [None]:
del_file = '../Data/20250829_BARD1delscores.tsv'
snv_file = '../Data/20250825_BARD1snvscores_filtered.xlsx'
clinvar_snvs_file = '../Data/20250912_BARD1_ClinVarSNVs_1StarPlus.txt'
clinvar_dels_file = '../Data/20250912_BARD1_ClinVarDels_1StarPlus.txt'
gnom_path = '../Data/20240905_BARD1_gnomADv4.1.0_SNVs.xlsx' #gnomAD data path
reg_path = '../Data/20240802_BARD1_Regeneron_MAF.xlsx' #Regeneron data path
rna_scores = ''
alt.data_transformers.disable_max_rows()

Functions for initial read in of SGE data

In [None]:
def get_thresholds(thresholds): #Gets SGE thresholds
    df = pd.read_excel(thresholds)

    # find the GMM thresholds
    target_value = 0.950
    # Calculate the absolute difference for the Normal (N) density
    diffN = (df['gmm_density_normal'] - target_value).abs()
    # Find the index of the minimum difference
    closest_index = diffN.idxmin()
    # Retrieve the row with the closest value
    closest_row_n = df.loc[closest_index]
    
    # now repeat that for the abnormal density
    # Calculate the absolute difference
    diffA = (df['gmm_density_abnormal'] - target_value).abs()
    # Find the index of the minimum difference
    closest_index = diffA.idxmin()
    # Retrieve the row with the closest value
    closest_row_a = df.loc[closest_index]
    
    # now we get the scores that are the closest to the (n)ormal and (a)bnormal thresholds
    uppr = closest_row_n['score']
    lwr = closest_row_a['score']
    
    thresholds = [lwr, uppr]

    
    #Some quick processing of SNV scores
    df.loc[df['score'] >= 0, 'functional_consequence'] = 'functionally_normal' #Ensures that variants above our upper threshold (which is less than 0) will be assigned a functionally normal class
    df['var_type'] = 'snv' #Sets variant type column to SNV
    df = df.drop(columns = ['functional_consequence_zscore', 'gmm_density_abnormal', 'gmm_density_normal', 'target_pos_id', 'gmm_consequence_0.99']) #Drops these columns
                 
    return df, thresholds

In [None]:
def class_dels(dels, thresholds):
    dels = pd.read_csv(dels, sep = '\t') #Reads deletions
 
    dels['functional_consequence'] = 'indeterminate' #Sets base functional consequence to indeterminate
    dels.loc[dels['score'] <= thresholds[0], 'functional_consequence'] = 'functionally_abnormal' #Dels scoring below or equal to lower threshold are functionally abnormal
    dels.loc[dels['score'] >= thresholds[1], 'functional_consequence'] = 'functionally_normal' #Dels scoring at or above to upper threshold are functionally normal
    dels['var_type'] = '3bp_del'
    dels['pos_id'] = dels['start'].astype(str) + '-' + dels['end'].astype(str)

    return dels

Functions used to process and merge the ClinVar data

In [None]:
def read_clinvar(snv_file, del_file): #Reads ClinVar data
    
    df = pd.read_csv(snv_file, delimiter='\t') #reads ClinVar SNV tabular .txt 
    df = df[['Name','Protein change','GRCh38Chromosome','GRCh38Location','Germline classification']] #pulls useful columns
    df = df.dropna(subset = ['GRCh38Location']) #Drops variants without genomic coordinate
    df.GRCh38Location = df.GRCh38Location.astype(int) #Sets coordinates to integer data type
    df['pos_id'] = None #preps for next function


    del_df = pd.read_csv(del_file, sep = '\t') #Reads ClinVar deletions
    del_df = del_df.loc[del_df['GRCh38Location'].str.contains('-')] #Splits coordinates
    del_df['start'] = del_df['GRCh38Location'].transform(lambda x: x.split(' - ')[0]) #Gets deletion start coordinate
    del_df['end'] = del_df['GRCh38Location'].transform(lambda x: x.split(' - ')[1]) #Gets deletion end coordinate

    #Sets coordinate data types to integer
    del_df['start'] = del_df['start'].astype(int) 
    del_df['end'] = del_df['end'].astype(int)

    del_df['del_length'] = del_df['end'] - del_df['start'] #Calculates deletion length 

    del_df = del_df.loc[del_df['del_length'].isin([2])] #Pulls out 3bp deletions
    del_df['pos_id'] = del_df['start'].astype(str) + '-' + del_df['end'].astype(str) #Sets base change column to coordinate spanned by deletion
    del_df = del_df[['pos_id', 'Germline classification']] #Pulls out necessary columns


    return df, del_df

In [None]:
def get_pair(base): #ClinVar gives base changes on negative sense strand, SGE pos_id on positive sense
    if base == 'A':
        return 'T'
    elif base == 'T':
        return 'A'
    elif base == 'C':
        return 'G'
    else:
        return 'C'

In [None]:
def get_base_changes(df): #Creates pos_id column in format of SGE datafile for ClinVar data    
    k = 0
    while k < len(df):
        var = df['Name'][k]
        coord = str(df['GRCh38Location'][k])
        k += 1
        i = 0
        j = 3
        while j < (len(var) + 1):
            test_str = var[i:j]
            j += 1
            i += 1
            sense_base = get_pair(test_str[2])
            if test_str[1] == '>':
                change = coord + ":" + sense_base
                df.loc[df['Name'] == var, 'pos_id'] = change
                
    df = df[['pos_id', 'Germline classification']]
    return df

Functions used to merge allele frequency data from gnomAD and Regeneron Million Exomes 

In [None]:
def read_gnomAD(gnomAD_path): #Reads gnomAD file
    
    unfiltered = pd.read_excel(gnomAD_path) #Reads gnomAD file
    filtered = unfiltered[['gnomAD ID', 'Allele Frequency']] #Gets necessary columns 

    filtered = filtered.copy()
    filtered['pos_id'] = filtered['gnomAD ID'].transform(lambda x: x[2:11] + ':' + x[14]) #Adds pos_id column for merging

    filtered = filtered.rename(columns = {'Allele Frequency': 'gnomad_af'})
    filtered = filtered[['pos_id', 'gnomad_af']]
    return filtered

In [None]:
def read_regeneron(reg_path): #Reads Regeneron data
    
    df = pd.read_excel(reg_path) #Reads data
    maf = df[['Variant','AAF']] #Pulls necessary columns
    maf = maf.copy()

    maf = maf.rename(columns = {'AAF': 'regeneron_maf', 'Variant': 'pos_id'}) #Renames columns to share column names with SGE data

    maf['pos_id'] = maf['pos_id'].transform(lambda x: x[2:12] + x[len(x) - 1: len(x) + 1]) #Remakes the pos_id column to match pos_id column from SGE data for merging
    
    return maf

In [None]:
def main():

    #Reads SGE data
    snv_df, snv_thresholds = get_thresholds(snv_file)
    del_df = class_dels(del_file, snv_thresholds)

    sge_df = pd.concat([snv_df, del_df]) #Final concatenated SNVs and Deletions dataframe

    #Processes and Merges ClinVar Data
    clinvar_snvs, clinvar_dels = read_clinvar(clinvar_snvs_file, clinvar_dels_file)
    clinvar_snvs = get_base_changes(clinvar_snvs)
    all_clinvar = pd.concat([clinvar_snvs, clinvar_dels])
    
    df = pd.merge(sge_df, all_clinvar, on = 'pos_id', how = 'left') #df merged with ClinVar

    #Processes and Merges MAF Data
    gnomad_df = read_gnomAD(gnom_path)
    regeneron_df = read_regeneron(reg_path)

    df = pd.merge(df, gnomad_df, on = 'pos_id', how = 'left')
    df = pd.merge(df, regeneron_df, on = 'pos_id', how = 'left')

    threshold_df = pd.DataFrame({'min': [snv_thresholds[0]], 'max': [snv_thresholds[1]]})

    dfs = {'scores': df,
           'thresholds': threshold_df
          }
    print(df)


    with pd.ExcelWriter('../Data/BARD1_SGE_final_table.xlsx') as writer:
        for sheet_name, df in dfs.items():
            df.to_excel(writer, sheet_name=sheet_name, index=False)


In [None]:
main()