In [None]:
import pandas as pd
import altair as alt

In [None]:
sge = {'BARD1': '../Data/filtered_ppj_data/SGE/BARD1.xlsx', 
             'RAD51D': '../Data/filtered_ppj_data/SGE/RAD51D.xlsx', 
             'PALB2': '../Data/filtered_ppj_data/SGE/PALB2.xlsx',
             'SFPQ': '../Data/filtered_ppj_data/SGE/SFPQ.xlsx',
             'XRCC2': '../Data/filtered_ppj_data/SGE/XRCC2.xlsx',
             'CTCF': '../Data/filtered_ppj_data/SGE/CTCF.xlsx',
            }

vampseq = {'F9': '../Data/filtered_ppj_data/VAMPseq/F9_ab102.csv',
           'G6PD': '../Data/filtered_ppj_data/VAMPseq/G6PD_scores_consequence.csv',
           'TSC2': '../Data/filtered_ppj_data/VAMPseq/TSC2_Lib1_scores_consequences.csv'
          }

vampseq_file = '/Users/ivan/Downloads/VAMP.funccat.tsv'
clinvar = {'BARD1': ('../Data/filtered_ppj_data/SGE_ClinVar_data/20250827_BARD1_clinvar.txt',0),
         'RAD51D': ('../Data/filtered_ppj_data/SGE_ClinVar_data/20250827_RAD51D_clinvar.txt',0),
         'CTCF': ('../Data/filtered_ppj_data/SGE_ClinVar_data/20250827_CTCF_clinvar.txt',1),
         'PALB2': ('../Data/filtered_ppj_data/SGE_ClinVar_data/20250827_PALB2_clinvar.txt',0),
         'SFPQ': ('../Data/filtered_ppj_data/SGE_ClinVar_data/20250827_SFPQ_clinvar.txt',0),
         'XRCC2': ('../Data/filtered_ppj_data/SGE_ClinVar_data/20250827_XRCC2_clinvar.txt',0),
         'G6PD': '../Data/filtered_ppj_data/SGE_ClinVar_data/20250924_G6PD_ClinVar.txt',
         'TSC2': '../Data/filtered_ppj_data/SGE_ClinVar_data/20250924_TSC2_ClinVar.txt'
        }

sge_genes = list(sge.keys())
vampseq_genes = ['G6PD', 'TSC2']


In [None]:
def read_vampseq(vamp_file):

    df = pd.read_csv(vamp_file, sep = '\t')
    df = df[['Gene', 'aa_pos', 'aa_ref', 'aa_alt', 'condensed_consequence', 'functional_consequence']]
    df = df.dropna(subset = ['aa_pos'])
    
    df['aa_pos'] = df['aa_pos'].astype(int)
    df['var_id'] = df['aa_ref'] + df['aa_pos'].astype(str) + df['aa_alt']
    
    g6pd_df = df.loc[df['Gene'].isin(['G6PD'])]
    tsc2_df = df.loc[df['Gene'].isin(['TSC2'])]

    vamp_dfs = {'G6PD': g6pd_df,
                'TSC2': tsc2_df
               }
   
    
    return vamp_dfs

In [None]:
def read_vamp_clinvar(clinvar):

    clinvar_dicts = {}
    for gene in vampseq_genes:
        path = clinvar[gene]
        df = pd.read_csv(path, sep = '\t')

        df = df[['Protein change', 'Germline classification']]
        df = df.dropna(subset = ['Protein change'])
        
        df['Protein change'] = df['Protein change'].transform(lambda x: x.split(',')[0])
        df = df.rename(columns = {'Protein change': 'var_id'})
        clinvar_dicts[gene] = df

    return clinvar_dicts

In [None]:
def read_sge(sge_dict):

    sge_dfs = {}
    for gene in sge_genes:
        path = sge_dict[gene]
        df = pd.read_excel(path)
        df = df.drop(columns = ['functional_consequence'])

        df = df.rename(columns = {'gmm_consequence_0.95': 'functional_consequence'})
        df = df[['target', 'pos_id', 'functional_consequence']]

        sge_dfs[gene] = df

    return sge_dfs

In [None]:
def get_pair(base): #ClinVar gives base changes on negative sense strand, SGE pos_id on positive sense
    if base == 'A':
        return 'T'
    elif base == 'T':
        return 'A'
    elif base == 'C':
        return 'G'
    else:
        return 'C'

In [None]:
def get_base_changes(df, sense):
    #creates pos_id in format of SGE data file
    
    k = 0
    while k < len(df):
        var = df['Name'][k]
        coord = str(df['GRCh38Location'][k])
        k += 1
        i = 0
        j = 3
        while j < (len(var) + 1):
            test_str = var[i:j]
            j += 1
            i += 1
            if sense == 0:
                sense_base = get_pair(test_str[2])
            elif sense == 1:
                sense_base = test_str[2]
            if test_str[1] == '>':
                change = coord + ":" + sense_base
                df.loc[df['Name'] == var, 'pos_id'] = change

    return df

In [None]:
def read_sge_clinvar(clinvar_dict):

    clinvar_dfs = {}
    for gene in sge_genes:
        path, sense = clinvar_dict[gene]
        clinvar_df = pd.read_csv(path, sep = '\t')
        clinvar_df = clinvar_df[['Name', 'GRCh38Location', 'Germline classification']]
        clinvar_df = get_base_changes(clinvar_df, sense)

        if gene == 'PALB2': #One variant for PALB2 is entered twice
            clinvar_df = clinvar_df.groupby('pos_id').agg({
                'Name': 'first',
                'GRCh38Location': 'first', 
                'Germline classification': 'first'
                }).reset_index()

        clinvar_dfs[gene] = clinvar_df

    return clinvar_dfs

In [None]:
def clinvar_performance(df, assay_name):

    df = df.loc[~df['functional_consequence'].isin(['indeterminate'])]

    df = df.loc[~df['Germline classification'].isin(['Uncertain significance', 'Conflicting classifications of pathogenicity'])]

    df = df.dropna(subset = ['Germline classification'])
    df['test'] = 0
    df.loc[(df['Germline classification'] == 'Benign') | (df['Germline classification'] == 'Likely benign') | (df['Germline classification'] == 'Benign/Likely benign'), 'simple_classification'] = 'Benign'
    df.loc[(df['Germline classification'] == 'Pathogenic') | (df['Germline classification'] == 'Likely pathogenic') | (df['Germline classification'] == 'Pathogenic/Likely pathogenic'), 'simple_classification'] = 'Pathogenic'
    
    benign_df = df.loc[df['simple_classification'].isin(['Benign'])]
    path_df = df.loc[df['simple_classification'].isin(['Pathogenic'])]

    assay_benign_df = df.loc[df['functional_consequence'].isin(['functionally_normal'])]
    assay_path_df =  df.loc[df['functional_consequence'].isin(['functionally_abnormal'])]

    benign_df.loc[benign_df['functional_consequence'] == 'functionally_normal', 'test'] = 1
    path_df.loc[path_df['functional_consequence'] == 'functionally_abnormal', 'test'] = 1

    assay_benign_df.loc[assay_benign_df['simple_classification'] == 'Benign', 'test'] = 1
    assay_path_df.loc[assay_path_df['simple_classification'] == 'Pathogenic', 'test'] = 1

    #path_df.to_excel('/Users/ivan/Desktop/test_excel_outputs/20250924_VAMPseq_PathClinVar_test.xlsx', index = False)

    perc_benign = (benign_df['test'].sum() / len(benign_df)) * 100
    perc_path = (path_df['test'].sum() / len(path_df)) * 100

    print('% Benign Correct: ', str(perc_benign), '\n',
        '% Path. Correct: ', str(perc_path), '\n')

    df = pd.DataFrame({'var_type': ['benign', 'pathogenic'],
                        'correct': [perc_benign, perc_path]
                      }
                     )

    plot = alt.Chart(df).mark_bar().encode(
        x = alt.X('var_type:O',
                  title = 'Germline Classification'
                 ),
        y = alt.Y('correct:Q', 
                  title = '% Agreement',
                  scale = alt.Scale(domain = [0, 100])
                 )
    )
    plot.display()
    
    perc_benign_agree = (assay_benign_df['test'].sum() / len(assay_benign_df)) * 100
    perc_path_agree = (assay_path_df['test'].sum() / len(assay_path_df)) * 100

    raw_npv = (assay_benign_df['test'].sum() / len(assay_benign_df))
    raw_ppv = (assay_path_df['test'].sum() / len(assay_path_df))

    print('% Benign Agree: ', str(perc_benign_agree), '\n',
        '% Path. Agree: ', str(perc_path_agree), '\n')
    
    agree_df = pd.DataFrame({'assay_type': ['NPV', 'PPV'],
                            'correct': [raw_npv, raw_ppv]})

    plot = alt.Chart(agree_df).mark_bar().encode(
        y = alt.Y('assay_type:O',
                  title = '',
                  sort = ['NPV', 'PPV'],
                  axis = alt.Axis(ticks = False,
                                  labelFontSize = 16,
                                  labelFontWeight = 'bold'
                                 )
                 ),
        x = alt.X('correct:Q',
                  title = '', 
                  axis = alt.Axis(titleFontSize = 20, 
                                  labelFontSize = 16
                                 ),
                  scale = alt.Scale(domain = [0, 1])
                 ),
        color = alt.Color('assay_type', 
                          sort = ['NPV', 'PPV'],
                          legend = None
                         )
    ).properties(
        width = 300, 
        height = 100
    ).configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    )

    plot.display()
    save_string = '/Users/ivan/Desktop/pillar_project_figs/20251113_' + assay_name + '_vsClinVar.svg'

    plot.save(save_string)

In [None]:
def main():
    vampseq_dfs = read_vampseq(vampseq_file)
    vampseq_clinvar_dfs = read_vamp_clinvar(clinvar)

    merged_vampseq = []
    for gene in vampseq_genes:
        vampseq_df = vampseq_dfs[gene]
        clinvar_df = vampseq_clinvar_dfs[gene]

        df = pd.merge(vampseq_df, clinvar_df, on = 'var_id', how = 'inner')
        merged_vampseq.append(df)

    final_vampseq = pd.concat(merged_vampseq)

    sge_dfs = read_sge(sge)
    sge_clinvar_dfs = read_sge_clinvar(clinvar)

    merged_sge = []
    for gene in sge_genes:
        sge_df = sge_dfs[gene]
        clinvar_df = sge_clinvar_dfs[gene]
        
        
        df = pd.merge(sge_df, clinvar_df, on = 'pos_id', how = 'inner')

        merged_sge.append(df)

    final_sge = pd.concat(merged_sge)

    clinvar_performance(final_sge, 'SGE')
    clinvar_performance(final_vampseq, 'VAMPseq')

In [None]:
main()