In [None]:
import pandas as pd
import altair as alt
import scipy.stats as stats
from Bio import SeqIO
from Bio.Seq import Seq

In [None]:
sasa = {'BARD1': [('../Data/SASA/SGE/BARD1_1JM7_SASAcalculation_B.csv', 'B'), ('../Data/SASA/SGE/BARD1_3C5R_SASAcalculation_B.csv', 'B'), ('../Data/SASA/SGE/BARD1_3FA2_SASAcalculation_B.csv', 'B')],
        'RAD51D': [('../Data/SASA/SGE/RAD51D_XRCC2_8OUZ_SASAcalculation_C_D.csv', 'C')],
        'XRCC2': [('../Data/SASA/SGE/RAD51D_XRCC2_8OUZ_SASAcalculation_C_D.csv', 'D')],
        'PALB2': [('../Data/SASA/SGE/PALB2_2W18_SASAcalculation_A.csv', 'A')],
        'SFPQ': [('../Data/SASA/SGE/SFPQ_6OWJ_SASAcalculation_A.csv', 'A')]
       }


sge_genes = {'BARD1': '../Data/filtered_ppj_data/SGE/BARD1.xlsx',
             'RAD51D': '../Data/filtered_ppj_data/SGE/RAD51D.xlsx',
             'PALB2': '../Data/filtered_ppj_data/SGE/PALB2.xlsx',
             'SFPQ': '../Data/filtered_ppj_data/SGE/SFPQ.xlsx',
             'XRCC2': '../Data/filtered_ppj_data/SGE/XRCC2.xlsx'
            }

vampseq = {'F9': ['../Data/filtered_ppj_data/VAMPseq/F9_ab102.csv'],
           'G6PD': ['../Data/filtered_ppj_data/VAMPseq/G6PD_scores_consequence.csv'],
           'TSC2': ['../Data/filtered_ppj_data/VAMPseq/TSC2_Lib1_scores_consequences.csv']
          }

vamp_funcat = '/Users/ivan/Downloads/VAMP.funccat.tsv'

vamp_sasa = {'F9': [('../Data/SASA/VAMPseq/F9_AF_SASAcalculation_A.csv', 'A')],
        'G6PD': [('../Data/SASA/VAMPseq/G6PD_7UAG_SASAcalculation_B.csv', 'B')],
        'TSC2': [('../Data/SASA/VAMPseq/TSC2_7DL2_SASAcalculation_A.csv', 'A')]
            }

alt.data_transformers.disable_max_rows()

In [None]:
def read_sasa(sasa_dict):
    genes = list(sasa_dict.keys())

    three_to_one = {
    'ALA': 'A',
    'ARG': 'R',
    'ASN': 'N',
    'ASP': 'D',
    'CYS': 'C',
    'GLN': 'Q',
    'GLU': 'E',
    'GLY': 'G',
    'HIS': 'H',
    'ILE': 'I',
    'LEU': 'L',
    'LYS': 'K',
    'MET': 'M',
    'PHE': 'F',
    'PRO': 'P',
    'SER': 'S',
    'THR': 'T',
    'TRP': 'W',
    'TYR': 'Y',
    'VAL': 'V'
    }

    sasa_dfs = {}
    for gene in genes:
        file_list = sasa_dict[gene]

        gene_sasa_dfs = []
        for file in file_list:
            path, chain = file
            sasa_df = pd.read_csv(path)
            sasa_df = sasa_df.loc[sasa_df['ResidNr'] > 0]
            sasa_df = sasa_df.loc[sasa_df['Chain'].isin([chain])]
            sasa_df['ResidNe'] = sasa_df['ResidNe'].map(three_to_one)
            sasa_df['AAid'] = gene + ':' + sasa_df['ResidNe'] + sasa_df['ResidNr'].astype(str)
            sasa_df = sasa_df[['AAid', 'Q.SASA.']]
            gene_sasa_dfs.append(sasa_df)

        gene_sasa_df = pd.concat(gene_sasa_dfs)

        sasa_dfs[gene] = gene_sasa_df

        

    return sasa_dfs

In [None]:
def read_sge(sge_dict):

    genes = list(sge_dict.keys())

    sge_dfs = {}
    for gene in genes:
        path = sge_dict[gene]
        sge_df = pd.read_excel(path)
        sge_df = sge_df.loc[~sge_df['amino_acid_change'].isin(['---'])]
        sge_df = sge_df.loc[sge_df['consequence'].isin(['missense_variant'])]
        sge_df['AAid'] = gene + ':' + sge_df['amino_acid_change'].transform(lambda x: x[:-1])
        sge_df = sge_df[['amino_acid_change', 'AAid', 'functional_consequence']]

        '''
        sge_df['functional_consequence'] = pd.Categorical(sge_df['functional_consequence'],
                                                          categories = ['functionally_abnormal', 'indeterminate', 'functionally_normal'],
                                                          ordered = True
                                                         )

        sge_df = sge_df.groupby('amino_acid_change').agg({
            'AAid': 'first',
            'functional_consequence': 'min'
        }).reset_index()
        '''
        
        sge_dfs[gene] = sge_df

    return sge_dfs

In [None]:
def read_vampseq(files):

    vampseq_data = {}
    for gene in files:
        gene_name, path = gene
        data = pd.read_csv(path)
        data = data.loc[~(data['type'].isin(['Deletion']))]
        data = data.loc[data['type'].isin(['Missense'])]

        if 'F9' in gene_name:
            ab = data['antibody_nonnum'][2]
            if ab == 'ab001':
                data = data.rename(columns = {'variant': 'Mutation','wt_aa': 'WT', 'var_aa': 'Mut'})
                data = data.drop(columns = ['antibody_nonnum', 'antibody_label2', 'Unnamed: 8'])
            else:
                data['Mutation'] = data['wt_aa'] + data['position'].astype(str) + data['var_aa']
                data = data.rename(columns = {'variant': 'Mutation','wt_aa': 'WT', 'var_aa': 'Mut'})
                data = data.drop(columns = ['antibody_nonnum', 'antibody_label2', 'n_exp'])
        else:
            data.loc[data['Mut'] == 'Stop', 'Mut'] = '*'
            data = data.dropna(subset = ['position'])
            data['position'] = data['position'].astype(int)
            data['Mutation'] = data['WT'] + data['position'].astype(str) + data['Mut']

        data['AAid'] = gene_name + ':' + data['WT'] + data['position'].astype(str)
        
        vampseq_data[gene_name] = data

    return vampseq_data

In [None]:
def mutate_snvs(dna_sequence): #Mutates all possible SNVs of provided DNA sequence
    snvs = []
    i = 0
    while i < len(dna_sequence):
        if dna_sequence[i] == "A":
            snvs.append(dna_sequence[:i] + "T" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "C" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "G" + dna_sequence[i + 1 :])
        elif dna_sequence[i] == "T":
            snvs.append(dna_sequence[:i] + "A" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "C" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "G" + dna_sequence[i + 1 :])
        elif dna_sequence[i] == "C":
            snvs.append(dna_sequence[:i] + "A" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "T" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "G" + dna_sequence[i + 1 :])
        else:
            snvs.append(dna_sequence[:i] + "A" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "T" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "C" + dna_sequence[i + 1 :])
        i += 1
    return snvs

In [None]:
def reverse_complement_string(seq_string): #Reverse complement and returns string
    reverse_seq = seq_string[::-1]
    reverse_comp_list = []
    for char in reverse_seq:
        if char == "A":
            reverse_comp_list.append("T")
        elif char == "G":
            reverse_comp_list.append("C")
        elif char == "C":
            reverse_comp_list.append("G")
        else:
            reverse_comp_list.append("A")
    reverse_compliment_str = "".join(reverse_comp_list)
    return reverse_compliment_str

In [None]:
def read_vampfuncat(file):

    df = pd.read_csv(file, sep = '\t')
    
    df = df.dropna(subset = ['aa_pos', 'consequence'])
    df['aa_pos'] = df['aa_pos'].astype(int)
    df['AAid'] = df['Gene'] + ':' + df['aa_ref'] + df['aa_pos'].astype(str)
    df['var_id'] = df['AAid'] + ':' + df['hgvs_p']


    cleaned = []
    grouped = df.groupby('AAid')

    for id, group in grouped:
        
        longest = group.loc[group['ref_allele'].str.len().idxmax(), 'ref_allele']
        if len(longest) == 1:
            group = group.reset_index()
            cleaned.append(group)
            continue
        elif len(longest) == 3:
            if 'G6PD' in id:
                longest = reverse_complement_string(longest)
            mutated_aas = list(set(group['aa_alt'].tolist()))
            snvs = mutate_snvs(longest)
            snv_aas = []

            for snv in snvs:
                var = Seq(snv)
                aa = var.translate()
                snv_aas.append(str(aa))
            snv_aas = list(set(snv_aas))
            group = group.loc[group['aa_alt'].isin(snv_aas)]
            group = group.reset_index()
            cleaned.append(group)
        
        
    df = pd.concat(cleaned)    
    
    df = df[['Gene', 'AAid','aa_pos', 'aa_ref', 'aa_alt', 'condensed_consequence', 'functional_consequence']]
    #df.to_excel('/Users/ivan/Desktop/test_excel_outputs/20250925_SASAvsVAMPseq.xlsx', index = False)
    return df    

In [None]:
def merge(sge, sasa):
    genes = list(sge.keys())

    merged_dfs = []
    for gene in genes:
        sge_df = sge[gene]
        sasa_df = sasa[gene]

        merged = pd.merge(sge_df, sasa_df, on = 'AAid', how = 'inner')
        merged['accessible'] = 'No'
        merged.loc[merged['Q.SASA.'] >= 0.2, 'accessible'] = 'Yes'
        
        merged_dfs.append(merged)

    final_df = pd.concat(merged_dfs)

    return final_df

In [None]:
def merge_vampseq(vampseq_df, sasa_dfs):
    sasa_df = []

    keys = list(sasa_dfs.keys())
    for key in keys:

        if key == 'F9':
            continue
        else:
            df = sasa_dfs[key]
            df['accessible'] = 'No'
            df.loc[df['Q.SASA.'] >= 0.2, 'accessible'] = 'Yes'
            sasa_df.append(df)

    final_sasa_df = pd.concat(sasa_df)


    df = pd.merge(vampseq_df, final_sasa_df, on = 'AAid', how = 'inner')

    return df

In [None]:
def bar_chart(sge, vampseq):

    dfs = [('SGE', sge), ('VAMP-seq', vampseq)]

    for df in dfs:
        assay, df = df
        print(assay)
        df = df[['functional_consequence', 'accessible', 'assay']]
        df = df.loc[~df['functional_consequence'].isin(['indeterminate'])]
        summary_df = df.groupby(['accessible', 'functional_consequence']).size().reset_index(name = 'count')
        summary_df['percentage'] = summary_df['count'] / summary_df.groupby('accessible')['count'].transform('sum') * 100
        print(summary_df)
        chart = alt.Chart(summary_df).mark_bar().encode(
            x = alt.X('functional_consequence:O',
                      title = '',
                      axis = alt.Axis(labels = False,
                                      ticks = False
                                     ),
                      sort = ['functionally_normal', 'functionally_abnormal']
                     ),
            y = alt.Y('percentage:Q',
                      title = '% of Vars.',
                      scale = alt.Scale(domain = [0, 100]),
                      axis = alt.Axis(values = list(range(0, 110, 10)),
                                     titleFontSize = 20,
                                     labelFontSize = 18)
                     ),
            column = alt.Column('accessible:N', 
                                title = 'Surface Accessible?',
                                header = alt.Header(labelFontSize = 18, titleFontSize = 20
                                                       )
                               ),
            color = alt.Color('functional_consequence',
                              sort = ['functionally_normal', 'functionally_abnormal']
                             )
        ).properties(
            title = assay,
            width = 100,
            height = 300
        ).configure_axis(
            grid = False
        ).configure_view(
            stroke = None
        )
            
    
        chart.display()
        save_string = '/Users/ivan/Desktop/pillar_project_figs/20250925_SASAvs' + assay + '.png' 
        chart.save(save_string, ppi = 500)

In [None]:
def main():
    sasa_dfs = read_sasa(sasa)
    sge_dfs = read_sge(sge_genes)
    sge_df = merge(sge_dfs, sasa_dfs)
    sge_df['assay'] = 'sge'
    
    vampseq_df = read_vampfuncat(vamp_funcat)
    vamp_sasa_dfs = read_sasa(vamp_sasa)

    vampseq_df = merge_vampseq(vampseq_df, vamp_sasa_dfs)
    vampseq_df['assay'] = 'vampseq'
    

    sge_df = sge_df[['functional_consequence', 'AAid', 'accessible', 'assay']]
    vampseq_df = vampseq_df[['functional_consequence', 'AAid', 'accessible', 'assay']]

    final_df = pd.concat([sge_df, vampseq_df])
    bar_chart(sge_df, vampseq_df)
    print(final_df)

    #final_df.to_excel('/Users/ivan/Desktop/test_excel_outputs/20250925_SASAvsAssays.xlsx', index = False)

In [None]:
main()