In [None]:
import pandas as pd
import altair as alt
import scipy.stats as stats

In [None]:
vampseq = [('F9','../Data/filtered_ppj_data/VAMPseq/F9_ab102.csv'),
           ('G6PD','../Data/filtered_ppj_data/VAMPseq/G6PD_scores_consequence.csv'),
           ('TSC2', '../Data/filtered_ppj_data/VAMPseq/TSC2_Lib1_scores_consequences.csv')
          ]

sasa = [('F9', '../Data/SASA/VAMPseq/F9_AF_SASAcalculation_A.csv', 'A'),
        ('G6PD', '../Data/SASA/VAMPseq/G6PD_7UAG_SASAcalculation_B.csv', 'B'),
        ('TSC2', '../Data/SASA/VAMPseq/TSC2_7DL2_SASAcalculation_A.csv', 'A')
       ]

alt.data_transformers.disable_max_rows()

In [None]:
def read_vampseq(files):

    vampseq_data = {}
    for gene in files:
        gene_name, path = gene
        data = pd.read_csv(path)
        data = data.loc[~(data['type'].isin(['Deletion']))]
        data = data.loc[data['type'].isin(['Missense'])]

        if 'F9' in gene_name:
            ab = data['antibody_nonnum'][2]
            if ab == 'ab001':
                data = data.rename(columns = {'variant': 'Mutation','wt_aa': 'WT', 'var_aa': 'Mut'})
                data = data.drop(columns = ['antibody_nonnum', 'antibody_label2', 'Unnamed: 8'])
            else:
                data['Mutation'] = data['wt_aa'] + data['position'].astype(str) + data['var_aa']
                data = data.rename(columns = {'variant': 'Mutation','wt_aa': 'WT', 'var_aa': 'Mut'})
                data = data.drop(columns = ['antibody_nonnum', 'antibody_label2', 'n_exp'])
        else:
            data.loc[data['Mut'] == 'Stop', 'Mut'] = '*'
            data = data.dropna(subset = ['position'])
            data['position'] = data['position'].astype(int)
            data['Mutation'] = data['WT'] + data['position'].astype(str) + data['Mut']

        data['AAid'] = data['WT'] + data['position'].astype(str)
        
        vampseq_data[gene_name] = data

    return vampseq_data

In [None]:
def read_sasa(sasa_files):
    
    three_to_one = {
    'ALA': 'A',
    'ARG': 'R',
    'ASN': 'N',
    'ASP': 'D',
    'CYS': 'C',
    'GLN': 'Q',
    'GLU': 'E',
    'GLY': 'G',
    'HIS': 'H',
    'ILE': 'I',
    'LEU': 'L',
    'LYS': 'K',
    'MET': 'M',
    'PHE': 'F',
    'PRO': 'P',
    'SER': 'S',
    'THR': 'T',
    'TRP': 'W',
    'TYR': 'Y',
    'VAL': 'V'
    }

    sasa_outputs = {}
    for elem in sasa_files:
        gene, path, chain = elem
        sasa_df = pd.read_csv(path)

        sasa_df = sasa_df.loc[sasa_df['Chain'].isin([chain])]
        sasa_df['ResidNe'] = sasa_df['ResidNe'].map(three_to_one)
        sasa_df['AAid'] = sasa_df['ResidNe'] + sasa_df['ResidNr'].astype(str)
        sasa_df = sasa_df[['AAid', 'Q.SASA.']]

        sasa_outputs[gene] = sasa_df

    return sasa_outputs


In [None]:
def merge(vampseq, sasa):
    genes = list(vampseq.keys())

    dfs = []
    for gene in genes:
        vampseq_df = vampseq[gene]
        sasa_df = sasa[gene]

        merged = pd.merge(vampseq_df, sasa_df, on = 'AAid', how = 'inner')
        merged['accessible'] = 'No'
        merged.loc[merged['Q.SASA.'] >= 0.2, 'accessible'] = 'Yes'

        dfs.append(merged)
        

    df = pd.concat(dfs)

    return df

In [None]:
def boxplot(df):

    plot = alt.Chart(df).mark_boxplot(size=50,
        box={'fill': 'white', 'stroke': 'black', 'strokeWidth': 1},
        median={'stroke': 'black', 'strokeWidth': 2},
        outliers={'fill': 'black', 'stroke': 'black'},
        ticks={'stroke': 'black', 'strokeWidth': 1},
        rule={'stroke': 'black', 'strokeWidth': 1}).encode(
        x = alt.X('accessible:N',
                  title = 'Surface Accessible?',
                 sort = ['Yes', 'No'],
                axis = alt.Axis(labelAngle = 0,
                                ticks = False,
                                labelFontSize = 18,
                                titleFontSize = 18
                               )
                 ), 
        y = alt.Y('average_score:Q',
                 title = 'VAMP-seq Score',
                  axis = alt.Axis(
                     labelFontSize = 18,
                     titleFontSize = 18
                  )
                 )       
    ).properties(
        width = 200,
        height = 600,
        title = 'VAMP-seq Score vs. Accessibility'
    ).configure_axis(
        grid = False
    ).configure_view(
        stroke = None
    )

    yes_scores =  df.loc[df['accessible'] == 'Yes']['average_score'].tolist()
    no_scores =  df.loc[df['accessible'] == 'No']['average_score'].tolist()

    u, p = stats.mannwhitneyu(yes_scores, no_scores, alternative = 'two-sided')

    print('Accessible Residues: ', df['accessible'].value_counts().get('Yes', 0), '\n',
          'Inaccessible Residues: ', df['accessible'].value_counts().get('No', 0), '\n',
          'p-value: ', str(p)
         )

    plot.display()
    #plot.save('/Users/ivan/Desktop/pillar_project_figs/20250922_VAMPseqvsSASA.png', dpi = 500)
    

In [None]:
def main():
    vampseq_outputs = read_vampseq(vampseq)
    sasa_outputs = read_sasa(sasa)
    df = merge(vampseq_outputs, sasa_outputs)
    boxplot(df)

In [None]:
main()