In [None]:
import pandas as pd
import altair as alt
import scipy.stats as stats

In [None]:
sasa = {'BARD1': [('../Data/SASA/SGE/BARD1_1JM7_SASAcalculation_B.csv', 'B'), ('../Data/SASA/SGE/BARD1_3C5R_SASAcalculation_B.csv', 'B'), ('../Data/SASA/SGE/BARD1_3FA2_SASAcalculation_B.csv', 'B')],
        'RAD51D': [('../Data/SASA/SGE/RAD51D_XRCC2_8OUZ_SASAcalculation_C_D.csv', 'C')],
        'XRCC2': [('../Data/SASA/SGE/RAD51D_XRCC2_8OUZ_SASAcalculation_C_D.csv', 'D')],
        'PALB2': [('../Data/SASA/SGE/PALB2_2W18_SASAcalculation_A.csv', 'A'), ('../Data/SASA/SGE/PALB2_8YAP_SASAcalculation_A_B.csv', 'A')],
        'SFPQ': [('../Data/SASA/SGE/SFPQ_6OWJ_SASAcalculation_A.csv', 'A')]
       }


sge_genes = {'BARD1': '../Data/filtered_ppj_data/SGE/BARD1.xlsx',
             'RAD51D': '../Data/filtered_ppj_data/SGE/RAD51D.xlsx',
             'PALB2': '../Data/filtered_ppj_data/SGE/PALB2.xlsx',
             'SFPQ': '../Data/filtered_ppj_data/SGE/SFPQ.xlsx',
             'XRCC2': '../Data/filtered_ppj_data/SGE/XRCC2.xlsx'
            }

alt.data_transformers.disable_max_rows()

In [None]:
def read_sasa(sasa_dict):
    genes = list(sasa_dict.keys())

    three_to_one = {
    'ALA': 'A',
    'ARG': 'R',
    'ASN': 'N',
    'ASP': 'D',
    'CYS': 'C',
    'GLN': 'Q',
    'GLU': 'E',
    'GLY': 'G',
    'HIS': 'H',
    'ILE': 'I',
    'LEU': 'L',
    'LYS': 'K',
    'MET': 'M',
    'PHE': 'F',
    'PRO': 'P',
    'SER': 'S',
    'THR': 'T',
    'TRP': 'W',
    'TYR': 'Y',
    'VAL': 'V'
    }

    sasa_dfs = {}
    for gene in genes:
        file_list = sasa_dict[gene]

        gene_sasa_dfs = []
        for file in file_list:
            path, chain = file
            sasa_df = pd.read_csv(path)
            sasa_df = sasa_df.loc[sasa_df['ResidNr'] > 0]
            sasa_df = sasa_df.loc[sasa_df['Chain'].isin([chain])]
            sasa_df['ResidNe'] = sasa_df['ResidNe'].map(three_to_one)
            sasa_df['AAid'] = sasa_df['ResidNe'] + sasa_df['ResidNr'].astype(str)
            sasa_df = sasa_df[['AAid', 'Q.SASA.']]
            gene_sasa_dfs.append(sasa_df)

        gene_sasa_df = pd.concat(gene_sasa_dfs)

        sasa_dfs[gene] = gene_sasa_df

        

    return sasa_dfs

In [None]:
def read_sge(sge_dict):

    genes = list(sge_dict.keys())

    sge_dfs = {}
    for gene in genes:
        path = sge_dict[gene]
        sge_df = pd.read_excel(path)
        sge_df = sge_df.loc[~sge_df['amino_acid_change'].isin(['---'])]
        sge_df['AAid'] = sge_df['amino_acid_change'].transform(lambda x: x[:-1])
        sge_df = sge_df[['amino_acid_change', 'score', 'AAid']]

        sge_dfs[gene] = sge_df

    return sge_dfs

In [None]:
def merge(sge, sasa):
    genes = list(sge.keys())

    merged_dfs = []
    for gene in genes:
        sge_df = sge[gene]
        sasa_df = sasa[gene]

        merged = pd.merge(sge_df, sasa_df, on = 'AAid', how = 'inner')
        merged['accessible'] = 'No'
        merged.loc[merged['Q.SASA.'] >= 0.2, 'accessible'] = 'Yes'
        
        merged_dfs.append(merged)

    final_df = pd.concat(merged_dfs)

    return final_df

In [None]:
def boxplot(df):

    plot = alt.Chart(df).mark_boxplot(size=50,
        box={'fill': 'white', 'stroke': 'black', 'strokeWidth': 1},
        median={'stroke': 'black', 'strokeWidth': 2},
        outliers={'fill': 'black', 'stroke': 'black'},
        ticks={'stroke': 'black', 'strokeWidth': 1},
        rule={'stroke': 'black', 'strokeWidth': 1}).encode(
        x = alt.X('accessible:N',
                 sort = ['Yes', 'No']),
        y = alt.Y('score:Q')
    ).properties(
        width = 150,
        height = 600,
        title = 'SGE Score vs. Accessibility'
    )


    yes_scores =  df.loc[df['accessible'] == 'Yes']['score'].tolist()
    no_scores =  df.loc[df['accessible'] == 'No']['score'].tolist()

    u, p = stats.mannwhitneyu(yes_scores, no_scores, alternative = 'two-sided')

    print('Accessible Residues: ', df['accessible'].value_counts().get('Yes', 0), '\n',
          'Inaccessible Residues: ', df['accessible'].value_counts().get('No', 0), '\n',
          'p-value: ', str(p)
         )
    plot.display()

In [None]:
def main():
    sasa_dfs = read_sasa(sasa)
    sge_dfs = read_sge(sge_genes)
    final_df = merge(sge_dfs, sasa_dfs)
    boxplot(final_df)

In [None]:
main()