In [8]:
import pandas as pd
import numpy as np

from natsort import natsort_keygen

In [9]:
def read_and_filter_data(
    path, 
    effect_std_filter=2,
    times_seen_filter=2,
    n_selections_filter=2,
    clip_effect=-5 
):
    print(f'Reading data from {path}')
    print(
        f"Filtering for:\n"
        f"  effect_std <= {effect_std_filter}\n"
        f"  times_seen >= {times_seen_filter}\n"
        f"  n_selections >= {n_selections_filter}"
    )
    print(f"Clipping effect values at {clip_effect}")

    df = pd.read_csv(path).query(
        'effect_std <= @effect_std_filter and \
        times_seen >= @times_seen_filter and \
        n_selections >= @n_selections_filter'
    ).query(
        'mutant not in ["*", "-"]' # don't want stop codons/indels
    )

    df['site'] = df['site'].astype(str)
    df['effect'] = df['effect'].clip(-5)

    df = pd.concat([
        df,
        df[['site', 'wildtype']].drop_duplicates().assign(
            mutant=lambda x: x['wildtype'],
            effect=0.0,
            effect_std=0.0,
            times_seen=np.nan,
            n_selections=np.nan
        ) # add wildtype sites with zero effect
    ], ignore_index=True).sort_values(['site', 'mutant']).reset_index(drop=True)
    
    return df

H3_effects = read_and_filter_data('../data/MDCKSIAT1_entry_func_effects.csv')
H5_effects = read_and_filter_data('../data/293T_entry_func_effects.csv')
H7_effects = read_and_filter_data('../data/293_2-6_entry_func_effects.csv')

Reading data from ../data/MDCKSIAT1_entry_func_effects.csv
Filtering for:
  effect_std <= 2
  times_seen >= 2
  n_selections >= 2
Clipping effect values at -5
Reading data from ../data/293T_entry_func_effects.csv
Filtering for:
  effect_std <= 2
  times_seen >= 2
  n_selections >= 2
Clipping effect values at -5
Reading data from ../data/293_2-6_entry_func_effects.csv
Filtering for:
  effect_std <= 2
  times_seen >= 2
  n_selections >= 2
Clipping effect values at -5


In [10]:
# mean effects of mutations at each site (excluding wildtype)

avg_H3_effects = (
    H3_effects.query('mutant != wildtype')
    .groupby(['site', 'wildtype'], as_index=False)['effect']
    .mean()
    .sort_values("site", key=natsort_keygen())
    .reset_index(drop=True)
)

avg_H5_effects = (
    H5_effects.query('mutant != wildtype')
    .groupby(['site', 'wildtype'], as_index=False)['effect']
    .mean()
    .sort_values("site", key=natsort_keygen())
    .reset_index(drop=True)
)

avg_H7_effects = (
    H7_effects.query('mutant != wildtype')
    .groupby(['site', 'wildtype'], as_index=False)['effect']
    .mean()
    .sort_values("site", key=natsort_keygen())
    .reset_index(drop=True)
)

In [11]:
structural_aln = pd.read_csv(
    '../results/structural_alignment/structural_alignment.csv', dtype={'h3_site': object})
structural_aln.head()

Unnamed: 0,struct_site,4o5n_aa,4r8w_aa,4kwm_aa,4o5n_aa_pdb_site,4o5n_aa_RSA,4o5n_aa_SS,4o5n_aa_chain,4kwm_aa_pdb_site,4kwm_aa_RSA,...,4r8w_aa_chain,rmsd_h3h5,rmsd_h3h7,rmsd_h5h7,h3_site,h3_wt_aa,h5_site,h5_wt_aa,h7_site,h7_wt_aa
0,9,P,-,P,9.0,1.084277,-,A,-1.0,1.140252,...,,9.178066,,,9,S,9,K,,
1,10,G,-,G,10.0,0.150962,-,A,0.0,0.175962,...,,8.183151,,,10,T,10,S,,
2,11,A,D,D,11.0,0.050388,E,A,1.0,0.097927,...,A,5.04805,1.735252,4.172437,11,A,11,D,11.0,D
3,12,T,K,Q,12.0,0.268605,E,A,2.0,0.216889,...,A,3.93908,1.489412,4.694308,12,T,12,Q,12.0,K
4,13,L,I,I,13.0,0.0,E,A,3.0,0.0,...,A,3.725425,1.019553,3.75921,13,L,13,I,13.0,I


In [12]:
# combined dataframe with all mutation effects
combined_mutation_effects = pd.merge(
    pd.merge(
        pd.merge(
            structural_aln,
            H3_effects[['site', 'wildtype', 'mutant', 'effect', 'effect_std']],
            left_on=['h3_site', 'h3_wt_aa'],
            right_on=['site', 'wildtype'],
            how='left'
        ).drop(columns=['site', 'wildtype']).rename(
            columns={'effect': 'h3_effect', 'effect_std': 'h3_effect_std'}
        ),
        H5_effects[['site', 'wildtype', 'mutant', 'effect', 'effect_std']],
        left_on=['h5_site', 'h5_wt_aa', 'mutant'],
        right_on=['site', 'wildtype', 'mutant'],
        how='left',
    ).drop(columns=['site', 'wildtype']).rename(
        columns={'effect': 'h5_effect', 'effect_std': 'h5_effect_std'}
    ),
    H7_effects[['site', 'wildtype', 'mutant', 'effect', 'effect_std']],
    left_on=['h7_site', 'h7_wt_aa', 'mutant'],
    right_on=['site', 'wildtype', 'mutant'],
    how='left',
).drop(columns=['site', 'wildtype']).rename(
    columns={'effect': 'h7_effect', 'effect_std': 'h7_effect_std'}
)

combined_mutation_effects = (
    combined_mutation_effects[['mutant'] 
    + [c for c in combined_mutation_effects.columns if c != 'mutant']]
)

combined_mutation_effects.to_csv(
    '../results/combined_effects/combined_mutation_effects.csv', 
    index=False
)
combined_mutation_effects.head()

Unnamed: 0,mutant,struct_site,4o5n_aa,4r8w_aa,4kwm_aa,4o5n_aa_pdb_site,4o5n_aa_RSA,4o5n_aa_SS,4o5n_aa_chain,4kwm_aa_pdb_site,...,h5_site,h5_wt_aa,h7_site,h7_wt_aa,h3_effect,h3_effect_std,h5_effect,h5_effect_std,h7_effect,h7_effect_std
0,A,9,P,-,P,9.0,1.084277,-,A,-1.0,...,9,K,,,0.0151,0.7225,0.0558,0.2918,,
1,C,9,P,-,P,9.0,1.084277,-,A,-1.0,...,9,K,,,-0.408,0.385,-0.4245,0.02737,,
2,D,9,P,-,P,9.0,1.084277,-,A,-1.0,...,9,K,,,0.2361,0.274,0.2039,0.07884,,
3,E,9,P,-,P,9.0,1.084277,-,A,-1.0,...,9,K,,,-0.2463,0.8478,0.1713,0.1021,,
4,F,9,P,-,P,9.0,1.084277,-,A,-1.0,...,9,K,,,0.2061,0.3214,-0.8397,1.348,,


In [13]:
# combined dataframe with average mutation effects at each site
combined_site_effects = pd.merge(
    pd.merge(
        pd.merge(
            structural_aln,
            avg_H3_effects,
            left_on=['h3_site', 'h3_wt_aa'],
            right_on=['site', 'wildtype'],
            how='left'
        ).drop(columns=['site', 'wildtype']).rename(
            columns={'effect': 'avg_h3_effect'}
        ),
        avg_H5_effects,
        left_on=['h5_site', 'h5_wt_aa'],
        right_on=['site', 'wildtype'],
        how='left',
    ).drop(columns=['site', 'wildtype']).rename(
        columns={'effect': 'avg_h5_effect'}
    ),
    avg_H7_effects,
    left_on=['h7_site', 'h7_wt_aa'],
    right_on=['site', 'wildtype'],
    how='left',
).drop(columns=['site', 'wildtype']).rename(
    columns={'effect': 'avg_h7_effect'}
)

combined_site_effects.to_csv(
    '../results/combined_effects/combined_site_effects.csv', 
    index=False
)
combined_site_effects.head()

Unnamed: 0,struct_site,4o5n_aa,4r8w_aa,4kwm_aa,4o5n_aa_pdb_site,4o5n_aa_RSA,4o5n_aa_SS,4o5n_aa_chain,4kwm_aa_pdb_site,4kwm_aa_RSA,...,rmsd_h5h7,h3_site,h3_wt_aa,h5_site,h5_wt_aa,h7_site,h7_wt_aa,avg_h3_effect,avg_h5_effect,avg_h7_effect
0,9,P,-,P,9.0,1.084277,-,A,-1.0,1.140252,...,,9,S,9,K,,,-0.050776,-1.062932,
1,10,G,-,G,10.0,0.150962,-,A,0.0,0.175962,...,,10,T,10,S,,,-0.697911,-3.224739,
2,11,A,D,D,11.0,0.050388,E,A,1.0,0.097927,...,4.172437,11,A,11,D,11.0,D,-3.13828,-3.921267,-2.986653
3,12,T,K,Q,12.0,0.268605,E,A,2.0,0.216889,...,4.694308,12,T,12,Q,12.0,K,-1.036219,-0.467449,-1.822106
4,13,L,I,I,13.0,0.0,E,A,3.0,0.0,...,3.75921,13,L,13,I,13.0,I,-3.94105,-3.885729,-3.883289
