In [1]:
import pandas as pd
import numpy as np

from natsort import natsort_keygen

In [2]:
def read_and_filter_data(
    path, 
    effect_std_filter=2,
    times_seen_filter=2,
    n_selections_filter=2 
):
    print(f'Reading data from {path}')
    print(
        f"Filtering for:\n"
        f"  effect_std <= {effect_std_filter}\n"
        f"  times_seen >= {times_seen_filter}\n"
        f"  n_selections >= {n_selections_filter}"
    )

    df = pd.read_csv(path).query(
        'effect_std <= @effect_std_filter and \
        times_seen >= @times_seen_filter and \
        n_selections >= @n_selections_filter'
    ).query(
        'mutant not in ["*", "-"]' # don't want stop codons/indels
    )

    df['site'] = df['site'].astype(str)
    df = pd.concat([
        df,
        df[['site', 'wildtype']].drop_duplicates().assign(
            mutant=lambda x: x['wildtype'],
            effect=0.0,
            effect_std=0.0,
            times_seen=np.nan,
            n_selections=np.nan
        ) # add wildtype sites with zero effect
    ], ignore_index=True).sort_values(['site', 'mutant']).reset_index(drop=True)
    
    return df

H3_effects = read_and_filter_data('../data/MDCKSIAT1_entry_func_effects.csv')
H5_effects = read_and_filter_data('../data/293T_entry_func_effects.csv')
H7_effects = read_and_filter_data('../data/293_2-6_entry_func_effects.csv')

Reading data from ../data/MDCKSIAT1_entry_func_effects.csv
Filtering for:
  effect_std <= 2
  times_seen >= 2
  n_selections >= 2
Reading data from ../data/293T_entry_func_effects.csv
Filtering for:
  effect_std <= 2
  times_seen >= 2
  n_selections >= 2
Reading data from ../data/293_2-6_entry_func_effects.csv
Filtering for:
  effect_std <= 2
  times_seen >= 2
  n_selections >= 2


In [3]:
# mean effects of mutations at each site (excluding wildtype)

avg_H3_effects = (
    H3_effects.query('mutant != wildtype')
    .groupby(['site', 'wildtype'], as_index=False)['effect']
    .mean()
    .sort_values("site", key=natsort_keygen())
    .reset_index(drop=True)
)

avg_H5_effects = (
    H5_effects.query('mutant != wildtype')
    .groupby(['site', 'wildtype'], as_index=False)['effect']
    .mean()
    .sort_values("site", key=natsort_keygen())
    .reset_index(drop=True)
)

avg_H7_effects = (
    H7_effects.query('mutant != wildtype')
    .groupby(['site', 'wildtype'], as_index=False)['effect']
    .mean()
    .sort_values("site", key=natsort_keygen())
    .reset_index(drop=True)
)

In [4]:
structural_aln = pd.read_csv(
    '../results/structural_alignment/structural_alignment.csv', dtype={'h3_site': object})
structural_aln.head()

Unnamed: 0,struct_site,4o5n_aa,4r8w_aa,4kwm_aa,4o5n_aa_RSA,4o5n_aa_SS,4kwm_aa_RSA,4kwm_aa_SS,4r8w_aa_RSA,4r8w_aa_SS,rmsd_h3h5,rmsd_h3h7,rmsd_h5h7,h3_site,h3_wt_aa,h5_site,h5_wt_aa,h7_site,h7_wt_aa
0,9,P,-,P,1.081761,-,1.138365,-,,,9.178066,,,9,S,9,K,,
1,10,G,-,G,0.153846,-,0.173077,-,,,8.183151,,,10,T,10,S,,
2,11,A,D,D,0.054264,E,0.098446,-,0.259067,-,5.04805,1.735252,4.172437,11,A,11,D,11.0,D
3,12,T,K,Q,0.267442,E,0.217778,E,0.279661,E,3.93908,1.489412,4.694308,12,T,12,Q,12.0,K
4,13,L,I,I,0.0,E,0.0,E,0.005076,E,3.725425,1.019553,3.75921,13,L,13,I,13.0,I


In [5]:
# combined dataframe with all mutation effects
combined_mutation_effects = pd.merge(
    pd.merge(
        pd.merge(
            structural_aln,
            H3_effects[['site', 'wildtype', 'mutant', 'effect']],
            left_on=['h3_site', 'h3_wt_aa'],
            right_on=['site', 'wildtype'],
            how='left'
        ).drop(columns=['site', 'wildtype']).rename(
            columns={'effect': 'h3_effect'}
        ),
        H5_effects[['site', 'wildtype', 'mutant', 'effect']],
        left_on=['h5_site', 'h5_wt_aa', 'mutant'],
        right_on=['site', 'wildtype', 'mutant'],
        how='left',
    ).drop(columns=['site', 'wildtype']).rename(
        columns={'effect': 'h5_effect'}
    ),
    H7_effects[['site', 'wildtype', 'mutant', 'effect']],
    left_on=['h7_site', 'h7_wt_aa', 'mutant'],
    right_on=['site', 'wildtype', 'mutant'],
    how='left',
).drop(columns=['site', 'wildtype']).rename(
    columns={'effect': 'h7_effect'}
)

combined_mutation_effects = (
    combined_mutation_effects[['mutant'] 
    + [c for c in combined_mutation_effects.columns if c != 'mutant']]
)

combined_mutation_effects.to_csv(
    '../results/combined_effects/combined_mutation_effects.csv', 
    index=False
)
combined_mutation_effects.head()

Unnamed: 0,mutant,struct_site,4o5n_aa,4r8w_aa,4kwm_aa,4o5n_aa_RSA,4o5n_aa_SS,4kwm_aa_RSA,4kwm_aa_SS,4r8w_aa_RSA,...,rmsd_h5h7,h3_site,h3_wt_aa,h5_site,h5_wt_aa,h7_site,h7_wt_aa,h3_effect,h5_effect,h7_effect
0,A,9,P,-,P,1.081761,-,1.138365,-,,...,,9,S,9,K,,,0.0151,0.0558,
1,C,9,P,-,P,1.081761,-,1.138365,-,,...,,9,S,9,K,,,-0.408,-0.4245,
2,D,9,P,-,P,1.081761,-,1.138365,-,,...,,9,S,9,K,,,0.2361,0.2039,
3,E,9,P,-,P,1.081761,-,1.138365,-,,...,,9,S,9,K,,,-0.2463,0.1713,
4,F,9,P,-,P,1.081761,-,1.138365,-,,...,,9,S,9,K,,,0.2061,-0.8397,


In [6]:
# combined dataframe with average mutation effects at each site
combined_site_effects = pd.merge(
    pd.merge(
        pd.merge(
            structural_aln,
            avg_H3_effects,
            left_on=['h3_site', 'h3_wt_aa'],
            right_on=['site', 'wildtype'],
            how='left'
        ).drop(columns=['site', 'wildtype']).rename(
            columns={'effect': 'avg_h3_effect'}
        ),
        avg_H5_effects,
        left_on=['h5_site', 'h5_wt_aa'],
        right_on=['site', 'wildtype'],
        how='left',
    ).drop(columns=['site', 'wildtype']).rename(
        columns={'effect': 'avg_h5_effect'}
    ),
    avg_H7_effects,
    left_on=['h7_site', 'h7_wt_aa'],
    right_on=['site', 'wildtype'],
    how='left',
).drop(columns=['site', 'wildtype']).rename(
    columns={'effect': 'avg_h7_effect'}
)

combined_site_effects.to_csv(
    '../results/combined_effects/combined_site_effects.csv', 
    index=False
)
combined_site_effects.head()

Unnamed: 0,struct_site,4o5n_aa,4r8w_aa,4kwm_aa,4o5n_aa_RSA,4o5n_aa_SS,4kwm_aa_RSA,4kwm_aa_SS,4r8w_aa_RSA,4r8w_aa_SS,...,rmsd_h5h7,h3_site,h3_wt_aa,h5_site,h5_wt_aa,h7_site,h7_wt_aa,avg_h3_effect,avg_h5_effect,avg_h7_effect
0,9,P,-,P,1.081761,-,1.138365,-,,,...,,9,S,9,K,,,-0.050776,-1.193879,
1,10,G,-,G,0.153846,-,0.173077,-,,,...,,10,T,10,S,,,-0.697911,-3.704239,
2,11,A,D,D,0.054264,E,0.098446,-,0.259067,-,...,4.172437,11,A,11,D,11.0,D,-3.13828,-4.348767,-2.983488
3,12,T,K,Q,0.267442,E,0.217778,E,0.279661,E,...,4.694308,12,T,12,Q,12.0,K,-1.036219,-0.523449,-1.986117
4,13,L,I,I,0.0,E,0.0,E,0.005076,E,...,3.75921,13,L,13,I,13.0,I,-3.94105,-4.419553,-4.040841
