In [1]:
import pandas as pd
import numpy as np

from natsort import natsort_keygen

In [2]:
# Parameters for papermill
effect_std_filter = 2
times_seen_filter = 2
n_selections_filter = 2
clip_effect = -5

In [3]:
# Parameters
effect_std_filter = 2
times_seen_filter = 2
n_selections_filter = 2
clip_effect = -5


In [4]:
def read_and_filter_data(
    path, 
    effect_std_filter=2,
    times_seen_filter=2,
    n_selections_filter=2,
    clip_effect=-5 
):
    print(f'Reading data from {path}')
    print(
        f"Filtering for:\n"
        f"  effect_std <= {effect_std_filter}\n"
        f"  times_seen >= {times_seen_filter}\n"
        f"  n_selections >= {n_selections_filter}"
    )
    print(f"Clipping effect values at {clip_effect}")

    df = pd.read_csv(path).query(
        'effect_std <= @effect_std_filter and \
        times_seen >= @times_seen_filter and \
        n_selections >= @n_selections_filter'
    ).query(
        'mutant not in ["*", "-"]' # don't want stop codons/indels
    )

    df['site'] = df['site'].astype(str)
    df['effect'] = df['effect'].clip(clip_effect)

    df = pd.concat([
        df,
        df[['site', 'wildtype']].drop_duplicates().assign(
            mutant=lambda x: x['wildtype'],
            effect=0.0,
            effect_std=0.0,
            times_seen=np.nan,
            n_selections=np.nan
        ) # add wildtype sites with zero effect
    ], ignore_index=True).sort_values(['site', 'mutant']).reset_index(drop=True)
    
    return df

H3_effects = read_and_filter_data(
    '../data/cell_entry_effects/MDCKSIAT1_entry_func_effects.csv',
    effect_std_filter=effect_std_filter,
    times_seen_filter=times_seen_filter,
    n_selections_filter=n_selections_filter,
    clip_effect=clip_effect
)
H5_effects = read_and_filter_data(
    '../data/cell_entry_effects/293T_entry_func_effects.csv',
    effect_std_filter=effect_std_filter,
    times_seen_filter=times_seen_filter,
    n_selections_filter=n_selections_filter,
    clip_effect=clip_effect
)
H7_effects = read_and_filter_data(
    '../data/cell_entry_effects/293_mix_entry_func_effects.csv',
    effect_std_filter=effect_std_filter,
    times_seen_filter=times_seen_filter,
    n_selections_filter=n_selections_filter,
    clip_effect=clip_effect
)

Reading data from ../data/cell_entry_effects/MDCKSIAT1_entry_func_effects.csv
Filtering for:
  effect_std <= 2
  times_seen >= 2
  n_selections >= 2
Clipping effect values at -5
Reading data from ../data/cell_entry_effects/293T_entry_func_effects.csv
Filtering for:
  effect_std <= 2
  times_seen >= 2
  n_selections >= 2
Clipping effect values at -5
Reading data from ../data/cell_entry_effects/293_mix_entry_func_effects.csv
Filtering for:
  effect_std <= 2
  times_seen >= 2
  n_selections >= 2
Clipping effect values at -5


In [5]:
# mean effects of mutations at each site (excluding wildtype)

avg_H3_effects = (
    H3_effects.query('mutant != wildtype')
    .groupby(['site', 'wildtype'], as_index=False)['effect']
    .mean()
    .sort_values("site", key=natsort_keygen())
    .reset_index(drop=True)
)

avg_H5_effects = (
    H5_effects.query('mutant != wildtype')
    .groupby(['site', 'wildtype'], as_index=False)['effect']
    .mean()
    .sort_values("site", key=natsort_keygen())
    .reset_index(drop=True)
)

avg_H7_effects = (
    H7_effects.query('mutant != wildtype')
    .groupby(['site', 'wildtype'], as_index=False)['effect']
    .mean()
    .sort_values("site", key=natsort_keygen())
    .reset_index(drop=True)
)

In [6]:
structural_aln = pd.read_csv(
    '../results/structural_alignment/structural_alignment.csv', dtype={'h3_site': object})
structural_aln.head()

Unnamed: 0,struct_site,h3_site,h5_site,h7_site,h3_wt_aa,h5_wt_aa,h7_wt_aa,rmsd_h3h5,rmsd_h3h7,rmsd_h5h7,4o5n_aa_RSA,4kwm_aa_RSA,6ii9_aa_RSA
0,9,9,9,,S,K,,9.1674,,,1.084277,1.140252,
1,10,10,10,,T,S,,8.157247,,,0.150962,0.175962,
2,11,11,11,11.0,A,D,D,5.04004,2.984626,2.886615,0.050388,0.097927,0.624352
3,12,12,12,12.0,T,Q,K,3.937602,1.626754,3.38435,0.268605,0.216889,0.368644
4,13,13,13,13.0,L,I,I,3.687798,1.734039,2.549524,0.0,0.0,0.0


In [7]:
# combined dataframe with all mutation effects
combined_mutation_effects = pd.merge(
    pd.merge(
        pd.merge(
            structural_aln,
            H3_effects[['site', 'wildtype', 'mutant', 'effect', 'effect_std']],
            left_on=['h3_site', 'h3_wt_aa'],
            right_on=['site', 'wildtype'],
            how='left'
        ).drop(columns=['site', 'wildtype']).rename(
            columns={'effect': 'h3_effect', 'effect_std': 'h3_effect_std'}
        ),
        H5_effects[['site', 'wildtype', 'mutant', 'effect', 'effect_std']],
        left_on=['h5_site', 'h5_wt_aa', 'mutant'],
        right_on=['site', 'wildtype', 'mutant'],
        how='left',
    ).drop(columns=['site', 'wildtype']).rename(
        columns={'effect': 'h5_effect', 'effect_std': 'h5_effect_std'}
    ),
    H7_effects[['site', 'wildtype', 'mutant', 'effect', 'effect_std']],
    left_on=['h7_site', 'h7_wt_aa', 'mutant'],
    right_on=['site', 'wildtype', 'mutant'],
    how='left',
).drop(columns=['site', 'wildtype']).rename(
    columns={'effect': 'h7_effect', 'effect_std': 'h7_effect_std'}
).drop(columns=['h3_site', 'h5_site', 'h7_site'])

combined_mutation_effects = (
    combined_mutation_effects[['mutant'] 
    + [c for c in combined_mutation_effects.columns if c != 'mutant']]
)

combined_mutation_effects.to_csv(
    '../results/combined_effects/combined_mutation_effects.csv', 
    index=False
)
combined_mutation_effects.head()

Unnamed: 0,mutant,struct_site,h3_wt_aa,h5_wt_aa,h7_wt_aa,rmsd_h3h5,rmsd_h3h7,rmsd_h5h7,4o5n_aa_RSA,4kwm_aa_RSA,6ii9_aa_RSA,h3_effect,h3_effect_std,h5_effect,h5_effect_std,h7_effect,h7_effect_std
0,A,9,S,K,,9.1674,,,1.084277,1.140252,,0.0151,0.7225,0.2049,0.2627,,
1,C,9,S,K,,9.1674,,,1.084277,1.140252,,-0.408,0.385,-0.3977,0.1072,,
2,D,9,S,K,,9.1674,,,1.084277,1.140252,,0.2361,0.274,0.2383,0.2087,,
3,E,9,S,K,,9.1674,,,1.084277,1.140252,,-0.2463,0.8478,0.312,0.2815,,
4,F,9,S,K,,9.1674,,,1.084277,1.140252,,0.2061,0.3214,-0.8917,1.202,,


In [8]:
# combined dataframe with average mutation effects at each site
combined_site_effects = pd.merge(
    pd.merge(
        pd.merge(
            structural_aln,
            avg_H3_effects,
            left_on=['h3_site', 'h3_wt_aa'],
            right_on=['site', 'wildtype'],
            how='left'
        ).drop(columns=['site', 'wildtype']).rename(
            columns={'effect': 'avg_h3_effect'}
        ),
        avg_H5_effects,
        left_on=['h5_site', 'h5_wt_aa'],
        right_on=['site', 'wildtype'],
        how='left',
    ).drop(columns=['site', 'wildtype']).rename(
        columns={'effect': 'avg_h5_effect'}
    ),
    avg_H7_effects,
    left_on=['h7_site', 'h7_wt_aa'],
    right_on=['site', 'wildtype'],
    how='left',
).drop(columns=['site', 'wildtype']).rename(
    columns={'effect': 'avg_h7_effect'}
).drop(columns=['h3_site', 'h5_site', 'h7_site'])

combined_site_effects.to_csv(
    '../results/combined_effects/combined_site_effects.csv', 
    index=False
)
combined_site_effects.head()

Unnamed: 0,struct_site,h3_wt_aa,h5_wt_aa,h7_wt_aa,rmsd_h3h5,rmsd_h3h7,rmsd_h5h7,4o5n_aa_RSA,4kwm_aa_RSA,6ii9_aa_RSA,avg_h3_effect,avg_h5_effect,avg_h7_effect
0,9,S,K,,9.1674,,,1.084277,1.140252,,-0.050776,-0.998095,
1,10,T,S,,8.157247,,,0.150962,0.175962,,-0.697911,-3.348267,
2,11,A,D,D,5.04004,2.984626,2.886615,0.050388,0.097927,0.624352,-3.13828,-3.951383,-2.962194
3,12,T,Q,K,3.937602,1.626754,3.38435,0.268605,0.216889,0.368644,-1.036219,-0.342761,-1.705403
4,13,L,I,I,3.687798,1.734039,2.549524,0.0,0.0,0.0,-3.94105,-3.827571,-3.829644
