# Process Flynn et al (2022) Mpro mutational scanning
Process the values to mutation effect estimates.

Get variables from `snakemake`:

In [None]:
if "snakemake" in globals() or "snakemake" in locals():
    raw_data_csv = snakemake.input.raw_data
    wt_seq_fasta = snakemake.input.wt_seq
    processed_csv = snakemake.output.processed
else:
    # running interactively
    raw_data_csv = "../results/dms/flynn_mpro_2022/raw.csv"
    wt_seq_fasta = "../data/Mpro.fa"
    processed_csv = "../results/dms/flynn_mpro_2022/processed.csv"

Import Python modules:

In [None]:
import Bio.SeqIO

import numpy

import pandas as pd

Read the wildtype sequence:

In [None]:
wt_seq = str(Bio.SeqIO.read(wt_seq_fasta, "fasta").seq)

wt_seq

Read and process the data.
We average the replicates for each assay, and then report an effect of mutations for each assay as well as averaged over assays:

In [None]:
# read raw data, not it is actually in Excel and not CSV
raw_data = pd.read_excel(raw_data_csv, sheet_name=None)

assays = sorted(set(sheet.split()[0] for sheet in raw_data))
print(f"Found data for {assays=}")

df = (
    pd.concat(
        [
            sheet_df.assign(sheet=sheet).rename(columns={" aa": "aa"})
            for sheet, sheet_df in raw_data.items()
        ]
    )
    .query("Position != 'WT'")  # have normalized functional scores of 1 by definition
    .assign(
        site=lambda x: x["Position"].astype(int),
        assay=lambda x: x["sheet"].str.split().str[0],
        effect=lambda x: x["functional score (normalized)"],
        wildtype=lambda x: x["site"].map(dict(enumerate(wt_seq, start=1))),
    )
    .query("effect.notnull()")
    .query("wildtype != aa")  # do not keep wildtype to wildtype mutations
    .groupby([ "assay", "site", "wildtype", "aa"], as_index=False)
    .aggregate(effect=pd.NamedAgg("effect", "mean"))
    .pivot_table(
        index=["site", "wildtype", "aa"],
        values="effect",
        columns="assay",
    )
    .assign(effect=lambda x: x.mean(axis=1, skipna=True))
    .reset_index()
    .rename(columns={"aa": "mutant", "Growth": "growth"})
    [["site", "wildtype", "mutant", "effect", "growth", "FRET", "TF"]]
)

df

In [None]:
df.to_csv(processed_csv, index=False, float_format="%.5g")