# Process Iketani et al Mpro mutational scanning
Process the values to mutation effect estimates.

Get variables from `snakemake`:

In [None]:
if "snakemake" in globals() or "snakemake" in locals():
    raw_data_csv = snakemake.input.raw_data
    wt_seq_fasta = snakemake.input.wt_seq
    processed_csv = snakemake.output.processed
else:
    # running interactively
    raw_data_csv = "../results/dms/iketani_mpro/raw.csv"
    wt_seq_fasta = "../data/Mpro.fa"
    processed_csv = "../results/dms/iketani_mpro/processed.csv"

Import Python modules:

In [None]:
import Bio.SeqIO

import numpy

import pandas as pd

Read the wildtype sequence:

In [None]:
wt_seq = str(Bio.SeqIO.read(wt_seq_fasta, "fasta").seq)

wt_seq

Read the raw data and convert to mutation effects:

In [None]:
raw_data = pd.read_csv(raw_data_csv)

# process the data
processed = (
    raw_data
    .merge(
        (
            raw_data
            .query("WT == 1")
            [["resid", "mut"]]
            .drop_duplicates()
            .rename(columns={"mut": "wildtype"})
        ),
        on="resid",
        how="left",
        validate="many_to_one",
    )
    .rename(
        columns={
            "resid": "site",
            "AS": "effect",
            "mut": "mutant",
        }
    )
    [["site", "wildtype", "mutant", "effect"]]
    .query("site.notnull()")
    .assign(site=lambda x: x["site"].astype(int))
)

# some sites are missing wildtypes, add these and then set all wildtype to zero
wt_df = pd.DataFrame(enumerate(wt_seq, start=1), columns=["site", "wildtype"])

processed = processed.merge(wt_df, how="outer", on="site", validate="many_to_one")

assert (
    (processed["wildtype_x"] == processed["wildtype_y"])
    | processed["wildtype_x"].isnull()
).all()

processed = (
    processed
    .rename(columns={"wildtype_y": "wildtype"})
    [["site", "wildtype", "mutant", "effect"]]
    .assign(
        effect=lambda x: numpy.where(x["wildtype"] != x["mutant"], x["effect"], 0),
    )
)

assert processed["wildtype"].notnull().all()

processed

In [None]:
processed.to_csv(processed_csv, index=False, float_format="%.5g")