# Process Iketani et al Mpro mutational scanning
Process the values to mutation effect estimates.

The next cell is tagged parameters, and so the values will be replaced by `papermill` parameterization:

In [1]:
raw_data_csv = "../results/dms/iketani_mpro/raw.csv"
wt_seq_fasta = "../results/dms/iketani_mpro/wt.fa"
processed_csv = "../results/dms/iketani_mpro/processed.csv"

Import Python modules:

In [2]:
import Bio.SeqIO

import numpy

import pandas as pd

Read the wildtype sequence.
For some reason the protein they used started at the seventh position in their wildtype sequence.
Also exclude the final stop codon:

In [3]:
wt_seq = str(Bio.SeqIO.read(wt_seq_fasta, "fasta").translate().seq)[6: -1]

wt_seq

'SGFRKMAFPSGKVEGCMVQVTCGTTTLNGLWLDDVVYCPRHVICTSEDMLNPNYEDLLIRKSNHNFLVQAGNVQLRVIGHSMQNCVLKLKVDTANPKTPKYKFVRIQPGQTFSVLACYNGSPSGVYQCAMRPNFTIKGSFLNGSCGSVGFNIDYDCVSFCYMHHMELPTGVHAGTDLEGNFYGPFVDRQTAQAAGTDTTITVNVLAWLYAAVINGDRWFLNRFTTTLNDFNLVAMKYNYEPLTQDHVDILGPLSAQTGIAVLDMCASLKELLQNGMNGRTILGSALLEDEFTPFDVVRQCSGVTFQ'

Read the raw data and convert to mutation effects:

In [4]:
raw_data = pd.read_csv(raw_data_csv)

# process the data
processed = (
    raw_data
    .merge(
        (
            raw_data
            .query("WT == 1")
            [["resid", "mut"]]
            .drop_duplicates()
            .rename(columns={"mut": "wildtype"})
        ),
        on="resid",
        how="left",
        validate="many_to_one",
    )
    .rename(
        columns={
            "resid": "site",
            "AS": "effect",
            "mut": "mutant",
        }
    )
    [["site", "wildtype", "mutant", "effect"]]
    .query("site.notnull()")
    .assign(site=lambda x: x["site"].astype(int))
)

# some sites are missing wildtypes, add these and then set all wildtype to zero
wt_df = pd.DataFrame(enumerate(wt_seq, start=1), columns=["site", "wildtype"])

processed = processed.merge(wt_df, how="outer", on="site", validate="many_to_one")

assert (
    (processed["wildtype_x"] == processed["wildtype_y"])
    | processed["wildtype_x"].isnull()
).all()

processed = (
    processed
    .rename(columns={"wildtype_y": "wildtype"})
    [["site", "wildtype", "mutant", "effect"]]
    .assign(
        effect=lambda x: numpy.where(x["wildtype"] != x["mutant"], x["effect"], 0),
    )
)

assert processed["wildtype"].notnull().all()

processed

Unnamed: 0,site,wildtype,mutant,effect
0,1,S,*,-0.662300
1,1,S,A,0.023196
2,1,S,C,0.008618
3,1,S,D,-0.620688
4,1,S,E,-0.780123
...,...,...,...,...
6055,306,Q,S,1.172006
6056,306,Q,T,0.406673
6057,306,Q,V,0.057967
6058,306,Q,W,-0.301491


In [5]:
processed.to_csv(processed_csv, index=False)