# Test computation of escape scores
Test computation of "escape scores" by `CodonVariantTable.escape_scores`.

First create miniature test variant table:

In [1]:
import tempfile

import pandas as pd

import dms_variants.codonvarianttable

geneseq = "ATGGGC"

variant_counts = pd.DataFrame.from_records(
    [
        ("lib1", "AA", "", "pre", 10),
        ("lib1", "AG", "ATG1CAA", "pre", 20),
        ("lib2", "TT", "ATG1ATC", "pre", 30),
        ("lib2", "TA", "ATG1ATC GGC2GGG", "pre", 40),
        ("lib1", "AA", "", "post", 1),
        ("lib1", "AG", "ATG1CAA", "post", 30),
        ("lib2", "TT", "ATG1ATC", "post", 15),
        ("lib2", "TA", "ATG1ATC GGC2GGG", "post", 5),
        ("lib1", "AA", "", "post2", 20),
        ("lib1", "AG", "ATG1CAA", "post2", 40),
    ],
    columns=["library", "barcode", "codon_substitutions", "sample", "count"],
).assign(variant_call_support=1)

with tempfile.NamedTemporaryFile("w") as f:
    variant_counts.to_csv(f, index=False)
    f.flush()
    variants = dms_variants.codonvarianttable.CodonVariantTable.from_variant_count_df(
        variant_count_df_file=f.name, geneseq=geneseq
    )

variants.variant_count_df

Unnamed: 0,library,sample,barcode,count,variant_call_support,codon_substitutions,aa_substitutions,n_codon_substitutions,n_aa_substitutions
0,lib1,pre,AG,20,1,ATG1CAA,M1Q,1,1
1,lib1,pre,AA,10,1,,,0,0
2,lib1,post,AG,30,1,ATG1CAA,M1Q,1,1
3,lib1,post,AA,1,1,,,0,0
4,lib1,post2,AG,40,1,ATG1CAA,M1Q,1,1
5,lib1,post2,AA,20,1,,,0,0
6,lib2,pre,TA,40,1,ATG1ATC GGC2GGG,M1I,2,1
7,lib2,pre,TT,30,1,ATG1ATC,M1I,1,1
8,lib2,post,TT,15,1,ATG1ATC,M1I,1,1
9,lib2,post,TA,5,1,ATG1ATC GGC2GGG,M1I,2,1


Now compute escape scores using score type of `minus_log_bind`:

In [2]:
sample_df = pd.DataFrame.from_records(
    [
        ("name1", "lib1", "pre", "post", 0.59),
        ("name1", "lib2", "pre", "post", 0.62),
        ("name2", "lib1", "pre", "post2", 0.2),
    ],
    columns=["name", "library", "pre_sample", "post_sample", "frac_escape"],
)

variants.escape_scores(sample_df, score_type="minus_log_bind")

Unnamed: 0,name,library,pre_sample,post_sample,barcode,score,score_var,pre_count,post_count,codon_substitutions,n_codon_substitutions,aa_substitutions,n_aa_substitutions
0,name1,lib1,pre,post,AG,2.740562,0.318338,20,30,ATG1CAA,1,M1Q,1
1,name1,lib1,pre,post,AA,0.122887,0.010479,10,1,,0,,0
2,name1,lib2,pre,post,TA,0.483315,0.032764,40,5,ATG1ATC GGC2GGG,2,M1I,1
3,name1,lib2,pre,post,TT,6.643856,0.0,30,15,ATG1ATC,1,M1I,1
4,name2,lib1,pre,post2,AG,0.323371,0.001021,20,40,ATG1CAA,1,M1Q,1
5,name2,lib1,pre,post2,AA,0.319115,0.007043,10,20,,0,,0


With a different floor:

In [3]:
variants.escape_scores(sample_df, score_type="minus_log_bind", floor_B=0.001)

Unnamed: 0,name,library,pre_sample,post_sample,barcode,score,score_var,pre_count,post_count,codon_substitutions,n_codon_substitutions,aa_substitutions,n_aa_substitutions
0,name1,lib1,pre,post,AG,2.740562,0.318338,20,30,ATG1CAA,1,M1Q,1
1,name1,lib1,pre,post,AA,0.122887,0.010479,10,1,,0,,0
2,name1,lib2,pre,post,TA,0.483315,0.032764,40,5,ATG1ATC GGC2GGG,2,M1I,1
3,name1,lib2,pre,post,TT,9.965784,0.0,30,15,ATG1ATC,1,M1I,1
4,name2,lib1,pre,post2,AG,0.323371,0.001021,20,40,ATG1CAA,1,M1Q,1
5,name2,lib1,pre,post2,AA,0.319115,0.007043,10,20,,0,,0


With a different pseudocount:

In [4]:
variants.escape_scores(sample_df, score_type="minus_log_bind", pseudocount=2)

Unnamed: 0,name,library,pre_sample,post_sample,barcode,score,score_var,pre_count,post_count,codon_substitutions,n_codon_substitutions,aa_substitutions,n_aa_substitutions
0,name1,lib1,pre,post,AG,2.587813,0.262882,20,30,ATG1CAA,1,M1Q,1
1,name1,lib1,pre,post,AA,0.223114,0.017847,10,1,,0,,0
2,name1,lib2,pre,post,TA,0.55345,0.03341,40,5,ATG1ATC GGC2GGG,2,M1I,1
3,name1,lib2,pre,post,TT,6.643856,0.0,30,15,ATG1ATC,1,M1I,1
4,name2,lib1,pre,post2,AG,0.32706,0.001064,20,40,ATG1CAA,1,M1Q,1
5,name2,lib1,pre,post2,AA,0.312566,0.005906,10,20,,0,,0


By amino-acid substitution:

In [5]:
variants.escape_scores(
    sample_df,
    score_type="minus_log_bind",
    floor_B=0.001,
    by="aa_substitutions",
)

Unnamed: 0,name,library,pre_sample,post_sample,aa_substitutions,score,score_var,pre_count,post_count,n_aa_substitutions
0,name1,lib1,pre,post,M1Q,2.740562,0.318338,20,30,1
1,name1,lib1,pre,post,,0.122887,0.010479,10,1,0
2,name1,lib2,pre,post,M1I,1.395929,0.0,70,20,1
3,name2,lib1,pre,post2,M1Q,0.323371,0.001021,20,40,1
4,name2,lib1,pre,post2,,0.319115,0.007043,10,20,0


Now with score type `log_escape`:

In [6]:
variants.escape_scores(sample_df, score_type="log_escape")

Unnamed: 0,name,library,pre_sample,post_sample,barcode,score,score_var,pre_count,post_count,codon_substitutions,n_codon_substitutions,aa_substitutions,n_aa_substitutions
0,name1,lib1,pre,post,AG,-0.233831,0.0109,20,30,ATG1CAA,1,M1Q,1
1,name1,lib1,pre,post,AA,-3.614372,0.796134,10,1,,0,,0
2,name1,lib2,pre,post,TA,-1.812649,0.175442,40,5,ATG1ATC GGC2GGG,2,M1I,1
3,name1,lib2,pre,post,TT,0.0,0.0,30,15,ATG1ATC,1,M1I,1
4,name2,lib1,pre,post2,AG,-2.316171,0.016332,20,40,ATG1CAA,1,M1Q,1
5,name2,lib1,pre,post2,AA,-2.333235,0.118633,10,20,,0,,0


With a different floor:

In [7]:
variants.escape_scores(sample_df, score_type="log_escape", floor_E=0.1)

Unnamed: 0,name,library,pre_sample,post_sample,barcode,score,score_var,pre_count,post_count,codon_substitutions,n_codon_substitutions,aa_substitutions,n_aa_substitutions
0,name1,lib1,pre,post,AG,-0.233831,0.0109,20,30,ATG1CAA,1,M1Q,1
1,name1,lib1,pre,post,AA,-3.321928,0.240153,10,1,,0,,0
2,name1,lib2,pre,post,TA,-1.812649,0.175442,40,5,ATG1ATC GGC2GGG,2,M1I,1
3,name1,lib2,pre,post,TT,0.0,0.0,30,15,ATG1ATC,1,M1I,1
4,name2,lib1,pre,post2,AG,-2.316171,0.016332,20,40,ATG1CAA,1,M1Q,1
5,name2,lib1,pre,post2,AA,-2.333235,0.118633,10,20,,0,,0


With no ceiling:

In [8]:
variants.escape_scores(sample_df, score_type="log_escape", ceil_E=None)

Unnamed: 0,name,library,pre_sample,post_sample,barcode,score,score_var,pre_count,post_count,codon_substitutions,n_codon_substitutions,aa_substitutions,n_aa_substitutions
0,name1,lib1,pre,post,AG,-0.233831,0.0109,20,30,ATG1CAA,1,M1Q,1
1,name1,lib1,pre,post,AA,-3.614372,0.796134,10,1,,0,,0
2,name1,lib2,pre,post,TA,-1.812649,0.175442,40,5,ATG1ATC GGC2GGG,2,M1I,1
3,name1,lib2,pre,post,TT,0.091229,0.02946,30,15,ATG1ATC,1,M1I,1
4,name2,lib1,pre,post2,AG,-2.316171,0.016332,20,40,ATG1CAA,1,M1Q,1
5,name2,lib1,pre,post2,AA,-2.333235,0.118633,10,20,,0,,0


Now with score type `frac_escape`:

In [9]:
variants.escape_scores(sample_df, score_type="frac_escape")

Unnamed: 0,name,library,pre_sample,post_sample,barcode,score,score_var,pre_count,post_count,codon_substitutions,n_codon_substitutions,aa_substitutions,n_aa_substitutions
0,name1,lib1,pre,post,AG,0.850373,0.003728,20,30,ATG1CAA,1,M1Q,1
1,name1,lib1,pre,post,AA,0.081652,0.004028,10,1,,0,,0
2,name1,lib2,pre,post,TA,0.284668,0.007665,40,5,ATG1ATC GGC2GGG,2,M1I,1
3,name1,lib2,pre,post,TT,1.0,0.0,30,15,ATG1ATC,1,M1I,1
4,name2,lib1,pre,post2,AG,0.2008,0.000314,20,40,ATG1CAA,1,M1Q,1
5,name2,lib1,pre,post2,AA,0.198439,0.002187,10,20,,0,,0


Changing ceiling and floor:

In [10]:
variants.escape_scores(sample_df, score_type="frac_escape", ceil_E=None, floor_E=0.1)

Unnamed: 0,name,library,pre_sample,post_sample,barcode,score,score_var,pre_count,post_count,codon_substitutions,n_codon_substitutions,aa_substitutions,n_aa_substitutions
0,name1,lib1,pre,post,AG,0.850373,0.003728,20,30,ATG1CAA,1,M1Q,1
1,name1,lib1,pre,post,AA,0.1,0.001532,10,1,,0,,0
2,name1,lib2,pre,post,TA,0.284668,0.007665,40,5,ATG1ATC GGC2GGG,2,M1I,1
3,name1,lib2,pre,post,TT,1.065277,0.015926,30,15,ATG1ATC,1,M1I,1
4,name2,lib1,pre,post2,AG,0.2008,0.000314,20,40,ATG1CAA,1,M1Q,1
5,name2,lib1,pre,post2,AA,0.198439,0.002187,10,20,,0,,0
