### Scoring without CDRs
It is possible to score with AntPack while masking out or disregarding user-specified regions of
the sequence. This notebook generates scores using a variety of different masks and examines the
results. Note that scoring using all of the available masks will take a little time, and then
bootstrapping CIs on the AUC-ROC scores is also moderately time-consuming, so you should expect
this notebook to take a few minutes to run.

In [20]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from Bio import SeqIO
from sklearn.metrics import roc_auc_score
from antpack import SequenceScoringTool, SingleChainAnnotator

if "notebooks" in os.getcwd():
    os.chdir(os.path.join("..", ".."))
    current_dir = os.getcwd()

score_tool = SequenceScoringTool(offer_classifier_option = True)
sc_annotator = SingleChainAnnotator()

os.chdir(current_dir)

We use IMGT definitions of CDRs to create masks for CDR 3 only,
CDR3 + CDR2, or all three CDRs for each chain. These masks are then used for masked scoring
(calculating the score with and without the masked regions).

In [6]:
imgt = {  "heavy":{"3":{'105', '106', '107', '108', '109', '110', '111', '111A', '111B', '111C', '111D', '111E', '111F', '111G',
                        '111H', '111I', '111J', '111K', '112L', '112K', '112J', '112I', '112H', '112G', '112F', '112E',
                        '112D', '112C', '112B', '112A', '112', '113', '114', '115', '116', '117'},
                    "2":{'56', '57', '58', '59', '60', '60A', '60B', '60C', '60D', '60E', '61E', '61D', '61C', '61B', '61A', '61',
                         '62', '63', '64', '65'},
                    "1":{'27', '28', '29', '30', '31', '32', '32A', '32B', '32C', '33C', '33B', '33A',
                         '33', '34', '35', '36', '37', '38'}
                    },
         "light":{  "3":{'105', '106', '107', '108', '109', '110', '111', '111A', '111B', '111C', '111D', '112D', '112C', '112B', '112A',
                         '112', '113', '114', '115', '116', '117'},
                    "2":{'56', '57', '58', '59', '60', '61', '62', '63', '64', '65'},
                    "1":{'27', '28', '29', '30', '31', '32', '32A', '32B', '33B', '33A', '33', '34', '35', '36',
                        '37', '38'}
                 }
        }

The next function scores a batch of sequences using various masks.

In [11]:
def score_sequence_group(seqs, chain, imgt_mask_grps, scoring_tool, annotation_tool):
    chain_map = {"heavy":"H", "light":"L"}
    numberings = [t[0] for t in annotation_tool.analyze_online_seqs(seqs) ]
    chain_types = [chain_map[chain] for a in numberings]

    scores, mask_categories = [], []
    
    for grp in [["3"], ["3", "2"], ["3", "2", "1"], []]:
        print(f"{chain}, {grp}")
        if len(grp) == 0:
            grp_key = "masked_gaps"
        else:
            grp_key = f"cdr_{'_'.join(grp)}_masked"

        imgt_mask_set = set()
        for key in grp:
            for position in imgt_mask_grps[chain][key]:
                imgt_mask_set.add(position)

        masks = [[True if a not in imgt_mask_set else False for a in numbering] for
                 numbering in numberings]

        scores += scoring_tool.score_masked_sequences(seqs, chain_types, numberings, masks).tolist()
        mask_categories += [grp_key for a in numberings]

    scores += scoring_tool.batch_score_seqs(seqs).tolist()
    mask_categories += ["no_mask" for a in numberings]
    return scores, mask_categories

We now run the above function on each batch of 50,000 sequences in the test set. There are 450,000 sequences in
all and we are scoring them using 5 different settings, so this will take a few minutes.

In [13]:
if "results_and_resources" not in os.getcwd():
    os.chdir(os.path.join(current_dir, "results_and_resources"))


scoring = { chain:{"scores":[], "species":[], "labels":[], "mask_codes":[]} for chain in ["heavy", "light"]  }


for chain in ["heavy", "light"]:
    os.chdir(os.path.join(current_dir, "train_test_data_immunogenicity_0.0.1", f"{chain}_chain", "test_sample_sequences"))
    flist = [f for f in os.listdir() if f.endswith("fasta")]
    flist = sorted(flist)

    for fname in flist:
        print(fname)
        with open(fname, "r") as fhandle:
            slist = [str(s.seq) for s in SeqIO.parse(fhandle, "fasta")]

        scores, mask_cats = score_sequence_group(slist, chain, imgt, score_tool, sc_annotator)
        
        species = fname.split(".fasta")[0].split("test_subsample_")[1]
        scoring[chain]["species"] += [species for s in scores]
        scoring[chain]["scores"] += scores
        scoring[chain]["mask_codes"] += mask_cats

        if species == "human":
            scoring[chain]["labels"] += np.ones((len(scores))).tolist()
        else:
            scoring[chain]["labels"] += np.zeros((len(scores))).tolist()

test_subsample_camel.fasta
heavy, ['3']
heavy, ['3', '2']
heavy, ['3', '2', '1']
heavy, []
test_subsample_human.fasta
heavy, ['3']
heavy, ['3', '2']
heavy, ['3', '2', '1']
heavy, []
test_subsample_mouse_balbc.fasta
heavy, ['3']
heavy, ['3', '2']
heavy, ['3', '2', '1']
heavy, []
test_subsample_mouse_c576.fasta
heavy, ['3']
heavy, ['3', '2']
heavy, ['3', '2', '1']
heavy, []
test_subsample_rabbit.fasta
heavy, ['3']
heavy, ['3', '2']
heavy, ['3', '2', '1']
heavy, []
test_subsample_rat.fasta
heavy, ['3']
heavy, ['3', '2']
heavy, ['3', '2', '1']
heavy, []
test_subsample_rhesus.fasta
heavy, ['3']
heavy, ['3', '2']
heavy, ['3', '2', '1']
heavy, []
test_subsample_human.fasta
light, ['3']
light, ['3', '2']
light, ['3', '2', '1']
light, []
test_subsample_mouse_balbc.fasta
light, ['3']
light, ['3', '2']
light, ['3', '2', '1']
light, []
test_subsample_rhesus.fasta
light, ['3']
light, ['3', '2']
light, ['3', '2', '1']
light, []


In [16]:
dfs = {h:pd.DataFrame.from_dict(d) for (h,d) in scoring.items()}

In [18]:
dfs["heavy"].columns

Index(['scores', 'species', 'labels', 'mask_codes'], dtype='object')

In [21]:
combined_results = {"mask":[], "chain":[], "AUC-ROC":[], "AUC-ROC_LCB":[], "AUC-ROC_UCB":[]}

for chain in ["heavy", "light"]:
    print(f"AUC-ROC results, chain {chain}")
    df = dfs[chain]

    for mask in df.mask_codes.unique():
        subset = df[df["mask_codes"] == mask]
        auc_values = []
        gt_scores = subset["labels"].values
        input_scores = subset["scores"].values
        rng = np.random.default_rng(123)

        for i in range(1000):
            idx = rng.choice(gt_scores.shape[0], gt_scores.shape[0])
            auc_values.append(roc_auc_score(gt_scores[idx], input_scores[idx]))

        auc_values = np.sort(np.array(auc_values))
        score = roc_auc_score(gt_scores, input_scores)
        combined_results["mask"].append(mask)
        combined_results["chain"].append(chain)
        combined_results["AUC-ROC"].append(roc_auc_score(gt_scores, input_scores))
        combined_results["AUC-ROC_LCB"].append(auc_values[25])
        combined_results["AUC-ROC_UCB"].append(auc_values[975])

        print(f"{mask}:\t{score}\tLCB: {auc_values[25]}\tUCB:{auc_values[975]}")

    print("\n")

AUC-ROC results, chain heavy
cdr_3_masked:	0.9612810037607522	LCB: 0.9604965088933722	UCB:0.9621391917495131
cdr_3_2_masked:	0.9522230389277856	LCB: 0.9512787597344369	UCB:0.9531633610382578
cdr_3_2_1_masked:	0.9481899895579116	LCB: 0.9472384488185612	UCB:0.9491129788244127
masked_gaps:	0.9482035412682537	LCB: 0.9472351062897556	UCB:0.9492004768419419
no_mask:	0.9537544028005602	LCB: 0.9528452277390939	UCB:0.9547312741157983


AUC-ROC results, chain light
cdr_3_masked:	0.9451390896231235	LCB: 0.9439934772303273	UCB:0.9462360475136109
cdr_3_2_masked:	0.9450598258410052	LCB: 0.9438855849146923	UCB:0.9461759440970852
cdr_3_2_1_masked:	0.9459188170352941	LCB: 0.9447083063865196	UCB:0.947024660736701
masked_gaps:	0.9474444009185896	LCB: 0.946274402015722	UCB:0.9485542969490786
no_mask:	0.9482599997943796	LCB: 0.9471203449587161	UCB:0.9493557608590446


