### Comparison with AntiBERTy classifier
This notebook compares a mixture model-based classifier with the AntiBERTy LLM-based
classifier. Note that in general we prefer not to use classifiers for reasons
discussed in Parkinson et al. (accuracy improves, but at the expense of robustness).
This comparison is still useful to see whether / if using AntiBERTy provides any
benefit compared with use of a much simpler model.

In [1]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from Bio import SeqIO
from sklearn.metrics import accuracy_score, matthews_corrcoef as MCC
from antpack import SequenceScoringTool

if "notebooks" in os.getcwd():
    os.chdir(os.path.join("..", ".."))
    current_dir = os.getcwd()

score_tool = SequenceScoringTool(offer_classifier_option = True)

os.chdir(current_dir)

We number and score all the test sequences on the fly using AntPack. There are 450,000 sequences,
so this will take a couple minutes.

In [2]:
scoring = { chain:{"scores":[], "species":[], "labels":[]} for chain in ["heavy", "light"] }

for chain in ["heavy", "light"]:
    os.chdir(os.path.join(current_dir, "train_test_data_immunogenicity_0.0.1", f"{chain}_chain", "test_sample_sequences"))
    flist = [f for f in os.listdir() if f.endswith("fasta")]

    for fname in flist:
        if "camel" in fname or "rabbit" in fname:
            continue
        print(fname)
        with open(fname, "r") as fhandle:
            seqs = [str(s.seq) for s in SeqIO.parse(fhandle, "fasta")]

        species = fname.split(".fasta")[0].split("test_subsample_")[1]
        scoring[chain]["scores"] += score_tool.batch_score_seqs(seqs, mode="classifier").tolist()
        scoring[chain]["species"] += [species for s in seqs]
        if species == "human":
            scoring[chain]["labels"] += np.ones((len(seqs))).tolist()
        else:
            scoring[chain]["labels"] += np.zeros((len(seqs))).tolist()

test_subsample_mouse_balbc.fasta
test_subsample_rat.fasta
test_subsample_mouse_c576.fasta
test_subsample_human.fasta
test_subsample_rhesus.fasta
test_subsample_mouse_balbc.fasta
test_subsample_human.fasta
test_subsample_rhesus.fasta


In [3]:
print("AntPack / SAM scores")
for chain in ["heavy", "light"]:
    preds = np.round(scoring[chain]["scores"]).astype(np.int32)
    print(chain)
    print(f"Accuracy: {accuracy_score(scoring[chain]['labels'], preds)}")
    print(f"MCC: {MCC(scoring[chain]['labels'], preds)}\n")

AntPack / SAM scores
heavy
Accuracy: 0.9987498562334669
MCC: 0.9966684585510319

light
Accuracy: 0.9944602796686899
MCC: 0.9876228050179905



In [4]:
print("AntiBERTy heavy chain scores")

os.chdir(os.path.join(current_dir, "results_and_resources"))
heavy_antiberty = pd.read_csv("heavy_antiberty_classifier.txt", header=None)
heavy_antiberty = heavy_antiberty[~heavy_antiberty.iloc[:,0].isin(["rabbit", "camel"])]
heavy_antiberty_labels = [1 if r == "human" else 0 for r in heavy_antiberty.iloc[:,0].tolist()]
heavy_antiberty_preds = [1 if r == "Human" else 0 for r in heavy_antiberty.iloc[:,1].tolist()]

print(f"Accuracy: {accuracy_score(heavy_antiberty_labels, heavy_antiberty_preds)}")
print(f"MCC: {MCC(heavy_antiberty_labels, heavy_antiberty_preds)}")

AntiBERTy heavy chain scores
Accuracy: 0.9171554728793812
MCC: 0.8169122090301767


In [5]:
print("AntiBERTy light chain scores")

os.chdir(os.path.join(current_dir, "results_and_resources"))
light_antiberty = pd.read_csv("light_antiberty_classifier.txt", header=None)
light_antiberty_labels = [1 if r == "human" else 0 for r in light_antiberty.iloc[:,0].tolist()]
light_antiberty_preds = [1 if r == "Human" else 0 for r in light_antiberty.iloc[:,1].tolist()]

print(f"Accuracy: {accuracy_score(light_antiberty_labels, light_antiberty_preds)}")
print(f"MCC: {MCC(light_antiberty_labels, light_antiberty_preds)}")

AntiBERTy light chain scores
Accuracy: 0.9872573019013446
MCC: 0.971418079901737
