### Numbering testing

Sequence numbering in AntPack is orders of magnitude faster than ANARCI, dramatically faster than AbRSA,
and certaintly faster than a webserver-only tool like AbNum. But...how does assigned numbering compare?
In this notebook, we load sequences where AbNum and ANARCI agree on assigned numbering and see in how many
cases AntPack *disagrees*, then save the result of those disagreements to file for closer inspection.
As we will show, AntPack agrees with the consensus on all except 6 - 7 out of several thousand sequences,
and in every case the disagreement is minor (placement of a single gap).

In [3]:
import os
import itertools
import time
import pandas as pd
from antpack import SingleChainAnnotator
import pandas as pd

if "notebooks" in os.getcwd():
    os.chdir(os.path.join("..", ".."))
    current_dir = os.getcwd()

This function is...a little messy. Basically it loads preassigned AbNum / ANARCI numbering,
eliminates any sequences where AbNum / ANARCI don't match or other serious issues are encountered,
numbers the accepted sequences using AntPack and compares AntPack's results to the comparators.
Mismatching sequences are saved to file under csvs in the repo directory under the name
"{numbering_scheme}_{chain}_mismatch_analysis.csv".

In [13]:
def retrieve_and_compare(numbering_scheme = "kabat"):
    os.chdir(os.path.join(current_dir, "numbering_test_data"))
    h_anarci = pd.read_csv(f"{numbering_scheme}_heavy_H.csv")
    l_anarci = pd.read_csv(f"{numbering_scheme}_light_KL.csv")

    anarci_data = {}

    for (df, cn) in [(h_anarci, "h"), (l_anarci, "l")]:
        num_not_matching, num_retained = 0, 0
        dfcols = df.columns.tolist()
        if cn == "h":
            conserved1, conserved2, conserved3, cons4 = dfcols.index("22"), dfcols.index("92"), dfcols.index("36"), dfcols.index("106")
        else:
            conserved1, conserved2, conserved3, cons4 = dfcols.index("23"), dfcols.index("88"), dfcols.index("35"), dfcols.index("101")

        for i in range(df.shape[0]):
            sequence = "".join(df.iloc[i,13:]).replace("-", "")

            #Discard severely truncated sequences...
            if len(sequence) < 105:
                continue

            numbering = [df.columns[j] for j in range(13, len(df.columns)) if
                      df.iloc[i,j] != "-"]

            #and sequences where the highly conserved positions are not conserved.
            if df.iloc[i,conserved1] != "C" or df.iloc[i,conserved2] != "C" or df.iloc[i,conserved3] != "W" or df.iloc[i,cons4] != "G":
                continue
        
            abnum_numbering = df.iloc[i,0].split("_")[3:]
            if abnum_numbering != numbering:
                if cn == "l":
                    if '108' in abnum_numbering:
                        abnum_numbering = abnum_numbering[:abnum_numbering.index('108')]
                    if abnum_numbering != numbering:
                        num_not_matching += 1
                        continue
                else:
                    num_not_matching += 1
                    continue
            anarci_data[sequence] = numbering
            num_retained += 1
        print(f"Of {i}, {num_not_matching} not matching, {num_retained} retained")

    print(f"Retrieved {len(anarci_data)}")

    tool1 = SingleChainAnnotator(chains=["H", "K", "L"], scheme=numbering_scheme, compress_init_gaps = True)

    sequences = list(anarci_data.keys())
    numberings = [anarci_data[k] for k in sequences]

    st = time.time()
    antpack_numberings = tool1.analyze_online_seqs(sequences)
    print(f"Time to number {len(sequences)} sequences with AntPack: {time.time() - st}")

    num_matched = len([i for i in range(len(numberings)) if antpack_numberings[i][0] == numberings[i]])

    ant_col_list = {cn:itertools.chain.from_iterable([k[0] for k in antpack_numberings if k[2] == cn])
                for cn in ["H", "K", "L"]}
    ant_col_list = {k:set(ant_col_list[k]) for k in ant_col_list}


    ksets = {"H":set(h_anarci.columns[13:].tolist()),
        "K":set(l_anarci.columns[13:].tolist()),
         "L":set(l_anarci.columns[13:].tolist())  }

    kcols = {"H":h_anarci.columns[13:].tolist(),
        "K":l_anarci.columns[13:].tolist(),
         "L":l_anarci.columns[13:].tolist()  }

    for k in ["H", "K", "L"]:
        for extra in list(ant_col_list[k] - ksets[k]):
            kcols[k].insert(kcols[k].index(extra[:-1])+1, extra)

    out_dicts = {"H":{"origin":[], "chain":[], "errmess":[]},
             "L":{"origin":[], "chain":[], "errmess":[]},
             "K":{"origin":[], "chain":[], "errmess":[]} }

    for chain in kcols:
        for k in kcols[chain]:
            out_dicts[chain][k] = []

    for i in range(len(antpack_numberings)):
        if antpack_numberings[i][0] != numberings[i]:
            chain = antpack_numberings[i][2]
            out_dict = out_dicts[chain]
            out_dict["origin"].append("combo")
            out_dict["chain"].append(chain)
            out_dict['errmess'].append("na")
            for j, k in enumerate(kcols[chain]):
                if k in numberings[i]:
                    out_dict[k].append(sequences[i][numberings[i].index(k)])
                else:
                    out_dict[k].append("-")
        
            out_dict["origin"].append("**ANTPACK**")
            out_dict["chain"].append(chain)
            out_dict['errmess'].append(antpack_numberings[i][3])
            for j, k in enumerate(kcols[chain]):
                if k in antpack_numberings[i][0]:
                    out_dict[k].append(sequences[i][antpack_numberings[i][0].index(k)])
                else:
                    out_dict[k].append("-")
            for key in out_dict:
                out_dict[key].append(" ")

    dfs = {k:pd.DataFrame.from_dict(out_dicts[k]) for k in out_dicts}

    os.chdir(current_dir)
    for key, out_df in dfs.items():
        out_df.to_csv(f"{numbering_scheme}_{key}_mismatch_analysis.csv")
    
    return num_matched, len(antpack_numberings)

In [None]:
num_matched, total = retrieve_and_compare("kabat")
print(f"For Kabat, out of {total}, {num_matched}; {100 * num_matched / total} percent")
print("Mismatching alignments saved to cloned repo dir.")

In [None]:
num_matched, total = retrieve_and_compare("martin")
print(f"For Martin, out of {total}, {num_matched}; {100 * num_matched / total} percent")
print("Mismatching alignments saved to cloned repo dir.")