In [1]:
%load_ext lab_black

In [4]:
from sadie.airr import Airr
import pandas as pd
from pathlib import Path

airr_api = Airr("human")
heavy_file = Path("../../tests/integration/airr/fixtures/catnap_nt_heavy.fasta.gz")
light_file = Path("../../tests/integration/airr/fixtures/catnap_nt_light.fasta.gz")
airr_api = Airr(species="human", database="imgt", functional="functional")
catnap_heavy = airr_api.run_file(heavy_file)
catnap_light = airr_api.run_file(light_file)
light_at = pd.read_feather(
    "../../tests/integration/airr/fixtures/catnap_light_airrtable.feather",
)
heavy_at = pd.read_feather(
    "../../tests/integration/airr/fixtures/catnap_heavy_airrtable.feather",
)
pd.testing.assert_frame_equal(light_at, catnap_light.table)
pd.testing.assert_frame_equal(heavy_at, catnap_heavy.table)

# make new files

In [94]:
from distutils.version import StrictVersion
from math import nan
from pathlib import Path

import pandas as pd
import pytest
from pkg_resources import resource_filename
from sadie.airr import Airr
from sadie.airr.airrtable import constants


def fixture_file(file):
    """Helper method for test execution."""
    _file = Path(resource_filename(__name__, f"fixtures/{file}"))
    if not _file.exists():
        raise FileExistsError(f"Fixutre file not found {_file}")
    return _file


def fillna(df, fill_value=""):
    """
    Replace null values with `fill_value`.

    Also replaces in categorical columns.
    """
    for col in df.dtypes[df.dtypes == "category"].index:
        if fill_value not in df[col].cat.categories:
            df[col].cat.add_categories([fill_value], inplace=True)
    # Known bug https://github.com/pandas-dev/pandas/issues/25472
    if StrictVersion(pd.__version__) >= StrictVersion("1.0"):
        for col in df.dtypes[df.dtypes.apply(lambda x: x in ["float64", "Int16", "Int64"])].index:
            df[col] = df[col].astype("float")
    return df.fillna(fill_value)


# straight ignore these
ignore = [
    "v_score",
    "d_score",
    "j_score",
    "v_support",
    "d_support",
    "j_support",
]

# cast these to integers
starts_and_ends = [
    "cdr1_end",
    "cdr1_start",
    "cdr2_end",
    "cdr2_start",
    "cdr3_end",
    "cdr3_start",
    "d_alignment_end",
    "d_alignment_start",
    "d_germline_end",
    "d_germline_start",
    "d_sequence_end",
    "d_sequence_start",
    "fwr1_end",
    "fwr1_start",
    "fwr2_end",
    "fwr2_start",
    "fwr4_end",
    "fwr4_start",
    "j_alignment_end",
    "j_alignment_start",
    "j_germline_end",
    "j_germline_start",
    "j_sequence_end",
    "j_sequence_start",
]

# check these in integration
check_these = [
    "sequence",
    "locus",
    "stop_codon",
    "vj_in_frame",
    "productive",
    "rev_comp",
    "complete_vdj",
    "fwr1",
    "cdr1",
    "fwr2",
    "cdr2",
    "fwr3",
    "cdr3",
    "fwr1_aa",
    "cdr1_aa",
    "fwr2_aa",
    "cdr2_aa",
    "fwr3_aa",
    "cdr3_aa",
    "fwr4_aa",
]


def make_sadie_comparable(df):
    """Takes sadie df and makes it comparable wiht IMGT

    Parameters
    ----------
    df : AirrTable

    Returns
    -------
    pd.DataFrame
        returns a pandas dataframe for comparison
    """

    # comparison keys between imgt and sadie
    compare_key = list(constants.IGBLAST_AIRR.keys()) + [
        "v_call_top",
        "d_call_top",
        "j_call_top",
    ]

    # Drop frameshift since imgt does not have it
    compare_key.remove("v_frameshift")

    # Just get compare keys
    df = df[compare_key].drop(ignore, axis=1)
    df.loc[:, starts_and_ends] = df[starts_and_ends].astype("Int64")
       # Just get the gene top call IGHV1-2*01 -> IGHV1-2
    df.insert(
        df.columns.get_loc("v_call_top"),
        "v_gene_top",
        df["v_call_top"].str.split("*").str.get(0),
    )
    df.insert(
        df.columns.get_loc("d_call_top"),
        "d_gene_top",
        df["d_call_top"].str.split("*").str.get(0),
    )
    df.insert(
        df.columns.get_loc("j_call_top"),
        "j_gene_top",
        df["j_call_top"].str.split("*").str.get(0),
    )
    return df


def make_imgt_comparable(df: pd.DataFrame) -> pd.DataFrame:

    """Takes Hi-Vquest and return a compariable dataframe

    Parameters
    ----------
    df : pd.Dataframe

    Returns
    -------
    pd.Dataframe
        A comparable dataframe
    """

    def map_bool(mapp_df, col):
        return mapp_df.loc[:, col].map(
            {"T": True, "F": False, True: True, False: False, nan: False},
            na_action="ignore",
        )

    compare_key = list(constants.IGBLAST_AIRR.keys())
    compare_key.remove("v_frameshift")
    df = df[compare_key].copy()
    bool_cols = ["vj_in_frame", "productive", "rev_comp", "complete_vdj", "stop_codon"]
    for col in bool_cols:
        df[col] = map_bool(df, col)

    upper_columns = [
        "fwr1",
        "cdr1",
        "fwr2",
        "cdr2",
        "fwr3",
        "cdr3",
        "fwr4",
        "germline_alignment",
        "germline_alignment_aa",
        "sequence_alignment",
        "sequence_alignment_aa",
        "v_germline_alignment",
        "v_germline_alignment_aa",
        "v_sequence_alignment",
        "v_sequence_alignment_aa",
        "d_germline_alignment",
        "d_germline_alignment_aa",
        "d_sequence_alignment",
        "d_sequence_alignment_aa",
        "j_germline_alignment",
        "j_germline_alignment_aa",
        "j_sequence_alignment",
        "j_sequence_alignment_aa",
        "np1",
        "np2",
        "junction",
        "sequence",
    ]
    for col in upper_columns:
        df[col] = df[col].str.upper()

    # strip the "." on left and right by imgt
    df["fwr1"] = df["fwr1"].str.lstrip(".")
    df["fwr4"] = df["fwr4"].str.rstrip(".")

    # Get top gens
    df["v_call_top"] = df["v_call"].str.split(",").str.get(0).str.split().str.get(1)
    df["v_gene_top"] = df["v_call_top"].str.split("*").str.get(0)
    df["d_call_top"] = df["d_call"].str.split(",").str.get(0).str.split("_").str.get(-1)
    df["d_gene_top"] = df["d_call_top"].str.split("*").str.get(0)
    df["j_call_top"] = df["j_call"].str.split(",").str.get(0).str.split().str.get(1)
    df["j_gene_top"] = df["j_call_top"].str.split("*").str.get(0)

    # Drop the ignore
    df = df.drop(ignore, axis=1)

    # Convert the integer types
    df.loc[:, starts_and_ends] = df[starts_and_ends].astype("Int64")
    return df


# @pytest.mark.skip(reason="integration tests will change under this active development")
def test_imgt_integration():
    
    ignore = [48,
     67,
     143,
     149,
     193,
     213,
     239,
     286,
     305,
     306,
     364,
     367,
     383,
     419,
     436,
     450,
     457,
     486,
     490,
     520,
     521,
     590,
     606,
     612,
     631,
     698,
     715,
     720,
     732,
     760,
     767,
     827,
     839,
     888,
     899,
     983,
     989]

    # sadie annotate
    file = '../../tests/integration/airr/fixtures/OAS_subsample_good_anarci_sub1000.fasta'
    #file = fixture_file("OAS_subsample_good_anarci_sub1000.fasta")
    airr_api = Airr(species="human", database="imgt", adaptable=True)
    sadie_airr = airr_api.run_fasta(file)
    sadie_comparable = make_sadie_comparable(sadie_airr)[check_these]
    sadie_comparable = sadie_comparable.drop(pd.Index(ignore))

    # imgt airr
    # imgt_airr = fixture_file("imgt_v_quest_airr.tsv.gz'")
    imgt_airr = '../../tests/integration/airr/fixtures/imgt_v_quest_airr.tsv.gz'

    imgt_df = pd.read_csv(imgt_airr, low_memory=False)
    imgt_comparable = make_imgt_comparable(imgt_df)[check_these]
    imgt_comparable = imgt_comparable.drop(pd.Index(ignore))
    for x in check_these:
        if not (sadie_comparable[x] == imgt_comparable[x]).all():
            not_index = sadie_comparable[~(sadie_comparable[x] == imgt_comparable[x])].index
            if not not_index.empty:
                raise AssertionError(f"{not_index} does not match {x}")
    


test_imgt_integration()

In [86]:
df.to_csv('../../tests/integration/airr/fixtures/imgt_v_quest_airr.tsv.gz')

In [13]:
import pandas as pd
from Bio.SeqIO import parse
from sadie.airr import Airr

df =pd.read_csv('vquest_airr.tsv',delimiter='\t')
fastas = list(parse('../../tests/integration/airr/fixtures/OAS_subsample_good_anarci_sub1000.fasta','fasta'))
airr_api = Airr('human',adaptable=True)
sadie_results = airr_api.run_fasta("../../tests/integration/airr/fixtures/OAS_subsample_good_anarci_sub1000.fasta")

In [85]:
df_1 = make_sadie_comparable(sadie_results)[check_these]

df_2 = make_imgt_comparable(df)[check_these]

drop_these = []
for x in check_these:
    if not (df_1[x] == df_2[x]).all():
        not_index = df_1[~(df_1[x] == df_2[x])].index
        #print(df_1.loc[not_index][x])
        #print(df_2.loc[not_index][x])
        drop_these += list(not_index)
dfsorted(list(set(drop_these)))

[48,
 67,
 143,
 149,
 193,
 213,
 239,
 286,
 305,
 306,
 364,
 367,
 383,
 419,
 436,
 450,
 457,
 486,
 490,
 520,
 521,
 590,
 606,
 612,
 631,
 698,
 715,
 720,
 732,
 760,
 767,
 827,
 839,
 888,
 899,
 983,
 989]

In [81]:
drop_these

[Int64Index([419], dtype='int64'),
 Int64Index([899], dtype='int64'),
 Int64Index([143, 306, 827, 899], dtype='int64'),
 Int64Index([67, 457, 983], dtype='int64'),
 Int64Index([143, 306, 827, 899], dtype='int64'),
 Int64Index([67, 457, 983], dtype='int64'),
 Int64Index([ 48, 149, 193, 213, 239, 286, 305, 364, 367, 383, 419, 436, 450,
             486, 490, 520, 521, 590, 606, 612, 631, 698, 715, 720, 732, 760,
             767, 839, 888, 989],
            dtype='int64')]

In [64]:
df_2.loc[not_index][x]

419    True
Name: complete_vdj, dtype: bool

In [59]:
df_2.loc[419]['fwr4_aa']

'WGQGTTVTVS'