In [1]:
%load_ext lab_black

In [2]:
import json
import gzip
import pandas as pd
import glob
import tempfile
from importlib import reload

import uuid
import os
from pprint import pprint
from multiprocessing import Pool
from pathlib import Path
from sadie.airr import Airr, AirrTable
from gspread_pandas import Spread, Client
from sadie.airr import AirrTable
from sadie.airr import constants
from distutils.version import StrictVersion
from math import nan
from IPython.display import clear_output
from IPython.core.display import display, HTML

spread = Spread("Sadie", create_spread=True, create_sheet=True)
pd.set_option("display.max_rows", 45)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 100000)


def fillna(df, fill_value=""):
    """
    Replace null values with `fill_value`.

    Also replaces in categorical columns.
    """
    for col in df.dtypes[df.dtypes == "category"].index:
        if fill_value not in df[col].cat.categories:
            df[col].cat.add_categories([fill_value], inplace=True)
    # Known bug https://github.com/pandas-dev/pandas/issues/25472
    if StrictVersion(pd.__version__) >= StrictVersion("1.0"):
        for col in df.dtypes[
            df.dtypes.apply(lambda x: x in ["float64", "Int16", "Int64"])
        ].index:
            df[col] = df[col].astype("float")
    return df.fillna(fill_value)

# Sadie Annotate

In [3]:
# straight ignore these
ignore = [
    #     "v_call",
    #     "d_call",
    #     "j_call",
    "v_score",
    "d_score",
    "j_score",
    "v_support",
    "d_support",
    "j_support",
]

# cast these to integers
starts_and_ends = [
    "cdr1_end",
    "cdr1_start",
    "cdr2_end",
    "cdr2_start",
    "cdr3_end",
    "cdr3_start",
    "d_alignment_end",
    "d_alignment_start",
    "d_germline_end",
    "d_germline_start",
    "d_sequence_end",
    "d_sequence_start",
    "fwr1_end",
    "fwr1_start",
    "fwr2_end",
    "fwr2_start",
    "fwr4_end",
    "fwr4_start",
    "j_alignment_end",
    "j_alignment_start",
    "j_germline_end",
    "j_germline_start",
    "j_sequence_end",
    "j_sequence_start",
]


def make_sadie_comparable(df):
    if isinstance(df, AirrTable):
        df = df.table
    # comparison keys between imgt and sadie
    compare_key = list(constants.IGBLAST_AIRR.keys()) + [
        "v_call_top",
        "d_call_top",
        "j_call_top",
    ]
    compare_key.remove("v_frameshift")
    df = df[compare_key].drop(ignore, axis=1)
    df.loc[:, starts_and_ends] = df[starts_and_ends].astype("Int64")
    df.insert(
        df.columns.get_loc("v_call_top"),
        "v_gene_top",
        df["v_call_top"].str.split("*").str.get(0),
    )
    df.insert(
        df.columns.get_loc("d_call_top"),
        "d_gene_top",
        df["d_call_top"].str.split("*").str.get(0),
    )
    df.insert(
        df.columns.get_loc("j_call_top"),
        "j_gene_top",
        df["j_call_top"].str.split("*").str.get(0),
    )
    return df


def make_imgt_comparable(df):
    def map_bool(mapp_df, col):
        return mapp_df.loc[:, col].map(
            {"T": True, "F": False, True: True, False: False, nan: False},
            na_action="ignore",
        )

    compare_key = list(constants.IGBLAST_AIRR.keys())
    compare_key.remove("v_frameshift")
    df = df[compare_key].copy()
    bool_cols = ["vj_in_frame", "productive", "rev_comp", "complete_vdj", "stop_codon"]
    for col in bool_cols:
        df[col] = map_bool(df, col)

    upper_columns = [
        "fwr1",
        "cdr1",
        "fwr2",
        "cdr2",
        "fwr3",
        "cdr3",
        "fwr4",
        "germline_alignment",
        "germline_alignment_aa",
        "sequence_alignment",
        "sequence_alignment_aa",
        "v_germline_alignment",
        "v_germline_alignment_aa",
        "v_sequence_alignment",
        "v_sequence_alignment_aa",
        "d_germline_alignment",
        "d_germline_alignment_aa",
        "d_sequence_alignment",
        "d_sequence_alignment_aa",
        "j_germline_alignment",
        "j_germline_alignment_aa",
        "j_sequence_alignment",
        "j_sequence_alignment_aa",
        "np1",
        "np2",
        "junction",
        "sequence",
    ]
    for col in upper_columns:
        df[col] = df[col].str.upper().str.replace(".", "")

    df["v_call_top"] = df["v_call"].str.split(",").str.get(0).str.split().str.get(1)
    df["v_gene_top"] = df["v_call_top"].str.split("*").str.get(0)
    df["d_call_top"] = df["d_call"].str.split(",").str.get(0).str.split("_").str.get(-1)
    df["d_gene_top"] = df["d_call_top"].str.split("*").str.get(0)
    df["j_call_top"] = df["j_call"].str.split(",").str.get(0).str.split().str.get(1)
    df["j_gene_top"] = df["j_call_top"].str.split("*").str.get(0)
    df = df.drop(ignore, axis=1)
    df.loc[:, starts_and_ends] = df[starts_and_ends].astype("Int64")
    return df

In [19]:
sub_sample_file = "../../tests/integration/airr/fixtures/OAS_subsample.fasta"
airr_api = Airr(species="human", database="imgt", functional="all")
sadie_airr = airr_api.run_file(sub_sample_file)



In [5]:
sadie_comparable = make_sadie_comparable(sadie_airr)

In [8]:
imgt_airr = "../../tests/integration/airr/fixtures/OAS_airr_from_imgtvquest.tsv"
imgt_df = pd.read_csv(imgt_airr, delimiter="\t", low_memory=False)

imgt_comparable = make_imgt_comparable(imgt_df)

  df[col] = df[col].str.upper().str.replace(".", "")


In [117]:
sadie_comparable.to_feather("sadie_compariable.feather")
imgt_comparable.to_feather("imgt_compariable.feather")

### Only good

In [34]:
sub_sample_file_good_anarci = (
    "../../tests/integration/airr/fixtures/OAS_subsample_good_anarci.fasta"
)
airr_api = Airr(species="human", database="imgt", functional="all")
sadie_airr_good_anarci = airr_api.run_file(sub_sample_file)

In [35]:
sadie_airr_good_anarci

Unnamed: 0,sequence_id,sequence,species,locus,stop_codon,vj_in_frame,v_frameshift,productive,rev_comp,complete_vdj,v_call_top,v_call,d_call_top,d_call,j_call_top,j_call,sequence_alignment,germline_alignment,sequence_alignment_aa,germline_alignment_aa,v_alignment_start,v_alignment_end,d_alignment_start,d_alignment_end,j_alignment_start,j_alignment_end,v_sequence_alignment,v_sequence_alignment_aa,v_germline_alignment,v_germline_alignment_aa,d_sequence_alignment,d_sequence_alignment_aa,d_germline_alignment,d_germline_alignment_aa,j_sequence_alignment,j_sequence_alignment_aa,j_germline_alignment,j_germline_alignment_aa,fwr1,fwr1_aa,cdr1,cdr1_aa,fwr2,fwr2_aa,cdr2,cdr2_aa,fwr3,fwr3_aa,fwr4,fwr4_aa,cdr3,cdr3_aa,junction,junction_length,junction_aa,junction_aa_length,v_score,d_score,j_score,v_cigar,d_cigar,j_cigar,v_support,d_support,j_support,v_identity,d_identity,j_identity,v_sequence_start,v_sequence_end,v_germline_start,v_germline_end,d_sequence_start,d_sequence_end,d_germline_start,d_germline_end,j_sequence_start,j_sequence_end,j_germline_start,j_germline_end,fwr1_start,fwr1_end,cdr1_start,cdr1_end,fwr2_start,fwr2_end,cdr2_start,cdr2_end,fwr3_start,fwr3_end,fwr4_start,fwr4_end,cdr3_start,cdr3_end,np1,np1_length,np2,np2_length,vdj_nt,vdj_aa
0,0_SRR11937609_igblastn_anarci_Heavy_IGHD_55,CAGTTACAATATGGTCTTGGGGGATGCTTTCTGAGAGTCATGGATC...,human,IGH,False,True,False,True,False,True,IGHV4-39*01,IGHV4-39*01,IGHD3-3*01,IGHD3-3*01,IGHJ4*02,IGHJ4*02,CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGG...,CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGG...,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKG...,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKG...,1,297,308,333,342,382,CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGG...,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKG...,CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGG...,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKG...,ACGATTTTTGGAGTGGTTATTATACC,DFWSGYYT,ACGATTTTTGGAGTGGTTATTATACC,DFWSGYYT,TGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,DYWGQGTLVTVSS,TGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,DYWGQGTLVTVSS,CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGG...,QLQLQESGPGLVKPSETLSLTCTVS,GGTGGCTCCATCAGCAGTAGTAGTTACTAC,GGSISSSSYY,TGGGGCTGGATCCGCCAGCCCCCAGGGAAGGGGCTGGAGTGGATTG...,WGWIRQPPGKGLEWIGS,ATCTATTATAGTGGGAGCACC,IYYSGST,TACTACAACCCGTCCCTCAAGAGTCGAGTCACCATATCCGTAGACA...,YYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYC,TGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA,WGQGTLVTVSS,GCGAGAGGCACTACTCACGATTTTTGGAGTGGTTATTATACCGGCC...,ARGTTHDFWSGYYTGPADY,TGTGCGAGAGGCACTACTCACGATTTTTGGAGTGGTTATTATACCG...,63,CARGTTHDFWSGYYTGPADYW,21,566.00,50.687500,66.56250,117S297M121S2N,424S5N26M85S,458S7N41M36S,3.780000e-163,1.837000e-10,3.858000e-15,99.6875,100.0,100.000,118,414,1,297,425,450,6.0,31.0,459,499,8,48,118,192,193,222,223,273,274,294,295,408,466,498,409,465,GGCACTACTC,10,GGCCCAGC,8,CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGG...,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKG...
1,1_SRR11937601_igblastn_anarci_Heavy_IGHD_552,GCTTTAACCTAAGTTCTTGGGGGAGCTCTGAGAGAGGAGCCCAGCC...,human,IGH,False,True,False,True,False,True,IGHV3-9*01,IGHV3-9*01,IGHD4-17*01,"IGHD4-17*01,IGHD4-23*01",IGHJ3*02,IGHJ3*02,GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGCA...,GAAGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGCA...,EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE...,EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE...,1,295,301,310,322,367,GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGCA...,EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE...,GAAGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGCA...,EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE...,GACTACGGTG,DYG,GACTACGGTG,DYG,GCTTTTGATATCTGGGGCCAAGGGACAATGGTCACCGTCTCTTCAG,AFDIWGQGTMVTVSS,GCTTTTGATATCTGGGGCCAAGGGACAATGGTCACCGTCTCTTCAG,AFDIWGQGTMVTVSS,GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGCA...,EVQLVESGGGLVQPGRSLRLSCAAS,GGATTCACCTTTGATGATTATGCC,GFTFDDYA,ATGCACTGGGTCCGGCAAGCTCCAGGGAAGGGCCTGGAGTGGGTCT...,MHWVRQAPGKGLEWVSG,ATTAGTTGGAATAGTGGTAGCATA,ISWNSGSI,GGCTATGCGGACTCTGTGAAGGGCCGATTCACCATCTCCAGAGACA...,GYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTALYYC,TGGGGCCAAGGGACAATGGTCACCGTCTCTTCA,WGQGTMVTVSS,GCAAAAGCCTTAGACTACGGTGCCCCCGGTTGGGCTTTTGATATC,AKALDYGAPGWAFDI,TGTGCAAAAGCCTTAGACTACGGTGCCCCCGGTTGGGCTTTTGATA...,51,CAKALDYGAPGWAFDIW,17,562.00,19.906250,74.50000,159S295M108S3N,459S1N10M93S5N,480S4N46M36S,5.715000e-162,3.520000e-01,1.671000e-17,99.6875,100.0,100.000,160,454,1,295,460,469,2.0,11.0,481,526,5,50,160,234,235,258,259,309,310,333,334,447,493,525,448,492,CCTTA,5,CCCCCGGTTGG,11,GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGCA...,EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE...
2,2_SRR11937602_igblastn_anarci_Heavy_IGHD_23,GACTTAATCTGAAAGCTTGGGGGATCACTCAACAACCACATCTGTC...,human,IGH,False,True,False,True,False,True,IGHV1-8*01,IGHV1-8*01,,,IGHJ4*02,IGHJ4*02,CAGGTGCAGCTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGG...,CAGGTGCAGCTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGG...,QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYDINWVRQATGQGLE...,QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYDINWVRQATGQGLE...,1,296,,,305,352,CAGGTGCAGCTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGG...,QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYDINWVRQATGQGLE...,CAGGTGCAGCTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGG...,QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYDINWVRQATGQGLE...,,,,,ACTACGGTGACCACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,YGDHWGQGTLVTVSS,ACTACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,YFDYWGQGTLVTVSS,CAGGTGCAGCTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGG...,QVQLVQSGAEVKKPGASVKVSCKAS,GGATACACCTTCACCAGTTATGAT,GYTFTSYD,ATCAACTGGGTGCGACAGGCCACTGGACAAGGGCTTGAGTGGATGG...,INWVRQATGQGLEWMGW,ATGAACCCTAACAGTGGTAACACA,MNPNSGNT,GGCTATGCACAGAAGTTCCAGGGCAGAGTCACCATGACCAGGAACA...,GYAQKFQGRVTMTRNTSISTAYMELSSLRSEDTAVYYC,TGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA,WGQGTLVTVSS,GCGAGAGGCCTCTCTGACTACGGTGACCAC,ARGLSDYGDH,TGTGCGAGAGGCCTCTCTGACTACGGTGACCACTGG,36,CARGLSDYGDHW,12,570.00,,68.12500,138S296M92S,,442S48M36S,2.584000e-164,,1.263000e-15,100.0000,,93.750,139,434,1,296,,,,,443,490,1,48,139,213,214,237,238,288,289,312,313,426,457,489,427,456,CCTCTCTG,8,,,CAGGTGCAGCTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGG...,QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYDINWVRQATGQGLE...
3,3_SRR11937612_igblastn_anarci_Heavy_IGHD_43,GGAATTAAATACGGTCTTGGGGGGATCAGGACTGAACAGAGAGAAC...,human,IGH,False,True,False,True,False,True,IGHV3-23*01,"IGHV3-23*01,IGHV3-23D*01",IGHD3-10*01,"IGHD3-10*01,IGHD3-10*02,IGHD3-16*02",IGHJ4*02,IGHJ4*02,GAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGG...,GAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGG...,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...,1,296,307,312,318,358,GAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGG...,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...,GAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGG...,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...,TCGGGG,SG,TCGGGG,SG,TGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,DYWGQGTLVTVSS,TGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,DYWGQGTLVTVSS,GAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGG...,EVQLLESGGGLVQPGGSLRLSCAAS,GGATTCACCTTTAGCAGCTATGCC,GFTFSSYA,ATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCT...,MSWVRQAPGKGLEWVSA,ATTAGTGGTAGTGGTGGTAGCACA,ISGSGGST,TACTACGCAGACTCCGTGAAGGGCCGGTTCACCATCTCCAGAGACA...,YYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYC,TGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA,WGQGTLVTVSS,GCGAAAGACGCCGCCTTATCGGGGGTGAGTGACTAC,AKDAALSGVSDY,TGTGCGAAAGACGCCGCCTTATCGGGGGTGAGTGACTACTGG,42,CAKDAALSGVSDYW,14,570.00,12.226562,66.56250,108S296M98S,414S13N6M82S12N,425S7N41M36S,2.463000e-164,6.481000e+01,3.611000e-15,100.0000,100.0,100.000,109,404,1,296,415,420,14.0,19.0,426,466,8,48,109,183,184,207,208,258,259,282,283,396,433,465,397,432,CGCCGCCTTA,10,GTGAG,5,GAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGG...,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...
4,4_Donor01+IGH-Clonotypes_igblastn_anarci_Heavy...,CAGGTGCAGTTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGG...,human,IGH,False,True,False,True,False,True,IGHV1-46*01,IGHV1-46*01,IGHD3-22*01,"IGHD3-22*01,IGHD3-3*01,IGHD3-3*02",IGHJ4*02,IGHJ4*02,CAGGTGCAGTTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGG...,CAGGTGCAGCTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGG...,QVQLVQSGAEVKKPGASVKLPCKASGYIFTKYSMHWVRQAPGQGLE...,QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYYMHWVRQAPGQGLE...,1,293,295,301,322,367,CAGGTGCAGTTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGG...,QVQLVQSGAEVKKPGASVKLPCKASGYIFTKYSMHWVRQAPGQGLE...,CAGGTGCAGCTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGG...,QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYYMHWVRQAPGQGLE...,GGTTATT,GY,GGTTATT,GY,TACTTTGACAACTGGGGCCAGGGAACCCTGGTCACCGTTTCCTCAG,YFDNWGQGTLVTVSS,TACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,YFDYWGQGTLVTVSS,CAGGTGCAGTTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGG...,QVQLVQSGAEVKKPGASVKLPCKAS,GGATACATCTTCACCAAGTACAGT,GYIFTKYS,ATGCACTGGGTGCGACAGGCCCCTGGACAGGGGCTTGAGTGGATGG...,MHWVRQAPGQGLEWMGI,ATCAACTCTAGTGGTGGTGGCACA,INSSGGGT,AGCTTCGCACAGAAGTTCCAGGGCAGAGTCACCATGACCAGGGACA...,SFAQKFQGRVTMTRDTSTRIVYMELSSLRSEDTAVYYC,TGGGGCCAGGGAACCCTGGTCACCGTTTCCTCA,WGQGTLVTVSS,GCGAGGGGTTATTCTGATATTCGTGCTCATTCGTACTTTGACAAC,ARGYSDIRAHSYFDN,TGTGCGAGGGGTTATTCTGATATTCGTGCTCATTCGTACTTTGACA...,51,CARGYSDIRAHSYFDNW,17,471.75,14.148438,68.12500,293M74S3N,294S19N7M66S5N,321S2N46M,5.869000e-135,1.240000e+01,9.037000e-16,94.5625,100.0,95.625,1,293,1,293,295,301,20.0,26.0,322,367,3,48,1,75,76,99,100,150,151,174,175,288,334,366,289,333,G,1,CTGATATTCGTGCTCATTCG,20,CAGGTGCAGTTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGG...,QVQLVQSGAEVKKPGASVKLPCKASGYIFTKYSMHWVRQAPGQGLE...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23995,23995_SRR11937611_igblastn_anarci_Light_Bulk_4,TCCATCGGATGGGCTCTTGGGGGGGAGGAACTGCTCAGTTAGGACC...,human,IGK,False,True,False,True,False,True,IGKV3-11*01,IGKV3-11*01,,,IGKJ4*01,IGKJ4*01,GAAATTGTGTTGACACAGTCTCCAGCCACCCTGTCTTTGTCTCCAG...,GAAATTGTGTTGACACAGTCTCCAGCCACCCTGTCTTTGTCTCCAG...,EIVLTQSPATLSLSPGERATLSCRASQSVSSYLAWYQQKPGQAPRL...,EIVLTQSPATLSLSPGERATLSCRASQSVSSYLAWYQQKPGQAPRL...,1,284,,,286,322,GAAATTGTGTTGACACAGTCTCCAGCCACCCTGTCTTTGTCTCCAG...,EIVLTQSPATLSLSPGERATLSCRASQSVSSYLAWYQQKPGQAPRL...,GAAATTGTGTTGACACAGTCTCCAGCCACCCTGTCTTTGTCTCCAG...,EIVLTQSPATLSLSPGERATLSCRASQSVSSYLAWYQQKPGQAPRL...,,,,,CTCACTTTCGGCGGAGGGACCAAGGTGGAGATCAAAC,LTFGGGTKVEIK,CTCACTTTCGGCGGAGGGACCAAGGTGGAGATCAAAC,LTFGGGTKVEIK,GAAATTGTGTTGACACAGTCTCCAGCCACCCTGTCTTTGTCTCCAG...,EIVLTQSPATLSLSPGERATLSCRAS,CAGAGTGTTAGCAGCTAC,QSVSSY,TTAGCCTGGTACCAACAGAAACCTGGCCAGGCTCCCAGGCTCCTCA...,LAWYQQKPGQAPRLLIY,GATGCATCC,DAS,AACAGGGCCACTGGCATCCCAGCCAGGTTCAGTGGCAGTGGGTCTG...,NRATGIPARFSGSGSGTDFTLTISSLEPEDFAVYYC,TTCGGCGGAGGGACCAAGGTGGAGATCAAA,FGGGTKVEIK,CAGCAGCGTAGCAACTGGCCCCTCACT,QQRSNWPLT,TGTCAGCAGCGTAGCAACTGGCCCCTCACTTTC,33,CQQRSNWPLTF,11,546.50,,60.21875,117S284M76S3N,,402S1N37M38S,2.060000e-157,,2.773000e-13,100.0000,,100.000,118,401,1,284,,,,,403,439,2,38,118,195,196,213,214,264,265,273,274,381,409,438,382,408,C,1,,,GAAATTGTGTTGACACAGTCTCCAGCCACCCTGTCTTTGTCTCCAG...,EIVLTQSPATLSLSPGERATLSCRASQSVSSYLAWYQQKPGQAPRL...
23996,23996_ERR3664766_igblastn_anarci_Light_Bulk_62732,GAGAGCTCTGGGGAGTCTGCACCATGGCTTGGACCCCACTCCTCTT...,human,IGL,False,True,False,True,False,True,IGLV4-69*01,IGLV4-69*01,,,IGLJ3*02,IGLJ3*02,CAGCTTGTGCTGACTCAATCGCCCTCTGCCTCTGCCTCCCTGGGAG...,CAGCTTGTGCTGACTCAATCGCCCTCTGCCTCTGCCTCCCTGGGAG...,QLVLTQSPSASASLGASVKLTCTLSSGYNSYAIAWHQQQPGKGPRY...,QLVLTQSPSASASLGASVKLTCTLSSGHSSYAIAWHQQQPEKGPRY...,1,298,,,301,334,CAGCTTGTGCTGACTCAATCGCCCTCTGCCTCTGCCTCCCTGGGAG...,QLVLTQSPSASASLGASVKLTCTLSSGYNSYAIAWHQQQPGKGPRY...,CAGCTTGTGCTGACTCAATCGCCCTCTGCCTCTGCCTCCCTGGGAG...,QLVLTQSPSASASLGASVKLTCTLSSGHSSYAIAWHQQQPEKGPRY...,,,,,GTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAG,VFGGGTKLTVL,GTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAG,VFGGGTKLTVL,CAGCTTGTGCTGACTCAATCGCCCTCTGCCTCTGCCTCCCTGGGAG...,QLVLTQSPSASASLGASVKLTCTLS,AGTGGGTACAACAGCTACGCC,SGYNSYA,ATCGCATGGCATCAGCAGCAGCCAGGGAAGGGCCCTCGGTACTTGA...,IAWHQQQPGKGPRYLMR,CTTAACAGTGATGGCAGCCAC,LNSDGSH,ATCAGAGGGGACGGGATCCCTGATCGCTTCTCAGGCTCCAGCTCTG...,IRGDGIPDRFSGSSSGAERYLIISSLQSVDEADYYC,TTCGGCGGAGGGACCAAGCTGACCGTCCTA,FGGGTKLTVL,CAGACCTGGGGCACTGGCATTCTAGTG,QTWGTGILV,TGTCAGACCTGGGGCACTGGCATTCTAGTGTTC,33,CQTWGTGILVF,11,516.00,,55.46875,83S298M101S1N,,383S4N34M65S,3.795000e-148,,7.570000e-12,96.6250,,100.000,84,381,1,298,,,,,384,417,5,38,84,158,159,179,180,230,231,251,252,359,387,416,360,386,TA,2,,,CAGCTTGTGCTGACTCAATCGCCCTCTGCCTCTGCCTCCCTGGGAG...,QLVLTQSPSASASLGASVKLTCTLSSGYNSYAIAWHQQQPGKGPRY...
23997,23997_Donor01+IGK-SomaticVariants_igblastn_ana...,GAAATAGTGATGACGCAGTCACCAGCCACCCTGTCAGTGTCTCCAG...,human,IGK,False,True,False,True,False,True,IGKV3-15*01,IGKV3-15*01,,,IGKJ4*01,IGKJ4*01,GAAATAGTGATGACGCAGTCACCAGCCACCCTGTCAGTGTCTCCAG...,GAAATAGTGATGACGCAGTCTCCAGCCACCCTGTCTGTGTCTCCAG...,EIVMTQSPATLSVSPGETTRLSCRASQSINSDVAWYQQKVGQTPRL...,EIVMTQSPATLSVSPGERATLSCRASQSVSSNLAWYQQKPGQAPRL...,1,287,,,290,325,GAAATAGTGATGACGCAGTCACCAGCCACCCTGTCAGTGTCTCCAG...,EIVMTQSPATLSVSPGETTRLSCRASQSINSDVAWYQQKVGQTPRL...,GAAATAGTGATGACGCAGTCTCCAGCCACCCTGTCTGTGTCTCCAG...,EIVMTQSPATLSVSPGERATLSCRASQSVSSNLAWYQQKPGQAPRL...,,,,,TCACTTTCGGCGGAGGGACCAAGGTGGAGATCAAAC,TFGGGTKVEIK,TCACTTTCGGCGGAGGGACCAAGGTGGAGATCAAAC,TFGGGTKVEIK,GAAATAGTGATGACGCAGTCACCAGCCACCCTGTCAGTGTCTCCAG...,EIVMTQSPATLSVSPGETTRLSCRAS,CAGAGTATTAACAGCGAC,QSINSD,GTAGCCTGGTACCAACAGAAAGTTGGCCAGACTCCCAGGCTCCTCA...,VAWYQQKVGQTPRLLIH,GGTGCTTCC,GAS,ACCAGGGCCACTGGTATCCCAGCCAGGTTCAGTGGCAGTGGGTCTG...,TRATGIPARFSGSGSGTEFTLTISSLQSEDFAVYYC,TTCGGCGGAGGGACCAAGGTGGAGATCAAA,FGGGTKVEIK,CAGCAGTATAATAATTGGCCTCCGGTCACT,QQYNNWPPVT,TGTCAGCAGTATAATAATTGGCCTCCGGTCACTTTC,36,CQQYNNWPPVTF,12,454.50,,58.65625,287M38S,,289S2N36M,8.365000e-130,,5.795000e-13,94.0625,,100.000,1,287,1,287,,,,,290,325,3,38,1,78,79,96,97,147,148,156,157,264,295,324,265,294,GG,2,,,GAAATAGTGATGACGCAGTCACCAGCCACCCTGTCAGTGTCTCCAG...,EIVMTQSPATLSVSPGETTRLSCRASQSINSDVAWYQQKVGQTPRL...
23998,23998_SRR11937598_igblastn_anarci_Light_Bulk_39,ATATGATGGGGAACATCTTGGGGGAGCCCCAGCTCTGGCACCAGGG...,human,IGL,False,True,False,True,False,True,IGLV7-46*01,IGLV7-46*01,,,IGLJ2*01,"IGLJ2*01,IGLJ3*01",CAGGCTGTGGTGACTCAGGAGACCTCACTGACTGTGTCCCCAGGAG...,CAGGCTGTGGTGACTCAGGAGCCCTCACTGACTGTGTCCCCAGGAG...,QAVVTQETSLTVSPGGTVTLTCGSSTGAVTSGHFPYWFQQKPGQAP...,QAVVTQEPSLTVSPGGTVTLTCGSSTGAVTSGHYPYWFQQKPGQAP...,1,272,,,293,328,CAGGCTGTGGTGACTCAGGAGACCTCACTGACTGTGTCCCCAGGAG...,QAVVTQETSLTVSPGGTVTLTCGSSTGAVTSGHFPYWFQQKPGQAP...,CAGGCTGTGGTGACTCAGGAGCCCTCACTGACTGTGTCCCCAGGAG...,QAVVTQEPSLTVSPGGTVTLTCGSSTGAVTSGHYPYWFQQKPGQAP...,,,,,TGGTATTCGGCGGAGGGACCAAGCTGACCGTCCTAG,VFGGGTKLTVL,TGGTATTCGGCGGAGGGACCAAGCTGACCGTCCTAG,VFGGGTKLTVL,CAGGCTGTGGTGACTCAGGAGACCTCACTGACTGTGTCCCCAGGAG...,QAVVTQETSLTVSPGGTVTLTCGSS,ACTGGAGCTGTCACCAGTGGTCATTTT,TGAVTSGHF,CCCTACTGGTTCCAGCAGAAGCCTGGCCAAGCCCCCAGGACACTGA...,PYWFQQKPGQAPRTLIY,GATACAAGC,DTS,AACAAACACTCCTGGACACCTGCCCGTTTCTCAGACTCCCTCCTTG...,NKHSWTPARFSDSLLGGKAALTLSGAQPEDEAEYYC,TTCGGCGGAGGGACCAAGCTGACCGTCCTA,FGGGTKLTVL,TTACTAACCTACATTGGCGCCATGGTA,LLTYIGAMV,TGCTTACTAACCTACATTGGCGCCATGGTATTC,33,CLLTYIGAMVF,11,500.50,,58.65625,124S272M97S22N,,416S2N36M41S,1.658000e-143,,8.611000e-13,98.5000,,100.000,125,396,1,272,,,,,417,452,3,38,125,199,200,226,227,277,278,286,287,394,422,451,395,421,ACTAACCTACATTGGCGCCA,20,,,CAGGCTGTGGTGACTCAGGAGACCTCACTGACTGTGTCCCCAGGAG...,QAVVTQETSLTVSPGGTVTLTCGSSTGAVTSGHFPYWFQQKPGQAP...


# Compare

In [3]:
sadie_comparable = pd.read_feather("sadie_compariable.feather")

imgt_comparable = pd.read_feather("imgt_compariable.feather")

In [4]:
check_these = [
    "sequence",
    "locus",
    "stop_codon",
    "vj_in_frame",
    "productive",
    "rev_comp",
    "complete_vdj",
    "v_call",
    "v_call_top",
    "v_gene_top",
    "d_call_top",
    "d_gene_top",
    "j_call_top",
    "j_gene_top",
    "fwr1",
    "cdr1",
    "fwr2",
    "cdr2",
    "fwr3",
    "cdr3",
    "fwr1_aa",
    "cdr1_aa",
    "fwr2_aa",
    "cdr2_aa",
    "fwr3_aa",
    "cdr3_aa",
    "fwr4_aa",
]

In [5]:
compare = imgt_comparable[check_these] == sadie_comparable[check_these]

## Stop Codon Corrections


This is where the imgt did not agree with the sadie on stop codon

In [6]:
disagreements_at_stop_codon_index = compare[~compare["stop_codon"]].index

In [7]:
disagreements_at_stop_codon_imgt = imgt_comparable.loc[
    disagreements_at_stop_codon_index
]

disagreements_at_stop_codon_sadie = sadie_comparable.loc[
    disagreements_at_stop_codon_index
]

In [17]:
from sadie.antibody import AntibodyChainNT


def render_ab_object(row):
    #print(row['v_call_top'],row['j_call_top'],row['fwr1'],row['cdr1'],row['fwr2'],row['cdr2'],row['fwr3'],row['cdr3'],row['fwr4'],'\n')
    return AntibodyChainNT(
        name=row["sequence_id"],
        fwr1_nt=row["fwr1"],
        cdr1_nt=row["cdr1"],
        fwr2_nt=row["fwr2"],
        cdr2_nt=row["cdr2"],
        fwr3_nt=row["fwr3"],
        
        cdr3_nt=row["cdr3"],
        fwr4_nt=row["fwr4"],
        v_gene=row["v_call_top"],
        j_gene=row["j_call_top"],
        species="human",
    )


dis_w_stop_codon_string_index = disagreements_at_stop_codon_imgt[
    disagreements_at_stop_codon_imgt["sequence_alignment_aa"].str.contains("\*")
].index

for i in dis_w_stop_codon_string_index:
    row = imgt_comparable.loc[i].fillna('')
    imgt_object = render_ab_object(row)
    print(f"IMGT\n{imgt_object.get_segmented_alignment_nt()}")
    print(f"\n{imgt_object.get_segmented_alignment_aa()}")


    row = sadie_comparable.loc[i].fillna('')
    sadie_object = render_ab_object(row)
    print(f"SADIE\n{sadie_object.get_segmented_alignment_nt()}")
    print(f"\n{sadie_object.get_segmented_alignment_aa()}")

    input()
    clear_output()
# print(imgt_comparable.loc[i, "sequence"], "\n")
#     print(imgt_comparable.loc[i, "sequence_alignment_aa"], "\n")
#     print(sadie_comparable.loc[i, "sequence_alignment_aa"], "\n")
#     input()
#     clear_output()

IMGT
IGHV4-59*05|IGHJ4*02                              CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCT G------GTGGCTCCATCAGTAGT
2684_SRR12190270_igblastn_anar                    ------------------------------------------........................T.CAGACTG .TGTCGT.GAA...AGG.GCAGA.

IGHV4-59*05|IGHJ4*02                              TACTAC TGGAGCTGGATCCGGCAGCCGCCGGGGAAGGGACTGGAGTGGATTGGGCGT ATCTATTATAGTGGGAGCACC TACTACAACCCGTCCCTCA
2684_SRR12190270_igblastn_anar                    CGGA.G A.C.C.C.TC.GAAC.CCAGT.ACT.ACCAATCTC.T.TGCCG.CTTCT.C T.GA.AA.A.AAA........ ............A......

IGHV4-59*05|IGHJ4*02                              AGAGTCGAGTCACCATATCCGTAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCAGACACGGCTGTGTATTACTGT GCG-
2684_SRR12190270_igblastn_anar                    .........................................C..................................................... ...A

IGHV4-59*05|IGHJ4*02                              ------------------------------------

KeyboardInterrupt: Interrupted by user

In [18]:
sadie_object

2684_SRR12190270_igblastn_anarci_Heavy_Bulk_1127
FrameWork1NT 1-75:---------------------------------------------------------------------------
CDR1NT 76-105:------------------------------
FrameWork2NT 106-122:CGTATGCCGTCTTCTGC
CDR2NT 123-143:TTGAAAAAAAAAAGGAGCACC
FrameWork3NT 144-257:TACTACAACCCGACCCTCAAGAGTCGAGTCACCATATCCGTAGACACGTCCAAGAACCAGCTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCAGACACGGCTGTGTATTACTGT
CDR3NT 258-317:GCGACTCTACCCTCAAGTATTACGATTTTTGGAGTGGTTCAGGGGTACTACTTTGACGAC
FrameWork4NT 318-350:TGGGGCCAGGGAACCCTGGTCACCGGCTCCTCA

In [9]:
imgt_object

2125_Subject-15_igblastn_anarci_Heavy_IGHA_162
FrameWork1NT 1-0:
CDR1NT 1-0:
FrameWork2NT 1-33:CTGGGGGGCCCTTAAGACCCCCCTNTCACCCAC
CDR2NT 34-57:CATATCAGGTTTCGGCGCCTTTCC
FrameWork3NT 58-171:TATGAACTGGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAACTCCAAGGACACGGTGTATCTGCAAATGGACAGCCTGAGACCTGAGGACACGGCTGTGTACTACTGT
CDR3NT 172-207:GCGAGGACAGTTGCAGTGGGTTGGTACTGTGACTAC
FrameWork4NT 208-241:TGGGGCCAGGGAACCCAGGTCACCGTCTCCTCAC

In [95]:
print(imgt_mab.get_segmented_alignment_nt())

IGHV5-51*01|IGHJ4*01  GAGGTGCAGCTGGTGCAGTCTGGAGCAGAGGTGAAAAAGCCCGGGGAGTCTCTGAAGATCTCCTGTAAGGGTTCT GGATACAGCTTTACCAGCTACTGG
somecrap              ACAC.TTT.TAAAAT.-------------------------------T.GGGG..................C... .......C....C...AT......

IGHV5-51*01|IGHJ4*01   ATCGGCTGGGTGCGCCAGATGCCCGGGAAAGGCCTGGAGTGGATGGGGATC ATCTATCCTGGTGACTCTGATACC AGATACAGCCCGTCCTTCCAAG
somecrap               ................................................G.. .....C....A..........G.. ........T........TG.G.

IGHV5-51*01|IGHJ4*01  GCCAGGTCACCATCTCAGCCGACAAGTCCATCAGCACCGCCTACCTGCAGTGGAGCAGCCTGAAGGCCTCGGACACCGCCATGTATTACTGT GCGAGAC
somecrap              .T...............................C...............A.....G................................T... .......

IGHV5-51*01|IGHJ4*01  A-----------ACTACTTTGACTAC TGGGGCCAAGGAACCCTGGTCACCGTCTCCTCAG
somecrap              GGAGAGATGGCT..A........... ........G.A.G............C......G.




In [74]:
single_entry = imgt_comparable.loc[i]

In [84]:
single_entry["j_gene_top"]

'IGHJ4'

In [167]:
# you cannot trust fwr4 since it has the extra nucleotide in the jcall

In [172]:
pd.set_option("display.max_rows", 100)
for i in range(6,20):
    single_entry_diff = (
        imgt_comparable.loc[compare[~compare["v_gene_top"]].index]
        .iloc[i]
        .to_frame()
        .join(
            sadie_comparable.loc[compare[~compare["v_gene_top"]].index].iloc[i].to_frame(),
            rsuffix="_sadie",
            lsuffix="_imgt",
        )
        .join(compare[~compare["v_gene_top"]].iloc[i].to_frame())
        .fillna("")
    ).loc[check_these]
    print(imgt_comparable.loc[compare[~compare["v_gene_top"]].index].iloc[i]['sequence'])
    display(HTML(single_entry_diff._repr_html_()))
    input()
    clear_output()

CGGGGAGTCTCTGAAGATCTCCTGTGCAGCGTCTGGATTCACGTTCAGTTATTTTGGCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGACTGGAGTGGGTGGCACTGATAAAGTATGATGGAAGTAATACATACCACGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAACACACTATATTTGCAAATGAACAGCCTGAGACCCGAGGACACGGCCATATATTACTGTGCGAAAGAGATGGATATGCGGTGGCTACAATTTCACGATTTCGGCTGCTTTGACTCCTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGCCTCCACCAAGGGCCCATCGGTCTTCCCCCTGGCGCCCTGCTCCAGGAGCACCTCTGGGGGCACAGCGGCCCTGGGCTGCCTG


Unnamed: 0,431_imgt,431_sadie,431
sequence,CGGGGAGTCTCTGAAGATCTCCTGTGCAGCGTCTGGATTCACGTTC...,CGGGGAGTCTCTGAAGATCTCCTGTGCAGCGTCTGGATTCACGTTC...,True
locus,IGH,IGH,True
stop_codon,True,False,False
vj_in_frame,True,True,True
productive,True,True,True
rev_comp,False,False,True
complete_vdj,False,False,True
v_call,"Homsap IGHV3-30*02 F, or Homsap IGHV3-30-5*02 ...",IGHV3-33*06,False
v_call_top,IGHV3-30*02,IGHV3-33*06,False
v_gene_top,IGHV3-30,IGHV3-33,False


KeyboardInterrupt: Interrupted by user

In [175]:
airr_api.run_single(
    "test",
    "CGGGGAGTCTCTGAAGATCTCCTGTGCAGCGTCTGGATTCACGTTCAGTTATTTTGGCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGACTGGAGTGGGTGGCACTGATAAAGTATGATGGAAGTAATACATACCACGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAACACACTATATTTGCAAATGAACAGCCTGAGACCCGAGGACACGGCCATATATTACTGTGCGAAAGAGATGGATATGCGGTGGCTACAATTTCACGATTTCGGCTGCTTTGACTCCTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGCCTCCACCAAGGGCCCATCGGTCTTCCCCCTGGCGCCCTGCTCCAGGAGCACCTCTGGGGGCACAGCGGCCCTGGGCTGCCTG",
)

Unnamed: 0,sequence_id,sequence,species,locus,stop_codon,vj_in_frame,v_frameshift,productive,rev_comp,complete_vdj,v_call_top,v_call,d_call_top,d_call,j_call_top,j_call,sequence_alignment,germline_alignment,sequence_alignment_aa,germline_alignment_aa,v_alignment_start,v_alignment_end,d_alignment_start,d_alignment_end,j_alignment_start,j_alignment_end,v_sequence_alignment,v_sequence_alignment_aa,v_germline_alignment,v_germline_alignment_aa,d_sequence_alignment,d_sequence_alignment_aa,d_germline_alignment,d_germline_alignment_aa,j_sequence_alignment,j_sequence_alignment_aa,j_germline_alignment,j_germline_alignment_aa,fwr1,fwr1_aa,cdr1,cdr1_aa,fwr2,fwr2_aa,cdr2,cdr2_aa,fwr3,fwr3_aa,fwr4,fwr4_aa,cdr3,cdr3_aa,junction,junction_length,junction_aa,junction_aa_length,v_score,d_score,j_score,v_cigar,d_cigar,j_cigar,v_support,d_support,j_support,v_identity,d_identity,j_identity,v_sequence_start,v_sequence_end,v_germline_start,v_germline_end,d_sequence_start,d_sequence_end,d_germline_start,d_germline_end,j_sequence_start,j_sequence_end,j_germline_start,j_germline_end,fwr1_start,fwr1_end,cdr1_start,cdr1_end,fwr2_start,fwr2_end,cdr2_start,cdr2_end,fwr3_start,fwr3_end,fwr4_start,fwr4_end,cdr3_start,cdr3_end,np1,np1_length,np2,np2_length,vdj_nt,vdj_aa
0,test,CGGGGAGTCTCTGAAGATCTCCTGTGCAGCGTCTGGATTCACGTTC...,human,IGH,False,True,False,True,False,False,IGHV3-33*06,IGHV3-33*06,IGHD5-24*01,IGHD5-24*01,IGHJ4*02,IGHJ4*02,CGGGGAGTCTCTGAAGATCTCCTGTGCAGCGTCTGGATTCACGTTC...,TGGGAGGTCCCTGAGACTCTCCTGTGCAGCGTCTGGATTCACCTTC...,GESLKISCAASGFTFSYFGMHWVRQAPGKGLEWVALIKYDGSNTYH...,GRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVAVIWYDGSNKYY...,1,255,269,279,292,338,CGGGGAGTCTCTGAAGATCTCCTGTGCAGCGTCTGGATTCACGTTC...,GESLKISCAASGFTFSYFGMHWVRQAPGKGLEWVALIKYDGSNTYH...,TGGGAGGTCCCTGAGACTCTCCTGTGCAGCGTCTGGATTCACCTTC...,GRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVAVIWYDGSNKYY...,TGGCTACAATT,WLQ,TGGCTACAATT,WLQ,CTGCTTTGACTCCTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,CFDSWGQGTLVTVSS,CTACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,YFDYWGQGTLVTVSS,CGGGGAGTCTCTGAAGATCTCCTGTGCAGCGTCT,GESLKISCAAS,GGATTCACGTTCAGTTATTTTGGC,GFTFSYFG,ATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGACTGGAGTGGGTGG...,MHWVRQAPGKGLEWVAL,ATAAAGTATGATGGAAGTAATACA,IKYDGSNT,TACCACGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACA...,YHADSVKGRFTISRDNSKNTLYLQMNSLRPEDTAIYYC,TGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA,WGQGTLVTVSS,GCGAAAGAGATGGATATGCGGTGGCTACAATTTCACGATTTCGGCT...,AKEMDMRWLQFHDFGCFDS,TGTGCGAAAGAGATGGATATGCGGTGGCTACAATTTCACGATTTCG...,63,CAKEMDMRWLQFHDFGCFDSW,21,343.0,21.84375,69.75,41N255M166S,268S7N11M142S2N,291S1N47M83S,4.067e-96,0.06912,3.339e-16,89.4375,100.0,95.75,1,255,42,296,269,279,8.0,18.0,292,338,2,48,1,34,35,58,59,109,110,133,134,247,305,337,248,304,GATGGATATGCGG,13,TCACGATTTCGG,12,CGGGGAGTCTCTGAAGATCTCCTGTGCAGCGTCTGGATTCACGTTC...,GESLKISCAASGFTFSYFGMHWVRQAPGKGLEWVALIKYDGSNTYH...


In [176]:
airr_api

IgBLAST: env IGDATA=/Users/jordanwillis/repos/personal/sadie/src/sadie/airr/data/germlines/imgt/all/Ig /Users/jordanwillis/repos/personal/sadie/src/sadie/airr/bin/darwin/igblastn -min_D_match 5 -num_alignments_V 5 -num_alignments_J 3 -num_alignments_D 3 -organism human -ig_seqtype Ig -germline_db_V /Users/jordanwillis/repos/personal/sadie/src/sadie/airr/data/germlines/imgt/all/Ig/blastdb/human_V -germline_db_D /Users/jordanwillis/repos/personal/sadie/src/sadie/airr/data/germlines/imgt/all/Ig/blastdb/human_D -germline_db_J /Users/jordanwillis/repos/personal/sadie/src/sadie/airr/data/germlines/imgt/all/Ig/blastdb/human_J -auxiliary_data /Users/jordanwillis/repos/personal/sadie/src/sadie/airr/data/germlines/imgt/aux_db/human_gl.aux -outfmt 19 -domain_system imgt -word_size 11 -gapopen 5 -gapextend 2 -J_penalty -1 -V_penalty -2 -num_threads 12 -show_translation -extend_align5end -extend_align3end

In [112]:
imgt_comparable.loc[55, "sequence"]

'CTTCGGAGACCCTGTCCCTCACCTGCGCTGTCTATGGTGGGTCCTTCAGTGGTTACTACTGGGGCTGGATCCGCCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGGAGTATCTATTATAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTCACCATATCCGTAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCAGACACGGCTGTGTATTACTGTGCGAACGGCCCTATGTATAGTGGGAGCTACGGGTGGTTCGACCCCTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGGTAAG'

## Read OAS

In [22]:
df_bz2 = pd.read_csv(
    "https://sadie.s3.us-east-2.amazonaws.com/integration/OAS_sample_subsample.bz2",
    index_col=0,
).reset_index()

sub_sample_file = "../../tests/integration/airr/fixtures/OAS_subsample.fasta"
with open("../../tests/integration/airr/fixtures/OAS_subsample.fasta", "w") as f:
    for index, (i, k, j) in enumerate(
        zip(
            df_bz2["OAS_dataunit"].str.split(".csv.gz").str.get(0),
            df_bz2["OAS_dataindex"],
            df_bz2["sequence"],
        )
    ):
        f.write(f">{index}_{i}_{k}\n{j}\n")

In [32]:
df_bz2 = pd.read_csv(
    "https://sadie.s3.us-east-2.amazonaws.com/integration/OAS_sample_subsample_good_anarci.csv.bz2",
    index_col=0,
).reset_index()

sub_sample_file = (
    "../../tests/integration/airr/fixtures/OAS_subsample_good_anarci.fasta"
)
with open(
    "../../tests/integration/airr/fixtures/OAS_subsample_good_anarci.fasta", "w"
) as f:
    for index, (i, k, j) in enumerate(
        zip(
            df_bz2["OAS_dataunit"].str.split(".csv.gz").str.get(0),
            df_bz2["OAS_dataindex"],
            df_bz2["sequence"],
        )
    ):
        f.write(f">{index}_{i}_{k}\n{j}\n")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


## IMGT HI-Vqeust "AIRR" format

In [17]:
spread.df_to_sheet(
    imgt_df.head(100).set_index("sequence_id"),
    sheet="imgt",
    replace=True,
    freeze_headers=True,
    freeze_index=True,
)

In [None]:
# spreadable_df = fillna(sadie_airr.table.head(100))
spread.df_to_sheet(
    spreadable_df.set_index("sequence_id"),
    sheet="sadie",
    replace=True,
    freeze_headers=True,
    freeze_index=True,
)

## Clean UP IMGT

## Clean up sadie

In [358]:
ref[check_these] == target[check_these]

fwr1     True
cdr1     True
fwr2     True
cdr2    False
fwr3     True
cdr3     True
Name: 99, dtype: bool

In [362]:
ref["cdr2_aa"]

'ISGSRTYI'

In [363]:
target["cdr2_aa"]

'ISGSFRTYI'

In [361]:
ref["sequence"]

'TACCTTGACTTAACTCTTGGGGGTCTGGGGGAGGCCTGGTCAAGCCTGGGGGGTCCCTGAGACTCTCATGTGCAGCCTCTGGATTCACCTTCAGTAGTTTTACCATGAATTGGGTCCGCCAGGCTCCAGGGAAGGGACTGGAGTGGGTCTCATCCATTAGCGGCAGTTTCAGAACTTACATATATTATGCCGACTCAGTGAAGGGCCGATTCACCGTCTCCAGAGACAACGCCAAGGAATTGGTGTTTCTGCAGATGGACAACCTGAGAGTCGAAGACACAGGTGTATATTATTGTGCGAGAGACCTAAATACGGTGACTACCCCAGAATACTTCCAACACTGGGGCCTGGGCACCCCGGTCTCCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTG'

In [355]:
target["cdr2"]

'ATCAACCCTAGTGGTGATAACACA'

In [344]:
ref["fwr4"]

'TGGGGCCAGGGAACC'

In [345]:

target["fwr4"]

'TGGGGCCAGGGAACCCT'

In [342]:
ref[check_these] == target[check_these]

fwr1     True
cdr1     True
fwr2     True
cdr2     True
fwr3     True
cdr3     True
fwr4    False
Name: 3, dtype: bool

In [328]:
len(ref["fwr4"])

33

In [320]:
ref["v_call_top"]

'IGHV4-31*02'

In [321]:
target["v_call_top"]

'IGHV4-31*02'

In [303]:
ref["j_call_top"]

'IGHJ4*02'

In [304]:
target["j_call_top"]

'IGHJ4*02'

In [330]:
len(target["fwr4"])

33

In [331]:
len(ref["fwr4"])

33

In [None]:
TGGGGCCAGGGAACCCTGGTCACCGTCGCCTCAG
TGGGGCCAGGGAACCCTGGTCACCGTCGCCTCA

In [312]:
from Bio.Seq import Seq

Seq("tggggccagggaaccctggtcaccgtcgcctca").translate()

Seq('WGQGTLVTVAS')

In [308]:
ref["sequence"]

'GGCCTCAGTGAAGGTCTCCTGCAAGGCATCTGGATACACCTTCACCAACCACTACATCCACTGGGTGCGACAGGCCCCTGGACAAGGGCTTGAGTGGATGGGAATAATCAACCCTAGTGGTGGTAGGACAAGTCACGTACAGGAGTTCCAGGGCAGAGTCACCATGACCAGGGACACGTCCACGAGTACTGTCTACATGGAGTTGAGTAGCCTGAGATCTGAGGACACGGCCGTGTACTTTTGTGCGAGACAACAGTGTAGTGCTTCCAACTGCTATGAGGAGAGTTTTGACCACTGGGGCCAGGGAACCCTGGTCACCGTCGCCTCAGCCTCCACACAGAGCCCATCCGTCTTC'

In [298]:
target[check_these] == ref[check_these]

fwr1     True
cdr1     True
fwr2     True
cdr2     True
fwr3     True
cdr3     True
fwr4    False
Name: 1_705010661_igblastn_anarci_Heavy_IGHE_1125, dtype: bool

'ACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGANNNNNNNNNAACTCCNNNNTGATGCTTTTGATATCTGGGGCCAAGGGACAATGGTCACCGTCTCTTCA'

In [193]:
spread.df_to_sheet(
    fillna(top_100_sadie_airr_only),
    sheet="sadie_airr_only",
    freeze_headers=True,
    freeze_index=True,
    replace=True,
)

  df.dtypes.apply(lambda x: x in ["float64", "Int16"])


In [213]:
spread.df_to_sheet(
    fillna(top_100_imgt_airr_only),
    sheet="imgt_airr_only",
    freeze_headers=True,
    freeze_index=True,
    replace=True,
)

  df.dtypes.apply(lambda x: x in ["float64", "Int16", "Int64"])


germline_alignment         ACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCT...
germline_alignment_aa      TVSGGSISSGGYYWSWIRQHPGKGLEWIGYIYYSGSTYYNPSLKSR...
v_alignment_end                                                       316.00
v_germline_start                                                        1.00
v_germline_end                                                        316.00
v_identity                                                             82.22
d_alignment_start                                                     325.00
d_alignment_end                                                       334.00
d_germline_alignment                                              CAACTGGAAC
d_germline_alignment_aa                                                  QLE
d_germline_start                                                        5.00
d_germline_end                                                         14.00
d_identity                                                              0.00

## Try and find OAS similariities

I'm suspect of the entire airr file since it doesn't have fwr4

In [None]:
df_bz2 = pd.read_csv(
    "https://sadie.s3.us-east-2.amazonaws.com/integration/OAS_sample_subsample.bz2",
    index_col=0,
).reset_index()

In [None]:
airr_dataframe = airr_api.run_dataframe(df_bz2, "index", "sequence")

In [None]:
ref = df_bz2[df_bz2.columns.intersection(airr_dataframe.table.columns)].drop(
    "sequence_id", axis=1
)
target = airr_dataframe.table[
    airr_dataframe.table.columns.intersection(df_bz2.columns)
].drop("sequence_id", axis=1)

In [None]:
ref.re

In [None]:
for index in ref.index:
    sub_ref = ref.loc[index]
    sub_target = target.loc[index]
    aggree = sub_ref[sub_ref == sub_target].index
    disagree = sub_ref[sub_ref != sub_target].index
    if disagree.empty:
        print('yay')
    for dis_index in disagree:
        print(f"OAR-{dis_index}:{sub_ref[dis_index]}")
        print(f"Sadie-{dis_index}:{sub_target[dis_index]}\n")

    break

In [None]:
disagree

In [None]:
from pandas.testing import assert_series_equal

# Get IMGT airr

In [None]:
imgt_airr = "../../tests/integration/airr/fixtures/OAS_airr_from_imgtvquest.tsv"
imgt_df = pd.read_csv(imgt_airr, delimiter="\t")

In [None]:
spread.df_to_sheet(imgt_df, sheet="imgt_airr")