In [1]:
%load_ext lab_black

In [1]:
import json
import gzip
import pandas as pd
import glob
import tempfile
from importlib import reload

import uuid
import os
from pprint import pprint
from multiprocessing import Pool
from pathlib import Path
from sadie.airr import Airr, AirrTable
from gspread_pandas import Spread, Client
from sadie.airr import AirrTable
from sadie.airr import constants

spread = Spread("Sadie", create_spread=True, create_sheet=True)
pd.set_option("display.max_rows", 45)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 100000)


def fillna(df, fill_value=""):
    """
    Replace null values with `fill_value`.

    Also replaces in categorical columns.
    """
    for col in df.dtypes[df.dtypes == "category"].index:
        if fill_value not in df[col].cat.categories:
            df[col].cat.add_categories([fill_value], inplace=True)
    # Known bug https://github.com/pandas-dev/pandas/issues/25472
    if StrictVersion(pd.__version__) >= StrictVersion("1.0"):
        for col in df.dtypes[
            df.dtypes.apply(lambda x: x in ["float64", "Int16", "Int64"])
        ].index:
            df[col] = df[col].astype("float")
    return df.fillna(fill_value)

## Sadie Annotate

In [5]:
sub_sample_file = "../../tests/integration/airr/fixtures/OAS_subsample.fasta"
airr_api = Airr(species="human", database="imgt", functional="all")
sadie_airr = airr_api.run_file(sub_sample_file)

Caution - sequences ['4_704010461_igblastn_anarci_Heavy_IGHE_8', '1327_SRR12190303_igblastn_anarci_Heavy_Bulk_136', '1890_SRR12190307_igblastn_anarci_Heavy_Bulk_469', '5664_SRR11937611_igblastn_anarci_Heavy_Bulk_62', '6073_SRR11610505_1_igblastn_anarci_Bulk_256', '8948_Subject-49_igblastn_anarci_Heavy_IGHE_2', '9858_SRR12190266_igblastn_anarci_Heavy_Bulk_1844', '11672_704010461_igblastn_anarci_Heavy_IGHE_10', '11888_SRR12190262_igblastn_anarci_Heavy_Bulk_654', '19830_G_d9_L_igblastn_anarci_Light_Bulk_791'] may need manual inspections


In [6]:
sadie_airr[~sadie_airr["note"].isna()]

Unnamed: 0,sequence_id,sequence,species,locus,stop_codon,vj_in_frame,v_frameshift,productive,rev_comp,complete_vdj,v_call,d_call,j_call,sequence_alignment,germline_alignment,sequence_alignment_aa,germline_alignment_aa,v_alignment_start,v_alignment_end,d_alignment_start,d_alignment_end,j_alignment_start,j_alignment_end,v_sequence_alignment,v_sequence_alignment_aa,v_germline_alignment,v_germline_alignment_aa,d_sequence_alignment,d_sequence_alignment_aa,d_germline_alignment,d_germline_alignment_aa,j_sequence_alignment,j_sequence_alignment_aa,j_germline_alignment,j_germline_alignment_aa,fwr1,fwr1_aa,cdr1,cdr1_aa,fwr2,fwr2_aa,cdr2,cdr2_aa,fwr3,fwr3_aa,fwr4,fwr4_aa,cdr3,cdr3_aa,junction,junction_length,junction_aa,junction_aa_length,v_score,d_score,j_score,v_cigar,d_cigar,j_cigar,v_support,d_support,j_support,v_identity,d_identity,j_identity,v_sequence_start,v_sequence_end,v_germline_start,v_germline_end,d_sequence_start,d_sequence_end,d_germline_start,d_germline_end,j_sequence_start,j_sequence_end,j_germline_start,j_germline_end,fwr1_start,fwr1_end,cdr1_start,cdr1_end,fwr2_start,fwr2_end,cdr2_start,cdr2_end,fwr3_start,fwr3_end,fwr4_start,fwr4_end,cdr3_start,cdr3_end,np1,np1_length,np2,np2_length,note,vdj_nt,vdj_aa
4,4_704010461_igblastn_anarci_Heavy_IGHE_8,CGGGGAGTCTCTGAAGATCTCCTGTGCAGCCTCTGGATTCACCTTC...,human,IGH,False,True,False,True,False,False,IGHV3-30-3*01,"IGHD1-20*01,IGHD1-7*01,IGHD1/OR15-1a*01","IGHJ2*01,IGHJ6*03",CGGGGAGTCTCTGAAGATCTCCTGTGCAGCCTCTGGATTCACCTTC...,TGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTC...,GESLKISCAASGFTFSSYAMHWVRQAPGKGLEWVAVISYDGSNKYY...,GRSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVAVISYDGSNKYY...,1,254,341.0,348.0,376,382,CGGGGAGTCTCTGAAGATCTCCTGTGCAGCCTCTGGATTCACCTTC...,GESLKISCAASGFTFSSYAMHWVRQAPGKGLEWVAVISYDGSNKYY...,TGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTC...,GRSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVAVISYDGSNKYY...,ATAACTGG,ITG,ATAACTGG,ITG,CTGGTAC,WY,CTGGTAC,WY,CGGGGAGTCTCTGAAGATCTCCTGTGCAGCCTCT,GESLKISCAAS,GGATTCACCTTCAGTAGCTATGCT,GFTFSSYA,ATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGG...,MHWVRQAPGKGLEWVAV,ATATCATATGATGGAAGCAATAAA,ISYDGSNK,TACTACGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACA...,YYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYC,,,GCGAGAGCACCCACCAAGGCTCCGGATGTGTTCCCCATCATATCAG...,ARAPTKAPDVFPIISGCRHPKDSSPVVLACLITGYHPTSVTVTWY,TGTGCGAGAGCACCCACCAAGGCTCCGGATGTGTTCCCCATCATAT...,141.0,CARAPTKAPDVFPIISGCRHPKDSSPVVLACLITGYHPTSVTVTWYM,47.0,456.25,16.0625,12.679688,41N254M155S1N,340S3N8M61S6N,375S3N7M27S43N,2.8039999999999997e-130,3.657,48.62,97.25,100.0,100.0,1,254,42,295,341.0,348.0,4.0,11.0,376,382,4,10,1,34,35,58,59,109,110,133,134,247,,,248.0,382.0,CACCCACCAAGGCTCCGGATGTGTTCCCCATCATATCAGGGTGCAG...,86,GTACCACCCAACGTCCGTGACTGTCAC,27.0,potential broken,CGGGGAGTCTCTGAAGATCTCCTGTGCAGCCTCTGGATTCACCTTC...,GESLKISCAASGFTFSSYAMHWVRQAPGKGLEWVAVISYDGSNKYY...
1327,1327_SRR12190303_igblastn_anarci_Heavy_Bulk_136,GTTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCATCTTCAGTAG...,human,IGH,False,True,False,True,False,False,IGHV3-48*03,"IGHD2-21*01,IGHD2-21*02,IGHD4-17*01","IGHJ6*01,IGHJ6*02,IGHJ6*03",GTTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCATCTTCAGTAG...,GGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAG...,SLRLSCAASGFIFSSYEMNWVRQAPGKGLEWVSYISNSGDTKYYAD...,SLRLSCAASGFTFSSYEMNWVRQAPGKGLEWVSYISSSGSTIYYAD...,1,249,252.0,256.0,259,282,GTTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCATCTTCAGTAG...,SLRLSCAASGFIFSSYEMNWVRQAPGKGLEWVSYISNSGDTKYYAD...,GGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAG...,SLRLSCAASGFTFSSYEMNWVRQAPGKGLEWVSYISSSGSTIYYAD...,GGTGA,G,GGTGA,G,ACTACTACTTCCCCGAACCGGTGA,YYFPEPV,ACTACTACTACTACGGTATGGACG,YYYYGMD,GTTCCCTGAGACTCTCCTGTGCAGCCTCT,SLRLSCAAS,GGATTCATCTTCAGTAGTTATGAA,GFIFSSYE,ATGAACTGGGTCCGCCAGGCTCCAGGGAAGGGGCTAGAGTGGGTTT...,MNWVRQAPGKGLEWVSY,ATTAGTAATAGTGGTGATACCAAG,ISNSGDTK,TACTACGCAGACTCTGTGAAGGGCCGATTCACCATCTCCAGAGACA...,YYADSVKGRFTISRDNAKNTLYLQMSSLRAEDTAVYYC,,,GCGAGAGCGGGTGAAGACTACTACTTCCCCGAACCGGTGA,ARAGEDYYFPEPV,TGTGCGAGAGCGGGTGAAGACTACTACTTCCCCGAACCGGTGACGG,46.0,CARAGEDYYFPEPVT,15.0,418.0,10.304688,15.851562,46N249M55S1N,251S13N5M48S10N,258S3N24M22S36N,7.744e-119,146.8,4.117,95.5625,100.0,58.34375,1,249,47,295,252.0,256.0,14.0,18.0,259,282,4,27,1,29,30,53,54,104,105,128,129,242,,,243.0,282.0,CG,2,AG,2.0,potential broken,GTTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCATCTTCAGTAG...,SLRLSCAASGFIFSSYEMNWVRQAPGKGLEWVSYISNSGDTKYYAD...
1890,1890_SRR12190307_igblastn_anarci_Heavy_Bulk_469,TCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGACTCCGTCA...,human,IGH,False,True,False,True,False,False,"IGHV4-61*01,IGHV4-61*08","IGHD3-3*01,IGHD3-3*02",IGHJ1*01,TCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGACTCCGTCA...,TCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCGTCA...,SETLSLTCTVSGDSVRTGRHHWNWTRQPPGKGLEWIGYIFHSSSSN...,SETLSLTCTVSGGSVSSGSYYWSWIRQPPGKGLEWIGYIYYSGSTN...,1,255,284.0,292.0,418,427,TCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGACTCCGTCA...,SETLSLTCTVSGDSVRTGRHHWNWTRQPPGKGLEWIGYIFHSSSSN...,TCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCGTCA...,SETLSLTCTVSGGSVSSGSYYWSWIRQPPGKGLEWIGYIYYSGSTN...,TTTTGGAGT,LE,TTTTGGAGT,LE,GACTACTTCC,DYF,GAATACTTCC,EYF,TCGGAGACCCTGTCCCTCACCTGCACTGTCTCT,SETLSLTCTVS,GGTGACTCCGTCAGAACTGGTCGTCACCAT,GDSVRTGRHH,TGGAACTGGACCCGGCAGCCCCCAGGGAAGGGACTTGAATGGATTG...,WNWTRQPPGKGLEWIGY,ATCTTTCACAGCTCGAGTTCC,IFHSSSS,AACTATAACCCTTCCCTCGAGAGTCGAGTCGACATGTCAATAGACA...,NYNPSLESRVDMSIDTSNSQFSLTLTSVTAADTAVYYC,,,GCGAGATCTTCTGTTGCAGTGTCGACAATCCCCCTTTTGGAGTCTT...,ARSSVAVSTIPLLESWGTEEIINEKNNEPQKKKGYKRKHKKKKKKK...,TGTGCGAGATCTTCTGTTGCAGTGTCGACAATCCCCCTTTTGGAGT...,184.0,CARSSVAVSTIPLLESWGTEEIINEKNNEPQKKKGYKRKHKKKKKK...,61.0,289.0,17.984375,14.265625,42N255M206S2N,283S10N9M169S12N,417S3N10M34S39N,7.179e-80,1.09,18.37,86.25,100.0,90.0,1,255,43,297,284.0,292.0,11.0,19.0,418,427,4,13,1,33,34,63,64,114,115,135,136,249,,,250.0,427.0,TCTTCTGTTGCAGTGTCGACAATCCCCC,28,CTTGGGGTACGGAAGAAATAATCAACGAAAAAAACAACGAACCCCA...,125.0,potential broken,TCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGACTCCGTCA...,SETLSLTCTVSGDSVRTGRHHWNWTRQPPGKGLEWIGYIFHSSSSN...
5664,5664_SRR11937611_igblastn_anarci_Heavy_Bulk_62,TGCATAAGTTGGTATCTTGGGGGATACTTTCTGAGACTCATGGACC...,human,IGH,False,True,False,True,False,True,IGHV4-4*07,"IGHD2/OR15-2a*01,IGHD2/OR15-2b*01,IGHD3-3*01",IGHJ4*03,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGG...,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGG...,QVQLQESGPGLVKPSETLSLTCTVSGGSISSYYWSWIRQPAGKGLE...,QVQLQESGPGLVKPSETLSLTCTVSGGSISSYYWSWIRQPAGKGLE...,1,288,312.0,316.0,347,370,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGG...,QVQLQESGPGLVKPSETLSLTCTVSGGSISSYYWSWIRQPAGKGLE...,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGG...,QVQLQESGPGLVKPSETLSLTCTVSGGSISSYYWSWIRQPAGKGLE...,TTTCT,F,TTTCT,F,GGCCCTCGCTCACCGTCTCCTCAG,PSLTVSS,GGACCCTGGTCACCGTCTCCTCAG,TLVTVSS,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGG...,QVQLQESGPGLVKPSETLSLTCTVS,GGTGGCTCCATCAGTAGTTACTAC,GGSISSYY,TGGAGCTGGATCCGGCAGCCCGCCGGGAAGGGACTGGAGGGGATTG...,WSWIRQPAGKGLEGIGR,ATCTATACCAGTGGGTGCACC,IYTSGCT,AACGACAACCCCTCCCTCAAGAGTCGCTTCCCCCTTTCCCTAGACA...,NDNPSLKSRFPLSLDTSKNQFSLKLSSVTAADTAVYYC,,,,,,,,,496.75,10.304688,26.9375,117S288M118S5N,428S21N5M90S5N,463S24N24M36S,2.5329999999999997e-142,256.2,0.003192,96.5,100.0,83.3125,118,405,1,288,429.0,433.0,22.0,26.0,464,487,25,48,118,192,193,216,217,267,268,288,289,402,,,,,TTCCCCCTTCTCCCTCTCTTCTT,23,TCTCCTCCTCTCTTCCCTTCTTCTCCCCCT,30.0,potential broken,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGG...,QVQLQESGPGLVKPSETLSLTCTVSGGSISSYYWSWIRQPAGKGLE...
6073,6073_SRR11610505_1_igblastn_anarci_Bulk_256,GGCCTCAGTGAAGGTCTCCTGCAAGGCTTCTGGTTACACCTTTACC...,human,IGH,False,True,False,True,False,False,IGHV1-18*04,IGHD6-19*01,IGHJ2*01,GGCCTCAGTGAAGGTCTCCTGCAAGGCTTCTGGTTACACCTTTACC...,GGCCTCAGTGAAGGTCTCCTGCAAGGCTTCTGGTTACACCTTTACC...,ASVKVSCKASGYTFTSYGISWVRQAPGQGLEWMGWISAYNGNTNYA...,ASVKVSCKASGYTFTSYGISWVRQAPGQGLEWMGWISAYNGNTNYA...,1,249,254.0,267.0,268,277,GGCCTCAGTGAAGGTCTCCTGCAAGGCTTCTGGTTACACCTTTACC...,ASVKVSCKASGYTFTSYGISWVRQAPGQGLEWMGWISAYNGNTNYA...,GGCCTCAGTGAAGGTCTCCTGCAAGGCTTCTGGTTACACCTTTACC...,ASVKVSCKASGYTFTSYGISWVRQAPGQGLEWMGWISAYNGNTNYA...,GGGTATAGCAGTGG,GYSSG,GGGTATAGCAGTGG,GYSSG,CTGGTACCAA,WYQ,CTGGTACTTC,WYF,GGCCTCAGTGAAGGTCTCCTGCAAGGCTTCT,ASVKVSCKAS,GGTTACACCTTTACCAGCTACGGT,GYTFTSYG,ATCAGCTGGGTGCGACAGGCCCCTGGACAAGGGCTTGAGTGGATGG...,ISWVRQAPGQGLEWMGW,ATCAGCGCTTACAATGGTAACACA,ISAYNGNT,AACTATGCACAGAAGCTCCAGGGCAGAGTCACCATGACCACAGACA...,NYAQKLQGRVTMTTDTSTSTAYMELRSLRSDDTAVYYC,,,GCGAGGGGTGGGTATAGCAGTGGCTGGTACCAA,ARGGYSSGWYQ,TGTGCGAGGGGTGGGTATAGCAGTGGCTGGTACCAACCC,39.0,CARGGYSSGWYQP,13.0,479.5,27.609375,12.679688,44N249M61S3N,253S14M43S7N,267S3N10M33S40N,2.381e-137,0.000926,37.83,100.0,100.0,70.0,1,249,45,293,254.0,267.0,1.0,14.0,268,277,4,13,1,31,32,55,56,106,107,130,131,244,,,245.0,277.0,GGGT,4,,0.0,potential broken,GGCCTCAGTGAAGGTCTCCTGCAAGGCTTCTGGTTACACCTTTACC...,ASVKVSCKASGYTFTSYGISWVRQAPGQGLEWMGWISAYNGNTNYA...
8948,8948_Subject-49_igblastn_anarci_Heavy_IGHE_2,CTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTT...,human,IGH,False,True,False,True,False,False,"IGHV3-21*01,IGHV3-21*02","IGHD1-20*01,IGHD1-7*01,IGHD1/OR15-1a*01","IGHJ2*01,IGHJ6*03",CTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTT...,CTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTT...,GGSLRLSCAASGFTFSSYSMNWVRQAPGKGLEWVSSISSSSSYIYY...,GGSLRLSCAASGFTFSSYSMNWVRQAPGKGLEWVSSISSSSSYIYY...,1,255,342.0,349.0,377,383,CTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTT...,GGSLRLSCAASGFTFSSYSMNWVRQAPGKGLEWVSSISSSSSYIYY...,CTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTT...,GGSLRLSCAASGFTFSSYSMNWVRQAPGKGLEWVSSISSSSSYIYY...,ATAACTGG,ITG,ATAACTGG,ITG,CTGGTAC,WY,CTGGTAC,WY,CTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCT,GGSLRLSCAAS,GGATTCACCTTCAGTAGCTATAGC,GFTFSSYS,ATGAACTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCT...,MNWVRQAPGKGLEWVSS,ATTAGTAGTAGTAGTAGTTACATA,ISSSSSYI,TACTACGCAGACTCAGTGAAGGGCCGATTCACCATCTCCAGAGACA...,YYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYC,,,GCGAGAGCACCCACCAAGGCTCCGGATGTGTTCCCCATCATATCAG...,ARAPTKAPDVFPIISGCRHPKDNSPVVLACLITGYHPTSVTVTWY,TGTGCGAGAGCACCCACCAAGGCTCCGGATGTGTTCCCCATCATAT...,141.0,CARAPTKAPDVFPIISGCRHPKDNSPVVLACLITGYHPTSVTVTWYM,47.0,491.0,16.0625,12.679688,40N255M155S1N,341S3N8M61S6N,376S3N7M27S43N,1.073e-140,3.666,48.75,100.0,100.0,100.0,1,255,41,295,342.0,349.0,4.0,11.0,377,383,4,10,1,35,36,59,60,110,111,134,135,248,,,249.0,383.0,CACCCACCAAGGCTCCGGATGTGTTCCCCATCATATCAGGGTGCAG...,86,GTACCACCCAACGTCCGTGACTGTCAC,27.0,potential broken,CTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTT...,GGSLRLSCAASGFTFSSYSMNWVRQAPGKGLEWVSSISSSSSYIYY...
9858,9858_SRR12190266_igblastn_anarci_Heavy_Bulk_1844,TCGGAGACCCGGTCCCTCACCTGCACTGTGTCTGGCGGGTCCACCG...,human,IGH,False,True,False,True,False,False,IGHV4-59*08,"IGHD3-9*01,IGHD6-19*01",IGHJ1*01,TCGGAGACCCGGTCCCTCACCTGCACTGTGTCTGGCGGGTCCACCG...,TCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCA...,SETRSLTCTVSGGSTASHYWNWIRQSPGKRPEWIGYVYYNGDTKYN...,SETLSLTCTVSGGSISSYYWSWIRQPPGKGLEWIGYIYYSGSTNYN...,1,249,258.0,263.0,327,358,TCGGAGACCCGGTCCCTCACCTGCACTGTGTCTGGCGGGTCCACCG...,SETRSLTCTVSGGSTASHYWNWIRQSPGKRPEWIGYVYYNGDTKYN...,TCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCA...,SETLSLTCTVSGGSISSYYWSWIRQPPGKGLEWIGYIYYSGSTNYN...,TTTTGA,F,TTTTGA,F,GGCCCTGGGCTGCCTGGTCAAGGACTACTTCC,ALGCLVKDYF,GGGCCAGGGCACCCTGGTCACCGTCTCCTCAG,GQGTLVTVSS,TCGGAGACCCGGTCCCTCACCTGCACTGTGTCT,SETRSLTCTVS,GGCGGGTCCACCGCTAGTCACTAC,GGSTASHY,TGGAACTGGATCCGCCAGTCCCCAGGGAAGAGACCAGAATGGATTG...,WNWIRQSPGKRPEWIGY,GTCTATTACAATGGAGACACC,VYYNGDT,AAGTATAATCCCTCCCTCCAGAGTCGAGTCACCATTTCAATAGACA...,KYNPSLQSRVTISIDTSENQFSLRLNSVTAADTAVYFC,,,,,,,,,266.0,12.226562,22.1875,42N249M143S2N,257S11N6M129S14N,326S20N32M34S,5.352e-73,50.32,0.06646,85.125,100.0,65.625,1,249,43,291,258.0,263.0,12.0,17.0,327,358,21,52,1,33,34,57,58,108,109,129,130,243,,,,,GGCTGGGC,8,TCCCAAGGGCCCATCGGTCTTCCCCCTGGCGCCCTGCTCCAGGAGC...,63.0,potential broken,TCGGAGACCCGGTCCCTCACCTGCACTGTGTCTGGCGGGTCCACCG...,SETRSLTCTVSGGSTASHYWNWIRQSPGKRPEWIGYVYYNGDTKYN...
11672,11672_704010461_igblastn_anarci_Heavy_IGHE_10,CTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTT...,human,IGH,False,True,False,True,False,False,IGHV3-30-3*01,"IGHD1-20*01,IGHD1-7*01,IGHD1/OR15-1a*01","IGHJ2*01,IGHJ6*03",CTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTT...,CTGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTT...,GGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVAVISYDGSNKYY...,GRSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVAVISYDGSNKYY...,1,255,342.0,349.0,377,383,CTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTT...,GGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVAVISYDGSNKYY...,CTGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTT...,GRSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVAVISYDGSNKYY...,ATAACTGG,ITG,ATAACTGG,ITG,CTGGTAC,WY,CTGGTAC,WY,CTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCT,GGSLRLSCAAS,GGATTCACCTTCAGTAGCTATGCT,GFTFSSYA,ATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGG...,MHWVRQAPGKGLEWVAV,ATATCATATGATGGAAGCAATAAA,ISYDGSNK,TACTACGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACA...,YYADSVKGRFTISRDNSKNTLYLQTNSLRAEDTAVYYW,,,GTGAGAGCACCCACCAAGGCTCCGGATGTGTTCCCCATCATATCAG...,VRAPTKAPDVFPIISGCRHPKDNSPVVLACLITGYHSTSVTVTWY,TGGGTGAGAGCACCCACCAAGGCTCCGGATGTGTTCCCCATCATAT...,141.0,WVRAPTKAPDVFPIISGCRHPKDNSPVVLACLITGYHSTSVTVTWYM,47.0,468.0,16.0625,12.679688,40N255M155S1N,341S3N8M61S6N,376S3N7M27S43N,9.466e-134,3.666,48.75,98.4375,100.0,100.0,1,255,41,295,342.0,349.0,4.0,11.0,377,383,4,10,1,35,36,59,60,110,111,134,135,248,,,249.0,383.0,CACCCACCAAGGCTCCGGATGTGTTCCCCATCATATCAGGGTGCAG...,86,GTACCACTCAACGTCCGTGACTGTCAC,27.0,potential broken,CTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTT...,GGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVAVISYDGSNKYY...
11888,11888_SRR12190262_igblastn_anarci_Heavy_Bulk_654,GTTCCCTTAGACTCTCCTGTGCACTGTCTGGGCTCGCCGTCAGTGG...,human,IGH,False,True,False,True,False,False,IGHV3-53*04,IGHD4-17*01,"IGHJ6*01,IGHJ6*02,IGHJ6*03",GTTCCCTTAGACTCTCCTGTGCACTGTCTGGGCTCGCCGTCAGTGG...,GGTCCCTGAGACTCTCCTGTGCAGCCTCTGGGTTCACCGTCAGTAG...,SLRLSCALSGLAVSGNYMTWVRQAPGKGLERVSVIHRGGTTYYADS...,SLRLSCAASGFTVSSNYMSWVRQAPGKGLEWVSVIYSGGSTYYADS...,1,247,260.0,269.0,278,303,GTTCCCTTAGACTCTCCTGTGCACTGTCTGGGCTCGCCGTCAGTGG...,SLRLSCALSGLAVSGNYMTWVRQAPGKGLERVSVIHRGGTTYYADS...,GGTCCCTGAGACTCTCCTGTGCAGCCTCTGGGTTCACCGTCAGTAG...,SLRLSCAASGFTVSSNYMSWVRQAPGKGLEWVSVIYSGGSTYYADS...,CTACGGTGAC,YGD,CTACGGTGAC,YGD,CTACTACTACTTCCCCGAACCGGTGA,YYYFPEPV,CTACTACTACTACGGTATGGACGTCT,YYYYGMDV,GTTCCCTTAGACTCTCCTGTGCACTGTCT,SLRLSCALS,GGGCTCGCCGTCAGTGGCAACTAC,GLAVSGNY,ATGACCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGCGGGTCT...,MTWVRQAPGKGLERVSV,ATTCACAGGGGTGGAACCACA,IHRGGTT,TATTATGCAGACTCCGTAAAGGGCCGATTCACCATCTCCAGACACA...,YYADSVKGRFTISRHTSRNTLYLQMNSLRSEDTAVYYC,,,GCGAGAGATAAGGGCTTCGGCTACGGTGACCCACAGGGCTACTACT...,ARDKGFGYGDPQGYYYFPEPV,TGTGCGAGAGATAAGGGCTTCGGCTACGGTGACCCACAGGGCTACT...,69.0,CARDKGFGYGDPQGYYYFPEPVT,23.0,316.0,19.90625,19.015625,46N247M78S,259S3N10M56S3N,277S4N26M22S33N,3.936e-88,0.2008,0.491,88.6875,100.0,53.84375,1,247,47,293,260.0,269.0,4.0,13.0,278,303,5,30,1,29,30,53,54,104,105,125,126,239,303.0,303.0,240.0,302.0,TAAGGGCTTCGG,12,CCACAGGG,8.0,potential broken,GTTCCCTTAGACTCTCCTGTGCACTGTCTGGGCTCGCCGTCAGTGG...,SLRLSCALSGLAVSGNYMTWVRQAPGKGLERVSVIHRGGTTYYADS...
19830,19830_G_d9_L_igblastn_anarci_Light_Bulk_791,ACTCAGCCTCCCTCCGCGTCCGGGTCTCCTGGACAGTCAGTCACCG...,human,IGL,False,True,False,True,False,False,IGLV2-8*01,,IGLJ7*01,ACTCAGCCTCCCTCCGCGTCCGGGTCTCCTGGACAGTCAGTCACCG...,ACTCAGCCTCCCTCCGCGTCCGGGTCTCCTGGACAGTCAGTCACCA...,TQPPSASGSPGQSVTVSCTGTSSDVGASNHVSWYQQHPGRAPKLII...,TQPPSASGSPGQSVTISCTGTSSDVGGYNYVSWYQQHPGKAPKLMI...,1,265,,,283,307,ACTCAGCCTCCCTCCGCGTCCGGGTCTCCTGGACAGTCAGTCACCG...,TQPPSASGSPGQSVTVSCTGTSSDVGASNHVSWYQQHPGRAPKLII...,ACTCAGCCTCCCTCCGCGTCCGGGTCTCCTGGACAGTCAGTCACCA...,TQPPSASGSPGQSVTISCTGTSSDVGGYNYVSWYQQHPGKAPKLMI...,,,,,GGAGGCACCAAGTTTTATGTCCTCG,GGTKFYVL,GGAGGCACCCAGCTGACCGTCCTCG,GGTQLTVL,ACTCAGCCTCCCTCCGCGTCCGGGTCTCCTGGACAGTCAGTCACCG...,TQPPSASGSPGQSVTVSCTGT,AGCAGTGACGTTGGTGCCTCTAACCAT,SSDVGASNH,GTCTCCTGGTACCAACAACACCCAGGCAGAGCCCCCAAACTCATAA...,VSWYQQHPGRAPKLIIY,GAAGTCAAC,EVN,AAGCGGCCCTCAGGGGTCCCTGATCGCTTCTCTGGCTCCAAGTCTG...,KRPSGVPDRFSGSKSGNTASLTVSGLQADDEADYYC,,,,,,,,,435.25,,22.1875,12N265M47S20N,,282S13N25M5S,4.917e-124,,0.05225,95.125,,76.0,1,265,13,277,,,,,283,307,14,38,1,63,64,90,91,141,142,150,151,258,,,,,CTGGAGATGAGTCATAT,17,,,potential broken,ACTCAGCCTCCCTCCGCGTCCGGGTCTCCTGGACAGTCAGTCACCG...,TQPPSASGSPGQSVTVSCTGTSSDVGASNHVSWYQQHPGRAPKLII...


In [2]:
airr_api = Airr(species="human", database="imgt", functional="all")
airr_api.igblast.j_penalty = -2

a = airr_api.run_single(
    "test_sequence",
    "GACATCCAGATGACCCAGTCTCCATCCTCCCTGTCTGCATCTGTAGGAGACAGAGTCACCATCACTTGCCAGGCGAGTCAGGACATTAGCAACTATTTAAATTGGTATCAGCAGAAACCAGGGAAAGCCCCTAAGCTCCTGATCTACGATGCATCCAATTTGGAAACAGGGGTCCCATCAAGGTTCAGTGGAAGTGGATCTGGGACAGATTTTACTTTCACCATCAGCAGCCTGCAGCCTGAAGATATTGCAACATATTACTGTCAACAGTATGATAATTTCGGCGGAGGGACCAAGGTGGACATCAAAC",
)

In [3]:
a["j_germline_alignment"]

0    TTCGGCGGAGGGACCAAGGTGGAGATCAAAC
Name: j_germline_alignment, dtype: object

In [17]:
#comparison keys between imgt and sadie
compare_key = list(constants.IGBLAST_AIRR.keys())
compare_key.remove("v_frameshift")

#straight ignore these
ignore = [
    "v_call",
    "d_call",
    "j_call",
    "v_score",
    "d_score",
    "j_score",
    "v_support",
    "d_support",
    "j_support"
]

#cast these to integers
starts_and_ends = [
    "cdr1_end",
    "cdr1_start",
    "cdr2_end",
    "cdr2_start",
    "cdr3_end",
    "cdr3_start",
    "d_alignment_end",
    "d_alignment_start",
    "d_germline_end",
    "d_germline_start",
    "d_sequence_end",
    "d_sequence_start",
    "fwr1_end",
    "fwr1_start",
    "fwr2_end",
    "fwr2_start",
    "fwr4_end",
    "fwr4_start",
    "j_alignment_end",
    "j_alignment_start",
    "j_germline_end",
    "j_germline_start",
    "j_sequence_end",
    "j_sequence_start",
]


In [33]:
a = sadie_airr.table.columns

In [29]:
a.get_loc('v_call')

9

In [31]:
sadie_airr.table.insert(sadie_airr.table.columns.get_loc('v_call'),'v_call_top',sadie_airr.table['v_call'].str.split(',').str.get(0))

In [32]:
sadie_airr

Unnamed: 0,sequence_id,sequence,locus,stop_codon,vj_in_frame,v_frameshift,productive,rev_comp,complete_vdj,v_call_top,v_call,d_call,j_call,sequence_alignment,germline_alignment,sequence_alignment_aa,germline_alignment_aa,v_alignment_start,v_alignment_end,d_alignment_start,d_alignment_end,j_alignment_start,j_alignment_end,v_sequence_alignment,v_sequence_alignment_aa,v_germline_alignment,v_germline_alignment_aa,d_sequence_alignment,d_sequence_alignment_aa,d_germline_alignment,d_germline_alignment_aa,j_sequence_alignment,j_sequence_alignment_aa,j_germline_alignment,j_germline_alignment_aa,fwr1,fwr1_aa,cdr1,cdr1_aa,fwr2,fwr2_aa,cdr2,cdr2_aa,fwr3,fwr3_aa,fwr4,fwr4_aa,cdr3,cdr3_aa,junction,junction_length,junction_aa,junction_aa_length,v_score,d_score,j_score,v_cigar,d_cigar,j_cigar,v_support,d_support,j_support,v_identity,d_identity,j_identity,v_sequence_start,v_sequence_end,v_germline_start,v_germline_end,d_sequence_start,d_sequence_end,d_germline_start,d_germline_end,j_sequence_start,j_sequence_end,j_germline_start,j_germline_end,fwr1_start,fwr1_end,cdr1_start,cdr1_end,fwr2_start,fwr2_end,cdr2_start,cdr2_end,fwr3_start,fwr3_end,fwr4_start,fwr4_end,cdr3_start,cdr3_end,np1,np1_length,np2,np2_length,species,note,vdj_nt,vdj_aa
0,0_G_d22_H_igblastn_anarci_Heavy_IGHA_219,ACTGTGTCTGGTGGTCTCATCACTAGTGATGGTCATTACTGGAGTT...,IGH,False,True,False,True,False,False,IGHV4-31*02,"IGHV4-31*02,IGHV4-31*03",IGHD4-23*01,IGHJ3*02,ACTGTGTCTGGTGGTCTCATCACTAGTGATGGTCATTACTGGAGTT...,ACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCT...,TVSGGLITSDGHYWSWIRQSPGKGLEWLGSTYYNGATYYSESLESR...,TVSGGSISSGGYYWSWIRQHPGKGLEWIGYIYYSGSTYYNPSLKSR...,1,229,239,244,249,297,ACTGTGTCTGGTGGTCTCATCACTAGTGATGGTCATTACTGGAGTT...,TVSGGLITSDGHYWSWIRQSPGKGLEWLGSTYYNGATYYSESLESR...,ACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCT...,TVSGGSISSGGYYWSWIRQHPGKGLEWIGYIYYSGSTYYNPSLKSR...,AACTCC,L,AACTCC,L,TGATGCTTTTGATATGTGGGGCCAAGGGACATTGGTCACCGTCTCCTCA,DAFDMWGQGTLVTVSS,TGATGCTTTTGATATCTGGGGCCAAGGGACAATGGTCACCGTCTCTTCA,DAFDIWGQGTMVTVSS,ACTGTGTCT,TVS,GGTGGTCTCATCACTAGTGATGGTCATTAC,GGLITSDGHY,TGGAGTTGGATTCGCCAGTCTCCAGGGAAGGGCCTGGAGTGGTTGG...,WSWIRQSPGKGLEWLGS,ACTTATTACAATGGGGCCACC,TYYNGAT,TACTACAGCGAGTCCCTCGAGAGTCGACTCCTCATTTCAATAGACC...,YYSESLESRLLISIDPSQTQFSLKLTSVTAADTAVYYC,TGGGGCCAAGGGACATTGGTCACCGTCTCCTCA,WGQGTLVTVSS,GCGACCTGGACCCAACTCCAACTTGATGCTTTTGATATG,ATWTQLQLDAFDM,TGTGCGACCTGGACCCAACTCCAACTTGATGCTTTTGATATGTGG,45,CATWTQLQLDAFDMW,15,234.00,12.226562,69.750000,66N229M68S4N,238S13N6M53S,248S49M1N,1.589000e-63,37.810,2.408000e-16,82.5625,100.0000,93.8750,1,229,67,295,239,244,14.0,19.0,249,297,1,49,1,9,10,39,40,90,91,111,112,225,265,297,226,264,CCTGGACCC,9,AACT,4,human,,ACTGTGTCTGGTGGTCTCATCACTAGTGATGGTCATTACTGGAGTT...,TVSGGLITSDGHYWSWIRQSPGKGLEWLGSTYYNGATYYSESLESR...
1,1_705010661_igblastn_anarci_Heavy_IGHE_1125,GGCCTCAGTGAAGGTCTCCTGCAAGGCATCTGGATACACCTTCACC...,IGH,False,True,False,True,False,False,IGHV1-46*01,IGHV1-46*01,"IGHD2-2*01,IGHD2-2*03",IGHJ4*02,GGCCTCAGTGAAGGTCTCCTGCAAGGCATCTGGATACACCTTCACC...,GGCCTCAGTGAAGGTTTCCTGCAAGGCATCTGGATACACCTTCACC...,ASVKVSCKASGYTFTNHYIHWVRQAPGQGLEWMGIINPSGGRTSHV...,ASVKVSCKASGYTFTSYYMHWVRQAPGQGLEWMGIINPSGGSTSYA...,1,250,257,278,287,329,GGCCTCAGTGAAGGTCTCCTGCAAGGCATCTGGATACACCTTCACC...,ASVKVSCKASGYTFTNHYIHWVRQAPGQGLEWMGIINPSGGRTSHV...,GGCCTCAGTGAAGGTTTCCTGCAAGGCATCTGGATACACCTTCACC...,ASVKVSCKASGYTFTSYYMHWVRQAPGQGLEWMGIINPSGGSTSYA...,TGTAGTGCTTCCAACTGCTATG,CSASNCY,TGTAGTAGTACCAGCTGCTATG,CSSTSCY,TTTGACCACTGGGGCCAGGGAACCCTGGTCACCGTCGCCTCAG,FDHWGQGTLVTVAS,TTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,FDYWGQGTLVTVSS,GGCCTCAGTGAAGGTCTCCTGCAAGGCATCT,ASVKVSCKAS,GGATACACCTTCACCAACCACTAC,GYTFTNHY,ATCCACTGGGTGCGACAGGCCCCTGGACAAGGGCTTGAGTGGATGG...,IHWVRQAPGQGLEWMGI,ATCAACCCTAGTGGTGGTAGGACA,INPSGGRT,AGTCACGTACAGGAGTTCCAGGGCAGAGTCACCATGACCAGGGACA...,SHVQEFQGRVTMTRDTSTSTVYMELSSLRSEDTAVYFC,TGGGGCCAGGGAACCCTGGTCACCGTCGCCTCA,WGQGTLVTVAS,GCGAGACAACAGTGTAGTGCTTCCAACTGCTATGAGGAGAGTTTTG...,ARQQCSASNCYEESFDH,TGTGCGAGACAACAGTGTAGTGCTTCCAACTGCTATGAGGAGAGTT...,57,CARQQCSASNCYEESFDHW,19,338.50,19.906250,63.406250,44N250M105S2N,256S7N22M77S2N,286S5N43M26S,7.250000e-95,0.220,2.356000e-14,93.1875,81.8125,95.3750,1,250,45,294,257,278,8.0,29.0,287,329,6,48,1,31,32,55,56,106,107,130,131,244,296,328,245,295,CAACAG,6,AGGAGAGT,8,human,,GGCCTCAGTGAAGGTCTCCTGCAAGGCATCTGGATACACCTTCACC...,ASVKVSCKASGYTFTNHYIHWVRQAPGQGLEWMGIINPSGGRTSHV...
2,2_Subject-53_igblastn_anarci_Heavy_IGHD_22592,CGGGGAGTCTCTGAAGATCACCTGTGCAGCCTCTGGATTCACCTTC...,IGH,False,True,False,True,False,False,IGHV3-74*01,"IGHV3-74*01,IGHV3-74*02","IGHD2/OR15-2a*01,IGHD2/OR15-2b*01",IGHJ4*02,CGGGGAGTCTCTGAAGATCACCTGTGCAGCCTCTGGATTCACCTTC...,TGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTC...,GESLKITCAASGFTFSDYWMHWVRQVPGKGLVWVSRINSDGSSTSY...,GGSLRLSCAASGFTFSSYWMHWVRQAPGKGLVWVSRINSDGSSTSY...,1,252,258,263,272,311,CGGGGAGTCTCTGAAGATCACCTGTGCAGCCTCTGGATTCACCTTC...,GESLKITCAASGFTFSDYWMHWVRQVPGKGLVWVSRINSDGSSTSY...,TGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTC...,GGSLRLSCAASGFTFSSYWMHWVRQAPGKGLVWVSRINSDGSSTSY...,AGAATA,N,AGAATA,N,GACTTCTGGGGCCAGGGAACCCTGGTCACCGTCTCGTCAG,DFWGQGTLVTVSS,GACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,DYWGQGTLVTVSS,CGGGGAGTCTCTGAAGATCACCTGTGCAGCCTCT,GESLKITCAAS,GGATTCACCTTCAGTGACTACTGG,GFTFSDYW,ATGCACTGGGTCCGCCAAGTTCCAGGGAAGGGGCTGGTGTGGGTCT...,MHWVRQVPGKGLVWVSR,ATTAACAGCGATGGGAGTAGCACA,INSDGSST,AGCTACGCGGACTCCGTGCAGGGCCGATTCACCATCTCCAGAGACA...,SYADSVQGRFTISRDNAKNTLNLQMNSLRAEDTAVYYC,TGGGGCCAGGGAACCCTGGTCACCGTCTCGTCA,WGQGTLVTVSS,GCAAGTGGGAAGAATACGGCCCTAGACTTC,ASGKNTALDF,TGTGCAAGTGGGAAGAATACGGCCCTAGACTTCTGG,36,CASGKNTALDFW,12,349.25,12.226562,58.656250,41N252M100S3N,257S6M89S25N,271S8N40M41S,3.741000e-98,45.050,6.305000e-13,94.0625,100.0000,95.0000,1,252,42,293,258,263,1.0,6.0,272,311,9,48,1,34,35,58,59,109,110,133,134,247,278,310,248,277,TGGGA,5,CGGCCCTA,8,human,,CGGGGAGTCTCTGAAGATCACCTGTGCAGCCTCTGGATTCACCTTC...,GESLKITCAASGFTFSDYWMHWVRQVPGKGLVWVSRINSDGSSTSY...
3,3_Donor03+IGH-Clonotypes_igblastn_anarci_Heavy...,GCTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTCCTGGAGCT...,IGH,False,True,False,True,False,False,IGHV4-30-2*01,IGHV4-30-2*01,"IGHD2-2*01,IGHD2-2*02,IGHD2-2*03",IGHJ5*02,GCTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTCCTGGAGCT...,GCTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTCCTGGAGCT...,AVSGGSISSGGYSWSWIRQPPGKGLEWIGYIYHSGSTYYNPSLKGR...,AVSGGSISSGGYSWSWIRQPPGKGLEWIGYIYHSGSTYYNPSLKSR...,1,232,233,237,238,260,GCTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTCCTGGAGCT...,AVSGGSISSGGYSWSWIRQPPGKGLEWIGYIYHSGSTYYNPSLKGR...,GCTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTCCTGGAGCT...,AVSGGSISSGGYSWSWIRQPPGKGLEWIGYIYHSGSTYYNPSLKSR...,TACCA,P,TACCA,P,GACCGCTGGGGCCAGGGAACCCT,DRWGQGTL,GACCCCTGGGGCCAGGGAACCCT,DPWGQGTL,GCTGTCTCT,AVS,GGTGGCTCCATCAGCAGTGGTGGTTACTCC,GGSISSGGYS,TGGAGCTGGATCCGGCAGCCACCAGGGAAGGGCCTGGAGTGGATTG...,WSWIRQPPGKGLEWIGY,ATCTATCATAGTGGGAGCACC,IYHSGST,TACTACAACCCGTCCCTCAAGGGTCGAGTCACCATATCAGTAGACA...,YYNPSLKGRVTISVDRSKNQFSLKPSSVTAADTAVYYC,TGGGGCCAGGGAACCCT,WGQGTL,GCCAGAGTACCAGACCGC,ARVPDR,TGTGCCAGAGTACCAGACCGCTGG,24,CARVPDRW,8,354.00,10.304688,34.875000,66N232M28S1N,232S15N5M23S11N,237S11N23M17N,1.058000e-99,124.900,6.555000e-06,98.6875,100.0000,95.6250,1,232,67,298,233,237,16.0,20.0,238,260,12,34,1,9,10,39,40,90,91,111,112,225,244,260,226,243,,0,,0,human,,GCTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTCCTGGAGCT...,AVSGGSISSGGYSWSWIRQPPGKGLEWIGYIYHSGSTYYNPSLKGR...
4,4_704010461_igblastn_anarci_Heavy_IGHE_8,CGGGGAGTCTCTGAAGATCTCCTGTGCAGCCTCTGGATTCACCTTC...,IGH,False,True,False,True,False,False,IGHV3-30-3*01,IGHV3-30-3*01,"IGHD1-20*01,IGHD1-7*01,IGHD1/OR15-1a*01","IGHJ2*01,IGHJ6*03",CGGGGAGTCTCTGAAGATCTCCTGTGCAGCCTCTGGATTCACCTTC...,TGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTC...,GESLKISCAASGFTFSSYAMHWVRQAPGKGLEWVAVISYDGSNKYY...,GRSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVAVISYDGSNKYY...,1,254,341,348,376,382,CGGGGAGTCTCTGAAGATCTCCTGTGCAGCCTCTGGATTCACCTTC...,GESLKISCAASGFTFSSYAMHWVRQAPGKGLEWVAVISYDGSNKYY...,TGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTC...,GRSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVAVISYDGSNKYY...,ATAACTGG,ITG,ATAACTGG,ITG,CTGGTAC,WY,CTGGTAC,WY,CGGGGAGTCTCTGAAGATCTCCTGTGCAGCCTCT,GESLKISCAAS,GGATTCACCTTCAGTAGCTATGCT,GFTFSSYA,ATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGG...,MHWVRQAPGKGLEWVAV,ATATCATATGATGGAAGCAATAAA,ISYDGSNK,TACTACGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACA...,YYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYC,,,GCGAGAGCACCCACCAAGGCTCCGGATGTGTTCCCCATCATATCAG...,ARAPTKAPDVFPIISGCRHPKDSSPVVLACLITGYHPTSVTVTWY,TGTGCGAGAGCACCCACCAAGGCTCCGGATGTGTTCCCCATCATAT...,141,CARAPTKAPDVFPIISGCRHPKDSSPVVLACLITGYHPTSVTVTWYM,47,377.25,16.062500,12.679688,41N254M155S1N,340S3N8M61S6N,375S3N7M27S43N,1.588000e-106,3.657,4.862000e+01,97.2500,100.0000,100.0000,1,254,42,295,341,348,4.0,11.0,376,382,4,10,1,34,35,58,59,109,110,133,134,247,,,248,382,CACCCACCAAGGCTCCGGATGTGTTCCCCATCATATCAGGGTGCAG...,86,GTACCACCCAACGTCCGTGACTGTCAC,27,human,,CGGGGAGTCTCTGAAGATCTCCTGTGCAGCCTCTGGATTCACCTTC...,GESLKISCAASGFTFSSYAMHWVRQAPGKGLEWVAVISYDGSNKYY...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,24995_Donor02+IGL-SomaticVariants_igblastn_ana...,CAGTCTGTGCTGACGCAGTCGCCCTCAGTGTCTGGGGCCCCAGGGC...,IGL,False,True,False,True,False,True,IGLV1-40*01,IGLV1-40*01,,"IGLJ2*01,IGLJ3*01",CAGTCTGTGCTGACGCAGTCGCCCTCAGTGTCTGGGGCCCCAGGGC...,CAGTCTGTGCTGACGCAGCCGCCCTCAGTGTCTGGGGCCCCAGGGC...,QSVLTQSPSVSGAPGQGVTISCTGSSSNIGAGYDAHWYQQLPGTAP...,QSVLTQPPSVSGAPGQRVTISCTGSSSNIGAGYDVHWYQQLPGTAP...,1,297,,,298,333,CAGTCTGTGCTGACGCAGTCGCCCTCAGTGTCTGGGGCCCCAGGGC...,QSVLTQSPSVSGAPGQGVTISCTGSSSNIGAGYDAHWYQQLPGTAP...,CAGTCTGTGCTGACGCAGCCGCCCTCAGTGTCTGGGGCCCCAGGGC...,QSVLTQPPSVSGAPGQRVTISCTGSSSNIGAGYDVHWYQQLPGTAP...,,,,,GTGATATTCGGCGGAGGGACCAAGGTCACCGTCCTA,VIFGGGTKVTVL,GTGGTATTCGGCGGAGGGACCAAGCTGACCGTCCTA,VVFGGGTKLTVL,CAGTCTGTGCTGACGCAGTCGCCCTCAGTGTCTGGGGCCCCAGGGC...,QSVLTQSPSVSGAPGQGVTISCTGS,AGCTCCAACATCGGGGCAGGCTATGAT,SSNIGAGYD,GCCCACTGGTACCAGCAACTTCCAGGAACAGCCCCCAAACTCCTCA...,AHWYQQLPGTAPKLLIF,GGTAATAAC,GNN,AATCGGCCCCCAGGGGTCCCTGATCGCTTTTCTGGCTCCAAGTCTG...,NRPPGVPDRFSGSKSGTSASLAITGLQAEDEADYYC,TTCGGCGGAGGGACCAAGGTCACCGTCCTA,FGGGTKVTVL,CAGTCCTATGACAGCAGCCTGAGTGGTGTGATA,QSYDSSLSGVI,TGCCAGTCCTATGACAGCAGCCTGAGTGGTGTGATATTC,39,CQSYDSSLSGVIF,13,424.00,,49.125000,297M36S2N,,297S1N36M1N,1.079000e-120,,4.335000e-10,95.6250,,91.6875,1,297,1,297,,,,,298,333,2,37,1,75,76,102,103,153,154,162,163,270,304,333,271,303,,0,,,human,,CAGTCTGTGCTGACGCAGTCGCCCTCAGTGTCTGGGGCCCCAGGGC...,QSVLTQSPSVSGAPGQGVTISCTGSSSNIGAGYDAHWYQQLPGTAP...
24996,24996_ERR3664757_igblastn_anarci_Light_Bulk_1364,GGGGTAGAGAAGACAGGACTCAGGACAATCTCCAGCATGGCCAGCT...,IGL,False,True,False,True,False,True,IGLV1-44*01,IGLV1-44*01,,IGLJ3*02,CAGTCTGTACTGACTCAGCCACCCTCAGCGTCTGGGACCCCCGGGC...,CAGTCTGTGCTGACTCAGCCACCCTCAGCGTCTGGGACCCCCGGGC...,QSVLTQPPSASGTPGQRVTISCSGSNSNIGTNAVNWYQQFPGTAPK...,QSVLTQPPSASGTPGQRVTISCSGSSSNIGSNTVNWYQQLPGTAPK...,1,294,,,296,331,CAGTCTGTACTGACTCAGCCACCCTCAGCGTCTGGGACCCCCGGGC...,QSVLTQPPSASGTPGQRVTISCSGSNSNIGTNAVNWYQQFPGTAPK...,CAGTCTGTGCTGACTCAGCCACCCTCAGCGTCTGGGACCCCCGGGC...,QSVLTQPPSASGTPGQRVTISCSGSSSNIGSNTVNWYQQLPGTAPK...,,,,,GGGTGTTCGGCGGAGGGACCAATCTGACCGTCCTTG,VFGGGTNLTVL,GGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAG,VFGGGTKLTVL,CAGTCTGTACTGACTCAGCCACCCTCAGCGTCTGGGACCCCCGGGC...,QSVLTQPPSASGTPGQRVTISCSGS,AACTCCAACATCGGAACTAATGCT,NSNIGTNA,GTAAACTGGTACCAGCAGTTCCCAGGAACGGCCCCCAAACTCCTCA...,VNWYQQFPGTAPKLLIY,ATTGATAAT,IDN,CCGCGGCCCCGAGGGGTCCCTGACCGGTTTTATGCCTTCAAGTCTG...,PRPRGVPDRFYAFKSGTSASLAISGLQSEYEAYYYW,TTCGGCGGAGGGACCAATCTGACCGTCCTT,FGGGTNLTVL,GCAGCATGGGATGCCGGCCTGAGTGGTGGGGTG,AAWDAGLSGGV,TGGGCAGCATGGGATGCCGGCCTGAGTGGTGGGGTGTTC,39,WAAWDAGLSGGVF,13,394.50,,52.312500,93S294M102S2N,,388S2N36M65S,1.323000e-111,,6.916000e-11,92.8750,,94.4375,94,387,1,294,,,,,389,424,3,38,94,168,169,192,193,243,244,252,253,360,394,423,361,393,G,1,,,human,,CAGTCTGTACTGACTCAGCCACCCTCAGCGTCTGGGACCCCCGGGC...,QSVLTQPPSASGTPGQRVTISCSGSNSNIGTNAVNWYQQFPGTAPK...
24997,24997_Donor03+IGK-SomaticVariants_igblastn_ana...,GACATCCAGATGACCCAGTCTCCATCTTCCCTGTCTGCATCTGTGG...,IGK,False,True,False,True,False,True,IGKV1-39*01,"IGKV1-39*01,IGKV1D-39*01",,IGKJ5*01,GACATCCAGATGACCCAGTCTCCATCTTCCCTGTCTGCATCTGTGG...,GACATCCAGATGACCCAGTCTCCATCCTCCCTGTCTGCATCTGTAG...,DIQMTQSPSSLSASVGDRVTISCRASENINWHLNWDQQQPGKAPKL...,DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKL...,1,283,,,289,325,GACATCCAGATGACCCAGTCTCCATCTTCCCTGTCTGCATCTGTGG...,DIQMTQSPSSLSASVGDRVTISCRASENINWHLNWDQQQPGKAPKL...,GACATCCAGATGACCCAGTCTCCATCCTCCCTGTCTGCATCTGTAG...,DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKL...,,,,,ATTACCTTCGGCCAAGGGACACGGCTGGAGATTAAAC,ITFGQGTRLEIK,ATCACCTTCGGCCAAGGGACACGACTGGAGATTAAAC,ITFGQGTRLEIK,GACATCCAGATGACCCAGTCTCCATCTTCCCTGTCTGCATCTGTGG...,DIQMTQSPSSLSASVGDRVTISCRAS,GAGAATATTAACTGGCAT,ENINWH,TTAAATTGGGATCAGCAACAACCAGGGAAAGCCCCTAAGCTCCTGA...,LNWDQQQPGKAPKLLIY,GGTGCATCC,GAS,AGTTTGCAAAATGGGGTGCCGTCAAGATTCAGGGGCGGTGGATCTG...,SLQNGVPSRFRGGGSGTDFTLIITNLQPEDFATYYC,TTCGGCCAAGGGACACGGCTGGAGATTAAA,FGQGTRLEIK,CAACAGAGTTACACTGCCCTTTCCATTACC,QQSYTALSIT,TGTCAACAGAGTTACACTGCCCTTTCCATTACCTTC,36,CQQSYTALSITF,12,361.75,,53.875000,283M42S4N,,288S1N37M,6.067000e-102,,1.565000e-11,90.8125,,94.6250,1,283,1,283,,,,,289,325,2,38,1,78,79,96,97,147,148,156,157,264,295,324,265,294,TTTCC,5,,,human,,GACATCCAGATGACCCAGTCTCCATCTTCCCTGTCTGCATCTGTGG...,DIQMTQSPSSLSASVGDRVTISCRASENINWHLNWDQQQPGKAPKL...
24998,24998_ERR3664763_igblastn_anarci_Light_Bulk_238,GGTGGGTCCAGGAGGCAGAACTCTGGGTGTCTCACCATGGCCTGGA...,IGL,False,True,False,True,False,True,IGLV3-25*03,IGLV3-25*03,,IGLJ3*02,TCCTATGAGGTGACACAGCCACACTCGGTGTCAGTGTCCCCAGGAC...,TCCTATGAGCTGACACAGCCACCCTCGGTGTCAGTGTCCCCAGGAC...,SYEVTQPHSVSVSPGQTARIKCSGDEMPNQYAYLYQQKPGQAPVLI...,SYELTQPPSVSVSPGQTARITCSGDALPKQYAYWYQQKPGQAPVLV...,1,289,,,291,325,TCCTATGAGGTGACACAGCCACACTCGGTGTCAGTGTCCCCAGGAC...,SYEVTQPHSVSVSPGQTARIKCSGDEMPNQYAYLYQQKPGQAPVLI...,TCCTATGAGCTGACACAGCCACCCTCGGTGTCAGTGTCCCCAGGAC...,SYELTQPPSVSVSPGQTARITCSGDALPKQYAYWYQQKPGQAPVLV...,,,,,GGTGTTCGGCGGCGGGACCAAGCTGACCGTCCTAC,VFGGGTKLTVL,GGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAG,VFGGGTKLTVL,TCCTATGAGGTGACACAGCCACACTCGGTGTCAGTGTCCCCAGGAC...,SYEVTQPHSVSVSPGQTARIKCSGD,GAAATGCCGAATCAATAT,EMPNQY,GCTTATTTGTACCAACAGAAGCCAGGCCAGGCCCCAGTCCTTATCA...,AYLYQQKPGQAPVLIIY,CAGACCATT,QTI,GAGAGGCCCTCAGGGATCCCTGAGCGATTCTCTGGCTCCCTCTCAG...,ERPSGIPERFSGSLSGTTVTLTISGVQAEDEAVYYC,TTCGGCGGCGGGACCAAGCTGACCGTCCTA,FGGGTKLTVL,CAATCGGCAGACAATGGTGGTACTTATCAGGTG,QSADNGGTYQV,TGTCAATCGGCAGACAATGGTGGTACTTATCAGGTGTTC,39,CQSADNGGTYQVF,13,361.75,,52.312500,93S289M101S1N,,383S3N35M65S,9.239000e-102,,6.828000e-11,89.9375,,94.3125,94,382,1,289,,,,,384,418,4,38,94,168,169,186,187,237,238,246,247,354,388,417,355,387,A,1,,,human,,TCCTATGAGGTGACACAGCCACACTCGGTGTCAGTGTCCCCAGGAC...,SYEVTQPHSVSVSPGQTARIKCSGDEMPNQYAYLYQQKPGQAPVLI...


In [19]:
sadie_airr_only = sadie_airr[compare_key].copy()
sadie_airr_only["v_call_top"] = (
    sadie_airr_only["v_call"].str.split(",").str.get(0)
)
sadie_airr_only["d_call_top"] = (
    sadie_airr_only["d_call"].str.split(",").str.get(0)
)
sadie_airr_only["j_call_top"] = (
    sadie_airr_only["j_call"].str.split(",").str.get(0)
)
sadie_airr_only = sadie_airr_only.drop(ignore, axis=1)
sadie_airr_only.loc[:, starts_and_ends] = sadie_airr_only[
    starts_and_ends
].astype("Int64")
sadie_airr_only

Unnamed: 0,sequence_id,sequence,locus,stop_codon,vj_in_frame,productive,rev_comp,complete_vdj,sequence_alignment,sequence_alignment_aa,germline_alignment,germline_alignment_aa,fwr1,fwr1_aa,fwr1_end,fwr1_start,cdr1,cdr1_aa,cdr1_start,cdr1_end,fwr2,fwr2_aa,fwr2_end,fwr2_start,cdr2,cdr2_aa,cdr2_start,cdr2_end,fwr3,fwr3_aa,fwr3_end,fwr3_start,cdr3,cdr3_aa,cdr3_end,cdr3_start,fwr4,fwr4_aa,fwr4_end,fwr4_start,v_alignment_start,v_alignment_end,v_cigar,v_germline_alignment,v_germline_alignment_aa,v_germline_start,v_germline_end,v_identity,v_sequence_alignment,v_sequence_alignment_aa,v_sequence_start,v_sequence_end,d_alignment_start,d_alignment_end,d_cigar,d_germline_alignment,d_germline_alignment_aa,d_germline_start,d_germline_end,d_identity,d_sequence_alignment,d_sequence_alignment_aa,d_sequence_start,d_sequence_end,j_alignment_start,j_alignment_end,j_cigar,j_germline_alignment,j_germline_alignment_aa,j_germline_start,j_germline_end,j_identity,j_sequence_alignment,j_sequence_alignment_aa,j_sequence_start,j_sequence_end,junction,junction_aa,junction_aa_length,junction_length,np1,np1_length,np2,np2_length,v_call_top,d_call_top,j_call_top
0,0_G_d22_H_igblastn_anarci_Heavy_IGHA_219,ACTGTGTCTGGTGGTCTCATCACTAGTGATGGTCATTACTGGAGTT...,IGH,False,True,True,False,False,ACTGTGTCTGGTGGTCTCATCACTAGTGATGGTCATTACTGGAGTT...,TVSGGLITSDGHYWSWIRQSPGKGLEWLGSTYYNGATYYSESLESR...,ACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCT...,TVSGGSISSGGYYWSWIRQHPGKGLEWIGYIYYSGSTYYNPSLKSR...,ACTGTGTCT,TVS,9,1,GGTGGTCTCATCACTAGTGATGGTCATTAC,GGLITSDGHY,10,39,TGGAGTTGGATTCGCCAGTCTCCAGGGAAGGGCCTGGAGTGGTTGG...,WSWIRQSPGKGLEWLGS,90,40,ACTTATTACAATGGGGCCACC,TYYNGAT,91,111,TACTACAGCGAGTCCCTCGAGAGTCGACTCCTCATTTCAATAGACC...,YYSESLESRLLISIDPSQTQFSLKLTSVTAADTAVYYC,225,112,GCGACCTGGACCCAACTCCAACTTGATGCTTTTGATATG,ATWTQLQLDAFDM,264,226,TGGGGCCAAGGGACATTGGTCACCGTCTCCTCA,WGQGTLVTVSS,297,265,1,229,66N229M68S4N,ACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCT...,TVSGGSISSGGYYWSWIRQHPGKGLEWIGYIYYSGSTYYNPSLKSR...,67,295,82.5625,ACTGTGTCTGGTGGTCTCATCACTAGTGATGGTCATTACTGGAGTT...,TVSGGLITSDGHYWSWIRQSPGKGLEWLGSTYYNGATYYSESLESR...,1,229,239,244,238S13N6M53S,AACTCC,L,14,19,100.0000,AACTCC,L,239,244,249,297,248S49M1N,TGATGCTTTTGATATCTGGGGCCAAGGGACAATGGTCACCGTCTCTTCA,DAFDIWGQGTMVTVSS,1,49,93.8750,TGATGCTTTTGATATGTGGGGCCAAGGGACATTGGTCACCGTCTCCTCA,DAFDMWGQGTLVTVSS,249,297,TGTGCGACCTGGACCCAACTCCAACTTGATGCTTTTGATATGTGG,CATWTQLQLDAFDMW,15,45,CCTGGACCC,9,AACT,4,IGHV4-31*02,IGHD4-23*01,IGHJ3*02
1,1_705010661_igblastn_anarci_Heavy_IGHE_1125,GGCCTCAGTGAAGGTCTCCTGCAAGGCATCTGGATACACCTTCACC...,IGH,False,True,True,False,False,GGCCTCAGTGAAGGTCTCCTGCAAGGCATCTGGATACACCTTCACC...,ASVKVSCKASGYTFTNHYIHWVRQAPGQGLEWMGIINPSGGRTSHV...,GGCCTCAGTGAAGGTTTCCTGCAAGGCATCTGGATACACCTTCACC...,ASVKVSCKASGYTFTSYYMHWVRQAPGQGLEWMGIINPSGGSTSYA...,GGCCTCAGTGAAGGTCTCCTGCAAGGCATCT,ASVKVSCKAS,31,1,GGATACACCTTCACCAACCACTAC,GYTFTNHY,32,55,ATCCACTGGGTGCGACAGGCCCCTGGACAAGGGCTTGAGTGGATGG...,IHWVRQAPGQGLEWMGI,106,56,ATCAACCCTAGTGGTGGTAGGACA,INPSGGRT,107,130,AGTCACGTACAGGAGTTCCAGGGCAGAGTCACCATGACCAGGGACA...,SHVQEFQGRVTMTRDTSTSTVYMELSSLRSEDTAVYFC,244,131,GCGAGACAACAGTGTAGTGCTTCCAACTGCTATGAGGAGAGTTTTG...,ARQQCSASNCYEESFDH,295,245,TGGGGCCAGGGAACCCTGGTCACCGTCGCCTCA,WGQGTLVTVAS,328,296,1,250,44N250M105S2N,GGCCTCAGTGAAGGTTTCCTGCAAGGCATCTGGATACACCTTCACC...,ASVKVSCKASGYTFTSYYMHWVRQAPGQGLEWMGIINPSGGSTSYA...,45,294,93.1875,GGCCTCAGTGAAGGTCTCCTGCAAGGCATCTGGATACACCTTCACC...,ASVKVSCKASGYTFTNHYIHWVRQAPGQGLEWMGIINPSGGRTSHV...,1,250,257,278,256S7N22M77S2N,TGTAGTAGTACCAGCTGCTATG,CSSTSCY,8,29,81.8125,TGTAGTGCTTCCAACTGCTATG,CSASNCY,257,278,287,329,286S5N43M26S,TTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,FDYWGQGTLVTVSS,6,48,95.3750,TTTGACCACTGGGGCCAGGGAACCCTGGTCACCGTCGCCTCAG,FDHWGQGTLVTVAS,287,329,TGTGCGAGACAACAGTGTAGTGCTTCCAACTGCTATGAGGAGAGTT...,CARQQCSASNCYEESFDHW,19,57,CAACAG,6,AGGAGAGT,8,IGHV1-46*01,IGHD2-2*01,IGHJ4*02
2,2_Subject-53_igblastn_anarci_Heavy_IGHD_22592,CGGGGAGTCTCTGAAGATCACCTGTGCAGCCTCTGGATTCACCTTC...,IGH,False,True,True,False,False,CGGGGAGTCTCTGAAGATCACCTGTGCAGCCTCTGGATTCACCTTC...,GESLKITCAASGFTFSDYWMHWVRQVPGKGLVWVSRINSDGSSTSY...,TGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTC...,GGSLRLSCAASGFTFSSYWMHWVRQAPGKGLVWVSRINSDGSSTSY...,CGGGGAGTCTCTGAAGATCACCTGTGCAGCCTCT,GESLKITCAAS,34,1,GGATTCACCTTCAGTGACTACTGG,GFTFSDYW,35,58,ATGCACTGGGTCCGCCAAGTTCCAGGGAAGGGGCTGGTGTGGGTCT...,MHWVRQVPGKGLVWVSR,109,59,ATTAACAGCGATGGGAGTAGCACA,INSDGSST,110,133,AGCTACGCGGACTCCGTGCAGGGCCGATTCACCATCTCCAGAGACA...,SYADSVQGRFTISRDNAKNTLNLQMNSLRAEDTAVYYC,247,134,GCAAGTGGGAAGAATACGGCCCTAGACTTC,ASGKNTALDF,277,248,TGGGGCCAGGGAACCCTGGTCACCGTCTCGTCA,WGQGTLVTVSS,310,278,1,252,41N252M100S3N,TGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTC...,GGSLRLSCAASGFTFSSYWMHWVRQAPGKGLVWVSRINSDGSSTSY...,42,293,94.0625,CGGGGAGTCTCTGAAGATCACCTGTGCAGCCTCTGGATTCACCTTC...,GESLKITCAASGFTFSDYWMHWVRQVPGKGLVWVSRINSDGSSTSY...,1,252,258,263,257S6M89S25N,AGAATA,N,1,6,100.0000,AGAATA,N,258,263,272,311,271S8N40M41S,GACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,DYWGQGTLVTVSS,9,48,95.0000,GACTTCTGGGGCCAGGGAACCCTGGTCACCGTCTCGTCAG,DFWGQGTLVTVSS,272,311,TGTGCAAGTGGGAAGAATACGGCCCTAGACTTCTGG,CASGKNTALDFW,12,36,TGGGA,5,CGGCCCTA,8,IGHV3-74*01,IGHD2/OR15-2a*01,IGHJ4*02
3,3_Donor03+IGH-Clonotypes_igblastn_anarci_Heavy...,GCTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTCCTGGAGCT...,IGH,False,True,True,False,False,GCTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTCCTGGAGCT...,AVSGGSISSGGYSWSWIRQPPGKGLEWIGYIYHSGSTYYNPSLKGR...,GCTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTCCTGGAGCT...,AVSGGSISSGGYSWSWIRQPPGKGLEWIGYIYHSGSTYYNPSLKSR...,GCTGTCTCT,AVS,9,1,GGTGGCTCCATCAGCAGTGGTGGTTACTCC,GGSISSGGYS,10,39,TGGAGCTGGATCCGGCAGCCACCAGGGAAGGGCCTGGAGTGGATTG...,WSWIRQPPGKGLEWIGY,90,40,ATCTATCATAGTGGGAGCACC,IYHSGST,91,111,TACTACAACCCGTCCCTCAAGGGTCGAGTCACCATATCAGTAGACA...,YYNPSLKGRVTISVDRSKNQFSLKPSSVTAADTAVYYC,225,112,GCCAGAGTACCAGACCGC,ARVPDR,243,226,TGGGGCCAGGGAACCCT,WGQGTL,260,244,1,232,66N232M28S1N,GCTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTCCTGGAGCT...,AVSGGSISSGGYSWSWIRQPPGKGLEWIGYIYHSGSTYYNPSLKSR...,67,298,98.6875,GCTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTCCTGGAGCT...,AVSGGSISSGGYSWSWIRQPPGKGLEWIGYIYHSGSTYYNPSLKGR...,1,232,233,237,232S15N5M23S11N,TACCA,P,16,20,100.0000,TACCA,P,233,237,238,260,237S11N23M17N,GACCCCTGGGGCCAGGGAACCCT,DPWGQGTL,12,34,95.6250,GACCGCTGGGGCCAGGGAACCCT,DRWGQGTL,238,260,TGTGCCAGAGTACCAGACCGCTGG,CARVPDRW,8,24,,0,,0,IGHV4-30-2*01,IGHD2-2*01,IGHJ5*02
4,4_704010461_igblastn_anarci_Heavy_IGHE_8,CGGGGAGTCTCTGAAGATCTCCTGTGCAGCCTCTGGATTCACCTTC...,IGH,False,True,True,False,False,CGGGGAGTCTCTGAAGATCTCCTGTGCAGCCTCTGGATTCACCTTC...,GESLKISCAASGFTFSSYAMHWVRQAPGKGLEWVAVISYDGSNKYY...,TGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTC...,GRSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVAVISYDGSNKYY...,CGGGGAGTCTCTGAAGATCTCCTGTGCAGCCTCT,GESLKISCAAS,34,1,GGATTCACCTTCAGTAGCTATGCT,GFTFSSYA,35,58,ATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGG...,MHWVRQAPGKGLEWVAV,109,59,ATATCATATGATGGAAGCAATAAA,ISYDGSNK,110,133,TACTACGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACA...,YYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYC,247,134,GCGAGAGCACCCACCAAGGCTCCGGATGTGTTCCCCATCATATCAG...,ARAPTKAPDVFPIISGCRHPKDSSPVVLACLITGYHPTSVTVTWY,382,248,,,,,1,254,41N254M155S1N,TGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTC...,GRSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVAVISYDGSNKYY...,42,295,97.2500,CGGGGAGTCTCTGAAGATCTCCTGTGCAGCCTCTGGATTCACCTTC...,GESLKISCAASGFTFSSYAMHWVRQAPGKGLEWVAVISYDGSNKYY...,1,254,341,348,340S3N8M61S6N,ATAACTGG,ITG,4,11,100.0000,ATAACTGG,ITG,341,348,376,382,375S3N7M27S43N,CTGGTAC,WY,4,10,100.0000,CTGGTAC,WY,376,382,TGTGCGAGAGCACCCACCAAGGCTCCGGATGTGTTCCCCATCATAT...,CARAPTKAPDVFPIISGCRHPKDSSPVVLACLITGYHPTSVTVTWYM,47,141,CACCCACCAAGGCTCCGGATGTGTTCCCCATCATATCAGGGTGCAG...,86,GTACCACCCAACGTCCGTGACTGTCAC,27,IGHV3-30-3*01,IGHD1-20*01,IGHJ2*01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,24995_Donor02+IGL-SomaticVariants_igblastn_ana...,CAGTCTGTGCTGACGCAGTCGCCCTCAGTGTCTGGGGCCCCAGGGC...,IGL,False,True,True,False,True,CAGTCTGTGCTGACGCAGTCGCCCTCAGTGTCTGGGGCCCCAGGGC...,QSVLTQSPSVSGAPGQGVTISCTGSSSNIGAGYDAHWYQQLPGTAP...,CAGTCTGTGCTGACGCAGCCGCCCTCAGTGTCTGGGGCCCCAGGGC...,QSVLTQPPSVSGAPGQRVTISCTGSSSNIGAGYDVHWYQQLPGTAP...,CAGTCTGTGCTGACGCAGTCGCCCTCAGTGTCTGGGGCCCCAGGGC...,QSVLTQSPSVSGAPGQGVTISCTGS,75,1,AGCTCCAACATCGGGGCAGGCTATGAT,SSNIGAGYD,76,102,GCCCACTGGTACCAGCAACTTCCAGGAACAGCCCCCAAACTCCTCA...,AHWYQQLPGTAPKLLIF,153,103,GGTAATAAC,GNN,154,162,AATCGGCCCCCAGGGGTCCCTGATCGCTTTTCTGGCTCCAAGTCTG...,NRPPGVPDRFSGSKSGTSASLAITGLQAEDEADYYC,270,163,CAGTCCTATGACAGCAGCCTGAGTGGTGTGATA,QSYDSSLSGVI,303,271,TTCGGCGGAGGGACCAAGGTCACCGTCCTA,FGGGTKVTVL,333,304,1,297,297M36S2N,CAGTCTGTGCTGACGCAGCCGCCCTCAGTGTCTGGGGCCCCAGGGC...,QSVLTQPPSVSGAPGQRVTISCTGSSSNIGAGYDVHWYQQLPGTAP...,1,297,95.6250,CAGTCTGTGCTGACGCAGTCGCCCTCAGTGTCTGGGGCCCCAGGGC...,QSVLTQSPSVSGAPGQGVTISCTGSSSNIGAGYDAHWYQQLPGTAP...,1,297,,,,,,,,,,,,,298,333,297S1N36M1N,GTGGTATTCGGCGGAGGGACCAAGCTGACCGTCCTA,VVFGGGTKLTVL,2,37,91.6875,GTGATATTCGGCGGAGGGACCAAGGTCACCGTCCTA,VIFGGGTKVTVL,298,333,TGCCAGTCCTATGACAGCAGCCTGAGTGGTGTGATATTC,CQSYDSSLSGVIF,13,39,,0,,,IGLV1-40*01,,IGLJ2*01
24996,24996_ERR3664757_igblastn_anarci_Light_Bulk_1364,GGGGTAGAGAAGACAGGACTCAGGACAATCTCCAGCATGGCCAGCT...,IGL,False,True,True,False,True,CAGTCTGTACTGACTCAGCCACCCTCAGCGTCTGGGACCCCCGGGC...,QSVLTQPPSASGTPGQRVTISCSGSNSNIGTNAVNWYQQFPGTAPK...,CAGTCTGTGCTGACTCAGCCACCCTCAGCGTCTGGGACCCCCGGGC...,QSVLTQPPSASGTPGQRVTISCSGSSSNIGSNTVNWYQQLPGTAPK...,CAGTCTGTACTGACTCAGCCACCCTCAGCGTCTGGGACCCCCGGGC...,QSVLTQPPSASGTPGQRVTISCSGS,168,94,AACTCCAACATCGGAACTAATGCT,NSNIGTNA,169,192,GTAAACTGGTACCAGCAGTTCCCAGGAACGGCCCCCAAACTCCTCA...,VNWYQQFPGTAPKLLIY,243,193,ATTGATAAT,IDN,244,252,CCGCGGCCCCGAGGGGTCCCTGACCGGTTTTATGCCTTCAAGTCTG...,PRPRGVPDRFYAFKSGTSASLAISGLQSEYEAYYYW,360,253,GCAGCATGGGATGCCGGCCTGAGTGGTGGGGTG,AAWDAGLSGGV,393,361,TTCGGCGGAGGGACCAATCTGACCGTCCTT,FGGGTNLTVL,423,394,1,294,93S294M102S2N,CAGTCTGTGCTGACTCAGCCACCCTCAGCGTCTGGGACCCCCGGGC...,QSVLTQPPSASGTPGQRVTISCSGSSSNIGSNTVNWYQQLPGTAPK...,1,294,92.8750,CAGTCTGTACTGACTCAGCCACCCTCAGCGTCTGGGACCCCCGGGC...,QSVLTQPPSASGTPGQRVTISCSGSNSNIGTNAVNWYQQFPGTAPK...,94,387,,,,,,,,,,,,,296,331,388S2N36M65S,GGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAG,VFGGGTKLTVL,3,38,94.4375,GGGTGTTCGGCGGAGGGACCAATCTGACCGTCCTTG,VFGGGTNLTVL,389,424,TGGGCAGCATGGGATGCCGGCCTGAGTGGTGGGGTGTTC,WAAWDAGLSGGVF,13,39,G,1,,,IGLV1-44*01,,IGLJ3*02
24997,24997_Donor03+IGK-SomaticVariants_igblastn_ana...,GACATCCAGATGACCCAGTCTCCATCTTCCCTGTCTGCATCTGTGG...,IGK,False,True,True,False,True,GACATCCAGATGACCCAGTCTCCATCTTCCCTGTCTGCATCTGTGG...,DIQMTQSPSSLSASVGDRVTISCRASENINWHLNWDQQQPGKAPKL...,GACATCCAGATGACCCAGTCTCCATCCTCCCTGTCTGCATCTGTAG...,DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKL...,GACATCCAGATGACCCAGTCTCCATCTTCCCTGTCTGCATCTGTGG...,DIQMTQSPSSLSASVGDRVTISCRAS,78,1,GAGAATATTAACTGGCAT,ENINWH,79,96,TTAAATTGGGATCAGCAACAACCAGGGAAAGCCCCTAAGCTCCTGA...,LNWDQQQPGKAPKLLIY,147,97,GGTGCATCC,GAS,148,156,AGTTTGCAAAATGGGGTGCCGTCAAGATTCAGGGGCGGTGGATCTG...,SLQNGVPSRFRGGGSGTDFTLIITNLQPEDFATYYC,264,157,CAACAGAGTTACACTGCCCTTTCCATTACC,QQSYTALSIT,294,265,TTCGGCCAAGGGACACGGCTGGAGATTAAA,FGQGTRLEIK,324,295,1,283,283M42S4N,GACATCCAGATGACCCAGTCTCCATCCTCCCTGTCTGCATCTGTAG...,DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKL...,1,283,90.8125,GACATCCAGATGACCCAGTCTCCATCTTCCCTGTCTGCATCTGTGG...,DIQMTQSPSSLSASVGDRVTISCRASENINWHLNWDQQQPGKAPKL...,1,283,,,,,,,,,,,,,289,325,288S1N37M,ATCACCTTCGGCCAAGGGACACGACTGGAGATTAAAC,ITFGQGTRLEIK,2,38,94.6250,ATTACCTTCGGCCAAGGGACACGGCTGGAGATTAAAC,ITFGQGTRLEIK,289,325,TGTCAACAGAGTTACACTGCCCTTTCCATTACCTTC,CQQSYTALSITF,12,36,TTTCC,5,,,IGKV1-39*01,,IGKJ5*01
24998,24998_ERR3664763_igblastn_anarci_Light_Bulk_238,GGTGGGTCCAGGAGGCAGAACTCTGGGTGTCTCACCATGGCCTGGA...,IGL,False,True,True,False,True,TCCTATGAGGTGACACAGCCACACTCGGTGTCAGTGTCCCCAGGAC...,SYEVTQPHSVSVSPGQTARIKCSGDEMPNQYAYLYQQKPGQAPVLI...,TCCTATGAGCTGACACAGCCACCCTCGGTGTCAGTGTCCCCAGGAC...,SYELTQPPSVSVSPGQTARITCSGDALPKQYAYWYQQKPGQAPVLV...,TCCTATGAGGTGACACAGCCACACTCGGTGTCAGTGTCCCCAGGAC...,SYEVTQPHSVSVSPGQTARIKCSGD,168,94,GAAATGCCGAATCAATAT,EMPNQY,169,186,GCTTATTTGTACCAACAGAAGCCAGGCCAGGCCCCAGTCCTTATCA...,AYLYQQKPGQAPVLIIY,237,187,CAGACCATT,QTI,238,246,GAGAGGCCCTCAGGGATCCCTGAGCGATTCTCTGGCTCCCTCTCAG...,ERPSGIPERFSGSLSGTTVTLTISGVQAEDEAVYYC,354,247,CAATCGGCAGACAATGGTGGTACTTATCAGGTG,QSADNGGTYQV,387,355,TTCGGCGGCGGGACCAAGCTGACCGTCCTA,FGGGTKLTVL,417,388,1,289,93S289M101S1N,TCCTATGAGCTGACACAGCCACCCTCGGTGTCAGTGTCCCCAGGAC...,SYELTQPPSVSVSPGQTARITCSGDALPKQYAYWYQQKPGQAPVLV...,1,289,89.9375,TCCTATGAGGTGACACAGCCACACTCGGTGTCAGTGTCCCCAGGAC...,SYEVTQPHSVSVSPGQTARIKCSGDEMPNQYAYLYQQKPGQAPVLI...,94,382,,,,,,,,,,,,,291,325,383S3N35M65S,GGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAG,VFGGGTKLTVL,4,38,94.3125,GGTGTTCGGCGGCGGGACCAAGCTGACCGTCCTAC,VFGGGTKLTVL,384,418,TGTCAATCGGCAGACAATGGTGGTACTTATCAGGTGTTC,CQSADNGGTYQVF,13,39,A,1,,,IGLV3-25*03,,IGLJ3*02


## Read OAS

In [5]:
df_bz2 = pd.read_csv(
    "https://sadie.s3.us-east-2.amazonaws.com/integration/OAS_sample_subsample.bz2",
    index_col=0,
).reset_index()

sub_sample_file = "../../tests/integration/airr/fixtures/OAS_subsample.fasta"
with open("../../tests/integration/airr/fixtures/OAS_subsample.fasta", "w") as f:
    for index, (i, k, j) in enumerate(
        zip(
            df_bz2["OAS_dataunit"].str.split(".csv.gz").str.get(0),
            df_bz2["OAS_dataindex"],
            df_bz2["sequence"],
        )
    ):
        f.write(f">{index}_{i}_{k}\n{j}\n")

## IMGT HI-Vqeust "AIRR" format

In [6]:
imgt_airr = "../../tests/integration/airr/fixtures/OAS_airr_from_imgtvquest.tsv"
imgt_df = pd.read_csv(imgt_airr, delimiter="\t")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [17]:
spread.df_to_sheet(
    imgt_df.head(100).set_index("sequence_id"),
    sheet="imgt",
    replace=True,
    freeze_headers=True,
    freeze_index=True,
)

In [164]:
top_100_imgt = imgt_df.head(100).copy()
top_100_sadie = sadie_airr.table.head(100).copy()
compariables = []

In [None]:
# spreadable_df = fillna(sadie_airr.table.head(100))
spread.df_to_sheet(
    spreadable_df.set_index("sequence_id"),
    sheet="sadie",
    replace=True,
    freeze_headers=True,
    freeze_index=True,
)

## Clean UP IMGT

In [256]:
def map_bool(df, col):
    return df.loc[:, col].map(
        {"T": True, "F": False, True: True, False: False, nan: False},
        na_action="ignore",
    )


top_100_imgt_airr_only = top_100_imgt[keys].copy()
bool_cols = ["vj_in_frame", "productive", "rev_comp", "complete_vdj", "stop_codon"]
for col in bool_cols:
    top_100_imgt_airr_only[col] = map_bool(top_100_imgt_airr_only, col)


upper_columns = [
    "fwr1",
    "cdr1",
    "fwr2",
    "cdr2",
    "fwr3",
    "cdr3",
    "fwr4",
    "germline_alignment",
    "germline_alignment_aa",
    "sequence_alignment",
    "sequence_alignment_aa",
    "v_germline_alignment",
    "v_germline_alignment_aa",
    "v_sequence_alignment",
    "v_sequence_alignment_aa",
    "d_germline_alignment",
    "d_germline_alignment_aa",
    "d_sequence_alignment",
    "d_sequence_alignment_aa",
    "j_germline_alignment",
    "j_germline_alignment_aa",
    "j_sequence_alignment",
    "j_sequence_alignment_aa",
    "np1",
    "np2",
    "junction",
    "sequence",
]
for col in upper_columns:
    top_100_imgt_airr_only[col] = (
        top_100_imgt_airr_only[col].str.upper().str.replace(".", "")
    )


top_100_imgt_airr_only["v_call_top"] = (
    top_100_imgt_airr_only["v_call"].str.split(",").str.get(0).str.split().str.get(1)
)
top_100_imgt_airr_only["d_call_top"] = (
    top_100_imgt_airr_only["d_call"]
    .str.split(",")
    .str.get(0)
    .str.split("_")
    .str.get(-1)
)
top_100_imgt_airr_only["j_call_top"] = (
    top_100_imgt_airr_only["j_call"].str.split(",").str.get(0).str.split().str.get(1)
)
top_100_imgt_airr_only = top_100_imgt_airr_only.drop(ignore, axis=1)
top_100_imgt_airr_only.loc[:, starts_and_ends] = top_100_imgt_airr_only[
    starts_and_ends
].astype("Int64")
top_100_imgt_airr_only = top_100_imgt_airr_only.set_index("sequence_id")

  top_100_imgt_airr_only[col].str.upper().str.replace(".", "")


## Clean up sadie

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_100_sadie_airr_only["v_call_top"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_100_sadie_airr_only["d_call_top"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_100_sadie_airr_only["j_call_top"] = (


In [268]:
ref["v_identity"] // 1

82.0

In [274]:
ref["np1"]
target["np1"]

'CCTGGACCC'

In [275]:
ref["np1"]

'CCTGGACC'

In [335]:
top_100_imgt_airr_onl.

Index(['0_G_d22_H_igblastn_anarci_Heavy_IGHA_219', '1_705010661_igblastn_anarci_Heavy_IGHE_1125', '2_Subject-53_igblastn_anarci_Heavy_IGHD_22592', '3_Donor03+IGH-Clonotypes_igblastn_anarci_Heavy_Bul', '4_704010461_igblastn_anarci_Heavy_IGHE_8', '5_E_d23_H_igblastn_anarci_Heavy_IGHD_3268', '6_BCP9-GC_igblastn_anarci_Heavy_IGHD_269', '7_BCP9-Naive_igblastn_anarci_IGHD_177', '8_BCP3-GC_igblastn_anarci_Heavy_IGHD_2256', '9_SRR12081566_igblastn_anarci_Heavy_Bulk_22358', '10_703010269_igblastn_anarci_Heavy_IGHD_33102', '11_707010476_igblastn_anarci_Heavy_IGHD_5257', '12_703010632_igblastn_anarci_Heavy_IGHG_27951', '13_Donor01+IGH-Clonotypes_igblastn_anarci_Heavy_Bu', '14_705010661_igblastn_anarci_Heavy_IGHA_18220', '15_Subject-50_igblastn_anarci_Heavy_IGHD_68196', '16_BCP3-MBC_igblastn_anarci_IGHG_8632', '17_Subject-19_igblastn_anarci_Heavy_IGHG_32336', '18_Subject-71_igblastn_anarci_Heavy_IGHD_1288', '19_BCP2-GC_igblastn_anarci_IGHG_6595', '20_SRR12081539_igblastn_anarci_Heavy_Bulk_32454', 

In [356]:
target_all = top_100_sadie_airr_only.reset_index()
reference_all = top_100_imgt_airr_only.reset_index()
for index in reference_all.index:
    ref = reference_all.loc[index]
    target = target_all.loc[index]
    if (
        ref["v_call_top"] == target["v_call_top"]
        and ref["j_call_top"] == target["j_call_top"]
    ):
        check_these = ["fwr1", "cdr1", "fwr2", "cdr2", "fwr3", "cdr3"]
        if not (ref[check_these].fillna("") == target[check_these].fillna("")).all():
            print("here")
            break
    else:
        print(
            ref["v_call_top"],
            target["v_call_top"],
            ref["j_call_top"],
            target["j_call_top"],
        )
    # diff_indexes = ref[~(ref == target)].index


#     for different_index in diff_indexes:
#         ref_ = ref[different_index]
#         target_ = target[different_index]
#         if isinstance(ref_, float):
#             if round(ref_, 1) == round(target_, 1):
#                 continue
#         print(f"{different_index}\nIMGT: {ref_}\nSADIE: {target_}\n")
#     break

IGHV3-30-3*01 IGHV3-30-3*01 nan IGHJ2*01
IGHV4-38-2*01 IGHV4-38-2*02 IGHJ5*02 IGHJ5*02
IGHV4-34*08 IGHV4-34*01 IGHJ4*02 IGHJ4*02
IGHV1-2*02 IGHV1-2*02 IGHJ5*01 IGHJ4*03
IGHV5-51*01 IGHV5-51*01 IGHJ5*01 IGHJ4*02
IGHV4-39*01 IGHV4-39*01 IGHJ4*02 IGHJ5*02
IGHV3-30*03 IGHV3-30*18 IGHJ6*02 IGHJ6*02
IGHV1-69-2*01 IGHV1-69-2*01 IGHJ4*03 IGHJ4*01
IGHV4-59*09 IGHV4-61*02 IGHJ4*02 IGHJ4*02
IGHV1-69*07 IGHV1-69*05 nan IGHJ6*01
IGHV4-39*07 IGHV4-59*01 IGHJ4*02 IGHJ4*02
IGHV3-30*03 IGHV3-30*18 IGHJ6*02 IGHJ6*02
IGHV4-38-2*02 IGHV4-30-4*05 IGHJ6*01 IGHJ6*01
here


In [358]:
ref[check_these] == target[check_these]

fwr1     True
cdr1     True
fwr2     True
cdr2    False
fwr3     True
cdr3     True
Name: 99, dtype: bool

In [362]:
ref["cdr2_aa"]

'ISGSRTYI'

In [363]:
target["cdr2_aa"]

'ISGSFRTYI'

In [361]:
ref["sequence"]

'TACCTTGACTTAACTCTTGGGGGTCTGGGGGAGGCCTGGTCAAGCCTGGGGGGTCCCTGAGACTCTCATGTGCAGCCTCTGGATTCACCTTCAGTAGTTTTACCATGAATTGGGTCCGCCAGGCTCCAGGGAAGGGACTGGAGTGGGTCTCATCCATTAGCGGCAGTTTCAGAACTTACATATATTATGCCGACTCAGTGAAGGGCCGATTCACCGTCTCCAGAGACAACGCCAAGGAATTGGTGTTTCTGCAGATGGACAACCTGAGAGTCGAAGACACAGGTGTATATTATTGTGCGAGAGACCTAAATACGGTGACTACCCCAGAATACTTCCAACACTGGGGCCTGGGCACCCCGGTCTCCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTG'

In [355]:
target["cdr2"]

'ATCAACCCTAGTGGTGATAACACA'

In [344]:
ref["fwr4"]

'TGGGGCCAGGGAACC'

In [345]:

target["fwr4"]

'TGGGGCCAGGGAACCCT'

In [342]:
ref[check_these] == target[check_these]

fwr1     True
cdr1     True
fwr2     True
cdr2     True
fwr3     True
cdr3     True
fwr4    False
Name: 3, dtype: bool

In [328]:
len(ref["fwr4"])

33

In [320]:
ref["v_call_top"]

'IGHV4-31*02'

In [321]:
target["v_call_top"]

'IGHV4-31*02'

In [303]:
ref["j_call_top"]

'IGHJ4*02'

In [304]:
target["j_call_top"]

'IGHJ4*02'

In [330]:
len(target["fwr4"])

33

In [331]:
len(ref["fwr4"])

33

In [None]:
TGGGGCCAGGGAACCCTGGTCACCGTCGCCTCAG
TGGGGCCAGGGAACCCTGGTCACCGTCGCCTCA

In [312]:
from Bio.Seq import Seq

Seq("tggggccagggaaccctggtcaccgtcgcctca").translate()

Seq('WGQGTLVTVAS')

In [308]:
ref["sequence"]

'GGCCTCAGTGAAGGTCTCCTGCAAGGCATCTGGATACACCTTCACCAACCACTACATCCACTGGGTGCGACAGGCCCCTGGACAAGGGCTTGAGTGGATGGGAATAATCAACCCTAGTGGTGGTAGGACAAGTCACGTACAGGAGTTCCAGGGCAGAGTCACCATGACCAGGGACACGTCCACGAGTACTGTCTACATGGAGTTGAGTAGCCTGAGATCTGAGGACACGGCCGTGTACTTTTGTGCGAGACAACAGTGTAGTGCTTCCAACTGCTATGAGGAGAGTTTTGACCACTGGGGCCAGGGAACCCTGGTCACCGTCGCCTCAGCCTCCACACAGAGCCCATCCGTCTTC'

In [298]:
target[check_these] == ref[check_these]

fwr1     True
cdr1     True
fwr2     True
cdr2     True
fwr3     True
cdr3     True
fwr4    False
Name: 1_705010661_igblastn_anarci_Heavy_IGHE_1125, dtype: bool

'ACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGANNNNNNNNNAACTCCNNNNTGATGCTTTTGATATCTGGGGCCAAGGGACAATGGTCACCGTCTCTTCA'

In [193]:
spread.df_to_sheet(
    fillna(top_100_sadie_airr_only),
    sheet="sadie_airr_only",
    freeze_headers=True,
    freeze_index=True,
    replace=True,
)

  df.dtypes.apply(lambda x: x in ["float64", "Int16"])


In [213]:
spread.df_to_sheet(
    fillna(top_100_imgt_airr_only),
    sheet="imgt_airr_only",
    freeze_headers=True,
    freeze_index=True,
    replace=True,
)

  df.dtypes.apply(lambda x: x in ["float64", "Int16", "Int64"])


germline_alignment         ACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCT...
germline_alignment_aa      TVSGGSISSGGYYWSWIRQHPGKGLEWIGYIYYSGSTYYNPSLKSR...
v_alignment_end                                                       316.00
v_germline_start                                                        1.00
v_germline_end                                                        316.00
v_identity                                                             82.22
d_alignment_start                                                     325.00
d_alignment_end                                                       334.00
d_germline_alignment                                              CAACTGGAAC
d_germline_alignment_aa                                                  QLE
d_germline_start                                                        5.00
d_germline_end                                                         14.00
d_identity                                                              0.00

## Try and find OAS similariities

I'm suspect of the entire airr file since it doesn't have fwr4

In [None]:
df_bz2 = pd.read_csv(
    "https://sadie.s3.us-east-2.amazonaws.com/integration/OAS_sample_subsample.bz2",
    index_col=0,
).reset_index()

In [None]:
airr_dataframe = airr_api.run_dataframe(df_bz2, "index", "sequence")

In [None]:
ref = df_bz2[df_bz2.columns.intersection(airr_dataframe.table.columns)].drop(
    "sequence_id", axis=1
)
target = airr_dataframe.table[
    airr_dataframe.table.columns.intersection(df_bz2.columns)
].drop("sequence_id", axis=1)

In [None]:
ref.re

In [None]:
for index in ref.index:
    sub_ref = ref.loc[index]
    sub_target = target.loc[index]
    aggree = sub_ref[sub_ref == sub_target].index
    disagree = sub_ref[sub_ref != sub_target].index
    if disagree.empty:
        print('yay')
    for dis_index in disagree:
        print(f"OAR-{dis_index}:{sub_ref[dis_index]}")
        print(f"Sadie-{dis_index}:{sub_target[dis_index]}\n")

    break

In [None]:
disagree

In [None]:
from pandas.testing import assert_series_equal

# Get IMGT airr

In [None]:
imgt_airr = "../../tests/integration/airr/fixtures/OAS_airr_from_imgtvquest.tsv"
imgt_df = pd.read_csv(imgt_airr, delimiter="\t")

In [None]:
spread.df_to_sheet(imgt_df, sheet="imgt_airr")