In [1]:
%load_ext lab_black

# sadie.airr

The airr primary purpose is for annotation of adative receptor nucletodie sequences. The backend uses[IgBLAST](https://www.ncbi.nlm.nih.gov/igblast/) for the inital alignments but abstracts all the difficult parts leaving a smooth pythonic API or command line interface. 

Most importantly, the output it dumps is in compliance with the standards set by [The AIRR community](https://docs.airr-community.org/en/stable/#table-of-contents). These are dumped as AirrTable objects, which is built with a [pandas dataframe](www.pandas.org) and can be manipulated with pandas dataframe operations.

#### Run Single Sequence

In [2]:
from sadie.airr import Airr, AirrTable, ScfvAirrTable, __version__

In [3]:
__version__

'0.2.1'

In [4]:
# Import library
from sadie.airr import Airr, AirrTable, ScfvAirrTable

# initialize API with species
airr_api = Airr("human", adaptable=False)

In [5]:
airr_api.igblast.v_penalty.value

-1

In [None]:
# Run on Single Sequence
single_seq = "GACATTGTGATGACCCAGTCTCCTGTCTCTCTGTCCGTCACCCTCGGACAGCCGGCCTCCATGTCCTGCAAGTCCAGTCAGACTGTCCGACAGAGTGATGGCAAGACTTTCTTATATTGGTATCGACAGAAGGCAGGCCAGTCTCCACAACTGTTAATATATGAGGGTTCGAATCGATTCTCTGGAGTGTCAGATAGGATCTCTGGCAGCGGGTCGGGGACAGATTTCACACTGAGAATCAGTCGAGTGGAGGCTGAGGATGTTGGCGTTTATTTCTGCCTGCAAACTAAAGACTTCCCCCTCACTTTTGGCGGAGGGACCAGGGTGGATATCAAA"
# heavy chain only
airr_table = airr_api.run_single("test_sequnce", single_seq)

In [None]:
airr_table.table.j_germline_alignment_aa

In [None]:
airr_table.table.germline_alignment_aa.iloc[0]

In [None]:
airr_table.table.v_germline_alignment_aa.iloc[0]

In [None]:
airr_table.table.v_germline_alignment_aa.iloc[0]

In [None]:
## Airr format table
v_germline = airr_table.table.v_germline_alignment_aa.iloc[0]
full_germline = airr_table.table.germline_alignment_aa.iloc[0]
cdr3_j_germline = full_germline[len(v_germline) :]


v_mature = airr_table.table.v_sequence_alignment_aa.iloc[0]
full_mature = airr_table.table.sequence_alignment_aa.iloc[0]
cdr3_j_mature = full_mature[len(v_mature) :]

# get j portion
assert len(cdr3_j_mature) == len(cdr3_j_germline)

iGL_cdr3 = ""
for mature, germline in zip(cdr3_j_mature, cdr3_j_germline):
    if germline == "X":
        iGL_cdr3 += mature
        continue
    iGL_cdr3 += germline

full_igl = v_germline + iGL_cdr3

In [None]:
## Access dataframe property
dataframe = airr_table.table
type(dataframe)

#### Run on Fasta File from API

In [None]:
# Import library
from airr import Airr, AirrTable, ScfvAirrTable

# initialize API with species
airr_api = Airr("human")

# test file only currently accepts fasta and compressed fasta
test_file = "tests/fixtures/fasta_inputs/heavy/"

# Get an airr table
results = airr_api.run_file(test_file)

In [None]:
# get only the sanitized antibodies. e.g full reading frame and no stop codons
sanitized_antibodes = results.sanitized_antibodies

In [None]:
# can use pandas opoerations right on datafarme
sanitized_antibodes["v_call"].str.split(",").str.get(0).value_counts().plot(
    kind="bar", color=["blue", "red"]
)

#### Run on BioPython SeqIO Records

If you have all a weird file format, you can pass BioPython SeqIO Records

In [None]:
from Bio import SeqIO

# In fastaq format, convert to list of SeqRecord
fastq_lists = list(
    SeqIO.parse(
        "tests/fixtures/other_inputs/sample_4_deepcdr_contig_list_trimmed.fq", "fastq"
    )
)
# Only run on subset
fastq_list_sub = fastq_lists[0:1000]

In [None]:
# use run multiple function
fastq_file_airr = airr_api.run_multiple(fastq_list_sub)

In [None]:
# easy metrics
sanitized_len = len(fastq_file_airr.sanitized_antibodies)
all_len = len(fastq_file_airr)
print(f"{all_len} total antibodies, {sanitized_len} sanitized_antibodies")

#### Run on Linked Heavy and Light Chains

When the heavy and light chain are in the same fasta read, pass `scfv=True` to get a joined Airr Tagble

In [None]:
## now we can run an scfv file
scfv_file = (
    "tests/fixtures/fasta_inputs/paired/2545_all_no_umi_contigs_full_length.fasta"
)
scfv_output = airr_api.run_file(scfv_file, scfv=True)

In [None]:
# notice that the airr fields have been appended with a _heavy or _light tag for heavy and light chain
scfv_output.table.head(3)

In [None]:
# Only want paired reads. That is, Productive for both heavy and light chain
joined_and_sanitized = scfv_output.get_joined_and_sanatized()
joined_and_sanitized.head(3)