In [1]:
from Bio import SeqIO

In [2]:
nt_records = [rec for rec in SeqIO.parse("../data/10X-VDJ-summary.all_nt.fasta", "fasta")]

In [3]:
nt_record = nt_records[1]
nt_record

SeqRecord(seq=Seq('ATGAGCTGCAGGCTTCTCCTCTATGTTTCCCTATGTCTTGTGGAAACAGCACTC...TCC', SingleLetterAlphabet()), id='30_TRB_CASSPGQGYAEQFF_TGTGCCAGCAGCCCGGGACAGGGTTATGCTGAGCAGTTCTTC', name='30_TRB_CASSPGQGYAEQFF_TGTGCCAGCAGCCCGGGACAGGGTTATGCTGAGCAGTTCTTC', description=' 30_TRB_CASSPGQGYAEQFF_TGTGCCAGCAGCCCGGGACAGGGTTATGCTGAGCAGTTCTTC', dbxrefs=[])

In [4]:
[sid, gene, cdr3_aa, cdr3_nt] = nt_record.id.split("_")

In [5]:
cdr3_nt

'TGTGCCAGCAGCCCGGGACAGGGTTATGCTGAGCAGTTCTTC'

In [6]:
one_seq = nt_record.seq
one_seq

Seq('ATGAGCTGCAGGCTTCTCCTCTATGTTTCCCTATGTCTTGTGGAAACAGCACTC...TCC', SingleLetterAlphabet())

In [7]:
str(one_seq).find(cdr3_nt) > -1

True

In [8]:
one_aa_seq = one_seq.translate()
one_aa_seq

Seq('MSCRLLLYVSLCLVETALMNTKITQSPRYLILGRANKSLECEQHLGHNAMYWYK...KNS', ExtendedIUPACProtein())

In [9]:
str(one_aa_seq).find(cdr3_aa) > -1

True

In [10]:
alpha_constant = next(SeqIO.parse("../sequences/mouse.TRAC.dna.fasta", "fasta"))
alpha_constant

SeqRecord(seq=Seq('ACATCCAGAACCCAGAACCTGCTGTGTACCAGTTAAAAGATCCTCGGTCTCAGG...TCC', SingleLetterAlphabet()), id='Mouse_TRAC', name='Mouse_TRAC', description=' Mouse_TRAC', dbxrefs=[])

In [11]:
beta_constant = next(SeqIO.parse("../sequences/mouse.TRBC.dna.fasta", "fasta"))
beta_constant

SeqRecord(seq=Seq('AGGATCTGAGAAATGTGACTCCACCCAAGGTCTCCTTGTTTGAGCCATCAAAAG...TCC', SingleLetterAlphabet()), id='Mouse_TRBC', name='Mouse_TRBC', description=' Mouse_TRBC', dbxrefs=[])

In [12]:
constant_regions = {
    'TRA': alpha_constant,
    'TRB': beta_constant
}

def check_and_translate_tcr(nt_record):
    [sid, gene, cdr3_aa, cdr3_nt] = nt_record.id.split("_")
    
    constant_nt = constant_regions[gene]
    constant_nt_seq = str(constant_nt.seq)
    
    nt_seq = str(nt_record.seq)
    assert nt_seq.find(cdr3_nt) > -1
    assert nt_seq.startswith('ATG')
    assert nt_seq.endswith(constant_nt_seq)
    
    aa_record = nt_record.translate()
    aa_seq = str(aa_record.seq)
    assert aa_seq.find(cdr3_aa) > -1
    assert aa_seq.startswith('M')
    assert aa_seq.endswith('S')
    
    return aa_record

aa_records = [check_and_translate_tcr(rec) for rec in nt_records]
SeqIO.write(aa_records, "../data/10X-VDJ-summary.all_aa.fasta", "fasta")

19