# Libraries

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import sys

from Bio.Seq import Seq
from Bio import pairwise2
from Bio.pairwise2 import format_alignment

from utils import *
from utils_sequence_backtranslation import *

%matplotlib inline

# Data import

In [2]:
# define relevant columns
ab_seq_col = "Heavy_chain"
cdr3_col = "CDRH3"
cdr3_nseq_col = cdr3_col + "_nucleotide_seq"

## Human gene reference

In [3]:
ref_V = pd.read_csv(os.path.dirname(os.getcwd()) + "/data/backtranslation/reference/AIRR_heavy_V_genes.csv")
ref_J = pd.read_csv(os.path.dirname(os.getcwd()) + "/data/backtranslation/reference/AIRR_heavy_J_genes.csv")

# remove last two amino acids (six nucleotides) of V gene as these belong to the CDR3 according to IMGT definitions
ref_V["Amino_acid_seq"] = ref_V["Amino_acid_seq"].apply(lambda x: x[:-2])
ref_V["Shortened_seq"] = ref_V["Shortened_seq"].apply(lambda x: x[:-6])

## HH_S5F substitution matrix

In [4]:
hs5f_trimers = pd.read_csv(os.path.dirname(os.getcwd()) + "/data/HS5F/HS5F_Substitution_trimers.csv", index_col="Trimer")
hs5f_trimers[-5:]

Unnamed: 0_level_0,A,C,G,T,Position
Trimer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TGT,0.252727,0.492952,0.254321,0.0,End
TTA,0.0,0.238534,0.499609,0.261856,End
TTC,0.190568,0.0,0.381139,0.428293,End
TTG,0.50642,0.301703,0.0,0.191878,End
TTT,0.147972,0.532769,0.319258,0.0,End


## Epitope-antibody sequence mapping

In [5]:
antibody_pairs = pd.read_csv(os.path.dirname(os.getcwd()) + "/data/antibody_pairs/Antibody_pairs.csv")
antibody_pairs_ids = set(antibody_pairs["Epitope_ID_A"]).union(set(antibody_pairs["Epitope_ID_B"]))

In [6]:
ab_seq = pd.read_csv(os.path.dirname(os.getcwd()) + "/data/antibody_pairs/Epitope_PDB_ID_ABseq_CDRH3.csv")
ab_seq = ab_seq[ab_seq["Epitope_ID"].isin(antibody_pairs_ids)]
print("Number of unique antibodies:", len(ab_seq))
ab_seq[:5]

Number of unique antibodies: 54


Unnamed: 0,Epitope_ID,PDB,Heavy_chain,CDRH3,Light_chain,CDRL3
0,164069,1G9N,QVQLLESGAEVKKPGSSVKVSCKASGDTFIRYSFTWVRQAPGQGLE...,AGVYEGEADEGEYRNNGFLKH,ELELTQSPATLSVSPGERATLSCRASESVSSDLAWYQQKPGQAPRL...,QQYNNWPPRYT
1,164067,2I5Y,EVQLVESGAEVKKPGSSVKVSCKASGDTFIRYSFTWVRQAPGQGLE...,AGVYEGEADEGEYDNNGFLKH,DIVMTQSPATLSVSPGERATLSCRASESVSSDLAWYQQKPGQAPRL...,QQYNNWPPRYT
2,164078,3MAC,QVQLVQSGAEVKKPGSSVKVSCKASGGTFNSYAFSWVRQAPGQGLE...,ARYFDTYNNYGFAN,DIELTQPPSVSVVPGQTARISCSGDNIPYEYASWYQQKPGQAPVLV...,ASWDSMTVDGV
3,164079,3MA9,QVQLVQSGAEVKKPGSSVKVSCKASGGTFNSYAFSWVRQAPGQGLE...,ARYFDTYNNYGFAN,DIELTQPPSVSVVPGQTARISCSGDNIPYEYASWYQQKPGQAPVLV...,ASWDSMTVDGV
4,738088,6AZZ,QVQLVQSGAEVKKPGASVKVSCKTSGYTFTDYYIHWVRQAPGQGLE...,ARDRITTAAPFDY,SYVLTQPPSVSVAPGQTARITCGGSNIGSKSVHWYQQKPGQTPMLV...,QVWDSSSDHVWV


# Nucleotide sequence backtranslation

## Define gene and CDR3 regions

In [7]:
ab_seq_regions = ab_seq.apply(find_cdr3, args=[ab_seq_col, cdr3_col], axis=1)
ab_seq_regions = ab_seq_regions.apply(get_V_J_gene_fragment, args=[ab_seq_col], axis=1)

# create partial dataframes for alignment ranking
seq_V = ab_seq_regions[["Epitope_ID", "Antibody_seq_V"]]
seq_J = ab_seq_regions[["Epitope_ID", "Antibody_seq_J"]]

## Find best reference gene alignments

In [8]:
seq_V_matched = add_matched_gene_information(seq_V, ref_V, "Antibody_seq_V")

164069
QVQLLESGAEVKKPGSSVKVSCKASGDTFIRYSFTWVRQAPGQGLEWMGRIITILDVAHYAPHLQGRVTITADKSTSTVYLELRNLRSDDTAVYFC
||||..||||||||||||||||||||.||..|...|||||||||||||||||.||..|.||...||||||||||||||.|.||..|||.|||||.|
QVQLVQSGAEVKKPGSSVKVSCKASGGTFSSYTISWVRQAPGQGLEWMGRIIPILGIANYAQKFQGRVTITADKSTSTAYMELSSLRSEDTAVYYC
  Score=75

Fraction of matching amino acids: 78.12%
Number of mutated positions: 21
---------------------
QVQLLESGAEVKKPGSSVKVSCKASGDTFIRYSFTWVRQAPGQGLEWMGRIITILDVAHYAPHLQGRVTITADKSTSTVYLELRNLRSDDTAVYFC
||||..||||||||||||||||||||.||..|...|||||||||||||||||.||..|.||...||||||||||||||.|.||..|||.|||||.|
QVQLVQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGRIIPILGIANYAQKFQGRVTITADKSTSTAYMELSSLRSEDTAVYYC
  Score=75

Fraction of matching amino acids: 78.12%
Number of mutated positions: 21
---------------------
QVQLLESGAEVKKPGSSVKVSCKASGDTFIRYSFTWVRQAPGQGLEWMGRIITILDVAHYAPHLQGRVTITADKSTSTVYLELRNLRSDDTAVYFC
||||..||||||||||||||||||||.||..|...|||||||||||||||||.||..|.||...||||||||||||||.|.||..|||.|||||.|
QVQ

QVQLVQSGGEVKKPGASVKVSCKASGYTFTSYGISWVRQAPGQGLEWMGWISAYNGNRKYAQKVQGRVSMTIDTHTSTANMELRSLRSDDTAVYYC
||||||||.||||||||||||||||||||||||||||||||||||||||||||||||..||||.||||.||.||.||||.||||||||||||||||
QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYGISWVRQAPGQGLEWMGWISAYNGNTNYAQKLQGRVTMTTDTSTSTAYMELRSLRSDDTAVYYC
  Score=88

Fraction of matching amino acids: 91.67%
Number of mutated positions: 8
---------------------
QVQLVQSGGEVKKPGASVKVSCKASGYTFTSYGISWVRQAPGQGLEWMGWISAYNGNRKYAQKVQGRVSMTIDTHTSTANMELRSLRSDDTAVYYC
||||||||.||||||||||||||||||||||||||||||||||||||||||||||||..||||.||||.||.||.||||.||||||||||||||||
QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYGISWVRQAPGQGLEWMGWISAYNGNTNYAQKLQGRVTMTTDTSTSTAYMELRSLRSDDTAVYYC
  Score=88

Fraction of matching amino acids: 91.67%
Number of mutated positions: 8
---------------------
738087
QVQLVQSGAEVKKPGASVKVSCRASGYIFTSYGFSWVRQAPGQGLEWMGWISAYNGNTDYSQKLQGRVTMTTDTSTNTVYMELRTLQSDDTAVYYC
||||||||||||||||||||||.||||.|||||.||||||||||||||||||||||||.|.|||||||||||||||.|.|||||.|.|||||||||
QVQLV

EVQLVESGGGLIQPGGSLRLSCAASGITVSSNYMSWVRQAPGKGLEWVSVIYSGGSTDYADSVKGRFTISRDKSKNTLYLQMNSLRAEDTAVYYC
||||||||||||||||||||||||||.||||||||||||||||||||||||||||||.||||||||||||||.||||||||||||||||||||||
EVQLVESGGGLIQPGGSLRLSCAASGFTVSSNYMSWVRQAPGKGLEWVSVIYSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYC
  Score=92

Fraction of matching amino acids: 96.84%
Number of mutated positions: 3
---------------------
1311248
EVQLVESGGGLIQPGGSLRLSCAASGIIVSSNYMTWVRQAPGKGLEWVSVIYSGGSTFYADSVKGRFTISRDNSKNTLYLQMSSLRAEDTAVYYC
||||||||||||||||||||||||||..||||||.||||||||||||||||||||||.||||||||||||||||||||||||.||||||||||||
EVQLVESGGGLIQPGGSLRLSCAASGFTVSSNYMSWVRQAPGKGLEWVSVIYSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYC
  Score=90

Fraction of matching amino acids: 94.74%
Number of mutated positions: 5
---------------------
1311246
EVQLVESGGGLIQPGGSLRLSCAASEFIVSRNYMSWVRQAPGKGLEWVSVIYSGGSTYYADSVKGRFTISRDNSKNTLNLQMNSLRAEDTAVYYC
|||||||||||||||||||||||||.|.||.|||||||||||||||||||||||||||||||||||||||||||||||.||||||||||||||||
EVQL

EVQLVESGGGLIQPGGSLRLSCAASGFIVSSNYMSWVRQAPGKGLEWVSIIYSGGSTYYADSVKGRFTISRDNSNNTLYLQMNSLRAEDTAVYYC
|||||||||||||||||||||||||||.|||||||||||||||||||||.||||||||||||||||||||||||.||||||||||||||||||||
EVQLVESGGGLIQPGGSLRLSCAASGFTVSSNYMSWVRQAPGKGLEWVSVIYSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYC
  Score=92

Fraction of matching amino acids: 96.84%
Number of mutated positions: 3
---------------------
2134977
EVQLVESGGGLIQPGGSLRLSCAASGFTVSSNYMSWVRQAPGKGLEWVSFIYSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYC
|||||||||||||||||||||||||||||||||||||||||||||||||.|||||||||||||||||||||||||||||||||||||||||||||
EVQLVESGGGLIQPGGSLRLSCAASGFTVSSNYMSWVRQAPGKGLEWVSVIYSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYC
  Score=94

Fraction of matching amino acids: 98.95%
Number of mutated positions: 1
---------------------
2134997
EVQLVESGGGLIQPGGSLRLSCAASGFTVSSNYMSWVRQAPGKGLEWVSVIYSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
EVQL

-VQLVESGGGLIQPGGSLRLSCAASGITVSSNYMTWVRQPPGKGLEWVSVIYSGGSTFYADPVKGRFTISRDNSRNTLYLQMNSLRAEDTAVYYC
 |||||||||||||||||||||||||.|||||||.||||.|||||||||||||||||.|||.||||||||||||.||||||||||||||||||||
EVQLVESGGGLIQPGGSLRLSCAASGFTVSSNYMSWVRQAPGKGLEWVSVIYSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYC
  Score=88

Fraction of matching amino acids: 92.63%
Number of mutated positions: 7
---------------------
2218456
EVQLVESGGGLIQPGGSLRLSCVVSGFTVSSNYMSWIRQAPGKGLEWVSVIYSGGSTFYADSVKGRFTISRDSSQNTLYLQMNSLRAEDTAVYYC
||||||||||||||||||||||..||||||||||||.||||||||||||||||||||.||||||||||||||.|.||||||||||||||||||||
EVQLVESGGGLIQPGGSLRLSCAASGFTVSSNYMSWVRQAPGKGLEWVSVIYSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYC
  Score=89

Fraction of matching amino acids: 93.68%
Number of mutated positions: 6
---------------------


In [9]:
seq_J_matched = add_matched_gene_information(seq_J, ref_J, "Antibody_seq_J")

164069
WGQGTLVTVTS
|||||||||.|
WGQGTLVTVSS
  Score=10

Fraction of matching amino acids: 90.91%
Number of mutated positions: 1
---------------------
WGQGTLVTVTS
|||||||||.|
WGQGTLVTVSS
  Score=10

Fraction of matching amino acids: 90.91%
Number of mutated positions: 1
---------------------
WGQGTLVTVTS
|||||||||.|
WGQGTLVTVSS
  Score=10

Fraction of matching amino acids: 90.91%
Number of mutated positions: 1
---------------------
164067
WGQGTLVTVSS
|||||||||||
WGQGTLVTVSS
  Score=11

Fraction of matching amino acids: 100.00%
Number of mutated positions: 0
---------------------
WGQGTLVTVSS
|||||||||||
WGQGTLVTVSS
  Score=11

Fraction of matching amino acids: 100.00%
Number of mutated positions: 0
---------------------
WGQGTLVTVSS
|||||||||||
WGQGTLVTVSS
  Score=11

Fraction of matching amino acids: 100.00%
Number of mutated positions: 0
---------------------
164078
WGQGTLVTVSS
|||||||||||
WGQGTLVTVSS
  Score=11

Fraction of matching amino acids: 100.00%
Number of mutated positions: 0
---


Fraction of matching amino acids: 100.00%
Number of mutated positions: 0
---------------------
WGQGTLVTVSS
|||||||||||
WGQGTLVTVSS
  Score=11

Fraction of matching amino acids: 100.00%
Number of mutated positions: 0
---------------------
WGQGTLVTVSS
|||||||||||
WGQGTLVTVSS
  Score=11

Fraction of matching amino acids: 100.00%
Number of mutated positions: 0
---------------------
1397524
WGQGTTVTVSS
|||||||||||
WGQGTTVTVSS
  Score=11

Fraction of matching amino acids: 100.00%
Number of mutated positions: 0
---------------------
1397527
WGQGTLVTVSS
|||||||||||
WGQGTLVTVSS
  Score=11

Fraction of matching amino acids: 100.00%
Number of mutated positions: 0
---------------------
WGQGTLVTVSS
|||||||||||
WGQGTLVTVSS
  Score=11

Fraction of matching amino acids: 100.00%
Number of mutated positions: 0
---------------------
WGQGTLVTVSS
|||||||||||
WGQGTLVTVSS
  Score=11

Fraction of matching amino acids: 100.00%
Number of mutated positions: 0
---------------------
1597632
WGQGTMVTVSS
||||||||||

## Compare reference to mismatched antibody sequence at a nucleotide level

In [10]:
# print statements are added to output text file
notebook_stdout = sys.stdout
file = open(os.path.dirname(os.getcwd()) + "/data/backtranslation/Backtranslation_heavy_V_log.txt", "wt")
sys.stdout = file

seq_V_reconstructed = seq_V_matched.apply(reconstruct_nucleotide_sequence, args=(ref_V, "Antibody_nucleotide_seq_V", 
    amino_acid_to_triplets, hs5f_trimers), axis=1)
          
file.close()
sys.stdout.close()
sys.stdout = notebook_stdout

print("Finished.")

Finished.


In [11]:
# print statements are added to output text file
notebook_stdout = sys.stdout
file = open(os.path.dirname(os.getcwd()) + "/data/backtranslation/Backtranslation_heavy_J_log.txt", "wt")
sys.stdout = file

seq_J_reconstructed = seq_J_matched.apply(reconstruct_nucleotide_sequence, args=(ref_J, "Antibody_nucleotide_seq_J", 
    amino_acid_to_triplets, hs5f_trimers), axis=1)
            
file.close()
sys.stdout.close()
sys.stdout = notebook_stdout

print("Finished.")

Finished.


## Choose highest scoring reconstructed sequences for V and J gene

In [12]:
# keep gene reconstruction with highest likelihood
# if multiple gene reconstructions are equal, keep first
seq_V_filtered = keep_most_likely_sequence_reconstruction(seq_V_reconstructed)
seq_J_filtered = keep_most_likely_sequence_reconstruction(seq_J_reconstructed)
print("Number of antibodies with reconstructed V gene:", len(seq_V_filtered))
print("Number of antibodies with reconstructed J gene:", len(seq_J_filtered))
print("Identical antibody sets in both dataframes?", 
    set(seq_V_filtered["Epitope_ID"]) == set(seq_J_filtered["Epitope_ID"]))

Number of antibodies with reconstructed V gene: 54
Number of antibodies with reconstructed J gene: 54
Identical antibody sets in both dataframes? True


### Check if reconstructed nucleotide sequence translate to desired amino acid sequence

In [13]:
for i, row in seq_V_reconstructed.iterrows():
    alignment = pairwise2.align.localxs(translate_to_amino_acid(row["Antibody_nucleotide_seq_V"]), 
        row["Antibody_seq_V"], -2, -2, one_alignment_only=True)
    print(format_alignment(*alignment[0], full_sequences=True))

for i, row in seq_J_reconstructed.iterrows():
    alignment = pairwise2.align.localxs(translate_to_amino_acid(row["Antibody_nucleotide_seq_J"]), 
        row["Antibody_seq_J"], -2, -2, one_alignment_only=True)
    print(format_alignment(*alignment[0], full_sequences=True))

VQLVESGGGLIQPGGSLRLSCAASGITVSSNYMTWVRQPPGKGLEWVSVIYSGGSTFYADPVKGRFTISRDNSRNTLYLQMNSLRAEDTAVYYC
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
VQLVESGGGLIQPGGSLRLSCAASGITVSSNYMTWVRQPPGKGLEWVSVIYSGGSTFYADPVKGRFTISRDNSRNTLYLQMNSLRAEDTAVYYC
  Score=94

EVQLVESGGGLIQPGGSLRLSCVVSGFTVSSNYMSWIRQAPGKGLEWVSVIYSGGSTFYADSVKGRFTISRDSSQNTLYLQMNSLRAEDTAVYYC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
EVQLVESGGGLIQPGGSLRLSCVVSGFTVSSNYMSWIRQAPGKGLEWVSVIYSGGSTFYADSVKGRFTISRDSSQNTLYLQMNSLRAEDTAVYYC
  Score=95

EVQLVESGGGLVQPGGSLRLSCAASGITVSSNYMTWVRQAPGKGLEWVSVIYSGGSTFYADSVRGRFTISRDNSKNTLYLQMNSLRAEDTAVYYC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
EVQLVESGGGLVQPGGSLRLSCAASGITVSSNYMTWVRQAPGKGLEWVSVIYSGGSTFYADSVRGRFTISRDNSKNTLYLQMNSLRAEDTAVYYC
  Score=95

EVQLVQSGGDLVQPGGSLRLSCAVSGFTVSRNYMTWVRQAPGRGLEWVSLIYPGGSAFYADSVKGRFTISRDNSKNTLYLQMNSLRVEDTAVYYC
|||||||

EVQLVESGGGLIQPGGSLRLSCAASEFIVSRNYMSWVRQAPGKGLEWVSVIYSGGSTYYADSVKGRFTISRDNSKNTLNLQMNSLRAEDTAVYYC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
EVQLVESGGGLIQPGGSLRLSCAASEFIVSRNYMSWVRQAPGKGLEWVSVIYSGGSTYYADSVKGRFTISRDNSKNTLNLQMNSLRAEDTAVYYC
  Score=95

QVQLVETGGGLIQPGGSLRLSCAASGFTVSSNYMSWVRQAPGKGLEWVSVIYSGGSTFYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
QVQLVETGGGLIQPGGSLRLSCAASGFTVSSNYMSWVRQAPGKGLEWVSVIYSGGSTFYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYC
  Score=95

QVQLVESGGGLIQPGGSLRLSCAASGFTVSSNYMSWVRQAPGKGLEWVSVIYSGGSTFYADSVKGRFTISRDNSKSTLYLQMNSLRVEDTAVYYC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
QVQLVESGGGLIQPGGSLRLSCAASGFTVSSNYMSWVRQAPGKGLEWVSVIYSGGSTFYADSVKGRFTISRDNSKSTLYLQMNSLRVEDTAVYYC
  Score=95

EVQLVESGGGLIQPGGSLRLSCAASGLTVSSNYMSWVRQAPGKGLEWVSVIYSGGSTFYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYC
||||

In [14]:
seq_V_J = seq_V_filtered.merge(seq_J_filtered, on="Epitope_ID", how="inner", suffixes=("_V", "_J"))
print("Number of antibodies with reconstructed V and J gene:", len(seq_V_J))

Number of antibodies with reconstructed V and J gene: 54


## Reconstruct CDR3 region

Nucleotide backtranslation with: https://www.ebi.ac.uk/jdispatcher/st/emboss_backtranseq

In [15]:
# with open(os.path.dirname(os.getcwd()) + "/data/backtranslation/Antibody_pairs_CDRH3.txt", "w") as f:
#     for i, row in ab_seq.iterrows():
#         f.write(">" + str(row["Epitope_ID"]) + "\n")
#         f.write(row[cdr3_col] + "\n")

In [16]:
file = os.path.dirname(os.getcwd()) + "/data/backtranslation/Antibody_pairs_CDRH3_backtranslated.txt"

cdr3_backtranslated = read_cdr3_backtranslation_file(file, cdr3_nseq_col)
cdr3_backtranslated = cdr3_backtranslated.merge(ab_seq[["Epitope_ID", cdr3_col]], on="Epitope_ID", how="inner")

# check that every residue is backtranslated to 3 nucleotides
for i, row in cdr3_backtranslated.iterrows():
    assert len(row[cdr3_nseq_col]) == len(row[cdr3_col])*3

In [17]:
seq_V_J_CDR3 = seq_V_J.merge(cdr3_backtranslated, on="Epitope_ID", how="inner")
print("Number of antibodies with all reconstructed segments:", len(seq_V_J_CDR3))
seq_V_J_CDR3[:5]

Number of antibodies with all reconstructed segments: 54


Unnamed: 0,Epitope_ID,Antibody_seq_V,Gene_name_V,Gene_seq_V,Fraction_match_V,Mutation_num_V,Mismatch_list_V,Antibody_nucleotide_seq_V,Likelihood_V,Mutation_num_nucleotide_V,...,Gene_name_J,Gene_seq_J,Fraction_match_J,Mutation_num_J,Mismatch_list_J,Antibody_nucleotide_seq_J,Likelihood_J,Mutation_num_nucleotide_J,CDRH3_nucleotide_seq,CDRH3
0,2218457,VQLVESGGGLIQPGGSLRLSCAASGITVSSNYMTWVRQPPGKGLEW...,IGHV3-53*01,EVQLVESGGGLIQPGGSLRLSCAASGFTVSSNYMSWVRQAPGKGLE...,92.631579,7.0,"{0: ('E', '-'), 26: ('F', 'I'), 34: ('S', 'T')...",GTGCAGCTGGTGGAGTCTGGAGGAGGCTTGATCCAGCCTGGGGGGT...,0.318278,6,...,IGHJ6*02,WGQGTTVTVSS,100.0,0.0,{},TGGGGCCAAGGGACCACGGTCACCGTCTCCTCA,1.0,0,GCCAGGGACCTGCAGGTGTACGGCATGGACGTG,ARDLQVYGMDV
1,2218456,EVQLVESGGGLIQPGGSLRLSCVVSGFTVSSNYMSWIRQAPGKGLE...,IGHV3-53*01,EVQLVESGGGLIQPGGSLRLSCAASGFTVSSNYMSWVRQAPGKGLE...,93.684211,6.0,"{22: ('A', 'V'), 23: ('A', 'V'), 36: ('V', 'I'...",GAGGTGCAGCTGGTGGAGTCTGGAGGAGGCTTGATCCAGCCTGGGG...,0.378301,6,...,IGHJ1*01,WGQGTLVTVSS,100.0,0.0,{},TGGGGCCAGGGCACCCTGGTCACCGTCTCCTCA,1.0,0,GCCAGGCACCCCTACGGCGACCACGCC,ARHPYGDHA
2,2217933,EVQLVESGGGLVQPGGSLRLSCAASGITVSSNYMTWVRQAPGKGLE...,IGHV3-66*01,EVQLVESGGGLVQPGGSLRLSCAASGFTVSSNYMSWVRQAPGKGLE...,95.789474,4.0,"{26: ('F', 'I'), 34: ('S', 'T'), 57: ('Y', 'F'...",GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCTGGGG...,0.270716,4,...,IGHJ3*02,WGQGTMVTVSS,100.0,0.0,{},TGGGGCCAAGGGACAATGGTCACCGTCTCTTCA,1.0,0,GCCAGGGACCTGGAGATGGCCGGCGCCTTCGACATC,ARDLEMAGAFDI
3,2186582,EVQLVQSGGDLVQPGGSLRLSCAVSGFTVSRNYMTWVRQAPGRGLE...,IGHV3-66*01,EVQLVESGGGLVQPGGSLRLSCAASGFTVSSNYMSWVRQAPGKGLE...,88.421053,11.0,"{5: ('E', 'Q'), 9: ('G', 'D'), 23: ('A', 'V'),...",GAGGTGCAGCTGGTGCAGTCTGGGGGAGACTTGGTCCAGCCTGGGG...,0.428361,12,...,IGHJ1*01,WGQGTLVTVSS,100.0,0.0,{},TGGGGCCAGGGCACCCTGGTCACCGTCTCCTCA,1.0,0,GCCAGGGACCCCGTGAGCACCGGCCACTACCACGACAGCGACTAC,ARDPVSTGHYHDSDY
4,2144729,EVQLVESGGGLVQPGGSLRLSCAASGFTVRSNYMSWVRQAPGKGLE...,IGHV3-66*01,EVQLVESGGGLVQPGGSLRLSCAASGFTVSSNYMSWVRQAPGKGLE...,96.842105,3.0,"{29: ('S', 'R'), 49: ('V', 'L'), 67: ('T', 'I')}",GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCTGGGG...,0.380967,3,...,IGHJ6*02,WGQGTTVTVSS,100.0,0.0,{},TGGGGCCAAGGGACCACGGTCACCGTCTCCTCA,1.0,0,GCCAGGGACCTGGCCGTGTACGGCATGGACGTG,ARDLAVYGMDV


## Create full-length nucleotide antibody sequences

In [18]:
ab_seq_reconstructed = seq_V_J_CDR3.apply(create_final_nucleotide_sequences, args=[cdr3_nseq_col, ref_V], axis=1)

In [19]:
for i, row in ab_seq.iterrows():
    print(row["Epitope_ID"])
    reconstructed_full_seq = ab_seq_reconstructed[ab_seq_reconstructed["Epitope_ID"] == \
        row["Epitope_ID"]]["Full_nucleotide_seq"].values[0]
    alignment = pairwise2.align.localxs(translate_to_amino_acid(reconstructed_full_seq), 
        row[ab_seq_col], -2, -2, one_alignment_only=True)
    print(format_alignment(*alignment[0], full_sequences=True))

164069
QVQLLESGAEVKKPGSSVKVSCKASGDTFIRYSFTWVRQAPGQGLEWMGRIITILDVAHYAPHLQGRVTITADKSTSTVYLELRNLRSDDTAVYFCAGVYEGEADEGEYRNNGFLKHWGQGTLVTVTS
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
QVQLLESGAEVKKPGSSVKVSCKASGDTFIRYSFTWVRQAPGQGLEWMGRIITILDVAHYAPHLQGRVTITADKSTSTVYLELRNLRSDDTAVYFCAGVYEGEADEGEYRNNGFLKHWGQGTLVTVTS
  Score=128

164067
EVQLVESGAEVKKPGSSVKVSCKASGDTFIRYSFTWVRQAPGQGLEWMGRIITILDVAHYAPHLQGRVTITADKSTSTVYLELRNLRSDDTAVYFCAGVYEGEADEGEYDNNGFLKHWGQGTLVTVSS
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
EVQLVESGAEVKKPGSSVKVSCKASGDTFIRYSFTWVRQAPGQGLEWMGRIITILDVAHYAPHLQGRVTITADKSTSTVYLELRNLRSDDTAVYFCAGVYEGEADEGEYDNNGFLKHWGQGTLVTVSS
  Score=128

164078
QVQLVQSGAEVKKPGSSVKVSCKASGGTFNSYAFSWVRQAPGQGLEWMGSIIPLFGFVVYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCARYFDTYNNYGFANWGQGTLVTVSS
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||

EVQLVESGGGLIQPGGSLRLSCAASEFIVSRNYMSWVRQAPGKGLEWVSVIYSGGSTYYADSVKGRFTISRDNSKNTLNLQMNSLRAEDTAVYYCARDYGDYYFDYWGQGTLVTVSS
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
EVQLVESGGGLIQPGGSLRLSCAASEFIVSRNYMSWVRQAPGKGLEWVSVIYSGGSTYYADSVKGRFTISRDNSKNTLNLQMNSLRAEDTAVYYCARDYGDYYFDYWGQGTLVTVSS
  Score=117

1314088
QVQLVESGGGLIQPGGSLRLSCAASGFIVSSNYMSWVRQAPGKGLEWVSVIYSGGSTFYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDYYFDYWGQGTLVTVSS
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
QVQLVESGGGLIQPGGSLRLSCAASGFIVSSNYMSWVRQAPGKGLEWVSVIYSGGSTFYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDYYFDYWGQGTLVTVSS
  Score=117

1334440
EVQLVESGGGLVQPGGSLRLSCAASGITVSSNYMNWVRQAPGKGLEWVSLIYSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYHCARDLVVYGMDVWGQGTTVTVSS
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
EVQLVESGGGLVQP

VQLVESGGGLIQPGGSLRLSCAASGITVSSNYMTWVRQPPGKGLEWVSVIYSGGSTFYADPVKGRFTISRDNSRNTLYLQMNSLRAEDTAVYYCARDLQVYGMDVWGQGTTVTVSS
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
VQLVESGGGLIQPGGSLRLSCAASGITVSSNYMTWVRQPPGKGLEWVSVIYSGGSTFYADPVKGRFTISRDNSRNTLYLQMNSLRAEDTAVYYCARDLQVYGMDVWGQGTTVTVSS
  Score=116

2218456
EVQLVESGGGLIQPGGSLRLSCVVSGFTVSSNYMSWIRQAPGKGLEWVSVIYSGGSTFYADSVKGRFTISRDSSQNTLYLQMNSLRAEDTAVYYCARHPYGDHAWGQGTLVTVSS
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
EVQLVESGGGLIQPGGSLRLSCVVSGFTVSSNYMSWIRQAPGKGLEWVSVIYSGGSTFYADSVKGRFTISRDSSQNTLYLQMNSLRAEDTAVYYCARHPYGDHAWGQGTLVTVSS
  Score=115



In [20]:
ab_seq_reconstructed.to_csv(os.path.dirname(os.getcwd()) + 
    "/data/antibody_pairs/Antibody_pairs_backtranslated_heavy.csv", index=False)

In [21]:
with open(os.path.dirname(os.getcwd()) + "/data/FASTA/Antibody_pairs_nucleotide_heavy_chain.fasta", "w") as f:
    for i, row in ab_seq_reconstructed.iterrows():
        f.write(">" + str(row["Epitope_ID"]) + "\n")
        f.write(row["Full_nucleotide_seq"] + "\n")