# Libraries

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import sys

from Bio.Seq import Seq
from Bio import pairwise2
from Bio.pairwise2 import format_alignment

from utils import *
from utils_sequence_backtranslation import *

%matplotlib inline

# Data import

In [2]:
# define relevant columns
ab_seq_col = "Light_chain"
cdr3_col = "CDRL3"
cdr3_nseq_col = cdr3_col + "_nucleotide_seq"

## Human gene reference

In [3]:
ref_V = pd.read_csv(os.path.dirname(os.getcwd()) + "/data/backtranslation/reference/AIRR_light_V_genes.csv")
ref_J = pd.read_csv(os.path.dirname(os.getcwd()) + "/data/backtranslation/reference/AIRR_light_J_genes.csv")

# remove last two amino acids (six nucleotides) of V gene as these belong to the CDR3 according to IMGT definitions
ref_V["Amino_acid_seq"] = ref_V["Amino_acid_seq"].apply(lambda x: x[:-2])
ref_V["Shortened_seq"] = ref_V["Shortened_seq"].apply(lambda x: x[:-6])

## HH_S5F substitution matrix

In [4]:
hs5f_trimers = pd.read_csv(os.path.dirname(os.getcwd()) + "/data/HS5F/HS5F_Substitution_trimers.csv", index_col="Trimer")
hs5f_trimers[-5:]

Unnamed: 0_level_0,A,C,G,T,Position
Trimer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TGT,0.252727,0.492952,0.254321,0.0,End
TTA,0.0,0.238534,0.499609,0.261856,End
TTC,0.190568,0.0,0.381139,0.428293,End
TTG,0.50642,0.301703,0.0,0.191878,End
TTT,0.147972,0.532769,0.319258,0.0,End


## Epitope-antibody sequence mapping

In [5]:
antibody_pairs = pd.read_csv(os.path.dirname(os.getcwd()) + "/data/antibody_pairs/Antibody_pairs.csv")
antibody_pairs_ids = set(antibody_pairs["Epitope_ID_A"]).union(set(antibody_pairs["Epitope_ID_B"]))

In [6]:
ab_seq = pd.read_csv(os.path.dirname(os.getcwd()) + "/data/antibody_pairs/Epitope_PDB_ID_ABseq_CDRH3.csv")
ab_seq = ab_seq[ab_seq["Epitope_ID"].isin(antibody_pairs_ids)]
print("Number of unique antibodies:", len(ab_seq))
ab_seq[:5]

Number of unique antibodies: 54


Unnamed: 0,Epitope_ID,PDB,Heavy_chain,CDRH3,Light_chain,CDRL3
0,164069,1G9N,QVQLLESGAEVKKPGSSVKVSCKASGDTFIRYSFTWVRQAPGQGLE...,AGVYEGEADEGEYRNNGFLKH,ELELTQSPATLSVSPGERATLSCRASESVSSDLAWYQQKPGQAPRL...,QQYNNWPPRYT
1,164067,2I5Y,EVQLVESGAEVKKPGSSVKVSCKASGDTFIRYSFTWVRQAPGQGLE...,AGVYEGEADEGEYDNNGFLKH,DIVMTQSPATLSVSPGERATLSCRASESVSSDLAWYQQKPGQAPRL...,QQYNNWPPRYT
2,164078,3MAC,QVQLVQSGAEVKKPGSSVKVSCKASGGTFNSYAFSWVRQAPGQGLE...,ARYFDTYNNYGFAN,DIELTQPPSVSVVPGQTARISCSGDNIPYEYASWYQQKPGQAPVLV...,ASWDSMTVDGV
3,164079,3MA9,QVQLVQSGAEVKKPGSSVKVSCKASGGTFNSYAFSWVRQAPGQGLE...,ARYFDTYNNYGFAN,DIELTQPPSVSVVPGQTARISCSGDNIPYEYASWYQQKPGQAPVLV...,ASWDSMTVDGV
4,738088,6AZZ,QVQLVQSGAEVKKPGASVKVSCKTSGYTFTDYYIHWVRQAPGQGLE...,ARDRITTAAPFDY,SYVLTQPPSVSVAPGQTARITCGGSNIGSKSVHWYQQKPGQTPMLV...,QVWDSSSDHVWV


# Nucleotide sequence reconstruction

## Define gene and CDR3 regions

In [7]:
ab_seq_regions = ab_seq.apply(find_cdr3, args=[ab_seq_col, cdr3_col], axis=1)
ab_seq_regions = ab_seq_regions.apply(get_V_J_gene_fragment, args=[ab_seq_col], axis=1)

# create partial dataframes for alignment ranking
seq_V = ab_seq_regions[["Epitope_ID", "Antibody_seq_V"]]
seq_J = ab_seq_regions[["Epitope_ID", "Antibody_seq_J"]]

## Find best reference gene alignments

Light chains can be of type kappa or lambda (while heavy chains are all of the same type). The gene that is assigned to the V gene determines which J genes are possible, as V and J gene of one antibody light chain should be of the same type. Thus, the workflow is slightly different to the heavy chain backtranslation pipeline.

In [8]:
seq_V_matched = add_matched_gene_information(seq_V, ref_V, "Antibody_seq_V")

164069
ELELTQSPATLSVSPGERATLSCRASESVSSDLAWYQQKPGQAPRLLIYGASTRATGVPARFSGSGSGAEFTLTISSLQSEDFAVYYC-----
|...||||||||||||||||||||||.||||.|||||||||||||||||||||||||.||||||||||.|||||||||||||||||||     
EIVMTQSPATLSVSPGERATLSCRASQSVSSNLAWYQQKPGQAPRLLIYGASTRATGIPARFSGSGSGTEFTLTISSLQSEDFAVYYCQQYNN
  Score=81

Fraction of matching amino acids: 87.10%
Number of mutated positions: 12
---------------------
ELELTQSPATLSVSPGERATLSCRASESVSSDLAWYQQKPGQAPRLLIYGASTRATGVPARFSGSGSGAEFTLTISSLQSEDFAVYYC-----
|...||||||||||||||||||||||.||||.|||||||||||||||||||||||||.||||||||||.|||||||||||||||||||     
EIVMTQSPATLSVSPGERATLSCRASQSVSSNLAWYQQKPGQAPRLLIYGASTRATGIPARFSGSGSGTEFTLTISSLQSEDFAVYYCQQYNN
  Score=81

Fraction of matching amino acids: 87.10%
Number of mutated positions: 12
---------------------
164067
DIVMTQSPATLSVSPGERATLSCRASESVSSDLAWYQQKPGQAPRLLIYGASTRATGVPARFSGSGSGAEFTLTISSLQSEDFAVYYC-----
 |||||||||||||||||||||||||.||||.|||||||||||||||||||||||||.||||||||||.|||||||||||||||||||     
EIVMTQSPATLSVSPGERAT

SSELTQDPAVSVALGQTVRITCQGDSLRGYYASWYQQKPGQAPVLVIYGKNNRPSGIPDRFSGSSSGNTASLTITGAQAEDEADYYC-------
||||||||||||||||||||||||||||.||||||||||||||||||||||||||||||||||||||||||||||||||||||||||       
SSELTQDPAVSVALGQTVRITCQGDSLRSYYASWYQQKPGQAPVLVIYGKNNRPSGIPDRFSGSSSGNTASLTITGAQAEDEADYYCNSRDSSG
  Score=86

Fraction of matching amino acids: 91.49%
Number of mutated positions: 8
---------------------
969168
SSELTQDPAVSVALGQTVRITCQGDSLRGYSASWYQLKPGQAPVLVIYGKNNRPSGIPDRFSGSTSGNRASLIITGTQAEDEADYYC-------
||||||||||||||||||||||||||||.|.|||||.|||||||||||||||||||||||||||.|||.|||.|||.||||||||||       
SSELTQDPAVSVALGQTVRITCQGDSLRSYYASWYQQKPGQAPVLVIYGKNNRPSGIPDRFSGSSSGNTASLTITGAQAEDEADYYCNSRDSSG
  Score=80

Fraction of matching amino acids: 85.11%
Number of mutated positions: 14
---------------------
1075136
DIVMTQSPSFLSASVGDRVTITCRASQGISSYLAWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTEFTLTISSLQPEDFATYYC-----
||..||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||     
DIQLTQSPSFLSAS

DIVMTQSPSFLSASVGDRVTITCRASQGISSYLAWYQQKPGKAPKLLIQAASTLQSGVPSRFSGSGSGTEFTLTISSLQPEDFATYYC-----
||..||||||||||||||||||||||||||||||||||||||||||||.|||||||||||||||||||||||||||||||||||||||     
DIQLTQSPSFLSASVGDRVTITCRASQGISSYLAWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTEFTLTISSLQPEDFATYYCQQLNS
  Score=85

Fraction of matching amino acids: 91.40%
Number of mutated positions: 8
---------------------
1346823
AIQLTQSPSFLSASIGDRVTITCRASQGISSYLAWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTEFTLTISSLQPEDFASYYC-----
 |||||||||||||.|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||.|||     
DIQLTQSPSFLSASVGDRVTITCRASQGISSYLAWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTEFTLTISSLQPEDFATYYCQQLNS
  Score=85

Fraction of matching amino acids: 91.40%
Number of mutated positions: 8
---------------------
1347915
QSALTQPPSASGSPGQSVTISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYEVSKRPSGVPDRFSGSKSGNTASLTVSGLQAEDEADYYC-------
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||       
QSALTQPPSASG

DIQLTQSPSFLSASVGDRVTITCRASQGISSYLAWYQQKPGKAPKLLIYGASTLQSGVPSRFSGSGSGTEFKLTISSLQPEDFATYYC-----
|||||||||||||||||||||||||||||||||||||||||||||||||.|||||||||||||||||||||.||||||||||||||||     
DIQLTQSPSFLSASVGDRVTITCRASQGISSYLAWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTEFTLTISSLQPEDFATYYCQQLNS
  Score=86

Fraction of matching amino acids: 92.47%
Number of mutated positions: 7
---------------------
2186582
DIQMTQSPSALSASVGDRVTITCQASQDINKFLNWYQQKPGKAPKLLIYDASNLETGVPSRFSGGGSGTDFTFTISSLQPEDIATYYC-----
|||||||||.|||||||||||||||||||...||||||||||||||||||||||||||||||||.|||||||||||||||||||||||     
DIQMTQSPSSLSASVGDRVTITCQASQDISNYLNWYQQKPGKAPKLLIYDASNLETGVPSRFSGSGSGTDFTFTISSLQPEDIATYYCQQYDN
  Score=83

Fraction of matching amino acids: 89.25%
Number of mutated positions: 10
---------------------
2217933
EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPEDSAVYYC-----
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||.|||||     
EIVLTQSPGTLSLSPGE

In [9]:
kappa_antibodies = seq_V_matched[seq_V_matched["Gene_name"].str.contains("IGK")]["Epitope_ID"]
lambda_antibodies = seq_V_matched[seq_V_matched["Gene_name"].str.contains("IGL")]["Epitope_ID"]

assert len(seq_V_matched) == len(kappa_antibodies) + len(lambda_antibodies)

seq_J_kappa = seq_J[seq_J["Epitope_ID"].isin(kappa_antibodies)]
seq_J_lambda = seq_J[seq_J["Epitope_ID"].isin(lambda_antibodies)] 

# gene call of V gene determines if J gene is kappa or lambda light chain
seq_J_kappa_matched = add_matched_gene_information(seq_J_kappa, ref_J[ref_J["chain"] == "IGK"], "Antibody_seq_J")
seq_J_lambda_matched = add_matched_gene_information(seq_J_lambda, ref_J[ref_J["chain"] == "IGL"], "Antibody_seq_J")

seq_J_matched = pd.concat([seq_J_kappa_matched, seq_J_lambda_matched])

164069
FGQGTRLEIK
||||||||||
FGQGTRLEIK
  Score=10

Fraction of matching amino acids: 100.00%
Number of mutated positions: 0
---------------------
164067
FGQGTRLEIK
||||||||||
FGQGTRLEIK
  Score=10

Fraction of matching amino acids: 100.00%
Number of mutated positions: 0
---------------------
738090
FGQGTTLEIK
|||||.||||
FGQGTKLEIK
  Score=9

Fraction of matching amino acids: 90.00%
Number of mutated positions: 1
---------------------
FGQGTTLEIK
|||||.||||
FGQGTKLEIK
  Score=9

Fraction of matching amino acids: 90.00%
Number of mutated positions: 1
---------------------
FGQGTTLEIK
|||||.||||
FGQGTKLEIK
  Score=9

Fraction of matching amino acids: 90.00%
Number of mutated positions: 1
---------------------
FGQGTTLEIK
|||||.||||
FGQGTRLEIK
  Score=9

Fraction of matching amino acids: 90.00%
Number of mutated positions: 1
---------------------
738086
FGGGTKVEIK
||||||||||
FGGGTKVEIK
  Score=10

Fraction of matching amino acids: 100.00%
Number of mutated positions: 0
---------------------



Fraction of matching amino acids: 90.00%
Number of mutated positions: 1
---------------------
FGGGTTLTVL
|||||.||||
FGGGTQLTVL
  Score=9

Fraction of matching amino acids: 90.00%
Number of mutated positions: 1
---------------------
738089
FGGGTKLTVL
||||||||||
FGGGTKLTVL
  Score=10

Fraction of matching amino acids: 100.00%
Number of mutated positions: 0
---------------------
FGGGTKLTVL
||||||||||
FGGGTKLTVL
  Score=10

Fraction of matching amino acids: 100.00%
Number of mutated positions: 0
---------------------
969167
FGGGTKLTVL
||||||||||
FGGGTKLTVL
  Score=10

Fraction of matching amino acids: 100.00%
Number of mutated positions: 0
---------------------
FGGGTKLTVL
||||||||||
FGGGTKLTVL
  Score=10

Fraction of matching amino acids: 100.00%
Number of mutated positions: 0
---------------------
969166
FGGGTKLTVL
||||||||||
FGGGTKLTVL
  Score=10

Fraction of matching amino acids: 100.00%
Number of mutated positions: 0
---------------------
FGGGTKLTVL
||||||||||
FGGGTKLTVL
  Score=10

F

## Compare reference to mismatched antibody sequence at a nucleotide level

In [10]:
# print statements are added to output text file
notebook_stdout = sys.stdout
file = open(os.path.dirname(os.getcwd()) + "/data/backtranslation/Backtranslation_light_V_log.txt", "wt")
sys.stdout = file

seq_V_reconstructed = seq_V_matched.apply(reconstruct_nucleotide_sequence, args=(ref_V, "Antibody_nucleotide_seq_V", 
    amino_acid_to_triplets, hs5f_trimers), axis=1)
          
file.close()
sys.stdout.close()
sys.stdout = notebook_stdout

print("Finished.")

Finished.


In [11]:
# print statements are added to output text file
notebook_stdout = sys.stdout
file = open(os.path.dirname(os.getcwd()) + "/data/backtranslation/Backtranslation_light_J_log.txt", "wt")
sys.stdout = file

seq_J_reconstructed = seq_J_matched.apply(reconstruct_nucleotide_sequence, args=(ref_J, "Antibody_nucleotide_seq_J", 
    amino_acid_to_triplets, hs5f_trimers), axis=1)
            
file.close()
sys.stdout.close()
sys.stdout = notebook_stdout

print("Finished.")

Finished.


## Choose highest scoring reconstructed sequences for V and J gene

In [12]:
# keep gene reconstruction with highest likelihood
# if multiple gene reconstructions are equal, keep first
seq_V_filtered = keep_most_likely_sequence_reconstruction(seq_V_reconstructed)
seq_J_filtered = keep_most_likely_sequence_reconstruction(seq_J_reconstructed)
print("Number of antibodies with reconstructed V gene:", len(seq_V_filtered))
print("Number of antibodies with reconstructed J gene:", len(seq_J_filtered))
print("Identical antibody set in both dataframes:", 
    set(seq_V_filtered["Epitope_ID"]) == set(seq_J_filtered["Epitope_ID"]))

Number of antibodies with reconstructed V gene: 54
Number of antibodies with reconstructed J gene: 54
Identical antibody set in both dataframes: True


### Check if reconstructed nucleotide sequence translate to desired amino acid sequence

In [13]:
for i, row in seq_V_reconstructed.iterrows():
    print(row["Epitope_ID"])
    alignment = pairwise2.align.localxs(translate_to_amino_acid(row["Antibody_nucleotide_seq_V"]), 
        row["Antibody_seq_V"], -2, -2, one_alignment_only=True)
    print(format_alignment(*alignment[0], full_sequences=True))

for i, row in seq_J_reconstructed.iterrows():
    print(row["Epitope_ID"])
    alignment = pairwise2.align.localxs(translate_to_amino_acid(row["Antibody_nucleotide_seq_J"]), 
        row["Antibody_seq_J"], -2, -2, one_alignment_only=True)
    print(format_alignment(*alignment[0], full_sequences=True))

2218457
IQLTQSPSFLSASVGDRVTITCRASQGISNFLAWYQQKPGKAPKLLIYAASTLQGGVPSTFSGSGSGTEFTLTISSLQPEDFATYYC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
IQLTQSPSFLSASVGDRVTITCRASQGISNFLAWYQQKPGKAPKLLIYAASTLQGGVPSTFSGSGSGTEFTLTISSLQPEDFATYYC
  Score=87

2218456
DIQMTQSPSPLSASVGDRVTITCQASQDIRNFLNWYQQKPGKAPKLLIHDASKLEAGVPSRFSGSGSGTDFTFTISSLQPEDIATYYC
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
DIQMTQSPSPLSASVGDRVTITCQASQDIRNFLNWYQQKPGKAPKLLIHDASKLEAGVPSRFSGSGSGTDFTFTISSLQPEDIATYYC
  Score=88

2217933
EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPEDSAVYYC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPEDSAVYYC
  Score=89

2186582
DIQMTQSPSALSASVGDRVTITCQASQDINKFLNWYQQKPGKAPKLLIYDASNLETGVPSRFSGGGSGTDFTFTISSLQPEDIATYYC
||||||||||||||||||||||||||||||||||||||||||

EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPEDFAVYYC
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPEDFAVYYC
  Score=89

1311248
DIQLTQSPSFLSASVGDRVTITCRASQGISSDLAWYQQKPGKAPNLLIYAASTLQSGVPSRFSGSGSGTEFTLTISSLQPEDFATYYC
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
DIQLTQSPSFLSASVGDRVTITCRASQGISSDLAWYQQKPGKAPNLLIYAASTLQSGVPSRFSGSGSGTEFTLTISSLQPEDFATYYC
  Score=88

1311247
AIQLTQSPSSLSASVGDRVTITCRASQGISSYLAWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYC
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
AIQLTQSPSSLSASVGDRVTITCRASQGISSYLAWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYC
  Score=88

1311246
EIVLTQSPGTLSLSPGERATLSCRASQGVSSFLAWYQQKPGQAPRLLIHGASSRATGIPDRFSGSGSGTDFTLTITRLEPEDFAVYYC
|||||||||||||||||||||||||||||||||||||||||||||||

In [14]:
seq_V_J = seq_V_filtered.merge(seq_J_filtered, on="Epitope_ID", how="inner", suffixes=("_V", "_J"))
print("Number of antibodies with reconstructed V and J gene:", len(seq_V_J))

Number of antibodies with reconstructed V and J gene: 54


## Reconstruct CDR3 region

Nucleotide backtranslation with: https://www.ebi.ac.uk/Tools/st/emboss_backtranseq/

In [15]:
# with open(os.path.dirname(os.getcwd()) + "/data/backtranslation/Antibody_pairs_CDRL3.txt", "w") as f:
#     for i, row in ab_seq.iterrows():
#         f.write(">" + str(row["Epitope_ID"]) + "\n")
#         f.write(row[cdr3_col] + "\n")

In [16]:
file = os.path.dirname(os.getcwd()) + "/data/backtranslation/Antibody_pairs_CDRL3_backtranslated.txt"

cdr3_backtranslated = read_cdr3_backtranslation_file(file, cdr3_nseq_col)
cdr3_backtranslated = cdr3_backtranslated.merge(ab_seq[["Epitope_ID", cdr3_col]], on="Epitope_ID", how="inner")

# check that every residue is backtranslated to 3 nucleotides
for i, row in cdr3_backtranslated.iterrows():
    assert len(row[cdr3_nseq_col]) == len(row[cdr3_col])*3

In [17]:
seq_V_J_CDR3 = seq_V_J.merge(cdr3_backtranslated, on="Epitope_ID", how="inner")
seq_V_J_CDR3[:5]

Unnamed: 0,Epitope_ID,Antibody_seq_V,Gene_name_V,Gene_seq_V,Fraction_match_V,Mutation_num_V,Mismatch_list_V,Antibody_nucleotide_seq_V,Likelihood_V,Mutation_num_nucleotide_V,...,Gene_name_J,Gene_seq_J,Fraction_match_J,Mutation_num_J,Mismatch_list_J,Antibody_nucleotide_seq_J,Likelihood_J,Mutation_num_nucleotide_J,CDRL3_nucleotide_seq,CDRL3
0,2218457,IQLTQSPSFLSASVGDRVTITCRASQGISNFLAWYQQKPGKAPKLL...,IGKV1-9*01,DIQLTQSPSFLSASVGDRVTITCRASQGISSYLAWYQQKPGKAPKL...,89.247312,10.0,"{0: ('D', '-'), 30: ('S', 'N'), 31: ('Y', 'F')...",ATCCAGTTGACCCAGTCTCCATCCTTCCTGTCTGCATCTGTAGGAG...,0.385235,4,...,IGKJ4*01,FGGGTKVEIK,90.0,1.0,"{5: ('K', 'R')}",TTCGGCGGAGGGACCAGGGTGGAGATCAAA,0.4501,1,CAGCACCTGAACGACTACCCCCTG,QHLNDYPL
1,2218456,DIQMTQSPSPLSASVGDRVTITCQASQDIRNFLNWYQQKPGKAPKL...,IGKV1-33*01,DIQMTQSPSSLSASVGDRVTITCQASQDISNYLNWYQQKPGKAPKL...,88.172043,11.0,"{9: ('S', 'P'), 29: ('S', 'R'), 31: ('Y', 'F')...",GACATCCAGATGACCCAGTCTCCATCCCCCCTGTCTGCATCTGTAG...,0.419136,6,...,IGKJ4*01,FGGGTKVEIK,100.0,0.0,{},TTCGGCGGAGGGACCAAGGTGGAGATCAAA,1.0,0,CAGCAGTACGACAACCTGCCCCTGACC,QQYDNLPLT
2,2217933,EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPR...,IGKV3-20*01,EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPR...,93.617021,6.0,"{83: ('F', 'S'), 89: ('Q', '-'), 90: ('Q', '-'...",GAAATTGTGTTGACGCAGTCTCCAGGCACCCTGTCTTTGTCTCCAG...,0.699252,1,...,IGKJ2*01,FGQGTKLEIK,100.0,0.0,{},TTTGGCCAGGGGACCAAGCTGGAGATCAAA,1.0,0,CAGCAGTACGGCAGCAGCTACACC,QQYGSSYT
3,2186582,DIQMTQSPSALSASVGDRVTITCQASQDINKFLNWYQQKPGKAPKL...,IGKV1-33*01,DIQMTQSPSSLSASVGDRVTITCQASQDISNYLNWYQQKPGKAPKL...,89.247312,10.0,"{9: ('S', 'A'), 29: ('S', 'N'), 30: ('N', 'K')...",GACATCCAGATGACCCAGTCTCCATCCGCCCTGTCTGCATCTGTAG...,0.39441,5,...,IGKJ1*01,FGQGTKVEIK,90.0,1.0,"{8: ('I', 'L')}",TTCGGCCAAGGGACCAAGGTGGAACTCAAA,0.270936,1,CACCAGTACGACAACCTGCCCAGGACC,HQYDNLPRT
4,2144729,DIQLTQSPSFLSASVGDRVTITCRASQGISSYLAWYQQKPGKAPKL...,IGKV1-9*01,DIQLTQSPSFLSASVGDRVTITCRASQGISSYLAWYQQKPGKAPKL...,92.473118,7.0,"{49: ('A', 'G'), 71: ('T', 'K'), 88: ('Q', '-'...",GACATCCAGTTGACCCAGTCTCCATCCTTCCTGTCTGCATCTGTAG...,0.359389,3,...,IGKJ5*01,FGQGTRLEIK,100.0,0.0,{},TTCGGCCAAGGGACACGACTGGAGATTAAA,1.0,0,CAGCAGCTGAACAACTACCCCCCCGTGACC,QQLNNYPPVT


## Create full-length nucleotide antibody sequences

In [18]:
ab_seq_reconstructed = seq_V_J_CDR3.apply(create_final_nucleotide_sequences, args=[cdr3_nseq_col, ref_V], axis=1)

In [19]:
for i, row in ab_seq.iterrows():
    reconstructed_full_seq = ab_seq_reconstructed[ab_seq_reconstructed["Epitope_ID"] == \
        row["Epitope_ID"]]["Full_nucleotide_seq"].values[0]
    alignment = pairwise2.align.localxs(translate_to_amino_acid(reconstructed_full_seq), 
        row[ab_seq_col], -2, -2, one_alignment_only=True)
    print(format_alignment(*alignment[0], full_sequences=True))

ELELTQSPATLSVSPGERATLSCRASESVSSDLAWYQQKPGQAPRLLIYGASTRATGVPARFSGSGSGAEFTLTISSLQSEDFAVYYCQQYNNWPPRYTFGQGTRLEIK
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
ELELTQSPATLSVSPGERATLSCRASESVSSDLAWYQQKPGQAPRLLIYGASTRATGVPARFSGSGSGAEFTLTISSLQSEDFAVYYCQQYNNWPPRYTFGQGTRLEIK
  Score=109

DIVMTQSPATLSVSPGERATLSCRASESVSSDLAWYQQKPGQAPRLLIYGASTRATGVPARFSGSGSGAEFTLTISSLQSEDFAVYYCQQYNNWPPRYTFGQGTRLEIK
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
DIVMTQSPATLSVSPGERATLSCRASESVSSDLAWYQQKPGQAPRLLIYGASTRATGVPARFSGSGSGAEFTLTISSLQSEDFAVYYCQQYNNWPPRYTFGQGTRLEIK
  Score=109

DIELTQPPSVSVVPGQTARISCSGDNIPYEYASWYQQKPGQAPVLVIYGDNNRPSGIPERFSGSNSGNTATLTISGTQAEDEADYYCASWDSMTVDGVFGGGTKLTVL
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
DIELTQPPSVSVVPGQTARISCSGDNIPYEYASWYQQKPGQAPVLVIYGDNNRPSGIPERFSGSNSGNTATLTISGTQAEDEADYYCASWDSMTVD

DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKLLIYAASSLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQANSFPGGTFGPGTKVDIK
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKLLIYAASSLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQANSFPGGTFGPGTKVDIK
  Score=108

DIQLTQSPSSLSASVGDRVTITCRASQGISSYLAWYQQKPGKAPKLLIYAASTLENGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQLNSYPGTFGQGTKLEIK
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
DIQLTQSPSSLSASVGDRVTITCRASQGISSYLAWYQQKPGKAPKLLIYAASTLENGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQLNSYPGTFGQGTKLEIK
  Score=107

DIQLTQSPSSLSAFVGDRVTITCRASEGISSYLAWYQQKPGNAPKLLIYAASTLESGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQLNSYPGTFGQGTKLEIK
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
DIQLTQSPSSLSAFVGDRVTITCRASEGISSYLAWYQQKPGNAPKLLIYAASTLESGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQLNSYPGTFGQGTKLEIK

IQLTQSPSFLSASVGDRVTITCRASQGISNFLAWYQQKPGKAPKLLIYAASTLQGGVPSTFSGSGSGTEFTLTISSLQPEDFATYYCQHLNDYPLFGGGTRVEIK
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
IQLTQSPSFLSASVGDRVTITCRASQGISNFLAWYQQKPGKAPKLLIYAASTLQGGVPSTFSGSGSGTEFTLTISSLQPEDFATYYCQHLNDYPLFGGGTRVEIK
  Score=105

DIQMTQSPSPLSASVGDRVTITCQASQDIRNFLNWYQQKPGKAPKLLIHDASKLEAGVPSRFSGSGSGTDFTFTISSLQPEDIATYYCQQYDNLPLTFGGGTKVEIK
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
DIQMTQSPSPLSASVGDRVTITCQASQDIRNFLNWYQQKPGKAPKLLIHDASKLEAGVPSRFSGSGSGTDFTFTISSLQPEDIATYYCQQYDNLPLTFGGGTKVEIK
  Score=107



In [20]:
ab_seq_reconstructed.to_csv(os.path.dirname(os.getcwd()) + 
    "/data/antibody_pairs/Antibody_pairs_backtranslated_light.csv", index=False)

In [21]:
with open(os.path.dirname(os.getcwd()) + "/data/FASTA/Antibody_pairs_nucleotide_light_chain.fasta", "w") as f:
    for i, row in ab_seq_reconstructed.iterrows():
        f.write(">" + str(row["Epitope_ID"]) + "\n")
        f.write(row["Full_nucleotide_seq"] + "\n")