## Parsing of the VDJdb data

data from the VDJ database collected on December 11, 2020 is parsed in this file using the following steps:
* Duplicate rows are removed
* Rows with missing values in one of the columns are removed
* CDR3 sequences with non-amino acid characters are removed
* CDR3 sequences not starting with 'C' or ending with 'F' or 'W' are removed because these amino acids are conserved residues
* The V and J columns were checked to see if they contained values not present in the IMGT database
* The J gene is split of the gene columns and the 'TRAJ' identifier and J allele are removed
* J gene is filled to 2 characters by adding a 0 at the front if there is only one character present
* The V column is split in a V family and V gene column, the 'TRAV' identifier and V allele are removed
* The V gene was added to V families with only 1 possible V gene because they were not always given in the db
* Both V family and gene are filled to 2 characters, like the J gene column

In [1]:
import pandas as pd

In [20]:
vdjdb_data = pd.read_csv("vdjdb_data.csv")
vdjdb_data = vdjdb_data.drop_duplicates()
vdjdb_data.head()

Unnamed: 0,Gene,CDR3,V,J,Species,Epitope
0,TRA,CAVGNNAGNMLTF,TRAV20*01,TRAJ39*01,HomoSapiens,NEGVKAAW
1,TRA,CAAKTGGGNKLTF,TRAV29/DV5*01,TRAJ10*01,HomoSapiens,NEGVKAAW
2,TRA,CAAKAGGGNKLTF,TRAV29/DV5*01,TRAJ10*01,HomoSapiens,NEGVKAAW
3,TRA,CAAITGIGGSQGNLIF,TRAV13-1*01,TRAJ42*01,HomoSapiens,NEGVKAAW
4,TRA,CAGTDGGATNKLIF,TRAV36/DV7*01,TRAJ32*01,HomoSapiens,NEGVKAAW


In [21]:
species = vdjdb_data['Species'].unique()
print(species)

['HomoSapiens']


In [22]:
TRAJ = vdjdb_data['J'].unique()
TRAV = vdjdb_data['V'].unique()
print(TRAJ)

['TRAJ39*01' 'TRAJ10*01' 'TRAJ42*01' 'TRAJ32*01' 'TRAJ13*01' 'TRAJ8*01'
 'TRAJ47*01' 'TRAJ29*01' 'TRAJ17*01' 'TRAJ38*01' 'TRAJ44*01' 'TRAJ49*01'
 'TRAJ26*01' 'TRAJ34*01' 'TRAJ40*01' 'TRAJ33*01' 'TRAJ23*01' 'TRAJ57*01'
 'TRAJ43*01' 'TRAJ6*01' 'TRAJ20*01' 'TRAJ30*01' 'TRAJ48*01' 'TRAJ31*01'
 'TRAJ45*01' 'TRAJ3*01' 'TRAJ27*01' 'TRAJ36*01' 'TRAJ52*01' 'TRAJ15*01'
 'TRAJ5*01' 'TRAJ4*01' 'TRAJ50*01' 'TRAJ28*01' 'TRAJ53*01' 'TRAJ37*01'
 'TRAJ22*01' 'TRAJ41*01' 'TRAJ9*01' 'TRAJ12*01' 'TRAJ11*01' 'TRAJ46*01'
 'TRAJ54*01' 'TRAJ18*01' 'TRAJ21*01' 'TRAJ56*01' 'TRAJ7*01' 'TRAJ14*01'
 'TRAJ13*02' 'TRAJ24*02']


In [23]:
print(TRAV) 

['TRAV20*01' 'TRAV29/DV5*01' 'TRAV13-1*01' 'TRAV36/DV7*01' 'TRAV19*01'
 'TRAV8-2*01' 'TRAV27*01' 'TRAV8-6*01' 'TRAV35*01' 'TRAV9-2*01'
 'TRAV23/DV6*01' 'TRAV38-2/DV8*01' 'TRAV18*01' 'TRAV8-3*01' 'TRAV22*01'
 'TRAV13-2*01' 'TRAV21*01' 'TRAV8-4*01' 'TRAV5*01' 'TRAV12-2*01'
 'TRAV38-1*01' 'TRAV12-1*01' 'TRAV1-2*01' 'TRAV39*01' 'TRAV16*01'
 'TRAV17*01' 'TRAV10*01' 'TRAV24*01' 'TRAV41*01' 'TRAV12-3*01' 'TRAV3*01'
 'TRAV8-1*01' 'TRAV26-2*01' 'TRAV4*01' 'TRAV14/DV4*01' 'TRAV2*01'
 'TRAV30*01' 'TRAV25*01' 'TRAV26-1*01' 'TRAV1-1*01' 'TRAV34*01'
 'TRAV40*01' 'TRAV6*01' 'TRAV9-1*01' 'TRAV7*01' 'TRAV14/DV4*02'
 'TRAV8-6*02']


In [24]:
# Parsing of the J-column
# In the first step the 'TRAJ' classifier is removed
vdjdb_data.replace('TRAJ','',regex=True, inplace = True)
# In the second step the J_allele is splitsed from the J_gene based on the * character, the J_allele is then removed as it is not needed in downstream steps
vdjdb_data[['J_gene','J_allele']] = vdjdb_data.J.str.split ("*",expand=True)
vdjdb_data = vdjdb_data.drop('J',1)
vdjdb_data = vdjdb_data.drop('J_allele',1)
# add a leading 0 to all families with only one digit so the length of all classifiers are the same
vdjdb_data['J_gene'] = vdjdb_data['J_gene'].str.zfill(2)
vdjdb_data.head()

Unnamed: 0,Gene,CDR3,V,Species,Epitope,J_gene
0,TRA,CAVGNNAGNMLTF,TRAV20*01,HomoSapiens,NEGVKAAW,39
1,TRA,CAAKTGGGNKLTF,TRAV29/DV5*01,HomoSapiens,NEGVKAAW,10
2,TRA,CAAKAGGGNKLTF,TRAV29/DV5*01,HomoSapiens,NEGVKAAW,10
3,TRA,CAAITGIGGSQGNLIF,TRAV13-1*01,HomoSapiens,NEGVKAAW,42
4,TRA,CAGTDGGATNKLIF,TRAV36/DV7*01,HomoSapiens,NEGVKAAW,32


In [25]:
J = vdjdb_data['J_gene'].unique()
print(J)

['39' '10' '42' '32' '13' '08' '47' '29' '17' '38' '44' '49' '26' '34'
 '40' '33' '23' '57' '43' '06' '20' '30' '48' '31' '45' '03' '27' '36'
 '52' '15' '05' '04' '50' '28' '53' '37' '22' '41' '09' '12' '11' '46'
 '54' '18' '21' '56' '07' '14' '24']


In [26]:
# Parsing of the V-column
# In the first step the 'trav' classifier is removed
vdjdb_data.replace('TRAV','',regex=True, inplace = True)
# In the second step the allele is split from the gene and family, by using the * character 
vdjdb_data[['V','V_allele']] = vdjdb_data.V.str.split("*",expand=True)
vdjdb_data = vdjdb_data.drop('V_allele',1)

# All families where only one gene is possible are supplemented with the gene, because they are not always given in the database
vdjdb_data['V'] = vdjdb_data['V'].str.zfill(2)
vdjdb_data = vdjdb_data.replace({"V": { '02' : "02-01", '03' : "03-01", "04" : "04-01", "05" : "05-01", "06" : "06-01", "07" : "07-01", "10" : "10-01", "15" : "15-01", "16" : "16-01", "17" : "17-01", "18" : "18-01", "19" : "19-01", "20" : "20-01", "21" : "21-01", "22" : "22-01", "24" : "24-01", "25" : "25-01", "27" : "27-01", "28" : "28-01", "30" : "30-01", "31" : "31-01", "32" : "32-01", "33" : "33-01", "34" : "34-01", "35" : "35-01", "37" : "37-01", "39" : "39-01", "40" : "40-01", "41" : "41-01", "46" : "46-01"}})

# Split off the V_family
vdjdb_data[['V_family','V_gene_1','V_gene_2']] = vdjdb_data.V.str.split ("-|/",expand=True)

# Fill V_family to 2 digits
vdjdb_data['V_family'] = vdjdb_data['V_family'].str.zfill(2)
# split gene and family, fill both to 2 digits
vdjdb_data[['V_1','V_2','V_3']] = vdjdb_data.V.str.split ("(-)",expand=True)
vdjdb_data['V_1'] = vdjdb_data['V_1'].str.zfill(2)
vdjdb_data['V_3'] = vdjdb_data['V_3'].str.zfill(2)
# Put family and gene back together
vdjdb_data = vdjdb_data.assign(V = vdjdb_data.V_1.astype(str) + vdjdb_data.V_2.astype(str) + vdjdb_data.V_3.astype(str))
vdjdb_data.replace('nan','',regex=True, inplace = True)

# V_gene_1 and 2 are removed as they are no longer needed
vdjdb_data = vdjdb_data.drop('V_gene_1',1)
vdjdb_data = vdjdb_data.drop('V_gene_2',1)
# column V is renamed to V_gene, so the name of this columns is the same in all datafiles
vdjdb_data = vdjdb_data.rename(columns={'V': 'V_gene'})
vdjdb_data = vdjdb_data[sorted(vdjdb_data.columns)]
vdjdb_data.head()

Unnamed: 0,CDR3,Epitope,Gene,J_gene,Species,V_1,V_2,V_3,V_family,V_gene
0,CAVGNNAGNMLTF,NEGVKAAW,TRA,39,HomoSapiens,20,-,1.0,20,20-01
1,CAAKTGGGNKLTF,NEGVKAAW,TRA,10,HomoSapiens,29/DV5,,,29,29/DV5
2,CAAKAGGGNKLTF,NEGVKAAW,TRA,10,HomoSapiens,29/DV5,,,29,29/DV5
3,CAAITGIGGSQGNLIF,NEGVKAAW,TRA,42,HomoSapiens,13,-,1.0,13,13-01
4,CAGTDGGATNKLIF,NEGVKAAW,TRA,32,HomoSapiens,36/DV7,,,36,36/DV7


In [27]:
V = vdjdb_data['V_gene'].unique()
print(V)

['20-01' '29/DV5' '13-01' '36/DV7' '19-01' '08-02' '27-01' '08-06' '35-01'
 '09-02' '23/DV6' '38-2/DV8' '18-01' '08-03' '22-01' '13-02' '21-01'
 '08-04' '05-01' '12-02' '38-01' '12-01' '01-02' '39-01' '16-01' '17-01'
 '10-01' '24-01' '41-01' '12-03' '03-01' '08-01' '26-02' '04-01' '14/DV4'
 '02-01' '30-01' '25-01' '26-01' '01-01' '34-01' '40-01' '06-01' '09-01'
 '07-01']


In [28]:
# See if CDR3 sequences start with C and end with F
vdjdb_data['CDR3_start'] = vdjdb_data['CDR3'].str[0] 
vdjdb_data['CDR3_end'] = vdjdb_data['CDR3'].str[-1] 
# Remove rows with CDR3 sequences which do not start with a C and end with a F or W
vdjdb_data = vdjdb_data.loc[vdjdb_data['CDR3_end'].str.contains(r'[F,W]') == True]
vdjdb_data = vdjdb_data.loc[vdjdb_data['CDR3_start'].str.contains(r'[C]') == True]

# CDR3 sequences with unknown Aminoacids, weird characters, no CDR3 sequence present or lowercase AA are removed  
weird_characters = [r'[_]',r'[*]',r'[(]',r'[)]',r'[NaN]',r'[a-z]']
for char in weird_characters:
    vdjdb_data = vdjdb_data.loc[vdjdb_data['CDR3'].str.contains(char) == False]

start = vdjdb_data['CDR3_start'].unique()
print(start)
# the following paper states that on the N-terminus the 'C' is conserved and on the C-terminus the 'F' and 'W' are conserved, so the other ones are removed (https://www.jimmunol.org/content/jimmunol/194/1/446.full.pdf)

['C']


In [29]:
end = vdjdb_data['CDR3_end'].unique()
print(end)

['F' 'W']


In [30]:
# Delete columns not needed in further steps
vdjdb_data = vdjdb_data[['CDR3', 'Epitope', 'Gene', 'J_gene', 'V_family', 'V_gene']]

In [31]:
vdjdb_data.head()

Unnamed: 0,CDR3,Epitope,Gene,J_gene,V_family,V_gene
5,CALSEAVSGGYQKVTF,NEGVKAAW,TRA,13,19,19-01
7,CVVSLTGFQKLVF,NEGVKAAW,TRA,8,8,08-02
9,CAVSESLVGTPLVF,NEGVKAAW,TRA,29,8,08-06
16,CAVSDSGTASKLTF,NEGVKAAW,TRA,44,8,08-06
21,CAASGTDKLIF,NEGVKAAW,TRA,34,13,13-01


In [32]:
TRAJ = vdjdb_data['V_family'].unique()
print(TRAJ)

['19' '08' '13' '22' '09' '05' '38' '16' '01' '29' '17' '39' '10' '12'
 '24' '41' '03' '21' '26' '04' '14' '23' '02' '20' '36' '25' '34' '35'
 '27' '18' '06' '30' '40' '07']


In [33]:
vdjdb_data.to_csv('parsed_vdjdb.csv', index=False, sep=",")

In [34]:
vdjdb_data.shape

(8431, 6)

In [35]:
print('Finished')

Finished
