## Parsing of the negative data

data from the iReceptor database collected using the parameters: Case-control studie, Homo Sapiens, TRA and Blood, using these parameters a database containing 2.439.440 TCR-A sequences of healthy control individuals was collected and parsed in this file using the following steps:
* Only unique CDR3 sequences were used
* Rows with missing values in one of the columns are removed
* Methods where mouse strains were used are removed
* CDR3 sequences with non-amino acid characters are removed
* CDR3 sequences not starting with 'C' or ending with 'F' or 'W' are removed because these amino acids are conserved residues
* The V and J columns were checked to see if they contained values not present in the IMGT database, entries not present in the IMGT db are removed
* The J gene is split of the gene columns and the 'TRAJ' identifier and J allele are removed
* J gene is filled to 2 characters by adding a 0 at the front if there is only one character present
* The V column is split in a V family and V gene column, the 'TRAV' identifier and V allele are removed
* The V gene was added to V families with only 1 possible V gene because they were not always given in the db
* Both V family and gene are filled to 2 characters, like the J gene column
* The epitope columns was added and all epitopes were named 'Control'

In [1]:
import pandas as pd

In [2]:
# ImmuneACCESSdb : filters set 'control', 'human', no TCRA data was available, only TCRB data
# iReceptor: filters set 'case-control', 'human', 'TRA', 'Healthy control', 'tissue (blood)' 'ireceptor_public_archive_1
neg_data = pd.read_csv("neg_data.csv")

# Remove CDR3 sequences present more than one time
neg_data = neg_data.drop_duplicates(subset="CDR3")
neg_data.head()

Unnamed: 0,v_call,j_call,Gene,CDR3
0,TRAV41*01,TRAJ58*01,TRA,CAVPLKE_SGSSLTF
1,TRAV12-2*01,TRAJ40*01,TRA,CAVNDPGTYKYIF
2,TRAV17*01,TRAJ47*01,TRA,CATDAEYGNKLVF
3,TRAV41*01,TRAJ32*02,TRA,CAALYGGATNKLIF
4,TRAV20*01,TRAJ39*01,TRA,CAVEGNAGNMLTF


In [3]:
# Parsing of J-column
neg_data.replace('TRAJ','',regex=True, inplace = True)
neg_data[['J_gene','J_allele']] = neg_data.j_call.str.split ("*",expand=True)
del neg_data['j_call']
del neg_data['J_allele']
# add a leading 0 to all families with only one digit
neg_data['J_gene'] = neg_data['J_gene'].str.zfill(2)
neg_data.head()

Unnamed: 0,v_call,Gene,CDR3,J_gene
0,TRAV41*01,TRA,CAVPLKE_SGSSLTF,58
1,TRAV12-2*01,TRA,CAVNDPGTYKYIF,40
2,TRAV17*01,TRA,CATDAEYGNKLVF,47
3,TRAV41*01,TRA,CAALYGGATNKLIF,32
4,TRAV20*01,TRA,CAVEGNAGNMLTF,39


In [4]:
# Parsing of the V-column, in the first step the 'trav' classifier is removed
neg_data.replace('TRAV','',regex=True, inplace = True)
# In the second step the allele is split from the gene and family, by using the * character 
neg_data[['v_call','V_allele']] = neg_data.v_call.str.split("*",expand=True)
neg_data = neg_data.drop('V_allele',1)

# All families where only one gene is possible are supplemented with the gene, because they are not always given in the database
neg_data['v_call'] = neg_data['v_call'].str.zfill(2)
neg_data = neg_data.replace({"v_call": { '02' : "02-01", '03' : "03-01", "04" : "04-01", "05" : "05-01", "06" : "06-01", "07" : "07-01", "10" : "10-01", "15" : "15-01", "16" : "16-01", "17" : "17-01", "18" : "18-01", "19" : "19-01", "20" : "20-01", "21" : "21-01", "22" : "22-01", "24" : "24-01", "25" : "25-01", "27" : "27-01", "28" : "28-01", "30" : "30-01", "31" : "31-01", "32" : "32-01", "33" : "33-01", "34" : "34-01", "35" : "35-01", "37" : "37-01", "39" : "39-01", "40" : "40-01", "41" : "41-01", "46" : "46-01"}})

# Split off the V_family
neg_data[['V_family','V_gene_1','V_gene_2']] = neg_data.v_call.str.split ("-|/",expand=True)

# Fill V_family to 2 digits
neg_data['V_family'] = neg_data['V_family'].str.zfill(2)
# split gene and family, fill both to 2 digits
neg_data[['V_1','V_2','V_3']] = neg_data.v_call.str.split ("(-)",expand=True)
neg_data['V_1'] = neg_data['V_1'].str.zfill(2)
neg_data['V_3'] = neg_data['V_3'].str.zfill(2)
# Put family and gene back together
neg_data = neg_data.assign(V = neg_data.V_1.astype(str) + neg_data.V_2.astype(str) + neg_data.V_3.astype(str))
neg_data.replace('nan','',regex=True, inplace = True)

# Delete rows without a gene
# The third character of each V_gene is given, when this character is NaN, only the family is present and no info of the gene is present so the row is deleted
neg_data['V_gene_present'] = neg_data['v_call'].str[2]
neg_data = neg_data.loc[neg_data['V_gene_present'].str.contains(r'[NaN]') == False]

# Delete columns which are no longer needed
neg_data = neg_data.drop('V_gene_1',1)
neg_data = neg_data.drop('V_gene_2',1)
neg_data.head()

Unnamed: 0,v_call,Gene,CDR3,J_gene,V_family,V_1,V_2,V_3,V,V_gene_present
0,41-01,TRA,CAVPLKE_SGSSLTF,58,41,41,-,1,41-01,-
1,12-2,TRA,CAVNDPGTYKYIF,40,12,12,-,2,12-02,-
2,17-01,TRA,CATDAEYGNKLVF,47,17,17,-,1,17-01,-
3,41-01,TRA,CAALYGGATNKLIF,32,41,41,-,1,41-01,-
4,20-01,TRA,CAVEGNAGNMLTF,39,20,20,-,1,20-01,-


In [6]:
# See if CDR3 sequences start with C and end with F
neg_data['CDR3_start'] = neg_data['CDR3'].str[0] 
neg_data['CDR3_end'] = neg_data['CDR3'].str[-1]
# Remove rows with CDR3 sequences which do not start with a C and end with a F or W
neg_data = neg_data.loc[neg_data['CDR3_end'].str.contains(r'[F,W]') == True]
neg_data = neg_data.loc[neg_data['CDR3_start'].str.contains(r'[C]') == True]

# CDR3 sequences with unknown Aminoacids, weird characters, no CDR3 sequence present and lowercase AA are removed 
weird_characters = [r'[_]',r'[*]',r'[(]',r'[)]',r'[NaN]',r'[a-z]']
for char in weird_characters:
    neg_data = neg_data.loc[neg_data['CDR3'].str.contains(char) == False]

start = neg_data['CDR3_start'].unique()
print(start)

['C']


In [7]:
end = neg_data['CDR3_end'].unique()
print(end)

['F' 'W']


In [9]:
# column V is renamed to V_gene, so the name of this columns is the same in all datafiles
neg_data = neg_data.rename(columns={'V': 'V_gene'})
# The Epitope column added, all apitopes are named 'Control'
neg_data['Epitope']= 'Control'
# Delete columns which are no longer needed in further steps
neg_data = neg_data[['CDR3', 'Epitope', 'Gene', 'J_gene', 'V_family', 'V_gene']]

neg_data = neg_data[sorted(neg_data.columns)]
neg_data.head()

Unnamed: 0,CDR3,Epitope,Gene,J_gene,V_family,V_gene
6,CAGGISSGSARQLTF,Control,TRA,22,25,25-01
11,CAVERGTSGSRLTF,Control,TRA,58,36,36/DV7
15,CATDAEGTYKYIF,Control,TRA,40,17,17-01
16,CAATQGGSEKLVF,Control,TRA,57,25,25-01
17,CALSSLSGTYKYIF,Control,TRA,40,19,19-01


In [10]:
V_gene = neg_data['V_gene'].unique()
print(V_gene)

['25-01' '36/DV7' '17-01' '19-01' '10-01' '02-01' '20-01' '04-01' '03-01'
 '13-01' '06-01' '13-02' '26-02' '41-01' '26-01' '05-01' '27-01' '38-01'
 '29/DV5' '12-01' '08-01' '12-02' '21-01' '34-01' '16-01' '12-03' '30-01'
 '14/DV4' '39-01' '23/DV6' '35-01' '01-02' '38-2/DV8' '08-03' '08-06'
 '09-02' '40-01' '08-04' '01-01' '22-01' '24-01' '08-02' '18-01' '09-01'
 '07-01' '08-07']


In [14]:
neg_data.shape

(77880, 6)

In [12]:
neg_data.to_csv('parsed_neg.csv', index=False, sep=",")

In [13]:
print('Finished')

Finished
