In [1]:
import pandas as pd
import numpy as np
import collections
import skbio 
from skbio.diversity.alpha import shannon 
import re
import IPython

In [118]:
# The same steps as for the VDJdb data are followed for the McPAS data (downloaded on December 14)
McPAS_data = pd.read_csv("McPAS_data.csv")

In [119]:
McPAS_data = McPAS_data.drop('Unnamed: 0',1)
McPAS_data = McPAS_data.drop_duplicates()
McPAS_data.head()

Unnamed: 0,CDR3,Species,antigen_identification_method,Epitope,V,J,Mouse_strain,Gene
0,CAVTIGFGNVLHCGSGTQVIVLPHIQ,Human,2.4,EAAGIGILTV,TRAV2,TRAJ35,,TRA
1,CASGGGADGLTFPYIQF,Human,2.4,EAAGIGILTV,TRAV2,TRAJ45,,TRA
2,CAASPPESGGYNKLIF,Human,2.4,EAAGIGILTV,TRAV21,TRAJ4,,TRA
3,CAAYYGGSOGNLIF,Human,2.4,EAAGIGILTV,TRAV21,TRAJ42,,TRA
4,CAVSRGGGADGLTF,Human,2.4,EAAGIGILTV,TRAV2,TRAJ45,,TRA


In [120]:
strain = McPAS_data['Mouse_strain'].unique()
method = McPAS_data['antigen_identification_method'].unique()
print(strain)

[nan 'C57/BL6']


In [121]:
print(method) # all identification methods are given (2.4 = Stimulation with tumor cells, 2.1 = Stimulation with a peptide, 1 = Peptide-MHC (pMHC) tetramers)

[2.4 2.1 1. ]


In [122]:
McPAS_data = McPAS_data[McPAS_data.Mouse_strain != "C57/BL6"] # methods where mouse strains are used are removed

In [123]:
Species = McPAS_data['Species'].unique() # see is all data is for human 
print(Species)

['Human']


In [124]:
TRAJ = McPAS_data['J'].unique()
TRAV = McPAS_data['V'].unique()
print(TRAJ)

['TRAJ35' 'TRAJ45' 'TRAJ4' 'TRAJ42' 'TRAJ47' 'TRAJ9' 'TRAJ39' 'TRAJ48'
 'TRAJ1' 'TRAJ5-1' 'TRAJ15' 'TRAJ2' 'TRAJ5' 'TRAJ49' 'TRAJ54' 'TRAJ21'
 'TRAJ53' 'TRAJ40' 'TRAJ36' 'TRAJ38' 'TRAJ26' 'TRAJ3' 'TRAJ37' 'TRAJ11'
 'TRAJ31' 'TRAJ34' 'TRAJ23' 'TRAJ16' 'TRAJ1-3' 'TRAJ9-1' 'TRAJ16-5'
 'TRAJ1-8' 'TRAJ10-1' 'TRAJ3-2' 'TRAJ9-14' 'TRAJ14-1' 'TRAJ16-1' 'TRAJ33'
 'TRAJ52' 'TRAJ44' 'TRAJ43' 'TRAJ50' 'TRAJ20' 'TRAJ58' 'TRAJ6' 'TRAJ57'
 'TRAJ8' 'TRAJ13' 'TRAJ29' 'TRAJ27' 'TRAJ30' 'TRAJ32' 'TRAJ10' 'TRAJ24'
 'TRAJ7' 'TRAJ28' 'TRAJ56' 'TRAJ37-2' 'TRAJ41' 'TRAJ17' 'TRAJ22' 'TRAJ12'
 'TRAJ4-01' 'TRAJ5-01' 'TRAJ3-01' 'TRAJ1-01' 'TRAJ2-01' 'TRAJ9-01'
 'TRAJ13:02' 'TRAJ6-01' 'TRAJ49:01' 'TRAJ20:01' 'TRAJ42:01' 'TRAJ24:01'
 'TRAJ21:01' 'TRAJ7:01' 'TRAJ39:01' 'TRAJ36:01' 'TRAJ54:01' 'TRAJ16:01'
 'TRAJ58:01' 'TRAJ18' 'TRAJ53:01' 'TRAJ53:02' 'TRAJ53:05' 'TRAJ43:01'
 'TRAJ2-1' 'TRAJ41:01' 'TRAJ3-1' 'TRAJ34:01' 'TRAJ31:01' 'TRAJ44:01'
 'TRAJ38:01' 'TRAJ24:02' 'TRAJ17:01' 'TRAJ48:01' 'TRAJ9:01' 'TRAJ18:01'
 'TR

In [125]:
print(TRAV)

['TRAV2' 'TRAV21' 'TRAV10' 'TRAV8' 'TRAV8-2' 'TRAV3' 'TRAV9' 'TRAV1-1'
 'TRAV25' 'TRAV1-4' 'TRAV1-2' 'TRAV28' 'TRAV16' 'TRAV2-1' 'TRAV26'
 'TRAV15' 'TRAV12-1' 'TRAV2-3' 'TRAV4-1' 'TRAV15-1' 'TRAV39' 'TRAV22'
 'TRAV26-2' 'TRAV8-3' 'TRAV18' 'TRAV4' 'TRAV23' 'TRAV27' 'TRAV19'
 'TRAV2-2' 'TRAV6' 'TRAV12' 'TRAV5' 'TRAV14' 'TRAV9-2' 'TRAV1' 'TRAV11'
 'TRAV1-01' 'TRAV13-1' 'TRAV8-4' 'TRAV8-6' 'TRAV251' 'TRAV17' 'TRAV40'
 'TRAV12-2' 'TRAV21-1' 'TRAV13-2' 'TRAV8-1' 'TRAV26-1' 'TRAV21-2'
 'TRAV3-1' 'TRAV35-1' 'TRAV29/DV5' 'TRAV14/DV4' 'TRAV20' 'TRAV24'
 'TRAV23/DV6' 'TRAV12-3' 'TRAV2-01' 'TRAV35:01' 'TRAV12-3:01' 'TRAV25:01'
 'TRAV3-01' 'TRAV10-01' 'TRAV1-2:01' 'TRAV38-2/DV8:01' 'TRAV13-2:01'
 'TRAV29/DV5:01' 'TRAV5-01' 'TRAV23/DV6:01' 'TRAV6-01' 'TRAV9-2:01'
 'TRAV14/DV4:01' 'TRAV41:01' 'TRAV38-1:01' 'TRAV14/DV4:02' 'TRAV8-1:01'
 'TRAV12-2:01' 'TRAV22:01' 'TRAV8-6:01' 'TRAV8-3:01' 'TRAV41-01'
 'TRAV8-2:01' 'TRAV21-01' 'TRAV8-6:02' 'TRAV1-1:01' 'TRAV26-2:01'
 'TRAV8-4:01' 'TRAV4:01' 'TRAV36/DV7:

In [126]:
# Parsing of the J-column
# In the first step the 'TRAJ' classifier is removed
McPAS_data.replace('TRAJ','',regex=True, inplace = True)
# In the second step the J_allele is splitsed from the J_gene based on the * character, the J_allele is then removed as it is not needed in downstream steps
McPAS_data[['J_gene','J_allele_1']] = McPAS_data.J.str.split ("-",expand=True)
McPAS_data[['J_gene','J_allele_2']] = McPAS_data.J_gene.str.split (":",expand=True)
McPAS_data = McPAS_data.drop('J',1)
McPAS_data = McPAS_data.drop('J_allele_1',1)
McPAS_data = McPAS_data.drop('J_allele_2',1)
# add a leading 0 to all families with only one digit so the length of each all classifiers are the same
McPAS_data['J_gene'] = McPAS_data['J_gene'].str.zfill(2)
McPAS_data.head()

Unnamed: 0,CDR3,Species,antigen_identification_method,Epitope,V,Mouse_strain,Gene,J_gene
0,CAVTIGFGNVLHCGSGTQVIVLPHIQ,Human,2.4,EAAGIGILTV,TRAV2,,TRA,35
1,CASGGGADGLTFPYIQF,Human,2.4,EAAGIGILTV,TRAV2,,TRA,45
2,CAASPPESGGYNKLIF,Human,2.4,EAAGIGILTV,TRAV21,,TRA,4
3,CAAYYGGSOGNLIF,Human,2.4,EAAGIGILTV,TRAV21,,TRA,42
4,CAVSRGGGADGLTF,Human,2.4,EAAGIGILTV,TRAV2,,TRA,45


In [127]:
# Parsing of the V-column, in the first step the 'trav' classifier is removed
McPAS_data = McPAS_data[~McPAS_data['V'].isin(['mTRAV14D-1','TRAV251'])]
# This identifier was changed because it caused some problems during further parsing, but it has no effect on the further steps since the allele is dropped
McPAS_data = McPAS_data.replace({"V": { 'TRAV24:01/TRAV24:02' : "TRAV24"}})
McPAS_data.replace('TRAV','',regex=True, inplace = True)

# In the second step the allele is split from the gene and family, by using the : character 
McPAS_data[['V','V_allele']] = McPAS_data.V.str.split(":",expand=True)
McPAS_data = McPAS_data.drop('V_allele',1)

# All families where only one gene is possible are supplemented with the gene, because they are not always given in the database
McPAS_data['V'] = McPAS_data['V'].str.zfill(2)
McPAS_data = McPAS_data.replace({"V": { '02' : "02-01", '03' : "03-01", "04" : "04-01", "05" : "05-01", "06" : "06-01", "07" : "07-01", "10" : "10-01", "15" : "15-01", "16" : "16-01", "17" : "17-01", "18" : "18-01", "19" : "19-01", "20" : "20-01", "21" : "21-01", "22" : "22-01", "24" : "24-01", "25" : "25-01", "27" : "27-01", "28" : "28-01", "30" : "30-01", "31" : "31-01", "32" : "32-01", "33" : "33-01", "34" : "34-01", "35" : "35-01", "37" : "37-01", "39" : "39-01", "40" : "40-01", "41" : "41-01", "46" : "46-01"}})

# Split off the V_family
McPAS_data[['V_family','V_gene_1','V_gene_2']] = McPAS_data.V.str.split ("-|/",expand=True)

# Fill V_family to 2 digits
McPAS_data['V_family'] = McPAS_data['V_family'].str.zfill(2)
# split gene and family, fill both to 2 digits
McPAS_data[['V_1','V_2','V_3']] = McPAS_data.V.str.split ("(-)",expand=True)
McPAS_data['V_1'] = McPAS_data['V_1'].str.zfill(2)
McPAS_data['V_3'] = McPAS_data['V_3'].str.zfill(2)
# Put family and gene back together
McPAS_data = McPAS_data.assign(V = McPAS_data.V_1.astype(str) + McPAS_data.V_2.astype(str) + McPAS_data.V_3.astype(str))
McPAS_data.replace('nan','',regex=True, inplace = True)

# Delete rows without a gene
# The third character of each V_gene is given, when this character is NaN, only the family is present and no info of the gene is present so the row is deleted
McPAS_data['V_gene_present'] = McPAS_data['V'].str[2]
McPAS_data = McPAS_data.loc[McPAS_data['V_gene_present'].str.contains(r'[NaN]') == False]

# Delete columns which are no longer needed
McPAS_data = McPAS_data.drop('V_gene_1',1)
McPAS_data = McPAS_data.drop('V_gene_2',1)
# column V is renamed to V_gene, so the name of this columns is the same in all datafiles
McPAS_data = McPAS_data.rename(columns={'V': 'V_gene'})
McPAS_data.head()

Unnamed: 0,CDR3,Species,antigen_identification_method,Epitope,V_gene,Mouse_strain,Gene,J_gene,V_family,V_1,V_2,V_3,V_gene_present
0,CAVTIGFGNVLHCGSGTQVIVLPHIQ,Human,2.4,EAAGIGILTV,02-01,,TRA,35,2,2,-,1,-
1,CASGGGADGLTFPYIQF,Human,2.4,EAAGIGILTV,02-01,,TRA,45,2,2,-,1,-
2,CAASPPESGGYNKLIF,Human,2.4,EAAGIGILTV,21-01,,TRA,4,21,21,-,1,-
3,CAAYYGGSOGNLIF,Human,2.4,EAAGIGILTV,21-01,,TRA,42,21,21,-,1,-
4,CAVSRGGGADGLTF,Human,2.4,EAAGIGILTV,02-01,,TRA,45,2,2,-,1,-


In [128]:
V = McPAS_data['V_gene'].unique()
print(V)

['02-01' '21-01' '10-01' '08-02' '03-01' '01-01' '25-01' '01-04' '01-02'
 '28-01' '16-01' '15-01' '12-01' '02-03' '04-01' '39-01' '22-01' '26-02'
 '08-03' '18-01' '27-01' '19-01' '02-02' '06-01' '05-01' '09-02' '13-01'
 '08-04' '08-06' '17-01' '40-01' '12-02' '13-02' '08-01' '26-01' '21-02'
 '35-01' '29/DV5' '14/DV4' '20-01' '24-01' '23/DV6' '12-03' '38-2/DV8'
 '41-01' '38-01' '36/DV7' '38-02' '30-01' '34-01' '07-01' '09-01']


In [129]:
# See if CDR3 sequences start with C and end with F
McPAS_data['CDR3_start'] = McPAS_data['CDR3'].str[0] 
McPAS_data['CDR3_end'] = McPAS_data['CDR3'].str[-1]  
# Remove rows with CDR3 sequences which do not start with a C and end with a F or W
McPAS_data = McPAS_data.loc[McPAS_data['CDR3_end'].str.contains(r'[F,W]') == True]
McPAS_data = McPAS_data.loc[McPAS_data['CDR3_start'].str.contains(r'[C]') == True]

# CDR3 sequences with unknown Aminoacids, weird characters, no CDR3 sequence present and lowercase AA are removed 
McPAS_data = McPAS_data.loc[McPAS_data['CDR3'].str.contains(r'[_]') == False]
McPAS_data = McPAS_data.loc[McPAS_data['CDR3'].str.contains(r'[*]') == False]
McPAS_data = McPAS_data.loc[McPAS_data['CDR3'].str.contains(r'[(]') == False]
McPAS_data = McPAS_data.loc[McPAS_data['CDR3'].str.contains(r'[)]') == False]
McPAS_data = McPAS_data.loc[McPAS_data['CDR3'].str.contains(r'[NaN]') == False]
McPAS_data = McPAS_data.loc[McPAS_data['CDR3'].str.contains(r'[a-z]') == False]

start = McPAS_data['CDR3_start'].unique()
print(start)

['C']


In [130]:
end = McPAS_data['CDR3_end'].unique()
print(end)

['F' 'W']


In [131]:
# Delete columns which are no longer needed in further steps
McPAS_data = McPAS_data.drop('CDR3_start',1)
McPAS_data = McPAS_data.drop('CDR3_end',1)
McPAS_data = McPAS_data.drop('V_gene_present',1)
McPAS_data = McPAS_data.drop('V_1',1)
McPAS_data = McPAS_data.drop('V_2',1)
McPAS_data = McPAS_data.drop('V_3',1)
McPAS_data = McPAS_data.drop('Species',1)
McPAS_data = McPAS_data.drop('Mouse_strain',1)
McPAS_data = McPAS_data.drop('antigen_identification_method',1)
McPAS_data = McPAS_data[sorted(McPAS_data.columns)]
McPAS_data.head()

Unnamed: 0,CDR3,Epitope,Gene,J_gene,V_family,V_gene
1,CASGGGADGLTFPYIQF,EAAGIGILTV,TRA,45,2,02-01
4,CAVSRGGGADGLTF,EAAGIGILTV,TRA,45,2,02-01
10,CAVKDSTLTF,EAAGIGILTV,TRA,1,2,02-01
12,CAAPQAGTALIF,AARAVFLAL,TRA,15,8,08-02
13,CTDVSTGGFKTIF,AARAVFLAL,TRA,9,3,03-01


In [132]:
McPAS_data.to_csv('parsed_McPAS.csv', index=True, sep=",")

In [133]:
McPAS_data.shape

(452, 6)

In [134]:
print('Finished')

Finished
