In [454]:
import pandas as pd
import collections
import skbio 
from skbio.diversity.alpha import shannon 
import IPython
import re
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [455]:
# Eerst alle stappen uitvoeren met de data van de VDJdb (gedownload op 11 december), splitsen + weergeven van data
vdjdb_data = pd.read_csv("vdjdb_data.csv")

In [456]:
del vdjdb_data ['Unnamed: 0']
vdjdb_data = vdjdb_data[sorted(vdjdb_data.columns)]
vdjdb_data.head()

Unnamed: 0,CDR3,Epitope,Gene,J,V
0,CAVGNNAGNMLTF,NEGVKAAW,TRA,TRAJ39*01,TRAV20*01
1,CAAKTGGGNKLTF,NEGVKAAW,TRA,TRAJ10*01,TRAV29/DV5*01
2,CAAKAGGGNKLTF,NEGVKAAW,TRA,TRAJ10*01,TRAV29/DV5*01
3,CAAITGIGGSQGNLIF,NEGVKAAW,TRA,TRAJ42*01,TRAV13-1*01
4,CAGTDGGATNKLIF,NEGVKAAW,TRA,TRAJ32*01,TRAV36/DV7*01


In [457]:
TRAJ = vdjdb_data ['J'].unique()
TRAV = vdjdb_data ['V'].unique()

In [458]:
print (TRAJ) # via deze methode gekeken hoe de data eruitzag om te splitsen

['TRAJ39*01' 'TRAJ10*01' 'TRAJ42*01' 'TRAJ32*01' 'TRAJ13*01' 'TRAJ8*01'
 'TRAJ47*01' 'TRAJ29*01' 'TRAJ17*01' 'TRAJ38*01' 'TRAJ44*01' 'TRAJ49*01'
 'TRAJ26*01' 'TRAJ34*01' 'TRAJ40*01' 'TRAJ33*01' 'TRAJ23*01' 'TRAJ57*01'
 'TRAJ43*01' 'TRAJ6*01' 'TRAJ20*01' 'TRAJ30*01' 'TRAJ48*01' 'TRAJ31*01'
 'TRAJ45*01' 'TRAJ3*01' 'TRAJ27*01' 'TRAJ36*01' 'TRAJ52*01' 'TRAJ15*01'
 'TRAJ5*01' 'TRAJ4*01' 'TRAJ50*01' 'TRAJ28*01' 'TRAJ53*01' 'TRAJ37*01'
 'TRAJ22*01' 'TRAJ41*01' 'TRAJ9*01' 'TRAJ12*01' 'TRAJ11*01' 'TRAJ46*01'
 'TRAJ54*01' 'TRAJ18*01' 'TRAJ21*01' 'TRAJ56*01' 'TRAJ7*01' 'TRAJ14*01'
 'TRAJ13*02' 'TRAJ24*02']


In [459]:
print (TRAV) 

['TRAV20*01' 'TRAV29/DV5*01' 'TRAV13-1*01' 'TRAV36/DV7*01' 'TRAV19*01'
 'TRAV8-2*01' 'TRAV27*01' 'TRAV8-6*01' 'TRAV35*01' 'TRAV9-2*01'
 'TRAV23/DV6*01' 'TRAV38-2/DV8*01' 'TRAV18*01' 'TRAV8-3*01' 'TRAV22*01'
 'TRAV13-2*01' 'TRAV21*01' 'TRAV8-4*01' 'TRAV5*01' 'TRAV12-2*01'
 'TRAV38-1*01' 'TRAV12-1*01' 'TRAV1-2*01' 'TRAV39*01' 'TRAV16*01'
 'TRAV17*01' 'TRAV10*01' 'TRAV24*01' 'TRAV41*01' 'TRAV12-3*01' 'TRAV3*01'
 'TRAV8-1*01' 'TRAV26-2*01' 'TRAV4*01' 'TRAV14/DV4*01' 'TRAV2*01'
 'TRAV30*01' 'TRAV25*01' 'TRAV26-1*01' 'TRAV1-1*01' 'TRAV34*01'
 'TRAV40*01' 'TRAV6*01' 'TRAV9-1*01' 'TRAV7*01' 'TRAV14/DV4*02'
 'TRAV8-6*02']


In [460]:
# splitsen van de J-kolom, eerst wordt TRAJ verwijderd, daarna zal het allel worden afgesplitst en in een andere kolom worden geplaatst, de oorspronkelijke kolom zal worden verwijderd en de data wordt aangevuld zodat zowel de family als het allele steeds 2 digits heeft
vdjdb_data.replace('TRAJ','',regex=True, inplace = True)
vdjdb_data[['J_family','J_allele']] = vdjdb_data.J.str.split ("*",expand=True)
del vdjdb_data ['J']
# add a leading 0 to all families with only one digit
vdjdb_data['J_family'] = vdjdb_data['J_family'].str.zfill(2)
vdjdb_data.head()

Unnamed: 0,CDR3,Epitope,Gene,V,J_family,J_allele
0,CAVGNNAGNMLTF,NEGVKAAW,TRA,TRAV20*01,39,1
1,CAAKTGGGNKLTF,NEGVKAAW,TRA,TRAV29/DV5*01,10,1
2,CAAKAGGGNKLTF,NEGVKAAW,TRA,TRAV29/DV5*01,10,1
3,CAAITGIGGSQGNLIF,NEGVKAAW,TRA,TRAV13-1*01,42,1
4,CAGTDGGATNKLIF,NEGVKAAW,TRA,TRAV36/DV7*01,32,1


In [461]:
# Splitsen van de V kolom, eerst zal de TRAV worden verwijderd en zal terug het allele worden afgesplitst en in een nieuwe kolom worden geplaatst
# daarna de gene van de family scheiden 
vdjdb_data.replace('TRAV','',regex=True, inplace = True)
vdjdb_data[['V','V_allele']] = vdjdb_data.V.str.split ("*",expand=True) # eerst allele afsplitsen, komt na de *
# dan in 3 stappen de gene van de family scheiden
vdjdb_data[['V_family','V_gene_1']] = vdjdb_data.V.str.split ("-",expand=True) # eerst scheiden op -
vdjdb_data[['V_family','V_gene_2']] = vdjdb_data.V_family.str.split ("/",expand=True) # dan op /
vdjdb_data.replace(np.nan,'', regex=True )
# ten slotte de 2 gene kolommen nog mergen en de andere 2 verwijderen
vdjdb_data = vdjdb_data.assign(V_gene = vdjdb_data.V_gene_1.astype(str) + vdjdb_data.V_gene_2.astype(str))

In [462]:
vdjdb_data.replace('nan','',regex=True, inplace = True)

In [463]:
del vdjdb_data ['V']
del vdjdb_data ['V_gene_1']
del vdjdb_data ['V_gene_2']

In [464]:
# add a leading 0 to all Values with only one digit
vdjdb_data['V_family'] = vdjdb_data['V_family'].str.zfill(2)
vdjdb_data['V_gene'] = vdjdb_data['V_gene'].str.zfill(2)

In [465]:
vdjdb_data = vdjdb_data[sorted(vdjdb_data.columns)]
vdjdb_data.head()
# nog vragen: als er geen V_gene is gegeven, moet dit dan aangevuld worden tot 00 of gewoon leeggelaten worden?

Unnamed: 0,CDR3,Epitope,Gene,J_allele,J_family,V_allele,V_family,V_gene
0,CAVGNNAGNMLTF,NEGVKAAW,TRA,1,39,1,20,00
1,CAAKTGGGNKLTF,NEGVKAAW,TRA,1,10,1,29,DV5
2,CAAKAGGGNKLTF,NEGVKAAW,TRA,1,10,1,29,DV5
3,CAAITGIGGSQGNLIF,NEGVKAAW,TRA,1,42,1,13,01
4,CAGTDGGATNKLIF,NEGVKAAW,TRA,1,32,1,36,DV7


In [466]:
# Enkel de epitopen overhouden die 30 keer of meer voorkomen,
vdjdb_data.drop_duplicates
vdjdb_pos = vdjdb_data.groupby('Epitope').filter(lambda x: len(x)> 30) 

In [467]:
## Data exploration (gekopieerd uit de code van de paper + aangevuld voor de allelen)
classes = vdjdb_pos['Epitope'].unique()
stats = collections.defaultdict(list)
for peptide in classes:
    pep_data = vdjdb_pos[vdjdb_pos['Epitope'] == peptide]
    
    stats['total TCRAs'].append(pep_data.shape[0])
    stats['unique CDR3'].append(len(set(pep_data['CDR3'])))
    
    cdr3_count = list(collections.Counter(pep_data['CDR3']).values())
    stats['CDR3 diversity'].append(shannon(cdr3_count))
    
    stats['unique V family'].append(len(pep_data['V_family'].unique()))
    stats['unique J family'].append(len(pep_data['J_family'].unique()))
    stats['unique V gene'].append(len(pep_data['V_gene'].unique()))
    stats['unique J allele'].append(len(pep_data['J_allele'].unique()))
    stats['unique V allele'].append(len(pep_data['V_allele'].unique()))
    v_fam_count = list(collections.Counter(pep_data['V_family']).values())
    j_fam_count = list(collections.Counter(pep_data['J_family']).values())
    v_gene_count = list(collections.Counter(pep_data['V_gene']).values())
    j_allele_count = list(collections.Counter(pep_data['J_allele']).values())
    v_allele_count = list(collections.Counter(pep_data['V_allele']).values())
    stats['V family diversity'].append(shannon(v_fam_count))
    stats['J family diversity'].append(shannon(j_fam_count))
    stats['V gene diversity'].append(shannon(v_gene_count))
    stats['J allele diversity'].append(shannon(j_allele_count))
    stats['V allele diversity'].append(shannon(v_allele_count))

pd.set_option('display.max_columns', None)
IPython.display.display(pd.DataFrame(stats, index=classes, columns=stats.keys()).T)
# Unique J en V allele zeggen hier niet zo veel omdat deze bijna altijd alleen 01 of 02 zijn

Unnamed: 0,NEGVKAAW,KLGGALQAK,ELAGIGILTV,IVTDFSVIK,AVFDRKSDAK,RAKFKQLL,GILGFVFTL,SLFNTVATLY,RLRAEAQVK,AYAQKIFKI,GLCTLVAML,LLFGYPVYV,FLYALALLL,FLASKIGRLV,RTLNAWVKV,FLRGRAYGL,NLVPMVATV,CINGVCWTV,LLWNGPMAV,KLVALGINAV,DPFRLLQNSQVFS,GLIYNRMGAVTTEV,PKYVKQNTLKLAT,QARQMVQAMRTIGTHP,SGPLKAEIAQRLED,GMFNMLSTVLGVS,FRDYVDRFYKTLRAEQASQE,LLLGIGILV,KAFSPEVIPMF,DATYQRTRALVR,YLQPRTFLL
total TCRAs,301.0,13401.0,361.0,710.0,1687.0,1213.0,4472.0,37.0,413.0,39.0,311.0,32.0,38.0,31.0,44.0,41.0,2212.0,74.0,299.0,46.0,43.0,128.0,141.0,111.0,63.0,72.0,248.0,376.0,31.0,112.0,325.0
unique CDR3,118.0,11426.0,332.0,519.0,1534.0,642.0,3004.0,37.0,390.0,39.0,190.0,32.0,22.0,31.0,44.0,29.0,2093.0,71.0,247.0,45.0,42.0,119.0,139.0,102.0,62.0,65.0,141.0,345.0,22.0,88.0,262.0
CDR3 diversity,6.15965,13.212909,8.318918,8.044908,10.433803,8.09786,10.818358,5.209453,8.563207,5.285402,6.908826,5.0,3.812945,4.954196,5.459432,4.448181,10.948642,6.128372,7.743769,5.480084,5.379753,6.859375,7.111183,6.542164,5.945534,5.975481,6.660757,8.380361,4.324849,6.322849,7.817597
unique V family,26.0,33.0,24.0,31.0,32.0,33.0,33.0,16.0,32.0,18.0,25.0,13.0,11.0,14.0,25.0,15.0,33.0,11.0,28.0,10.0,17.0,23.0,27.0,18.0,17.0,17.0,10.0,27.0,7.0,20.0,23.0
unique J family,38.0,48.0,46.0,46.0,47.0,46.0,48.0,25.0,46.0,24.0,43.0,25.0,11.0,25.0,27.0,22.0,46.0,34.0,44.0,25.0,22.0,37.0,42.0,31.0,31.0,34.0,27.0,43.0,9.0,35.0,37.0
unique V gene,10.0,11.0,9.0,11.0,11.0,11.0,11.0,8.0,11.0,8.0,10.0,8.0,5.0,7.0,10.0,7.0,11.0,8.0,10.0,7.0,7.0,10.0,10.0,9.0,6.0,8.0,3.0,11.0,6.0,7.0,8.0
unique J allele,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
unique V allele,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
V family diversity,3.832061,4.471825,1.239394,4.247684,4.420575,3.745162,4.119155,3.677804,4.478998,3.759511,3.014938,3.237301,2.034864,3.619226,4.388536,3.389675,4.576203,2.495234,2.271051,1.864106,3.631444,3.540532,4.075117,2.856583,2.830259,3.308998,1.011571,2.641717,2.137972,3.478539,1.219611
J family diversity,4.477435,5.390525,5.01356,5.007671,5.392408,4.711714,4.246334,4.465946,5.320349,4.355539,4.594717,4.51532,1.926574,4.518397,4.600922,4.051482,5.3848,4.762461,4.615085,4.246282,4.166246,4.827183,5.110287,4.383797,4.677688,4.65751,3.357188,4.834619,2.382494,4.711158,3.825526


In [468]:
## Zelfde voor de data van McPAS
McPAS_data = pd.read_csv("McPAS_data.csv")

In [469]:
del McPAS_data['Unnamed: 0']
McPAS_data.head()

Unnamed: 0,CDR3,Species,Epitope gene,Epitope,MHC A,V,J,Gene
0,CAVTIGFGNVLHCGSGTQVIVLPHIQ,Human,Melan-A/MART-1,EAAGIGILTV,HLA-A*02,TRAV2,TRAJ35,TRA
1,CASGGGADGLTFPYIQF,Human,Melan-A/MART-1,EAAGIGILTV,HLA-A*02,TRAV2,TRAJ45,TRA
2,CAASPPESGGYNKLIF,Human,Melan-A/MART-1,EAAGIGILTV,HLA-A*02,TRAV21,TRAJ4,TRA
3,CAAYYGGSOGNLIF,Human,Melan-A/MART-1,EAAGIGILTV,HLA-A*02,TRAV21,TRAJ42,TRA
4,CAVSRGGGADGLTF,Human,Melan-A/MART-1,EAAGIGILTV,HLA-A*02,TRAV2,TRAJ45,TRA


In [470]:
TRAJ = McPAS_data ['J'].unique()
TRAV = McPAS_data ['V'].unique()
print (TRAJ) # gebruikt niet de standaard naamgeving


['TRAJ35' 'TRAJ45' 'TRAJ4' 'TRAJ42' 'TRAJ47' 'TRAJ9' 'TRAJ39' 'TRAJ48'
 'TRAJ1' 'TRAJ5-1' 'TRAJ15' nan 'TRAJ2' 'TRAJ5' 'TRAJ49' 'TRAJ54' 'TRAJ21'
 'TRAJ53' 'TRAJ40' 'TRAJ36' 'TRAJ38' 'TRAJ26' 'TRAJ3' 'TRAJ37' 'TRAJ11'
 'TRAJ31' 'TRAJ34' 'TRAJ23' 'TRAJ16' 'TRAJ1-3' 'TRAJ9-1' 'TRAJ16-5'
 'TRAJ1-8' 'TRAJ10-1' 'TRAJ3-2' 'TRAJ9-14' 'TRAJ14-1' 'TRAJ16-1' 'TRAJ33'
 'TRAJ52' 'TRAJ44' 'TRAJ43' 'TRAJ50' 'TRAJ20' 'TRAJ58' 'TRAJ6' 'TRAJ57'
 'TRAJ8' 'TRAJ13' 'TRAJ29' 'TRAJ27' 'TRAJ30' 'TRAJ32' 'TRAJ10' 'TRAJ24'
 'TRAJ7' 'TRAJ28' 'TRAJ56' 'TRAJ37-2' 'TRAJ41' 'TRAJ17' 'TRAJ22' 'TRAJ12'
 'TRAJ4-01' 'TRAJ5-01' 'TRAJ3-01' 'TRAJ1-01' 'TRAJ24:02' 'TRAJ2-01'
 'TRAJ9-01' 'TRAJ6-01' 'TRAJ8-01' 'TRAJ13:02' 'TRAJ49:01' 'TRAJ20:01'
 'TRAJ42:01' 'TRAJ24:01' 'TRAJ21:01' 'TRAJ7:01' 'TRAJ39:01' 'TRAJ36:01'
 'TRAJ54:01' 'TRAJ16:01' 'TRAJ58:01' 'TRAJ18' 'TRAJ53:01' 'TRAJ53:02'
 'TRAJ53:05' 'TRAJ43:01' 'TRAJ2-1' 'TRAJ41:01' 'TRAJ3-1' 'TRAJ34:01'
 'TRAJ31:01' 'TRAJ44:01' 'TRAJ38:01' 'TRAJ17:01' 'TRAJ48:01' 'TRAJ9:01'
 

In [471]:
print (TRAV)

['TRAV2' 'TRAV21' 'TRAV10' 'TRAV8' 'TRAV8-2' 'TRAV3' 'TRAV9' 'TRAV1-1'
 'TRAV25' 'TRAV1-4' 'TRAV1-2' 'TRAV28' 'TRAV16' 'TRAV2-1' 'TRAV26'
 'TRAV39' 'TRAV15' 'TRAV20' 'TRAV12-1' 'TRAV2-3' 'TRAV4-1' 'TRAV15-1'
 'TRAV22' 'TRAV26-2' 'TRAV8-3' 'TRAV18' 'TRAV4' 'TRAV23' 'TRAV27' 'TRAV19'
 'TRAV2-2' 'TRAV6' 'TRAV12' nan 'TRAV5' 'TRAV14' 'TRAV12-4' 'TRAV38-2'
 'TRAV24' 'TRAV17' 'TRAV38-1' 'TRAV29' 'TRAV9-2' 'TRAV31' 'TRAV1' 'TRAV11'
 'TRAV1-01' 'TRAV13-1' 'TRAV8-4' 'TRAV8-6' 'TRAV251' 'TRAV40' 'TRAV12-2'
 'TRAV21-1' 'TRAV13-2' 'TRAV8-1' 'TRAV26-1' 'TRAV21-2' 'TRAV3-1'
 'TRAV35-1' 'TRAV29/DV5' 'TRAV14/DV4' 'TRAV23/DV6' 'TRAV12-2, TRAV21'
 'TRAV12-3' 'TRAV2-01' 'TRAV35:01' 'TRAV12-3:01' 'TRAV25:01' 'TRAV3-01'
 'TRAV10-01' 'TRAV1-2:01' 'TRAV14/DV4:01' 'TRAV29/DV5:01' 'TRAV1-1:01'
 'TRAV6-01' 'TRAV38-2/DV8:01' 'TRAV13-2:01' 'TRAV5-01' 'TRAV23/DV6:01'
 'TRAV9-2:01' 'TRAV41:01' 'TRAV38-1:01' 'TRAV14/DV4:02' 'TRAV8-1:01'
 'TRAV12-2:01' 'TRAV22:01' 'TRAV8-6:02' 'TRAV8-6:01' 'TRAV8-3:01'
 'TRAV13-1:01'

In [472]:
McPAS_data.replace('TRAJ','',regex=True, inplace = True)
McPAS_data[['J_family','J_allele_1']] = McPAS_data.J.str.split ("-",expand=True)
McPAS_data[['J_family','J_allele_2']] = McPAS_data.J_family.str.split (":",expand=True)
#McPAS_data.rename(index={J: 'J_family'})
del McPAS_data ['J']
McPAS_data.replace(np.nan,'', regex=True )
# ten slotte de 2 gene kolommen nog mergen en de andere 2 verwijderen
McPAS_data = McPAS_data.assign(J_allele = McPAS_data.J_allele_1.astype(str) + McPAS_data.J_allele_2.astype(str))


In [473]:
McPAS_data.replace('nan','',regex=True, inplace = True)
del McPAS_data ['J_allele_1']
del McPAS_data ['J_allele_2']

In [474]:
McPAS_data.head()

Unnamed: 0,CDR3,Species,Epitope gene,Epitope,MHC A,V,Gene,J_family,J_allele
0,CAVTIGFGNVLHCGSGTQVIVLPHIQ,Human,Melan-A/MART-1,EAAGIGILTV,HLA-A*02,TRAV2,TRA,35,
1,CASGGGADGLTFPYIQF,Human,Melan-A/MART-1,EAAGIGILTV,HLA-A*02,TRAV2,TRA,45,
2,CAASPPESGGYNKLIF,Human,Melan-A/MART-1,EAAGIGILTV,HLA-A*02,TRAV21,TRA,4,
3,CAAYYGGSOGNLIF,Human,Melan-A/MART-1,EAAGIGILTV,HLA-A*02,TRAV21,TRA,42,
4,CAVSRGGGADGLTF,Human,Melan-A/MART-1,EAAGIGILTV,HLA-A*02,TRAV2,TRA,45,


In [475]:
#McPAS_data.replace('TRAV','',regex=True, inplace = True)
#McPAS_data[['V','V_Fct']] = McPAS_data.V.str.split (" ",expand=True)

In [476]:
McPAS_pos = McPAS_data.groupby('Epitope').filter(lambda x: len(x)> 30)

In [477]:
# negative data
# nog andere data zoeken want deze heeft overal als allele 00 staan, 

# ImmuneACCESSdb : filters set control, human, geen TCRA data beschikbaar
# iReceptor: filters set case-control, human, TRA, en daarna gekeken welke controle data ongeveer de correcte grootte had.
neg_data = pd.read_csv("neg_data.csv")

In [478]:
del neg_data ['Unnamed: 0']
neg_data.head()

Unnamed: 0,sequence,v_call,j_call,locus,junction_aa
0,,TRAV38-2DV8*00,TRAJ47*00,TRA,CAYWEYGNKLVF
1,,TRAV27*00,TRAJ22*00,TRA,CAGAATSGSARQLTF
2,,TRAV20*00,TRAJ6*00,TRA,CAVQAGSYIPTF
3,,TRAV13-2*00,TRAJ16*00,TRA,CAVLTG_GQKLLF
4,,TRAV1-1*00,TRAJ26*00,TRA,CAVTPQGQNFVF


In [479]:
neg_data.replace('TRAJ','',regex=True, inplace = True)
neg_data[['J_family','J_allele']] = neg_data.j_call.str.split ("*",expand=True)
del neg_data ['j_call']
# add a leading 0 to all families with only one digit
neg_data['J_family'] = neg_data['J_family'].str.zfill(2)
neg_data.head()

Unnamed: 0,sequence,v_call,locus,junction_aa,J_family,J_allele
0,,TRAV38-2DV8*00,TRA,CAYWEYGNKLVF,47,0
1,,TRAV27*00,TRA,CAGAATSGSARQLTF,22,0
2,,TRAV20*00,TRA,CAVQAGSYIPTF,6,0
3,,TRAV13-2*00,TRA,CAVLTG_GQKLLF,16,0
4,,TRAV1-1*00,TRA,CAVTPQGQNFVF,26,0


In [480]:
neg_data.replace('TRAV','',regex=True, inplace = True)
neg_data.replace('TRDV','',regex=True, inplace = True)
neg_data[['v_call','V_allele']] = neg_data.v_call.str.split ("*",expand=True) # eerst allele afsplitsen, komt na de *
# dan in 3 stappen de gene van de family scheiden
neg_data[['V_family','V_gene']] = neg_data.v_call.str.split ("-",expand=True) # eerst scheiden op -

In [481]:
del neg_data ['v_call']

In [482]:
# add a leading 0 to all Values with only one digit
neg_data['V_family'] = neg_data['V_family'].str.zfill(2)
neg_data['V_gene'] = neg_data['V_gene'].str.zfill(2)

In [483]:
neg_data = neg_data[sorted(neg_data.columns)]
neg_data.head()

Unnamed: 0,J_allele,J_family,V_allele,V_family,V_gene,junction_aa,locus,sequence
0,0,47,0,38,2DV8,CAYWEYGNKLVF,TRA,
1,0,22,0,27,,CAGAATSGSARQLTF,TRA,
2,0,6,0,20,,CAVQAGSYIPTF,TRA,
3,0,16,0,13,02,CAVLTG_GQKLLF,TRA,
4,0,26,0,1,01,CAVTPQGQNFVF,TRA,


In [484]:
## Feature generation
# gekopieerd uit de code van de paper

# physico-chemical amino acid properties
# = dictionaries
basicity = {'A': 206.4, 'B': 210.7, 'C': 206.2, 'D': 208.6, 'E': 215.6, 'F': 212.1, 'G': 202.7,
            'H': 223.7, 'I': 210.8, 'K': 221.8, 'L': 209.6, 'M': 213.3, 'N': 212.8, 'P': 214.4,
            'Q': 214.2, 'R': 237.0, 'S': 207.6, 'T': 211.7, 'V': 208.7, 'W': 216.1, 'X': 210.2,
            'Y': 213.1, 'Z': 214.9}

hydrophobicity = {'A': 0.16, 'B': -3.14, 'C': 2.50, 'D': -2.49, 'E': -1.50, 'F': 5.00, 'G': -3.31,
                  'H': -4.63, 'I': 4.41, 'K': -5.00, 'L': 4.76, 'M': 3.23, 'N': -3.79, 'P': -4.92,
                  'Q': -2.76, 'R': -2.77, 'S': -2.85, 'T': -1.08, 'V': 3.02, 'W': 4.88, 'X': 4.59,
                  'Y': 2.00, 'Z': -2.13}

helicity = {'A': 1.24, 'B': 0.92, 'C': 0.79, 'D': 0.89, 'E': 0.85, 'F': 1.26, 'G': 1.15, 'H': 0.97,
            'I': 1.29, 'K': 0.88, 'L': 1.28, 'M': 1.22, 'N': 0.94, 'P': 0.57, 'Q': 0.96, 'R': 0.95,
            'S': 1.00, 'T': 1.09, 'V': 1.27, 'W': 1.07, 'X': 1.29, 'Y': 1.11, 'Z': 0.91}

mutation_stability = {'A': 13, 'C': 52, 'D': 11, 'E': 12, 'F': 32, 'G': 27, 'H': 15, 'I': 10,
                      'K': 24, 'L': 34, 'M':  6, 'N':  6, 'P': 20, 'Q': 10, 'R': 17, 'S': 10,
                      'T': 11, 'V': 17, 'W': 55, 'Y': 31}

physchem_properties = {'basicity': basicity, 'hydrophobicity': hydrophobicity,
                       'helicity': helicity, 'mutation stability': mutation_stability}