In [1]:
import glob

import numpy as np
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

sns.set(style='whitegrid', font_scale=1.2, rc={'figure.figsize': (10, 8)})

In [2]:
PROTEIN_ALPHABET = 'ACDEFGHIKLMNPQRSTVWY'
PROTEIN_PLACEHOLDER = 'Z'
PROTEIN_LETTERS = PROTEIN_ALPHABET + PROTEIN_PLACEHOLDER

In [3]:
df = pd.read_csv('../data/bdata/bdata.20130222.mhci.txt', sep='\t')

In [4]:
df = df[df.species == 'human']

In [5]:
df.head()

Unnamed: 0,species,mhc,peptide_length,sequence,inequality,meas
5009,human,HLA-A*01:01,8,ASFCGSPY,=,51.4
5010,human,HLA-A*01:01,8,LTDFGLSK,=,739.385479
5011,human,HLA-A*01:01,8,FTSFFYRY,=,1285.0
5012,human,HLA-A*01:01,8,KSVFNSLY,=,1466.0
5013,human,HLA-A*01:01,8,RDWAHNSL,=,1804.675523


In [6]:
df = df.loc[:, ('mhc', 'sequence', 'meas')]

In [7]:
df.head()

Unnamed: 0,mhc,sequence,meas
5009,HLA-A*01:01,ASFCGSPY,51.4
5010,HLA-A*01:01,LTDFGLSK,739.385479
5011,HLA-A*01:01,FTSFFYRY,1285.0
5012,HLA-A*01:01,KSVFNSLY,1466.0
5013,HLA-A*01:01,RDWAHNSL,1804.675523


In [8]:
print('\n'.join(list(df.mhc.unique())))

HLA-A*01:01
HLA-A*02:01
HLA-A*02:02
HLA-A*02:03
HLA-A*02:04
HLA-A*02:05
HLA-A*02:06
HLA-A*02:07
HLA-A*02:10
HLA-A*02:11
HLA-A*02:12
HLA-A*02:16
HLA-A*02:17
HLA-A*02:19
HLA-A*02:50
HLA-A*03:01
HLA-A*03:02
HLA-A*03:19
HLA-A*11:01
HLA-A*11:02
HLA-A*23:01
HLA-A*24:02
HLA-A*24:03
HLA-A*25:01
HLA-A*26:01
HLA-A*26:02
HLA-A*26:03
HLA-A*29:02
HLA-A*30:01
HLA-A*30:02
HLA-A*31:01
HLA-A*32:01
HLA-A*32:07
HLA-A*32:15
HLA-A*33:01
HLA-A*66:01
HLA-A*68:01
HLA-A*68:02
HLA-A*68:23
HLA-A*69:01
HLA-A*74:01
HLA-A*80:01
HLA-A1
HLA-A11
HLA-A2
HLA-A24
HLA-A26
HLA-A3
HLA-A3/11
HLA-B*07:02
HLA-B*08:01
HLA-B*08:02
HLA-B*08:03
HLA-B*14:01
HLA-B*14:02
HLA-B*15:01
HLA-B*15:02
HLA-B*15:03
HLA-B*15:09
HLA-B*15:17
HLA-B*15:42
HLA-B*18:01
HLA-B*27:01
HLA-B*27:02
HLA-B*27:03
HLA-B*27:04
HLA-B*27:05
HLA-B*27:06
HLA-B*27:10
HLA-B*27:20
HLA-B*35:01
HLA-B*35:03
HLA-B*35:08
HLA-B*37:01
HLA-B*38:01
HLA-B*39:01
HLA-B*40:01
HLA-B*40:02
HLA-B*40:13
HLA-B*42:01
HLA-B*42:02
HLA-B*44:02
HLA-B*44:03
HLA-B*45:01
HLA-B*45:06
HLA-B*46:

In [9]:
df.sequence.apply(len).value_counts()

9     122089
10     28731
8       2792
11      2565
13       376
15       272
14       219
12       203
18        38
17        38
30         1
16         1
Name: sequence, dtype: int64

In [10]:
seq_minlen = 8
seq_maxlen = 11

In [11]:
df = df[df.sequence.apply(len).isin(range(seq_minlen, seq_maxlen + 1))]

In [12]:
df.sequence = df.sequence.apply(lambda seq: seq.ljust(seq_maxlen, PROTEIN_PLACEHOLDER))

In [13]:
train_features = []
for pos in range(seq_maxlen):
    colname_prefix = 'fp' + str(pos + 1).zfill(len(str(seq_maxlen)))
    for aminoacid in PROTEIN_LETTERS:
        colname = colname_prefix + aminoacid
        train_features.append(colname)
        df[colname] = df.sequence.apply(lambda seq: float(seq[pos] == aminoacid))

In [14]:
df = df.join(pd.get_dummies(df.mhc, prefix='fmhc'))

In [15]:
df.to_hdf('../data/combined.h5', 'table')

In [16]:
df

Unnamed: 0,mhc,sequence,meas,fp01A,fp01C,fp01D,fp01E,fp01F,fp01G,fp01H,...,fmhc_HLA-C*07:01,fmhc_HLA-C*07:02,fmhc_HLA-C*08:02,fmhc_HLA-C*12:03,fmhc_HLA-C*14:02,fmhc_HLA-C*15:02,fmhc_HLA-Cw1,fmhc_HLA-Cw4,fmhc_HLA-E*01:01,fmhc_HLA-E*01:03
5009,HLA-A*01:01,ASFCGSPYZZZ,51.400000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5010,HLA-A*01:01,LTDFGLSKZZZ,739.385479,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5011,HLA-A*01:01,FTSFFYRYZZZ,1285.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5012,HLA-A*01:01,KSVFNSLYZZZ,1466.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5013,HLA-A*01:01,RDWAHNSLZZZ,1804.675523,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5014,HLA-A*01:01,FSSCPVAYZZZ,1939.466630,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5015,HLA-A*01:01,RNWAHSSLZZZ,2201.794454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5016,HLA-A*01:01,LSCAASGFZZZ,2830.055894,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5017,HLA-A*01:01,LASIDLKYZZZ,3464.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5018,HLA-A*01:01,RAKFKQLLZZZ,5000.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
