In [None]:
import os
os.environ["THEANO_FLAGS"] = 'cuda.root=/usr/local/cuda,floatX=float32,device=gpu1,force_device=False,lib.cnmem=.75'

import theano
print(theano.config.device)

import mhcflurry, seaborn, numpy, pandas, pickle, sklearn, collections, scipy, time
import mhcflurry.dataset
import fancyimpute, locale

import sklearn.metrics
import sklearn.cross_validation

def print_full(x):
    pandas.set_option('display.max_rows', len(x))
    print(x)
    pandas.reset_option('display.max_rows')
    
%matplotlib inline


In [2]:
max_ic50 = 50000
min_peptides_to_consider_allele = 10
data_dir = "../data/"

In [3]:
all_train_data = mhcflurry.dataset.Dataset.from_csv(data_dir + "bdata.2009.mhci.public.1.txt")

In [4]:
imputed_train_data = all_train_data.impute_missing_values(
    fancyimpute.MICE(n_imputations=250, n_burn_in=50),
    min_observations_per_peptide=2,
    min_observations_per_allele=2
)


Dropping 12235 peptides with <2 observations
Dropping 9 alleles with <2 observations: ['ELA-A1', 'HLA-B2701', 'HLA-B3508', 'HLA-B44', 'HLA-E0101', 'Mamu-B04', 'Patr-A0602', 'Patr-B0901', 'Patr-B1701']
[MICE] Completing matrix with shape (19304, 97)
[MICE] Starting imputation round 1/300, elapsed time 0.024
[MICE] Starting imputation round 2/300, elapsed time 1.564
[MICE] Starting imputation round 3/300, elapsed time 2.855
[MICE] Starting imputation round 4/300, elapsed time 4.131
[MICE] Starting imputation round 5/300, elapsed time 5.399
[MICE] Starting imputation round 6/300, elapsed time 6.674
[MICE] Starting imputation round 7/300, elapsed time 7.941
[MICE] Starting imputation round 8/300, elapsed time 9.213
[MICE] Starting imputation round 9/300, elapsed time 10.482
[MICE] Starting imputation round 10/300, elapsed time 11.755
[MICE] Starting imputation round 11/300, elapsed time 13.023
[MICE] Starting imputation round 12/300, elapsed time 14.301
[MICE] Starting imputation round 13/

In [5]:
imputed_train_data.to_dataframe()

Unnamed: 0_level_0,Unnamed: 1_level_0,allele,peptide,affinity,sample_weight
allele,peptide,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Mamu-A07,RELYLNSSNV,Mamu-A07,RELYLNSSNV,1615.211000,1.0
Mamu-A07,ICKAAMGLR,Mamu-A07,ICKAAMGLR,1684.829504,1.0
Mamu-A07,SAVTDRETDV,Mamu-A07,SAVTDRETDV,1691.783274,1.0
Mamu-A07,NSHQRSDSS,Mamu-A07,NSHQRSDSS,1694.957645,1.0
Mamu-A07,STATLCLGHH,Mamu-A07,STATLCLGHH,1682.656229,1.0
Mamu-A07,AIYVFCISLK,Mamu-A07,AIYVFCISLK,1659.680822,1.0
Mamu-A07,KPVDTSNSF,Mamu-A07,KPVDTSNSF,1632.351678,1.0
Mamu-A07,SIYIAVANCV,Mamu-A07,SIYIAVANCV,1631.715646,1.0
Mamu-A07,PWLTEKEAM,Mamu-A07,PWLTEKEAM,1698.113597,1.0
Mamu-A07,PSLPSPSR,Mamu-A07,PSLPSPSR,1678.871225,1.0


In [6]:
validation_df = pandas.read_csv("../data/combined_test_BLIND_dataset_from_kim2013.csv")
validation_df

Unnamed: 0,allele,peptide,length,meas,netmhc,netmhcpan,smmpmbec_cpp
0,H-2-DB,AAACNVATA,9,657.657837,154.881662,711.213514,438.530698
1,H-2-DB,AAFEFVYV,8,30831.879502,6456.542290,785.235635,10351.421667
2,H-2-DB,AAFVNDYSL,9,77.446180,17.458222,7.516229,28.054336
3,H-2-DB,AAIANQAAV,9,1.999862,9.638290,9.749896,25.703958
4,H-2-DB,AAIANQAVV,9,1.517050,8.550667,8.336812,28.773984
5,H-2-DB,AAIENYVRF,9,37.844258,252.348077,114.815362,187.068214
6,H-2-DB,AAINFITTM,9,3.155005,199.986187,389.045145,200.909281
7,H-2-DB,AAIPAPPPI,9,3243.396173,1059.253725,493.173804,295.120923
8,H-2-DB,AAKLNRPPL,9,654.636174,66.374307,77.268059,38.459178
9,H-2-DB,AALDMVDAL,9,229.614865,547.015963,597.035287,225.423921


In [7]:
validation_allele_counts = validation_df.allele.value_counts()
train_allele_counts = all_train_data._df.allele.value_counts()
print(validation_allele_counts)
print(train_allele_counts)

HLA-A0201    2126
HLA-A2601    1333
HLA-B0801     940
HLA-B5101     854
HLA-B5701     815
HLA-B0702     813
HLA-A0301     811
HLA-A3101     724
HLA-A1101     723
HLA-A0101     696
HLA-A0206     682
HLA-A6802     669
HLA-A3001     660
HLA-A0203     651
HLA-B3901     641
HLA-B1501     633
HLA-B1517     582
HLA-A2402     573
H-2-DB        564
H-2-KB        558
HLA-B3501     542
HLA-A6801     527
HLA-B0802     509
HLA-B1801     503
HLA-B5301     485
HLA-A3301     473
HLA-A6901     470
HLA-B1509     466
HLA-A3201     449
HLA-B5801     445
HLA-B2703     441
HLA-A3002     420
HLA-A2501     416
HLA-A2602     413
HLA-B4402     411
HLA-B4001     407
HLA-A2301     391
Mamu-A02      388
HLA-A8001     379
HLA-B4601     378
HLA-B4403     378
HLA-B3801     351
HLA-B2705     314
HLA-A2603     312
Mamu-A01      274
HLA-B0803     234
H-2-KD        229
HLA-B1503     165
HLA-A0202     126
HLA-A2902     118
HLA-B5401      79
HLA-B4002      74
HLA-B4501      65
Name: allele, dtype: int64
HLA-A0201     9565


In [8]:
alleles = sorted(train_allele_counts.index[
    (train_allele_counts >= min_peptides_to_consider_allele)
    & (train_allele_counts.index.isin(validation_allele_counts.index))
], key=lambda allele: -1 * train_allele_counts[allele])
alleles

['HLA-A0201',
 'HLA-A0301',
 'HLA-A0203',
 'HLA-A1101',
 'HLA-A0206',
 'HLA-A3101',
 'HLA-A6802',
 'HLA-A0202',
 'HLA-A0101',
 'HLA-B0702',
 'H-2-KB',
 'H-2-DB',
 'HLA-B1501',
 'HLA-A6801',
 'HLA-A3301',
 'HLA-B2705',
 'HLA-A2601',
 'HLA-B4001',
 'HLA-B5801',
 'HLA-A2402',
 'HLA-B3501',
 'HLA-A2902',
 'HLA-B0801',
 'Mamu-A01',
 'HLA-A6901',
 'HLA-B1801',
 'HLA-A3001',
 'HLA-A2301',
 'HLA-B5701',
 'HLA-B5101',
 'HLA-B4402',
 'HLA-A3002',
 'HLA-B4601',
 'HLA-B5401',
 'HLA-B5301',
 'Mamu-A02',
 'HLA-B4403',
 'HLA-B4501',
 'HLA-B3901',
 'HLA-B4002',
 'HLA-B1517',
 'HLA-A8001',
 'HLA-A3201',
 'HLA-A2501',
 'HLA-B0802',
 'H-2-KD',
 'HLA-B2703',
 'HLA-B1503',
 'HLA-B1509',
 'HLA-B0803',
 'HLA-A2603',
 'HLA-A2602',
 'HLA-B3801']

In [16]:
dropout_probabilities = [0.5]
embedding_output_dims_and_layer_sizes_list = [(32, [64])] # , (8, [4])]
activations = ["tanh"]

models_params_list = []


for model_num in range(1):
    for fraction_negative in [0, .2]:
        for impute in [False, True]:
            for dropout_probability in dropout_probabilities:
                for (embedding_output_dim, layer_sizes) in embedding_output_dims_and_layer_sizes_list:
                    for activation in activations:
                        models_params_list.append(dict(
                            fraction_negative=fraction_negative,
                            impute=impute,
                            dropout_probability=dropout_probability,  
                            embedding_output_dim=embedding_output_dim,
                            layer_sizes=layer_sizes,
                            activation=activation))

print("%d models" % len(models_params_list))
models_params_explored = set.union(*[set(x) for x in models_params_list])
models_params_explored


4 models


{'activation',
 'dropout_probability',
 'embedding_output_dim',
 'fraction_negative',
 'impute',
 'layer_sizes'}

In [10]:
def make_scores(ic50_y, ic50_y_pred, sample_weight=None, threshold_nm=500):     
    y_pred = mhcflurry.regression_target.ic50_to_regression_target(ic50_y_pred, max_ic50)
    try:
        auc = sklearn.metrics.roc_auc_score(ic50_y <= threshold_nm, y_pred, sample_weight=sample_weight)
    except ValueError:
        auc = numpy.nan
    try:
        f1 = sklearn.metrics.f1_score(ic50_y <= threshold_nm, ic50_y_pred <= threshold_nm, sample_weight=sample_weight)
    except ValueError:
        f1 = numpy.nan
    try:
        tau = scipy.stats.kendalltau(ic50_y_pred, ic50_y)[0]
    except ValueError:
        tau = numpy.nan
    
    return dict(
        auc=auc,
        f1=f1,
        tau=tau,
    )    


In [17]:
models_and_scores = {}
validation_df_with_mhcflurry = validation_df.copy()


In [None]:
# train and test models, adding columns to validation_df_with_mhcflurry
pandas.DataFrame(models_params_list).to_csv("../data/validation_models.csv", index=False)

def make_and_fit_model(allele, original_params):
    params = dict(original_params)
    impute = params["impute"]
    del params["impute"]
    
    fraction_negative = params["fraction_negative"]
    del params["fraction_negative"]
    
    model = mhcflurry.Class1BindingPredictor.from_hyperparameters(max_ic50=max_ic50, **params)
    print("Fitting model for allele %s (%d + %d): %s" % (
            allele,
            len(all_train_data.groupby_allele_dictionary()[allele]),
            len(imputed_train_data.groupby_allele_dictionary()[allele]),
            str(original_params)))
    t = -time.time()
    model.fit_dataset(
        all_train_data.get_allele(allele),
        pretraining_dataset=imputed_train_data.get_allele(allele) if impute else None,
        verbose=False,
        batch_size=128,
        n_training_epochs=1000,
        n_random_negative_samples=int(fraction_negative * len(all_train_data.get_allele(allele))))
    t += time.time()
    print("Trained in %d sec" % t)
    return model

for (i, allele) in enumerate(alleles):
    if allele not in validation_df_with_mhcflurry.allele.unique():
        print("Skipping allele %s: not in test set" % allele)
        continue
    if allele in models_and_scores:
        print("Skipping allele %s: already done" % allele)
        continue
    values_for_allele = []
    for (j, params) in enumerate(models_params_list):
        print("Allele %d model %d" % (i, j))
        model = make_and_fit_model(allele, params)
        predictions = model.predict(
            list(validation_df_with_mhcflurry.ix[validation_df_with_mhcflurry.allele == allele].peptide))
        print("test set size: %d" % len(predictions))
        validation_df_with_mhcflurry.loc[(validation_df_with_mhcflurry.allele == allele),
                                         ("mhcflurry %d" % j)] = predictions
        scores = make_scores(validation_df_with_mhcflurry.ix[validation_df.allele == allele].meas,
                            predictions)
        print(scores)
        values_for_allele.append((params, scores))
        
    models_and_scores[allele] = values_for_allele
    
    # Write out all data after each allele.
    validation_df_with_mhcflurry_results = validation_df_with_mhcflurry.ix[validation_df_with_mhcflurry.allele.isin(models_and_scores)]
    validation_df_with_mhcflurry_results.to_csv("../data/validation_predictions_full.csv", index=False)
    
    scores_df = collections.defaultdict(list)
    predictors = validation_df_with_mhcflurry_results.columns[4:]

    for (allele, grouped) in validation_df_with_mhcflurry_results.groupby("allele"):
        scores_df["allele"].append(allele)
        scores_df["test_size"].append(len(grouped.meas))
        for predictor in predictors:
            scores = make_scores(grouped.meas, grouped[predictor])
            for (key, value) in scores.items():
                scores_df["%s_%s" % (predictor, key)].append(value)

    scores_df = pandas.DataFrame(scores_df)
    scores_df["train_size"] = [
        len(all_train_data.groupby_allele_dictionary()[a])
        for a in scores_df.allele
    ]

    scores_df.index = scores_df.allele
    scores_df.to_csv("../data/validation_scores.csv", index=False)
        


Allele 0 model 0
Fitting model for allele HLA-A0201 (9565 + 19304): {'impute': False, 'dropout_probability': 0.5, 'embedding_output_dim': 32, 'layer_sizes': [64], 'fraction_negative': 0, 'activation': 'tanh'}
Trained in 182 sec
test set size: 2126
{'tau': 0.62319446846736604, 'auc': 0.92948587812088179, 'f1': 0.87857762359063318}
Allele 0 model 1
Fitting model for allele HLA-A0201 (9565 + 19304): {'impute': True, 'dropout_probability': 0.5, 'embedding_output_dim': 32, 'layer_sizes': [64], 'fraction_negative': 0, 'activation': 'tanh'}
Trained in 186 sec
test set size: 2126
{'tau': 0.62514991483552451, 'auc': 0.93029258823948413, 'f1': 0.87989668532070586}
Allele 0 model 2
Fitting model for allele HLA-A0201 (9565 + 19304): {'impute': False, 'dropout_probability': 0.5, 'embedding_output_dim': 32, 'layer_sizes': [64], 'fraction_negative': 0.2, 'activation': 'tanh'}
Trained in 192 sec
test set size: 2126
{'tau': 0.62509913730218769, 'auc': 0.9302694376179792, 'f1': 0.87598253275109161}
Alle

  'recall', 'true', average, warn_for)


Allele 33 model 0
Fitting model for allele HLA-B5401 (1019 + 19304): {'impute': False, 'dropout_probability': 0.5, 'embedding_output_dim': 32, 'layer_sizes': [64], 'fraction_negative': 0, 'activation': 'tanh'}
Trained in 26 sec
test set size: 79
{'tau': 0.29725085376804777, 'auc': 0.85135135135135143, 'f1': 0.88888888888888895}
Allele 33 model 1
Fitting model for allele HLA-B5401 (1019 + 19304): {'impute': True, 'dropout_probability': 0.5, 'embedding_output_dim': 32, 'layer_sizes': [64], 'fraction_negative': 0, 'activation': 'tanh'}
Trained in 29 sec
test set size: 79
{'tau': 0.28260795456765136, 'auc': 0.83783783783783783, 'f1': 0.80000000000000016}
Allele 33 model 2
Fitting model for allele HLA-B5401 (1019 + 19304): {'impute': False, 'dropout_probability': 0.5, 'embedding_output_dim': 32, 'layer_sizes': [64], 'fraction_negative': 0.2, 'activation': 'tanh'}
Trained in 27 sec
test set size: 79
{'tau': 0.33971526144919745, 'auc': 0.81081081081081086, 'f1': 0.74999999999999989}
Allele 33

  'precision', 'predicted', average, warn_for)


{'tau': 0.29954093915773233, 'auc': 0.94419753086419755, 'f1': 0.0}
Allele 49 model 1
Fitting model for allele HLA-B0803 (217 + 19304): {'impute': True, 'dropout_probability': 0.5, 'embedding_output_dim': 32, 'layer_sizes': [64], 'fraction_negative': 0, 'activation': 'tanh'}
Trained in 5 sec
test set size: 234
{'tau': 0.29502362517402508, 'auc': 0.93185185185185182, 'f1': 0.1818181818181818}
Allele 49 model 2
Fitting model for allele HLA-B0803 (217 + 19304): {'impute': False, 'dropout_probability': 0.5, 'embedding_output_dim': 32, 'layer_sizes': [64], 'fraction_negative': 0.2, 'activation': 'tanh'}
Trained in 3 sec
test set size: 234
{'tau': 0.293732964035823, 'auc': 0.93580246913580245, 'f1': 0.0}
Allele 49 model 3
Fitting model for allele HLA-B0803 (217 + 19304): {'impute': True, 'dropout_probability': 0.5, 'embedding_output_dim': 32, 'layer_sizes': [64], 'fraction_negative': 0.2, 'activation': 'tanh'}
Trained in 6 sec
test set size: 234
{'tau': 0.29868049839893096, 'auc': 0.94172839

In [None]:
print_full(scores_df[["train_size", "test_size"]].sort("train_size", inplace=False))

In [None]:
2