In [59]:
import mhcflurry
import numpy
import logging
import seaborn
from matplotlib import pyplot

% matplotlib inline
logging.basicConfig(level="DEBUG")

# Predict Using Existing Model

In [4]:
mhcflurry.predict(alleles=["HLA-A0201"], peptides=["SIINFEKL","SIINFEQL"])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Unnamed: 0,Allele,Peptide,Prediction
0,HLA-A0201,SIINFEKL,10672.347656
1,HLA-A0201,SIINFEQL,7828.974121


# Load Customized Training Set and Model

In [5]:
kim2014_train = mhcflurry.dataset.Dataset.from_csv(mhcflurry.downloads.get_path("data_kim2014", "bdata.2009.mhci.public.1.txt"))

In [9]:
kim2014_train.alleles

array(['ELA-A1', 'ELA-A1', 'ELA-A1', ..., 'Patr-B2401', 'Patr-B2401',
       'Patr-B2401'], dtype=object)

In [10]:
dfrmli_A0201_test = mhcflurry.dataset.Dataset.from_csv("testset/HLA-A0201")
len(dfrmli_A0201_test)

1251

In [13]:
dfrmli_A0201_test.affinities

array(['-', '-', '-', ..., '+', '+', '+'], dtype=object)

In [15]:
A0201_trainingset = kim2014_train.get_allele("HLA-A0201")
A0201_trainingset

Dataset(n=9565, alleles=['HLA-A0201'])

In [16]:
prediction_model = mhcflurry.class1_allele_specific.Class1BindingPredictor()
prediction_model.hyperparameters

{'activation': 'tanh',
 'batch_normalization': True,
 'batch_size': 128,
 'dropout_probability': 0.0,
 'embedding_output_dim': 32,
 'fraction_negative': 0.0,
 'init': 'glorot_uniform',
 'kmer_size': 9,
 'layer_sizes': [64],
 'loss': 'mse',
 'max_ic50': 50000.0,
 'n_training_epochs': 250,
 'optimizer': 'rmsprop',
 'output_activation': 'sigmoid',
 'pretrain_decay': 'numpy.exp(-epoch)'}

In [18]:
predictions = prediction_model.predict(dfrmli_A0201_test.peptides)
predictions

array([  84.33081818,  197.01031494,  172.89077759, ...,   93.08477783,
        135.23834229,  429.84768677], dtype=float32)

# Evalute Prediction Accuracy

In [20]:
mhcflurry.class1_allele_specific.scoring.make_scores(dfrmli_A0201_test.affinities, predictions)

  'recall', 'true', average, warn_for)


{'auc': nan, 'f1': 0.0, 'tau': 0.073076446246030796}

In [21]:
kim2014_test = mhcflurry.dataset.Dataset.from_csv(
    mhcflurry.downloads.get_path("data_kim2014", "bdata.2013.mhci.public.blind.1.txt"))
test_set = kim2014_test.get_allele("HLA-A0201")
len(test_set)

2126

In [22]:
test_set.affinities

array([  6.94444444e+04,   6.61261200e+00,   7.22181500e+00, ...,
         7.81250000e+04,   1.12300000e+03,   1.30000000e+02])

In [23]:
predictions_new = prediction_model.predict(test_set.peptides)
predictions_new

array([ 422.26077271,   91.14395142,  119.12548065, ...,  226.68408203,
        131.43814087,  140.93244934], dtype=float32)

In [25]:
mhcflurry.class1_allele_specific.scoring.make_scores(test_set.affinities, predictions_new)

{'auc': 0.57255260326485824,
 'f1': 0.70411233701103315,
 'tau': 0.087641280225993323}

# Custom Cross-Validation Methods and Model Parameters

In [27]:
help(mhcflurry.class1_allele_specific.cross_validation.cross_validation_folds)

Help on function cross_validation_folds in module mhcflurry.class1_allele_specific.cross_validation:

cross_validation_folds(train_data, alleles=None, n_folds=3, drop_similar_peptides=False, imputer=None, impute_kwargs={'min_observations_per_allele': 2, 'min_observations_per_peptide': 2}, parallel_backend=None)
    Split a Dataset into n_folds cross validation folds for each allele,
    optionally performing imputation.
    
    Parameters
    -----------
    train_data : mhcflurry.Dataset
    
    alleles : string list, optional
        Alleles to run cross validation on. Default: all alleles in
        train_data.
    
    n_folds : int, optional
        Number of cross validation folds for each allele.
    
    drop_similar_peptides : boolean, optional
        For each fold, remove peptides from the test data that are similar
        to peptides in the train data. Similarity is defined as in the
        similar_peptides function.
    
    imputer : fancyimpute.Solver, optional
     

In [28]:
fold1 = mhcflurry.class1_allele_specific.cross_validation.cross_validation_folds(A0201_trainingset,n_folds=3)
fold2 = mhcflurry.class1_allele_specific.cross_validation.cross_validation_folds(A0201_trainingset,n_folds=5)

In [31]:
fold2

[AlleleSpecificTrainTestFold(allele='HLA-A0201', train=Dataset(n=7652, alleles=['HLA-A0201']), imputed_train=None, test=Dataset(n=1913, alleles=['HLA-A0201'])),
 AlleleSpecificTrainTestFold(allele='HLA-A0201', train=Dataset(n=7652, alleles=['HLA-A0201']), imputed_train=None, test=Dataset(n=1913, alleles=['HLA-A0201'])),
 AlleleSpecificTrainTestFold(allele='HLA-A0201', train=Dataset(n=7652, alleles=['HLA-A0201']), imputed_train=None, test=Dataset(n=1913, alleles=['HLA-A0201'])),
 AlleleSpecificTrainTestFold(allele='HLA-A0201', train=Dataset(n=7652, alleles=['HLA-A0201']), imputed_train=None, test=Dataset(n=1913, alleles=['HLA-A0201'])),
 AlleleSpecificTrainTestFold(allele='HLA-A0201', train=Dataset(n=7652, alleles=['HLA-A0201']), imputed_train=None, test=Dataset(n=1913, alleles=['HLA-A0201']))]

In [68]:
models = mhcflurry.class1_allele_specific.train.HYPERPARAMETER_DEFAULTS.models_grid(
    fraction_negative=[.1],
    layer_sizes=[[8],[12]])
models[0]

{'activation': 'tanh',
 'batch_normalization': True,
 'batch_size': 128,
 'dropout_probability': 0.0,
 'embedding_output_dim': 32,
 'fraction_negative': 0.1,
 'impute': False,
 'init': 'glorot_uniform',
 'kmer_size': 9,
 'layer_sizes': [8],
 'loss': 'mse',
 'max_ic50': 50000.0,
 'n_training_epochs': 250,
 'optimizer': 'rmsprop',
 'output_activation': 'sigmoid',
 'pretrain_decay': 'numpy.exp(-epoch)'}

In [72]:
df = mhcflurry.class1_allele_specific.train.train_across_models_and_folds(fold1, models, return_predictors=True)

ValueError: Tensor("cond_4/pred_id:0", dtype=bool) must be from the same graph as Tensor("batchnorm_90/add_1:0", shape=(?, 8), dtype=float32).