In [1]:
import os
os.environ["THEANO_FLAGS"] = 'cuda.root=/usr/local/cuda,floatX=float32,device=gpu1,force_device=False,lib.cnmem=.75'

import theano
print(theano.config.device)

import mhcflurry, seaborn, numpy, pandas, pickle, sklearn, collections, scipy, time
import mhcflurry.dataset
import fancyimpute, locale

import sklearn.metrics
import sklearn.cross_validation

def print_full(x):
    pandas.set_option('display.max_rows', len(x))
    print(x)
    pandas.reset_option('display.max_rows')

ERROR (theano.sandbox.cuda): nvcc compiler not found on $PATH. Check your nvcc installation and try again.
ERROR:theano.sandbox.cuda:nvcc compiler not found on $PATH. Check your nvcc installation and try again.


Couldn't import dot_parser, loading of dot files will not be possible.
gpu1

Using Theano backend.







In [2]:
max_ic50 = 50000
min_peptides_to_consider_allele = 10
data_dir = "../data/"

In [None]:
all_train_data = mhcflurry.dataset.Dataset.from_csv(data_dir + "bdata.2009.mhci.public.1.txt")

In [None]:
imputed_train_data = all_train_data.impute_missing_values(
    fancyimpute.MICE(n_imputations=250, n_burn_in=50),
    min_observations_per_peptide=2,
    min_observations_per_allele=2
)


Dropping 12235 peptides with <2 observations
Dropping 9 alleles with <2 observations: ['ELA-A1', 'HLA-B2701', 'HLA-B3508', 'HLA-B44', 'HLA-E0101', 'Mamu-B04', 'Patr-A0602', 'Patr-B0901', 'Patr-B1701']
[MICE] Completing matrix with shape (19304, 97)
[MICE] Starting imputation round 1/300, elapsed time 0.060
[MICE] Starting imputation round 2/300, elapsed time 6.151
[MICE] Starting imputation round 3/300, elapsed time 12.110
[MICE] Starting imputation round 4/300, elapsed time 16.391
[MICE] Starting imputation round 5/300, elapsed time 20.525
[MICE] Starting imputation round 6/300, elapsed time 25.179
[MICE] Starting imputation round 7/300, elapsed time 31.370
[MICE] Starting imputation round 8/300, elapsed time 36.467
[MICE] Starting imputation round 9/300, elapsed time 42.110
[MICE] Starting imputation round 10/300, elapsed time 47.532
[MICE] Starting imputation round 11/300, elapsed time 52.988
[MICE] Starting imputation round 12/300, elapsed time 58.399
[MICE] Starting imputation rou

In [None]:
imputed_train_data.to_dataframe()

In [None]:
validation_df = pandas.read_csv("../data/combined_test_BLIND_dataset_from_kim2013.csv")
validation_df

In [None]:
validation_allele_counts = validation_df.allele.value_counts()
train_allele_counts = all_train_data._df.allele.value_counts()
print(validation_allele_counts)
print(train_allele_counts)

In [None]:
alleles = sorted(train_allele_counts.index[
    (train_allele_counts >= min_peptides_to_consider_allele)
    & (train_allele_counts.index.isin(validation_allele_counts.index))
], key=lambda allele: -1 * train_allele_counts[allele])
alleles

In [None]:
dropout_probabilities = [0.0, 0.5]
embedding_output_dims_and_layer_sizes_list = [(32, [64]), (8, [4])]
activations = ["tanh"]

models_params_list = []

for model_num in range(1):
    for impute in [False, True]:
        for dropout_probability in dropout_probabilities:
            for (embedding_output_dim, layer_sizes) in embedding_output_dims_and_layer_sizes_list:
                for activation in activations:
                    models_params_list.append(dict(
                        impute=impute,
                        dropout_probability=dropout_probability,  
                        embedding_output_dim=embedding_output_dim,
                        layer_sizes=layer_sizes,
                        activation=activation))

print("%d models" % len(models_params_list))
models_params_explored = set.union(*[set(x) for x in models_params_list])
models_params_explored


In [None]:

def make_scores(ic50_y, ic50_y_pred, sample_weight=None, threshold_nm=500):     
    y_pred = mhcflurry.regression_target.ic50_to_regression_target(ic50_y_pred, max_ic50)
    try:
        auc = sklearn.metrics.roc_auc_score(ic50_y <= threshold_nm, y_pred, sample_weight=sample_weight)
    except ValueError:
        auc = numpy.nan
    try:
        f1 = sklearn.metrics.f1_score(ic50_y <= threshold_nm, ic50_y_pred <= threshold_nm, sample_weight=sample_weight)
    except ValueError:
        f1 = numpy.nan
    try:
        tau = scipy.stats.kendalltau(ic50_y_pred, ic50_y)[0]
    except ValueError:
        tau = numpy.nan
    
    return dict(
        auc=auc,
        f1=f1,
        tau=tau,
    )    

def mean_with_std(grouped_column, decimals=3):
    pattern = "%%0.%df" % decimals
    return pandas.Series([
        (pattern + " +/ " + pattern) % (m, s) if not pandas.isnull(s) else pattern % m
        for (m, s) in zip(grouped_column.mean(), grouped_column.std())
    ], index = grouped_column.mean().index)

def allele_data_to_df(data):
    d = data._asdict()
    d["X_index"] = [x for x in d["X_index"]]
    d["X_binary"] = [x for x in d["X_binary"]]
    df = pandas.DataFrame(d).set_index('peptides')
    return df

def make_2d_array(thing):
    return numpy.array([list(x) for x in thing])

def df_to_allele_data(df):
    d = dict((col, df[col].values) for col in df)
    d["X_index"] = make_2d_array(d["X_index"])
    (d["max_ic50"],) = list(df.max_ic50.unique())
    return mhcflurry.data.AlleleData(peptides = df.index.values, **d)


In [None]:
models_and_scores = {}
validation_df_with_mhcflurry = validation_df.copy()


In [None]:
# train and test models, adding columns to validation_df_with_mhcflurry
pandas.DataFrame(models_params_list).to_csv("../data/validation_models.csv", index=False)

def make_and_fit_model(allele, original_params):
    params = dict(original_params)
    impute = params["impute"]
    del params["impute"]
    model = mhcflurry.Class1BindingPredictor.from_hyperparameters(max_ic50=max_ic50, **params)
    print("Fitting model for allele %s (%d + %d): %s" % (
            allele,
            len(all_train_data.groupby_allele_dictionary()[allele]),
            len(imputed_train_data.groupby_allele_dictionary()[allele]),
            str(original_params)))
    t = -time.time()
    model.fit_dataset(
        all_train_data.groupby_allele_dictionary()[allele],
        pretraining_dataset=imputed_train_data.groupby_allele_dictionary()[allele] if impute else None,
        verbose=False,
        batch_size=128,
        n_training_epochs=250)
    t += time.time()
    print("Trained in %d sec" % t)
    return model

for (i, allele) in enumerate(alleles):
    if allele not in validation_df_with_mhcflurry.allele.unique():
        print("Skipping allele %s: not in test set" % allele)
        continue
    if allele in models_and_scores:
        print("Skipping allele %s: already done" % allele)
        continue
    values_for_allele = []
    for (j, params) in enumerate(models_params_list):
        print("Allele %d model %d" % (i, j))
        model = make_and_fit_model(allele, params)
        predictions = model.predict(
            list(validation_df_with_mhcflurry.ix[validation_df_with_mhcflurry.allele == allele].peptide))
        print("test set size: %d" % len(predictions))
        validation_df_with_mhcflurry.loc[(validation_df_with_mhcflurry.allele == allele),
                                         ("mhcflurry %d" % j)] = predictions
        scores = make_scores(validation_df_with_mhcflurry.ix[validation_df.allele == allele].meas,
                            predictions)
        print(scores)
        values_for_allele.append((params, scores))
        
    models_and_scores[allele] = values_for_allele
    
    # Write out all data after each allele.
    validation_df_with_mhcflurry_results = validation_df_with_mhcflurry.ix[validation_df_with_mhcflurry.allele.isin(models_and_scores)]
    validation_df_with_mhcflurry_results.to_csv("../data/validation_predictions_full.csv", index=False)
    
    scores_df = collections.defaultdict(list)
    predictors = validation_df_with_mhcflurry_results.columns[4:]

    for (allele, grouped) in validation_df_with_mhcflurry_results.groupby("allele"):
        scores_df["allele"].append(allele)
        scores_df["test_size"].append(len(grouped.meas))
        for predictor in predictors:
            scores = make_scores(grouped.meas, grouped[predictor])
            for (key, value) in scores.items():
                scores_df["%s_%s" % (predictor, key)].append(value)

    scores_df = pandas.DataFrame(scores_df)
    scores_df["train_size"] = [
        len(all_train_data.groupby_allele_dictionary()[a])
        for a in scores_df.allele
    ]

    scores_df.index = scores_df.allele
    scores_df.to_csv("../data/validation_scores.csv", index=False)
        


In [None]:
print_full(scores_df[["train_size", "test_size"]].sort("train_size", inplace=False))