In [1]:
import mhcflurry, seaborn, numpy, pandas, pickle, sklearn, collections, scipy, time, logging
import mhcflurry.data
import mhcflurry.imputation
import fancyimpute

import sklearn.metrics
import sklearn.cross_validation

Using Theano backend.
Using gpu device 0: GeForce GTX TITAN X (CNMeM is enabled with initial size: 75.0% of memory, cuDNN 5004)


In [2]:
min_peptides_to_consider_allele = 50
max_ic50 = 50000
data_dir="/home/tim/sinai/data/mhc-binding/"

In [24]:
mhcflurry.data.load_dataframe(
        filename=data_dir + "bdata.2009.mhci.public.1.txt",
        max_ic50=max_ic50,
        sep=None,
        species_column_name="species",
        allele_column_name="mhc",
        peptide_column_name=None,
        filter_peptide_length=None,
        ic50_column_name="meas",
        only_human=False)

(           species         mhc  peptide_length   cv      sequence inequality  \
 0             None      ELA-A1              12  TBD  GSQKLTTGNCNW          =   
 1             None      ELA-A1              12  TBD  HVKDETNTTEYW          =   
 2             None      ELA-A1              12  TBD  LVEDVTNTAEYW          =   
 3             None      ELA-A1              12  TBD  RVEDKTNTAEYW          =   
 4             None      ELA-A1              12  TBD  RVEDVKNTAEYW          =   
 5             None      ELA-A1              12  TBD  RVEDVTLTAEYW          =   
 6             None      ELA-A1              12  TBD  RVEDVTNKAEYW          =   
 7             None      ELA-A1              12  TBD  RVEDVTNTAELW          =   
 8             None      ELA-A1              12  TBD  RVEDVTNTAEYL          =   
 9             None      ELA-A1              12  TBD  RVEDVTNTAEYW          =   
 10            None      ELA-A1              12  TBD  RVEDVTNTALYW          =   
 11            None      ELA

In [25]:
all_train_data = mhcflurry.data.load_allele_datasets(
    data_dir + "bdata.2009.mhci.public.1.txt",
    use_multiple_peptide_lengths=True)

In [62]:
alleles = [
    "HLA-A0201",
    "HLA-A0301",
    "HLA-A0203",
    "HLA-A2602",
    "HLA-A2603",
    'HLA-B7301',
]
#alleles = alleles[:1] + alleles[-1:]
#alleles = [allele for allele in all_train_data if len(all_train_data[allele].Y) >= min_peptides_to_consider_allele]

In [28]:
all_train_data[alleles[0]].weights

0.29094172040394206

In [26]:
set(len(x) for x in all_train_data[alleles[0]].peptides)

{9}

In [5]:
#sorted(dict((allele, len(all_train_data[allele].Y)) for allele in all_train_data).items(), key=lambda pair: -1*pair[1])

In [63]:
#train_data = dict((allele, data)
#                  for (allele, data) in all_train_data.items()
#                  if len(data.Y) >= min_peptides_to_consider_allele)
train_data = dict((allele, all_train_data[allele]) for allele in alleles)
print("Training data: %d / %d alleles" % (len(train_data), len(all_train_data)))

#test_data = mhcflurry.data.load_allele_datasets("/Users/tim/sinai/git/mhcflurry/bdata.2013.mhci.public.blind.1.txt")


Training data: 6 / 106 alleles


In [70]:
def log_to_ic50(log_value):
        """
        Convert neural network output to IC50 values between 0.0 and
        self.max_ic50 (typically 5000, 20000 or 50000)
        """
        return max_ic50 ** (1.0 - log_value)

def make_scores(y, y_pred, weights=None, sample_weight=None, threshold_nm=500):
    ic50_y = log_to_ic50(y)
    ic50_y_pred = log_to_ic50(y_pred) 
    return dict(
        auc=sklearn.metrics.roc_auc_score(ic50_y <= threshold_nm, y_pred, sample_weight=sample_weight),
        f1=sklearn.metrics.f1_score(ic50_y <= threshold_nm, ic50_y_pred <= threshold_nm, sample_weight=sample_weight),
        tau=scipy.stats.kendalltau(y_pred, y)[0],
    )    

def mean_with_std(grouped_column, decimals=3):
    pattern = "%%0.%df" % decimals
    return pandas.Series([
        (pattern + " +/ " + pattern) % (m, s) if not pandas.isnull(s) else pattern % m
        for (m, s) in zip(grouped_column.mean(), grouped_column.std())
    ], index = grouped_column.mean().index)

def allele_data_to_df(data):
    d = data._asdict()
    d["X_index"] = [x for x in d["X_index"]]
    d["X_binary"] = [x for x in d["X_binary"]]
    df = pandas.DataFrame(d).set_index('peptides')
    return df

def make_2d_array(thing):
    return numpy.array([list(x) for x in thing])

def df_to_allele_data(df):
    d = dict((col, df[col].values) for col in df)
    d["X_index"] = make_2d_array(d["X_index"])
    (d["max_ic50"],) = list(df.max_ic50.unique())
    return mhcflurry.data.AlleleData(peptides = df.index.values, **d)


In [66]:
dropout_probabilities = [0.0, 0.1, 0.5]

embedding_output_dims = [4, 16, 32, 64, 128]
#embedding_output_dims = [4, 32]

#layer_sizes = [[4], [8], [16], [64], [128]]
layer_sizes_list = [[16], [64], [100], [128]]

activations = ["tanh"]

models_params_list = []
for dropout_probability in dropout_probabilities:
    for embedding_output_dim in embedding_output_dims:
        for layer_sizes in layer_sizes_list:
            for activation in activations:
                models_params_list.append(dict(
                    dropout_probability=dropout_probability,  
                    embedding_output_dim=embedding_output_dim,
                    layer_sizes=layer_sizes,
                    activation=activation))

print("%d models" % len(models_params_list))
models_params_explored = set.union(*[set(x) for x in models_params_list])
models_params_explored


60 models


{'activation', 'dropout_probability', 'embedding_output_dim', 'layer_sizes'}

In [None]:
cv_df = collections.defaultdict(list)
start = time.time()
#for (allele, data) in list(train_data.items())[:1]:
for (allele, data) in train_data.items():
    data_df = allele_data_to_df(data)
    cv = sklearn.cross_validation.StratifiedKFold(log_to_ic50(data.Y) < 500, n_folds = 3)
    for (fold_num, (train_indices, test_indices)) in enumerate(cv):
        for impute in [False, True]:
            train_df = data_df.iloc[train_indices]
            test_df = data_df.iloc[test_indices]
            if impute:
                full_train_allele_data = dict(train_data)
                full_train_allele_data[allele] = df_to_allele_data(train_df)
                train_imputed_dict = mhcflurry.imputation.create_imputed_datasets(
                    all_train_data,
                    fancyimpute.mice.MICE(),
                    min_observations_per_peptide=1,
                    min_observations_per_allele=50)
                X_pretrain = train_imputed_dict[allele].X_index
                Y_pretrain = train_imputed_dict[allele].Y
                sample_weights_pretrain = train_imputed_dict[allele].weights
            else:
                X_pretrain = Y_pretrain = sample_weights_pretrain = None

            for (i, model_params) in enumerate(models_params_list):
                print("%10s fold %3d [%3d / %3d] train_size=%d test_size=%d impute=%s model=%s" %
                      (allele, fold_num, i, len(models_params_list), len(train_indices), len(test_indices), impute, model_params))
                model = mhcflurry.Class1BindingPredictor.from_hyperparameters(
                    max_ic50=max_ic50,
                    **model_params)
                fit_time = -time.time()
                model.fit(
                    make_2d_array(train_df.X_index),
                    train_df.Y,
                    sample_weights=train_df.weights,
                    X_pretrain=X_pretrain,
                    Y_pretrain=Y_pretrain,
                    sample_weights_pretrain=sample_weights_pretrain,
                    verbose=False
                )
                fit_time += time.time()
                predictions = model.predict(make_2d_array(test_df.X_index))
                train_predictions = model.predict(make_2d_array(train_df.X_index))
                cv_df["allele"].append(allele)
                cv_df["allele_size"].append(len(data.Y))
                cv_df["train_size"].append(len(train_indices))
                cv_df["model_params"].append(model_params)
                #cv_df["model"].append(model)
                cv_df["impute"].append(impute)
                cv_df["imputed_size"].append(len(Y_pretrain) if Y_pretrain is not None else None)
                cv_df["fit_time"].append(fit_time)

                for (param, param_value) in model_params.iteritems():
                    cv_df[param].append(param_value)
                for (key, value) in make_scores(test_df.Y, predictions, test_df.weights).items():
                    cv_df["test_%s" % key].append(value)
                    print("test %s: %f" % (key, value))
                for (key, value) in make_scores(train_df.Y, train_predictions, train_df.weights).items():
                    cv_df["train_%s" % key].append(value)
                    print("train %s: %f" % (key, value))


cv_df = pandas.DataFrame(cv_df)
cv_df["layer0_size"] = [x[0] for x in cv_df.layer_sizes]
print(time.time() - start)
cv_df

 HLA-A0201 fold   0 [  0 /  60] train_size=21917 test_size=10959 impute=False model={'activation': 'tanh', 'embedding_output_dim': 4, 'dropout_probability': 0.0, 'layer_sizes': [16]}
test tau: 0.339114
test auc: 0.748178
test f1: 0.620713
train tau: 0.548417
train auc: 0.884989
train f1: 0.748365
 HLA-A0201 fold   0 [  1 /  60] train_size=21917 test_size=10959 impute=False model={'activation': 'tanh', 'embedding_output_dim': 4, 'dropout_probability': 0.0, 'layer_sizes': [64]}
test tau: 0.474557
test auc: 0.848442
test f1: 0.545770
train tau: 0.547662
train auc: 0.885808
train f1: 0.738517
 HLA-A0201 fold   0 [  2 /  60] train_size=21917 test_size=10959 impute=False model={'activation': 'tanh', 'embedding_output_dim': 4, 'dropout_probability': 0.0, 'layer_sizes': [100]}
test tau: 0.312977
test auc: 0.730126
test f1: 0.604546
train tau: 0.544632
train auc: 0.883024
train f1: 0.745066
 HLA-A0201 fold   0 [  3 /  60] train_size=21917 test_size=10959 impute=False model={'activation': 'tanh'

In [None]:
train_data["HLA-A0201"].X_index.shape

In [None]:
cv_df = pandas.DataFrame(cv_df)
cv_df["layer0_size"] = [x[0] for x in cv_df.layer_sizes]
cv_df

In [None]:
cv_df.to_csv("cv4.csv")

In [12]:
group_columns = ["allele", "allele_size", "impute"]
group_columns.extend(models_params_explored)
group_columns.append("layer0_size")
group_columns.remove("layer_sizes")
print(mean_with_std(cv_df.groupby(group_columns).test_auc)) #.sort(inplace=False, ascending=False)



allele     allele_size  impute  embedding_output_dim  activation  layer0_size
HLA-A0201  32876        False   5                     tanh        4              0.863 +/ 0.042
                                                                  8              0.855 +/ 0.032
                                                                  16             0.834 +/ 0.035
                                                                  64             0.849 +/ 0.043
                                                                  128            0.835 +/ 0.038
                                10                    tanh        4              0.847 +/ 0.030
                                                                  8              0.866 +/ 0.025
                                                                  16             0.860 +/ 0.030
                                                                  64             0.808 +/ 0.040
                                                          

In [13]:
def best_by(score):
    means = cv_df.groupby(group_columns)[score].mean().reset_index()
    max_rows = []
    for allele in means.allele.unique():
        max_rows.append(means.ix[means.allele == allele][score].argmax())
    return means.ix[max_rows]

In [14]:
best_by('test_auc')


Unnamed: 0,allele,allele_size,impute,embedding_output_dim,activation,layer0_size,test_auc
31,HLA-A0201,32876,True,10,tanh,8,0.875208
76,HLA-A0203,19879,True,5,tanh,8,0.828529
125,HLA-A0301,19970,True,5,tanh,4,0.827136
165,HLA-A2602,202,False,64,tanh,4,0.96065
248,HLA-A2603,205,True,128,tanh,64,0.919985
286,HLA-B7301,115,True,32,tanh,8,0.854248


In [15]:
best_by('test_tau')

Unnamed: 0,allele,allele_size,impute,embedding_output_dim,activation,layer0_size,test_tau
31,HLA-A0201,32876,True,10,tanh,8,0.508779
75,HLA-A0203,19879,True,5,tanh,4,0.489796
125,HLA-A0301,19970,True,5,tanh,4,0.470644
161,HLA-A2602,202,False,32,tanh,8,0.630461
241,HLA-A2603,205,True,64,tanh,8,0.391268
286,HLA-B7301,115,True,32,tanh,8,0.452451


In [16]:
best_by('test_f1')

Unnamed: 0,allele,allele_size,impute,embedding_output_dim,activation,layer0_size,test_f1
0,HLA-A0201,32876,False,5,tanh,4,0.702868
76,HLA-A0203,19879,True,5,tanh,8,0.691566
125,HLA-A0301,19970,True,5,tanh,4,0.645124
161,HLA-A2602,202,False,32,tanh,8,0.868255
248,HLA-A2603,205,True,128,tanh,64,0.62381
270,HLA-B7301,115,False,128,tanh,4,0.321429
