Merge e0c8075 into e93299b

openvax · Mar 19, 2017 · f34918a · f34918a
2 parents e93299b + e0c8075
commit f34918a
Show file tree

Hide file tree

Showing 43 changed files with 1,593 additions and 200 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -45,6 +45,7 @@ env:
 script:
   # download data and models, then run tests
   - mhcflurry-downloads fetch
+  - mhcflurry-downloads fetch models_class1_allele_specific_ensemble
   - mhcflurry-downloads info  # just to test this command works
   - nosetests test --with-coverage --cover-package=mhcflurry  && ./lint.sh
 after_success:

diff --git a/Dockerfile b/Dockerfile
@@ -42,14 +42,11 @@ WORKDIR /home/user
 # Setup virtual envs and install convenience packages.  Note: installing
 # cherrypy as part of the mhcflurry installation weirdly fails on a unicode
 # issue in python2, but installing it separately seems to work.
-# We also install bokeh so that dask distributed will have an admin web interface.
 RUN virtualenv venv-py3 --python=python3 && \
     venv-py3/bin/pip install --upgrade pip && \
     venv-py3/bin/pip install --upgrade \
         numpy \
-        bokeh \
         cherrypy \
-        git+https://github.com/dask/distributed.git \
         jupyter \
         lxml \
         scipy \
@@ -59,9 +56,10 @@ RUN virtualenv venv-py3 --python=python3 && \
 ENV KERAS_BACKEND theano
 # RUN venv-py3/bin/pip install --upgrade https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.10.0-cp35-cp35m-linux_x86_64.whl
 
-# Install mhcflurry and download data and models.
+# Install mhcflurry and latest kubeface and download data and models.
 COPY . ./mhcflurry
-RUN venv-py3/bin/pip install ./mhcflurry && venv-py3/bin/mhcflurry-downloads fetch
+RUN venv-py3/bin/pip install --upgrade ./mhcflurry git+https://github.com/hammerlab/kubeface.git \
+    && venv-py3/bin/mhcflurry-downloads fetch
  
 EXPOSE 8888
 CMD venv-py3/bin/jupyter notebook --no-browser
diff --git a/downloads-generation/data_combined_iedb_kim2014/create-combined-class1-dataset.py b/downloads-generation/data_combined_iedb_kim2014/create-combined-class1-dataset.py
@@ -17,7 +17,7 @@
 """
 Combine 2013 Kim/Peters NetMHCpan dataset[*] with more recent IEDB entries
 
-* = "Dataset size and composition impact the reliability..."
+* = "AffinityMeasurementDataset size and composition impact the reliability..."
 """
 
 from __future__ import (

diff --git a/downloads-generation/models_class1_allele_specific_ensemble/GENERATE.sh b/downloads-generation/models_class1_allele_specific_ensemble/GENERATE.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+if [[ $# -eq 0 ]] ; then
+    echo 'WARNING: This script is intended to be called with additional arguments to pass to mhcflurry-class1-allele-specific-cv-and-train'
+    echo 'See README.md'
+fi
+
+set -e
+set -x
+
+DOWNLOAD_NAME=models_class1_allele_specific_ensemble
+SCRATCH_DIR=/tmp/mhcflurry-downloads-generation
+SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
+SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
+export PYTHONUNBUFFERED=1
+
+mkdir -p "$SCRATCH_DIR"
+rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
+mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
+
+# Send stdout and stderr to a logfile included with the archive.
+exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
+exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
+
+# Log some environment info
+date
+pip freeze
+git rev-parse HEAD
+git status
+
+cd $SCRATCH_DIR/$DOWNLOAD_NAME
+
+mkdir models
+
+cp $SCRIPT_DIR/models.py .
+python models.py > models.json
+
+time mhcflurry-class1-allele-specific-ensemble-train \
+    --ensemble-size 16 \
+    --model-architectures models.json \
+    --train-data "$(mhcflurry-downloads path data_combined_iedb_kim2014)/combined_human_class1_dataset.csv" \
+    --min-samples-per-allele 20 \
+    --out-manifest selected_models.csv \
+    --out-model-selection-manifest all_models.csv \
+    --out-models models \
+    --verbose \
+    "$@"
+
+bzip2 all_models.csv
+cp $SCRIPT_ABSOLUTE_PATH .
+tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
+
+echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2"
diff --git a/downloads-generation/models_class1_allele_specific_ensemble/README.md b/downloads-generation/models_class1_allele_specific_ensemble/README.md
@@ -0,0 +1,29 @@
+# Class I allele-specific models (ensemble)
+
+This download contains trained MHC Class I allele-specific MHCflurry models. For each allele, an ensemble of predictors is trained on random halves of the training data. Model architectures are selected based on performance on the other half of the dataset, so in general each ensemble contains predictors of different architectures. At prediction time the geometric mean IC50 is taken over the trained models. The training data used is in the [data_combined_iedb_kim2014](../data_combined_iedb_kim2014) MHCflurry download.
+
+The training script supports multi-node parallel execution using the [kubeface](https://github.com/hammerlab/kubeface) library.
+
+To use kubeface, you should make a google storage bucket and pass it below with the --storage-prefix argument. 
+
+To generate this download we run:
+
+```
+./GENERATE.sh \
+    --parallel-backend kubeface \
+    --target-tasks 200 \
+    --kubeface-backend kubernetes \
+    --kubeface-storage gs://kubeface-tim \
+    --kubeface-worker-image hammerlab/mhcflurry-misc:latest \
+    --kubeface-kubernetes-task-resources-memory-mb 10000 \
+    --kubeface-worker-path-prefix venv-py3/bin \
+    --kubeface-max-simultaneous-tasks 200 \
+    --kubeface-speculation-max-reruns 3 \
+```
+
+To debug locally:
+```
+./GENERATE.sh \
+    --parallel-backend local-threads \
+    --target-tasks 1
+```
diff --git a/downloads-generation/models_class1_allele_specific_ensemble/models.py b/downloads-generation/models_class1_allele_specific_ensemble/models.py
@@ -0,0 +1,24 @@
+import sys
+from mhcflurry.class1_allele_specific_ensemble import HYPERPARAMETER_DEFAULTS
+import json
+
+models = HYPERPARAMETER_DEFAULTS.models_grid(
+    impute=[False, True],
+    activation=["tanh"],
+    layer_sizes=[[12], [64], [128]],
+    embedding_output_dim=[8, 32, 64],
+    dropout_probability=[0, .1, .25],
+    fraction_negative=[0, .1, .2],
+    n_training_epochs=[250],
+
+    # Imputation arguments
+    impute_method=["mice"],
+    imputer_args=[
+        # Arguments specific to imputation method (mice)
+        {"n_burn_in": 5, "n_imputations": 50, "n_nearest_columns": 25}
+    ],
+    impute_min_observations_per_peptide=[3],
+    impute_min_observations_per_allele=[3])
+
+sys.stderr.write("Models: %d\n" % len(models))
+print(json.dumps(models, indent=4))
diff --git a/examples/class1_allele_specific_models.ipynb b/examples/class1_allele_specific_models.ipynb
@@ -2,21 +2,11 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using Theano backend.\n",
-      "/Users/tim/miniconda3/envs/py3k/lib/python3.5/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.\n",
-      "  warnings.warn(self.msg_depr % (key, alt_key))\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import mhcflurry\n",
     "import numpy\n",