# Deepvariant

Notebook for running DeepVariant training on K8s

This is a work in progress

This notebook assumes you have followed the README in order to setup your cluster

In [7]:
# Set some variables
# The bucket on GCS where data is stored.
import datetime
import os
import yaml
BUCKET="cloud-ml-dev_jlewi_deep_variant"
# Where the NFS share is mounted
NFS_MOUNT_POINT="/home/jovyan/deepvariant-pd/"
# The directory within the NFS share where data is stored
DATA_DIR = "deepvariant/data"
# The local directory where the DATA can be found.
LOCAL_DATA_DIR=os.path.join(NFS_MOUNT_POINT, DATA_DIR)

PROJECT="cloud-ml-dev"
CLUSTER="gke-tf-example"
ZONE="us-east1-d"

# GCS directory to use for this run
GCS_DIR = "gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210"
GCS_DATASET_CONFIG = os.path.join(GCS_DIR, "examples", "dataset_config.pbtxt")

# See https://stackoverflow.com/questions/21016220/is-it-possible-to-emit-valid-yaml-with-anchors-references-disabled-using-ruby
class ExplicitDumper(yaml.SafeDumper):
  """A dumper that will never emit aliases."""

  def ignore_aliases(self, data):
    return True

## Setup

### Setup Helm

In [57]:
# Setup helm
# TODO(jlewi): I should build a Docker image with everything we need.
!wget -O /tmp/helm-v2.7.2-linux-amd64.tar.gz https://storage.googleapis.com/kubernetes-helm/helm-v2.7.2-linux-amd64.tar.gz
!tar -C /tmp -xvf /tmp/helm-v2.7.2-linux-amd64.tar.gz
!mv /tmp/linux-amd64/helm ~/

--2017-12-10 01:24:13--  https://storage.googleapis.com/kubernetes-helm/helm-v2.7.2-linux-amd64.tar.gz
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.141.128, 2607:f8b0:400c:c06::80
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.141.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12166338 (12M) [application/x-tar]
Saving to: ‘/tmp/helm-v2.7.2-linux-amd64.tar.gz’


2017-12-10 01:24:13 (88.3 MB/s) - ‘/tmp/helm-v2.7.2-linux-amd64.tar.gz’ saved [12166338/12166338]

linux-amd64/
linux-amd64/README.md
linux-amd64/LICENSE
linux-amd64/helm


### Configure Kubectl


In [60]:
!gcloud --project={PROJECT} container clusters get-credentials --zone={ZONE} {CLUSTER}

Fetching cluster endpoint and auth data.
kubeconfig entry generated for gke-tf-example.


## Copy Data onto NFS share

* The data is most likely stored on GCS
* We need to copy it to NFS makes make_examples can't read/write from NFS

In [82]:
# Copy the reference genome from gcs to our NFS share
!mkdir -p {NFS_DIR}/reference
!gsutil cp gs://{BUCKET}/reference/GRCh38_Verily_v1.genome.fa {LOCAL_DATA_DIR}/reference
!gsutil cp gs://{BUCKET}/reference/GRCh38_Verily_v1.genome.fa.fai {LOCAL_DATA_DIR}/reference

Copying gs://cloud-ml-dev_jlewi_deep_variant/reference/GRCh38_Verily_v1.genome.fa.fai...
/ [0 files][    0.0 B/120.3 KiB]                                                / [1 files][120.3 KiB/120.3 KiB]                                                
Operation completed over 1 objects/120.3 KiB.                                    


In [84]:
!mkdir -p {LOCAL_DATA_DIR}/FAKE_FLOWCELL/GRCh38/HG001-NA12878-pFDA/chr20/
#!gsutil cp gs://verily-analysis-precision-fda/FAKE_FLOWCELL/GRCh38/HG001-NA12878-pFDA/chr20/realigned.bam {LOCAL_DATA_DIR}/FAKE_FLOWCELL/GRCh38/HG001-NA12878-pFDA/chr20/
!gsutil cp gs://verily-analysis-precision-fda/FAKE_FLOWCELL/GRCh38/HG001-NA12878-pFDA/chr20/realigned.bai {LOCAL_DATA_DIR}/FAKE_FLOWCELL/GRCh38/HG001-NA12878-pFDA/chr20/

Copying gs://verily-analysis-precision-fda/FAKE_FLOWCELL/GRCh38/HG001-NA12878-pFDA/chr20/realigned.bai...
/ [0 files][    0.0 B/211.8 KiB]                                                / [1 files][211.8 KiB/211.8 KiB]                                                
Operation completed over 1 objects/211.8 KiB.                                    


In [22]:
!gsutil ls gs://verily-analysis-precision-fda/FAKE_FLOWCELL/GRCh38/HG001-NA12878-pFDA/chr20/

gs://verily-analysis-precision-fda/FAKE_FLOWCELL/GRCh38/HG001-NA12878-pFDA/chr20/base_recalibration.table
gs://verily-analysis-precision-fda/FAKE_FLOWCELL/GRCh38/HG001-NA12878-pFDA/chr20/final.g.vcf
gs://verily-analysis-precision-fda/FAKE_FLOWCELL/GRCh38/HG001-NA12878-pFDA/chr20/final.g.vcf.idx
gs://verily-analysis-precision-fda/FAKE_FLOWCELL/GRCh38/HG001-NA12878-pFDA/chr20/realigned.bai
gs://verily-analysis-precision-fda/FAKE_FLOWCELL/GRCh38/HG001-NA12878-pFDA/chr20/realigned.bam
gs://verily-analysis-precision-fda/FAKE_FLOWCELL/GRCh38/HG001-NA12878-pFDA/chr20/realignment-targets.interval_list
gs://verily-analysis-precision-fda/FAKE_FLOWCELL/GRCh38/HG001-NA12878-pFDA/chr20/recalibrated.bai
gs://verily-analysis-precision-fda/FAKE_FLOWCELL/GRCh38/HG001-NA12878-pFDA/chr20/recalibrated.bam


In [85]:
!find {LOCAL_DATA_DIR} -name "*"

/home/jovyan/deepvariant-pd/deepvariant/data
/home/jovyan/deepvariant-pd/deepvariant/data/reference
/home/jovyan/deepvariant-pd/deepvariant/data/reference/GRCh38_Verily_v1.genome.fa.fai
/home/jovyan/deepvariant-pd/deepvariant/data/reference/GRCh38_Verily_v1.genome.fa
/home/jovyan/deepvariant-pd/deepvariant/data/FAKE_FLOWCELL
/home/jovyan/deepvariant-pd/deepvariant/data/FAKE_FLOWCELL/GRCh38
/home/jovyan/deepvariant-pd/deepvariant/data/FAKE_FLOWCELL/GRCh38/HG001-NA12878-pFDA
/home/jovyan/deepvariant-pd/deepvariant/data/FAKE_FLOWCELL/GRCh38/HG001-NA12878-pFDA/chr20
/home/jovyan/deepvariant-pd/deepvariant/data/FAKE_FLOWCELL/GRCh38/HG001-NA12878-pFDA/chr20/realigned.bai
/home/jovyan/deepvariant-pd/deepvariant/data/FAKE_FLOWCELL/GRCh38/HG001-NA12878-pFDA/chr20/realigned.bam


## Run make examples

* We use the helm chart to run make examples on the data.


In [97]:
import os
import yaml

# Create a config file that specifies the shards.
# Mount_dir will be the directory in the pods where we mount the NFS share.
# IT is set in the chart
MOUNT_DIR = "/mnt/biotensorflow"
config = {
   "reference": os.path.join(MOUNT_DIR, DATA_DIR, "reference/GRCh38_Verily_v1.genome.fa"),
    "shards": [
        {
             "reads": os.path.join(MOUNT_DIR, DATA_DIR, "FAKE_FLOWCELL/GRCh38/HG001-NA12878-pFDA/chr20/realigned.bam"),
             "examples": os.path.join(MOUNT_DIR, DATA_DIR, "test_output/chr20.tfrecord.20170815.gz"),
        },
    ],
}


# make sure the directory for the output exists 
for s in config["shards"]:
    d = s["examples"].lstrip(MOUNT_DIR)
    base_dir = os.path.dirname(d)
    local_output_dir = os.path.join(NFS_MOUNT_POINT, base_dir)
    if not os.path.exists(local_output_dir):
        print("Creating %s" % local_output_dir)
        os.makedirs(local_output_dir)
    
CONFIG_FILE="/tmp/make_examples_config.yaml"

config_yaml = yaml.dump(config, Dumper=ExplicitDumper, default_flow_style=False)

with open(CONFIG_FILE, "w") as hf:
    hf.write(config_yaml)

Creating /home/jovyan/deepvariant-pd/deepvariant/data/test_output


In [79]:
!~/helm install --name=make-test ./charts/make-examples -f {CONFIG_FILE}

NAME:   make-test
LAST DEPLOYED: Sun Dec 10 01:43:32 2017
NAMESPACE: default
STATUS: DEPLOYED

RESOURCES:
==> v1/Job
NAME                    DESIRED  SUCCESSFUL  AGE
make-examples-akw4fg-0  1        0           0s

==> v1/Pod(related)
NAME                          READY  STATUS             RESTARTS  AGE
make-examples-akw4fg-0-k26mz  0/1    ContainerCreating  0         0s




## Run Training

### Prepare the dataset config

In [102]:
!gunzip -k {local_output_dir}/chr20.tfrecord.20170815.gz

In [105]:
!ls -la {local_output_dir}

total 27327112
drwxr-xr-x 2 jovyan     users             4096 Dec 10 23:14 .
drwxr-xr-x 6 jovyan     users             4096 Dec 10 02:03 ..
-rw-r--r-- 1 jovyan     users      26441264520 Dec 10 07:16 chr20.tfrecord.20170815
-rw-r--r-- 1 4294967294 4294967294  1541678261 Dec 10 07:16 chr20.tfrecord.20170815.gz


In [None]:
import tensorflow as tf
num_examples = 0

examples_file = os.path.join(local_output_dir, "chr20.tfrecord.20170815")
for _ in tf.python_io.tf_record_iterator(examples_file):
    num_examples += 1

print("num_examples=%s" % num_examples)

In [122]:
GCS_EXAMPLES_PATH = os.path.join(GCS_DIR, "examples", "chr20.tfrecord.20170815.gz")

In [125]:
# Copy the data to GCS
!gsutil cp {local_output_dir}/chr20.tfrecord.20170815.gz {GCS_EXAMPLES_PATH}

Copying file:///home/jovyan/deepvariant-pd/deepvariant/data/test_output/chr20.tfrecord.20170815.gz [Content-Type=application/octet-stream]...
/ [0 files][    0.0 B/  1.4 GiB]                                                ==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

-- [0 files][ 82.2 MiB/  1.4 GiB]                                                \||

In [128]:
# Create the dataset config file.
import datetime
now = datetime.datetime.now()

# TODO(jlewi): We could put this on GCS 
dataset_config_rpath = os.path.join(DATA_DIR, "experiments", now.strftime("%Y%m%d_%H%M%S"), "dataset_config.pbtxt")

# TODO(jlewi): It would be better if we loaded up the protocol buffer definition from the DeepVariant soure repo as 
# opposed to manually writing the ASCII version directly to a file.
local_dataset_file = os.path.join(NFS_MOUNT_POINT, dataset_config_rpath)

local_dir = os.path.dirname(local_dataset_file)
if not os.path.exists(local_dir):
    os.makedirs(local_dir)
    
# If we install the appropriate GCS client libraries we can write directly to GCS    
with open(local_dataset_file, "w") as hf:
    # TODO(jlewi): What name should we use
    hf.write('name: "some-name"\n')    
    hf.write('tfrecord_path: "{0}"\n'.format(GCS_EXAMPLES_PATH))
    hf.write('num_examples: {0}\n'.format(num_examples))


In [129]:
!gsutil cp {local_dataset_file} {GCS_DATASET_CONFIG}

Copying file:///home/jovyan/deepvariant-pd/deepvariant/data/experiments/20171210_235851/dataset_config.pbtxt [Content-Type=application/octet-stream]...
/ [0 files][    0.0 B/  151.0 B]                                                / [1 files][  151.0 B/  151.0 B]                                                
Operation completed over 1 objects/151.0 B.                                      


In [130]:
!gsutil cat {GCS_DATASET_CONFIG}

name: "some-name"
tfrecord_path: "gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/chr20.tfrecord.20170815.gz"
num_examples: 170538


In [9]:
import datetime
now = datetime.datetime.now()
        
# Create a config file for the package.
config = {
    "cpu_image": "gcr.io/deepvariant-docker/deepvariant:0.4.0",
    "gpu_image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0",
    "train_dir": os.path.join(GCS_DIR, "model"),
    "dataset_config": GCS_DATASET_CONFIG,
    "num_ps": 1,
    "num_workers": 1,
}
    
CONFIG_FILE="/tmp/model_train_config.yaml"

config_yaml = yaml.dump(config, Dumper=ExplicitDumper, default_flow_style=False)

with open(CONFIG_FILE, "w") as hf:
    hf.write(config_yaml)

In [12]:
!~/helm install --name=train-dv ./charts/dv2-train/ -f {CONFIG_FILE}

NAME:   train-dv
LAST DEPLOYED: Mon Dec 11 01:00:31 2017
NAMESPACE: default
STATUS: DEPLOYED

RESOURCES:
==> v1alpha1/TfJob
NAME      AGE
train-dv  0s


NOTES:
1. TODO(jlewi): Explain how to get the URl of TensorBoard.



In [11]:
# To clean up the job after it finishes
!~/helm delete --purge train-dv

release "train-dv" deleted
