# Deploy GPU enabled AKS cluster

prereqs:
1. NVIDIA Docker installed for local testing: https://github.com/NVIDIA/nvidia-docker
2. `az cli` installed and logged into
3. `docker` installed and logged into

steps involved:
1. test that you can take items from the queue locally
2. test that you can take items from the queue from a locally running docker image
3. test that you can take items from the queue from AKS

In [3]:
%load_ext blackcellmagic

In [5]:
from dotenv import set_key, get_key, find_dotenv
from pathlib import Path
import json

In [13]:
env_path = find_dotenv(raise_error_if_not_found=True)

## Setup service bus and add items to queue

In [7]:
from azure.servicebus import ServiceBusService, Message, Queue
from azure.storage.blob import BlockBlobService
import os

In [None]:
sb_service_namespace = "jiatanamespace01"
sb_queue = "batchscoringtest02"
sb_shared_access_key_name = "RootManageSharedAccessKey"
sb_shared_access_key_value = "Mlfcj0edqaNyf9TevJES6+0nnYT4KkeTHZ46L8X9nds="
storage_account_name = "jiataakstest"
storage_account_key = "CCiyOLMA9fS98WKPBsvTFU2QWYZ+FlTv2GtaBRvZcM/oPdYRJlOUu7wISclFchYnT/mdOqqNUKnOQoBLv99fzw=="

set_key(env_path, "SB_QUEUE", sb_queue)
set_key(env_path, "SB_SERVICE_NAMESPACE", sb_service_namespace)
set_key(env_path, "SB_SHARED_ACCESS_KEY_NAME", sb_shared_access_key_name)
set_key(env_path, "SB_SHARED_ACCESS_KEY_VALUE", sb_shared_access_key_value)
set_key(env_path, "STORAGE_ACCOUNT_NAME", storage_account_name)
set_key(env_path, "STORAGE_ACCOUNT_KEY", storage_account_key)


In [21]:
%%writefile add_images_to_queue.py

# service bus creds
bus_service = ServiceBusService(
    service_namespace=os.getenv("SB_SERVICE_NAMESPACE"),
    shared_access_key_name=os.getenv("SB_SHARED_ACCESS_KEY_NAME"),
    shared_access_key_value=os.getenv("SB_SHARED_ACCESS_KEY_VALUE"),
)

# blob creds
block_blob_service = BlockBlobService(
    account_name=os.getenv("STORAGE_ACCOUNT_NAME"),
    account_key=os.getenv("STORAGE_ACCOUNT_KEY"),
)

# list all images in specified blob under directory "/input"
blob_iterator = block_blob_service.list_blobs("aks", prefix="input")

# for all images found, add to queue
for blob in blob_iterator:
    print("adding {} to queue...".format(blob.name.split("/")[-1]))
    msg = Message(blob.name.encode())
    bus_service.send_queue_message(os.getenv("SB_QUEUE"), msg)


Writing add_images_to_queue.py


## Build docker image

In [15]:
docker_login = "jiata"
image_repo = "batchscoringdl_receiver"
set_key(env_path, "DOCKER_LOGIN", docker_login)
set_key(env_path, "IMAGE_REPO", image_repo)

(True, 'IMAGE_REPO', 'batchscoringdl_receiver')

In [None]:
%%writefile Dockerfile

FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04

RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list

RUN apt-get update && apt-get install -y --no-install-recommends \
        build-essential \
        ca-certificates \
        cmake \
        curl \
        git \
        nginx \
        supervisor \
        wget && \
        rm -rf /var/lib/apt/lists/*

ENV PYTHON_VERSION=3.6
RUN curl -o ~/miniconda.sh -O  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
    chmod +x ~/miniconda.sh && \
    ~/miniconda.sh -b -p /opt/conda && \
    rm ~/miniconda.sh && \
    /opt/conda/bin/conda create -y --name py$PYTHON_VERSION python=$PYTHON_VERSION && \
    /opt/conda/bin/conda clean -ya
ENV PATH /opt/conda/envs/py$PYTHON_VERSION/bin:$PATH
ENV LD_LIBRARY_PATH /opt/conda/envs/py$PYTHON_VERSION/lib:/usr/local/cuda/lib64/:$LD_LIBRARY_PATH
ENV PYTHONPATH /code/:$PYTHONPATH

RUN mkdir /app
WORKDIR /app
ADD receiver_app/process_images_from_queue.py /app
ADD receiver_app/style_transfer.py /app
ADD receiver_app/requirements.txt /app

RUN pip install --no-cache-dir -r requirements.txt

CMD ["python", "process_images_from_queue.py"]

In [None]:
!sudo docker build -t $image_repo .

In [None]:
!sudo docker tag $image_repo $docker_login/$image_repo

In [None]:
!sudo docker push $docker_login/$image_repo

## Create AKS Deployment

In [16]:
resource_group = "jiataakstest"
aks_cluster = "jiataakstest-gpu"
set_key(env_path, "RESOURCE_GROUP", resource_group)
set_key(env_path, "AKS_CLUSTER", aks_cluster)

(True, 'AKS_CLUSTER', 'jiataakstest-gpu')

In [17]:
receiver_json = {
    "apiVersion": "apps/v1beta1",
    "kind": "Deployment",
    "metadata": {
        "name": "receiver", 
        "labels": {
            "purpose": "receive_queue_messages"
        }
    },
    "spec": {
        "replicas": 3,
        "template": {
            "metadata": {
                "labels": {
                    "app": "receiver"
                }
            },
            "spec": {
                "containers": [
                    {
                        "name": "receiver",
                        "image": "{}/{}:latest".format(get_key(env_path, "DOCKER_LOGIN"), get_key(env_path, "IMAGE_REPO")),
                        "volumeMounts": [
                            {
                                "mountPath": "/usr/local/nvidia", 
                                "name": "nvidia"
                            }
                        ],
                        "resources": {
                            "requests": {
                                "alpha.kubernetes.io/nvidia-gpu": 1
                            },
                            "limits": {
                                "alpha.kubernetes.io/nvidia-gpu": 1
                            },
                        },
                        "ports": [{
                            "containerPort": 433
                        }],
                        "env": [
                            {
                                "name": "LB_LIBRARY_PATH",
                                "value": "$LD_LIBRARY_PATH:/usr/local/nvidia/lib64:/opt/conda/envs/py3.6/lib",
                            },
                            {
                                "name": "DP_DISABLE_HEALTHCHECKS", 
                                "value": "xids"
                            },
                            {
                                "name": "SB_SERVICE_NAMESPACE",
                                "value": get_key(env_path, "SB_SERVICE_NAMESPACE")
                            },
                            {
                                "name": "SB_QUEUE", 
                                "value": get_key(env_path, "SB_QUEUE")
                            },
                            {
                                "name": "SB_SHARED_ACCESS_KEY_NAME",
                                "value": get_key(env_path, "SB_SHARED_ACCESS_KEY_NAME")
                            },
                            {
                                "name": "SB_SHARED_ACCESS_KEY_VALUE",
                                "value": get_key(env_path, "SB_SHARED_ACCESS_KEY_VALUE")
                            },
                            {
                                "name": "STORAGE_ACCOUNT_NAME", 
                                "value": get_key(env_path, "STORAGE_ACCOUNT_NAME")
                            },
                            {
                                "name": "STORAGE_ACCOUNT_KEY",
                                "value": get_key(env_path, "STORAGE_ACCOUNT_KEY")
                            },
                        ],
                    }
                ],
                "volumes": [
                    {
                        "name": "nvidia", 
                        "hostPath": {
                            "path": "/usr/local/nvidia"
                        }
                    }
                ],
            },
        },
    },
}

In [18]:
with open("receiver.json", "w") as outfile:
    json.dump(receiver_json, outfile, indent=4, sort_keys=True)
    outfile.write('\n\n')

In [19]:
!kubectl create -f receiver.json

deployment.apps/receiver created


In [20]:
!kubectl get pods

NAME                          READY   STATUS              RESTARTS   AGE
receiver-77dd86c478-6kqhn     0/1     ContainerCreating   0          2s
receiver-77dd86c478-skd4v     0/1     ContainerCreating   0          2s
receiver-77dd86c478-vlzfs     0/1     ContainerCreating   0          2s
samples-tf-mnist-demo-6p4bv   0/1     Pending             0          3h26m


Run `az aks browse -n $aks_cluster -g $resource_group` in your terminal so that you can use the Kubernetes Dashboard. If you're not able to access the dashboard, follow the instructions here: https://blog.tekspace.io/kubernetes-dashboard-remote-access/