In [None]:
%%javascript
require(['base/js/utils'], function(utils) {
    utils.load_extensions('usability/ruler/main');
    utils.load_extensions('usability/toc2/main');
});

# Notebook Installations 3: Zeppelin

## Usage Notes

Now, we'll install Zeppelin onto one of our application servers so that we can actually submit jobs to the Ambari cluster. Note that you do not need to use this script if you installed everything via EMR.

https://zeppelin.incubator.apache.org

## Notebook Imports

In [None]:
from aws_request import *
from aws_util import *
import subprocess

## Check Spot Instance Request

The instances for the application were generated by the previous notebook.

In [None]:
app_request = InstanceRequest('app')
app_instances = app_request.get_fulfilled()

app_host_names = [instance['PublicDnsName'] for instance in app_instances]
app_host_names

## Specify SSH User

If you created your cluster through EMR, the user is `hadoop`. If you created your cluster as a standard EC2 instance using this notebook series, the user is `ubuntu`.

In [None]:
user_name = 'hadoop'

## Specify S3 Bucket

Specify a bucket containing all of our installation files, so your current user will need to have read access to the bucket.

In [None]:
bucket_name = 'mdang.lesa'

Here, we'll make sure that our bucket is registered to our application servers.

In [None]:
set_bucket(user_name, app_host_names, bucket_name)

## Identify Server Tasks

The following decides which host to use for Zeppelin.

In [None]:
zeppelin_host_name = app_host_names[-1]

print 'Zeppelin', zeppelin_host_name

## ZeppelinHub Token

You may want to integrate with ZeppelinHub if you want to preview how your notebook will look using the ZeppelinHub viewer. You can do that by following the instructions below.

http://help.zeppelinhub.com/zeppelin_integration/

If you're not interested in this integration, leave the `zeppelin_instance_token` at `None`. Otherwise, update the JAR version with whatever version is linked in the above URL and provide the instance token corresponding to your ZeppelinHub instance.

In [None]:
zeppelinhub_jar = 'zeppelinhub-integration-v0.4.0-all.jar'
zeppelinhub_api_token = None

In [None]:
with open('zeppelin_hub.sh', 'w') as zeppelin_hub:
    print >> zeppelin_hub, 'export ZEPPELINHUB_JAR=' + zeppelinhub_jar
    if zeppelinhub_api_token is not None:
        print >> zeppelin_hub, 'export ZEPPELINHUB_API_TOKEN=' + zeppelinhub_api_token

upload_file(user_name, [zeppelin_host_name], 'zeppelin_hub.sh')

## Install Zeppelin

Create a script which will install Zeppelin binaries.

In [None]:
%%writefile scripts/install_zeppelin.sh
#!/bin/bash
source ~/.profile

ZEPPELIN_VERSION=0.5.6-incubating

# Download Zeppelin

if [ ! -f zeppelin-$ZEPPELIN_VERSION-bin-all.tgz ]; then
    aws s3 cp s3://$S3_BUCKET/zeppelin/zeppelin-$ZEPPELIN_VERSION-bin-all.tgz .
    tar -zxf zeppelin-$ZEPPELIN_VERSION-bin-all.tgz
fi

# Set JAVA_HOME

echo "export JAVA_HOME=$JAVA_HOME" \
    > zeppelin-$ZEPPELIN_VERSION-bin-all/conf/zeppelin-env.sh

chmod u+x zeppelin-$ZEPPELIN_VERSION-bin-all/conf/zeppelin-env.sh

# Set memory options

echo 'export ZEPPELIN_MEM="-Xms2g -Xmx2g"'
    >> zeppelin-$ZEPPELIN_VERSION-bin-all/conf/zeppelin-env.sh

    echo 'export ZEPPELIN_INTP_MEM="-Xms2g -Xmx2g"'
    >> zeppelin-$ZEPPELIN_VERSION-bin-all/conf/zeppelin-env.sh

# Enable ZeppelinHub integration

source ~/zeppelin_hub.sh

if [ ! -f zeppelin-$ZEPPELIN_VERSION-bin-all/lib/$ZEPPELINHUB_JAR ]; then
    wget -qq https://s3-ap-northeast-1.amazonaws.com/zeppel.in/$ZEPPELINHUB_JAR
    mkdir -p zeppelin-$ZEPPELIN_VERSION-bin-all/lib
    mv $ZEPPELINHUB_JAR zeppelin-$ZEPPELIN_VERSION-bin-all/lib
fi

if [ "" != "$ZEPPELINHUB_API_TOKEN" ]; then
    DEFAULT_REPO=org.apache.zeppelin.notebook.repo.VFSNotebookRepo
    ZEPPELINHUB_REPO=com.nflabs.zeppelinhub.notebook.repo.ZeppelinHubRepo

    echo "export ZEPPELIN_NOTEBOOK_STORAGE=\"$DEFAULT_REPO, $ZEPPELINHUB_REPO\"" \
        >> zeppelin-$ZEPPELIN_VERSION-bin-all/conf/zeppelin-env.sh

    echo export ZEPPELINHUB_API_ADDRESS=https://www.zeppelinhub.com \
        >> zeppelin-$ZEPPELIN_VERSION-bin-all/conf/zeppelin-env.sh

    echo export ZEPPELINHUB_API_TOKEN="$ZEPPELINHUB_API_TOKEN" \
        >> zeppelin-$ZEPPELIN_VERSION-bin-all/conf/zeppelin-env.sh
fi

# Set SPARK_HOME

echo "export SPARK_HOME=$SPARK_HOME" \
    >> zeppelin-$ZEPPELIN_VERSION-bin-all/conf/zeppelin-env.sh

echo "export PYSPARK_PYTHON=$(which python)" \
    >> zeppelin-$ZEPPELIN_VERSION-bin-all/conf/zeppelin-env.sh

echo 'export PY4J_ZIP=$(find -L $SPARK_HOME -name py4j*.zip)' \
    >> zeppelin-$ZEPPELIN_VERSION-bin-all/conf/zeppelin-env.sh

echo 'export SPARK_YARN_USER_ENV="PYTHONPATH=$SPARK_HOME/python:$PY4J_ZIP"' \
    >> zeppelin-$ZEPPELIN_VERSION-bin-all/conf/zeppelin-env.sh

# Set extra options

if [ "" != "$(which hdp-select)" ]; then
    export HDP_VERSION=$(hdp-select status hadoop-client | cut -d" " -f 3)

    echo $HDP_VERSION > hdp_version.txt

    echo "export MASTER=yarn-client" \
        >> zeppelin-$ZEPPELIN_VERSION-bin-all/conf/zeppelin-env.sh

    echo "export ZEPPELIN_JAVA_OPTS=-Dhdp.version=$HDP_VERSION" \
        >> zeppelin-$ZEPPELIN_VERSION-bin-all/conf/zeppelin-env.sh

    echo "export HADOOP_CONF_DIR=$HADOOP_HOME/conf" \
        >> zeppelin-$ZEPPELIN_VERSION-bin-all/conf/zeppelin-env.sh
else
    cat /dev/null > hdp_version.txt

    echo "export MASTER=local" \
        >> zeppelin-$ZEPPELIN_VERSION-bin-all/conf/zeppelin-env.sh

    echo "export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop" \
        >> zeppelin-$ZEPPELIN_VERSION-bin-all/conf/zeppelin-env.sh
fi

# Start the Zeppelin daemon

zeppelin-$ZEPPELIN_VERSION-bin-all/bin/zeppelin-daemon.sh stop
zeppelin-$ZEPPELIN_VERSION-bin-all/bin/zeppelin-daemon.sh start

Run the script on all servers.

In [None]:
run_script(user_name, [zeppelin_host_name], 'install_zeppelin.sh')

## Access Notebook GUI

In [None]:
print 'Zeppelin Server:'
print 'http://' + zeppelin_host_name + ':8080/'

## Configure Spark Interpreter

If you're using an Ambari-based installation of Hadoop, there are some additional steps required where you must specify the Hortonworks Data Platform (HDP) version for `spark.driver.extraJavaOptions` and `spark.yarn.am.extraJavaOptions`.

The following is the applicable documentation for HDP 2.4 and Zeppelin 0.5.6, which were current as of this writing.

* http://hortonworks.com/hadoop-tutorial/apache-zeppelin-hdp-2-4/
* https://zeppelin.incubator.apache.org/docs/0.5.6-incubating/install/yarn_install.html

You will need the specific HDP version, which can be found by running the cell below.

In [None]:
hdp_client_status = subprocess.check_output([
    'ssh', '-i', private_key_location,
    user_name + '@' + zeppelin_host_name,
    'cat hdp_version.txt'
]).strip()

if len(hdp_client_status) > 0:
    print 'spark.driver.extraJavaOptions\t-Dhdp.version=' + hdp_client_status
    print 'spark.yarn.am.extraJavaOptions\t-Dhdp.version=' + hdp_client_status
else:
    print 'Not using Hortonworks Data Platform'