In [None]:
%%javascript
require(['base/js/utils'], function(utils) {
    utils.load_extensions('usability/ruler/main');
    utils.load_extensions('usability/toc2/main');
});

# Hadoop Installations 2: Single Node

## Usage Notes

This notebook looks at installing a single node Hadoop cluster on a non-EMR server. If you plan on adding more nodes in the future, you should not use this notebook (skip to the Ambari cluster notebook).

## Notebook Imports

In [None]:
from aws_request import *
from aws_util import *

## Check Spot Instance Request

The instances for the application were generated by the previous notebook.

In [None]:
app_request = InstanceRequest('app')
app_instances = app_request.get_fulfilled()

app_host_names = [instance['PublicDnsName'] for instance in app_instances]
app_host_names

## Install Hadoop

### Prepare Configuration Files

We'll need to create a directory to store all the configuration files we need.

In [None]:
!mkdir -p conf

Now we create the individual files.

In [None]:
%%writefile conf/core-site.xml
<configuration>
  <property>
    <name>fs.default.name</name>
    <value>hdfs://localhost:9000</value>
  </property>
</configuration>

In [None]:
%%writefile conf/yarn-site.xml
<configuration>
  <property>
    <name>yarn.nodemanager.aux-services</name>
    <value>mapreduce_shuffle</value>
  </property>
  <property>
    <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
    <value>org.apache.hadoop.mapred.ShuffleHandler</value>
  </property>
</configuration>

In [None]:
%%writefile conf/mapred-site.xml
<configuration>
  <property>
    <name>mapreduce.framework.name</name>
    <value>yarn</value>
  </property>
</configuration>

In [None]:
%%writefile conf/hdfs-site.xml
<configuration>
  <property>
    <name>dfs.replication</name>
    <value>1</value>
  </property>
  <property>
    <name>dfs.namenode.name.dir</name>
    <value>NAME_NODE_FOLDERS</value>
  </property>
  <property>
    <name>dfs.datanode.data.dir</name>
    <value>DATA_NODE_FOLDERS</value>
  </property>
</configuration>

In [None]:
upload_file('ubuntu', app_host_names, 'conf/core-site.xml')
upload_file('ubuntu', app_host_names, 'conf/yarn-site.xml')
upload_file('ubuntu', app_host_names, 'conf/mapred-site.xml')
upload_file('ubuntu', app_host_names, 'conf/hdfs-site.xml')

### Install Hadoop

Create a script which will install Hadoop binaries.

In [None]:
%%writefile scripts/install_hadoop.sh
#!/bin/bash
source ~/.profile

HADOOP_VERSION=2.7.3

# Add the Hadoop user

sudo addgroup hadoop
sudo adduser --ingroup hadoop --disabled-password --gecos "" hduser

# Download Hadoop

wget -qq https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz
tar -zxf hadoop-$HADOOP_VERSION.tar.gz
sudo mv hadoop-$HADOOP_VERSION /usr/local/lib
sudo chown -R hduser:hadoop /usr/local/lib/hadoop-$HADOOP_VERSION

# Set HADOOP_HOME environment variable

HADOOP_HOME=/usr/local/lib/hadoop-$HADOOP_VERSION

echo >> $HOME/.profile
echo "# Added for MRJob" >> $HOME/.profile
echo export HADOOP_HOME="$HADOOP_HOME" >> $HOME/.profile
echo 'export PATH=$PATH:$HADOOP_HOME/bin' >> $HOME/.profile

# Create name node and data node folders on mount points

for folder in $(ls -1 /hadoop); do
    sudo mkdir -p /hadoop/$folder/hdfs/namenode
    sudo mkdir -p /hadoop/$folder/hdfs/datanode
    sudo chown -R hduser:hadoop /hadoop/$folder
done

# Update configuration to use data node folders

NAME_NODES=$(
    ls -1 /hadoop | awk '{ print "file:/hadoop/" $1 "/hdfs/namenode" }' | tr '\n' ','
)

sed -i -e "s@NAME_NODE_FOLDERS@$NAME_NODES@g" hdfs-site.xml

DATA_NODES=$(
    ls -1 /hadoop | awk '{ print "file:/hadoop/" $1 "/hdfs/datanode" }' | tr '\n' ','
)

sed -i -e "s@DATA_NODE_FOLDERS@$DATA_NODES@g" hdfs-site.xml

# Move all the -site.xml configuration files to the Hadoop folder

sudo mv *-site.xml $HADOOP_HOME/etc/hadoop
sudo chown hduser:hadoop $HADOOP_HOME/etc/hadoop/*-site.xml

# Enable SSH for the hduser

sudo su -c 'mkdir /home/hduser/.ssh' - hduser
sudo su -c 'ssh-keygen -t rsa -P "" -f /home/hduser/.ssh/id_rsa' - hduser
sudo su -c 'cp /home/hduser/.ssh/id_rsa.pub /home/hduser/.ssh/authorized_keys' - hduser

sudo su -c 'ssh -o StrictHostKeyChecking=no localhost "echo"' - hduser
sudo su -c 'ssh -o StrictHostKeyChecking=no 0.0.0.0 "echo"' - hduser

# Update .profile for hduser

sudo su -c "echo export JAVA_HOME=$JAVA_HOME >> /home/hduser/.profile" - hduser

sudo su -c "echo export HADOOP_INSTALL=$HADOOP_HOME >> /home/hduser/.profile" - hduser
sudo su -c "echo export HADOOP_MAPRED_HOME=$HADOOP_HOME >> /home/hduser/.profile" - hduser
sudo su -c "echo export HADOOP_COMMON_HOME=$HADOOP_HOME >> /home/hduser/.profile" - hduser
sudo su -c "echo export HADOOP_HDFS_HOME=$HADOOP_HOME >> /home/hduser/.profile" - hduser
sudo su -c "echo export YARN_HOME=$HADOOP_HOME >> /home/hduser/.profile" - hduser
sudo su -c "echo 'export PATH=$HADOOP_HOME/bin:$PATH' >> /home/hduser/.profile" - hduser

sudo sed -i -e "s@\${JAVA_HOME}@$JAVA_HOME@g" \
    /usr/local/lib/hadoop-$HADOOP_VERSION/etc/hadoop/hadoop-env.sh

# Format the name node and start DFS and Yarn

sudo su -c "$HADOOP_HOME/bin/hdfs namenode -format" - hduser
sudo su -c "$HADOOP_HOME/sbin/start-dfs.sh" - hduser
sudo su -c "$HADOOP_HOME/sbin/start-yarn.sh" - hduser

Run the script on all servers.

In [None]:
run_script('ubuntu', app_host_names, 'install_hadoop.sh')

## Install Spark

In [None]:
%%writefile scripts/install_spark.sh
#!/bin/bash
source ~/.profile

SPARK_VERSION=1.6.3

# Download Spark

wget -qq http://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-without-hadoop.tgz
tar -zxf spark-$SPARK_VERSION-bin-without-hadoop.tgz
sudo mv spark-$SPARK_VERSION-bin-without-hadoop /usr/local/lib/spark-$SPARK_VERSION

# Set HADOOP_HOME environment variable

SPARK_HOME=/usr/local/lib/spark-$SPARK_VERSION

echo >> $HOME/.profile
echo "# Added for Spark" >> $HOME/.profile
echo export SPARK_HOME="$SPARK_HOME" >> $HOME/.profile
echo 'export PATH=$PATH:$SPARK_HOME/bin' >> $HOME/.profile

# Update spark-env.sh with Java and Hadoop information

echo export SPARK_DIST_CLASSPATH=$(hadoop classpath) >> $SPARK_HOME/conf/spark-env.sh

Run the script on all servers.

In [None]:
run_script('ubuntu', app_host_names, 'install_spark.sh')

## Initialize HDFS

We'll want to make sure that the proper directories exist on HDFS. We'll want the user home for the Ubuntu user, which is the default location where data is stored in an MRJob.

In [None]:
%%writefile scripts/init_hdfs.sh
#!/bin/bash
source ~/.profile

sudo su -c "$HADOOP_HOME/bin/hdfs dfs -mkdir -p /user/ubuntu" hduser
sudo su -c "$HADOOP_HOME/bin/hdfs dfs -chown ubuntu:ubuntu /user/ubuntu" hduser

sudo su -c "$HADOOP_HOME/bin/hdfs dfs -mkdir -p /tmp" hduser
sudo su -c "$HADOOP_HOME/bin/hdfs dfs -chmod a+rwx /tmp" hduser

And now we run the command on our designated Notebook server.

In [None]:
run_script('ubuntu', app_host_names[:1], 'init_hdfs.sh')