# AWS EMR Launcher
Welcome to the DSA Hadoop Launcher. Here you will add your key and a couple paramaters to deploy your own Hadoop stack on AWS.

Please set the paramaters in the "SET THE FOLLOWING PARAMETERS" box below and run the first cell.

In [None]:
# Launch an EMR cluster with a Jupyter Notebook

################################### SET THE FOLLOWING PARAMETERS ###################################################
#Set the AWS Region
region = 'us-west-2'

#Set the AWS Access ID (Given to you buy the DSA staff)
access_id = '_____________'

#Set the AWS Access Key (Given to you buy the DSA staff)
access_key = '__________________________'

#Set the Size of the AWS EC2 Instances
instance_size = 'm3.xlarge'

#Set the Number of Master Instances
master_instances = 1

#Set the Number of Slave Instances
slave_instances = 2

#Dataset Provided By Your Instructor
dataset_location = 'https://s3-us-west-2.amazonaws.com/dataset-store/amazon_reviews/reviews.json'
dataset_file_name = 'reviews.json'

#Folder Name for Notebooks Transferring to AWS
load_notebook_location = 'notebooks'
####################################################################################################################


#Import AWS Tools
import boto3

#Establish The EMR Session
emr = boto3.client(
   'emr',
    region_name=region, 
    aws_access_key_id = access_id, 
    aws_secret_access_key = access_key
)

#Establish The EC2 Session
ec2 = boto3.client(
    'ec2',
    region_name=region, 
    aws_access_key_id = access_id, 
    aws_secret_access_key = access_key
)


In [None]:
import sys
sys.path.append(".") # add current folder for search path to find fabric

#Import System Tools
import json
import os
import time
import getpass
from subprocess import call
import fabric

In [None]:


#Set important Variables
system_user_name=getpass.getuser()
wk_dir=os.getcwd()

print(emr)
print(ec2)

#Price Calculator Development In Progress

# Create SSH Keypair
This will create a temporary keypair for you to access your cluster and save it to your current working directory. 

This is automatic so please run this cell as is.

In [None]:
# Create SSH Keypair File For This EMR Cluster

emr_pem_file=time.strftime("EMR-%d%m%Y%H%M%S-"+system_user_name)
emr_key=ec2.create_key_pair(KeyName=emr_pem_file)

#Don't do this unless you have a good reason
#print(emr_key['KeyMaterial'])

os.system("echo \""+emr_key['KeyMaterial']+"\" > "+emr_pem_file+".pem")
os.chmod(wk_dir+"/"+emr_pem_file+".pem",0o400)

print("KeyName         : "+emr_key['KeyName']+"\nKey Fingerprint : "+emr_key['KeyFingerprint'])

# Launch EMR Cluster
This step will launch your Hadoop cluster. From this point on you will be charged money for every hour that this cluster is running. Please proceed with caution.

All arguments for the following cell have been set in the first cell. Please run the following cell as is.

In [None]:
#Wait for Bootstrap and Print Cluster Details
print ("\n***Please Wait***\n\n"+response['Cluster']['Status']['State']+".",end="")
while True:
    response = emr.describe_cluster(
        ClusterId=cluster_id  
    )
    try:
        response['Cluster']['MasterPublicDnsName'].find("ec2")
        print('...Cluster DNS Active',end="")
        break
    except:    
        time.sleep(5)
        print(".", end="")
        pass

print("\n\nProceeding with Firewall Rules...")

#Get Cluster Security Group Info
master_security_group = response['Cluster']['Ec2InstanceAttributes']['EmrManagedMasterSecurityGroup']
slave_security_group = response['Cluster']['Ec2InstanceAttributes']['EmrManagedSlaveSecurityGroup']


In [None]:
# Launch an EMR cluster

response = emr.run_job_flow(
   Name='EMR Jupyter NB-'+system_user_name,
   LogUri='s3n://logs-'+system_user_name+'/elasticmapreduce/',
   ReleaseLabel='emr-5.28.0',
   Instances={
       
       'InstanceGroups': [
           {
               'Name':'Master - 1',
               'InstanceRole':'MASTER',
               'InstanceType': instance_size,
               'InstanceCount': master_instances
           },
           {
               'Name':'Core - 2',
               'InstanceRole':'CORE',
               'InstanceType': instance_size,
               'InstanceCount': slave_instances
           }
       ],
       'KeepJobFlowAliveWhenNoSteps': True,
       'TerminationProtected':True,
       'Ec2KeyName': emr_pem_file,
       'Placement': {
           'AvailabilityZone': 'us-west-2c'
       }
   },


#Insert Steps Here if Applicable 

#Insert Bootstrapping Actions Here if Applicable

   
   AutoScalingRole="EMR_AutoScaling_DefaultRole",
   Applications=[
       {
           'Name': 'Hadoop'
       },
       {
           'Name': 'Hive'
       },
       {
           'Name': 'Spark'
       },
       {
           'Name': 'Pig'
       }
   ],
   Configurations=[
       {
           'Classification': 'spark',
           'Configurations': [],
           'Properties': {
               'maximizeResourceAllocation':'true'
           }
       },
   ],
   VisibleToAllUsers=False,
   EbsRootVolumeSize=10,
   JobFlowRole='EMR_EC2_DefaultRole',
   ServiceRole='EMR_DefaultRole',
   #ScaleDownBehavior='TERMINATE_AT_INSTANCE_HOUR', #For reliese 5.0.0+
    
)#End of Cluster Launch Command

#Define Cluster ID
cluster_id = response['JobFlowId']
#Get Cluster Info
response = emr.describe_cluster(
    ClusterId=cluster_id  
)
print ("Cluster Name : "+response['Cluster']['Name']+"\nCluster ID   : "+response['Cluster']['Id'])

In [None]:
#Create Firewall Exceptions
try:
    sec_rule="SSH"
    data = ec2.authorize_security_group_ingress(
        GroupId=master_security_group,
        IpPermissions=[
            {'IpProtocol': 'tcp',
             'FromPort': 22,
             'ToPort': 22,
             'IpRanges': [{'CidrIp': '128.206.0.0/16'}]},
        ])
    print("Ingress "+sec_rule+" added")
except:
    print(sec_rule+" already added")


In [None]:
try:
    sec_rule="YARN"
    data = ec2.authorize_security_group_ingress(
        GroupId=master_security_group,
        IpPermissions=[
            {'IpProtocol': 'tcp',
             'FromPort': 8088,
             'ToPort': 8088,
             'IpRanges': [{'CidrIp': '128.206.0.0/16'}]},
        ])
    print("Ingress "+sec_rule+" added")
except:
    print(sec_rule+" already added")

In [None]:
try:
    sec_rule="HDFS NameNode"
    data = ec2.authorize_security_group_ingress(
        GroupId=master_security_group,
        IpPermissions=[
            {'IpProtocol': 'tcp',
             'FromPort': 50070,
             'ToPort': 50070,
             'IpRanges': [{'CidrIp': '128.206.0.0/16'}]},
        ])
    print("Ingress "+sec_rule+" added")
except:
    print(sec_rule+" already added")

In [None]:
try:
    sec_rule="Spark History Server"
    data = ec2.authorize_security_group_ingress(
        GroupId=master_security_group,
        IpPermissions=[
            {'IpProtocol': 'tcp',
             'FromPort': 18080,
             'ToPort': 18080,
             'IpRanges': [{'CidrIp': '128.206.0.0/16'}]},
        ])
    print("Ingress "+sec_rule+" added")
except:
    print(sec_rule+" already added")

In [None]:
try:
    sec_rule="Hue"
    data = ec2.authorize_security_group_ingress(
        GroupId=master_security_group,
        IpPermissions=[
            {'IpProtocol': 'tcp',
             'FromPort': 8888,
             'ToPort': 8888,
             'IpRanges': [{'CidrIp': '128.206.0.0/16'}]},
        ])
    print("Ingress "+sec_rule+" added")
except:
    print(sec_rule+" already added")

In [None]:
try:
    sec_rule="HBase"
    data = ec2.authorize_security_group_ingress(
        GroupId=master_security_group,
        IpPermissions=[
            {'IpProtocol': 'tcp',
             'FromPort': 16010,
             'ToPort': 16010,
             'IpRanges': [{'CidrIp': '128.206.0.0/16'}]},
        ])
    print("Ingress "+sec_rule+" added")
except:
    print(sec_rule+" already added")

In [None]:
try:
    sec_rule="Jupyter Notebook"
    data = ec2.authorize_security_group_ingress(
        GroupId=master_security_group,
        IpPermissions=[
            {'IpProtocol': 'tcp',
             'FromPort': 9090,
             'ToPort': 9090,
             'IpRanges': [{'CidrIp': '128.206.0.0/16'}]},
        ])
    print("Ingress "+sec_rule+" added")
except:
    print(sec_rule+" already added")


In [None]:
try:
    sec_rule="Slave SSH"
    data = ec2.authorize_security_group_ingress(
        GroupId=slave_security_group,
        IpPermissions=[
            {'IpProtocol': 'tcp',
             'FromPort': 22,
             'ToPort': 22,
             'IpRanges': [{'CidrIp': '128.206.0.0/16'}]},
        ])
    print("Ingress "+sec_rule+" added")
except:
    print(sec_rule+" already added")

In [None]:
try:
    sec_rule="Slave YARN NodeManager"
    data = ec2.authorize_security_group_ingress(
        GroupId=slave_security_group,
        IpPermissions=[
            {'IpProtocol': 'tcp',
             'FromPort': 8042,
             'ToPort': 8042,
             'IpRanges': [{'CidrIp': '128.206.0.0/16'}]},
        ])
    print("Ingress "+sec_rule+" added")
except:
    print(sec_rule+" already added")

In [None]:
try:
    sec_rule="Slave HDFS DataNode"
    data = ec2.authorize_security_group_ingress(
        GroupId=slave_security_group,
        IpPermissions=[
            {'IpProtocol': 'tcp',
             'FromPort': 50075,
             'ToPort': 50075,
             'IpRanges': [{'CidrIp': '128.206.0.0/16'}]},
        ])
    print("Ingress "+sec_rule+" added")
except:
    print(sec_rule+" already added")

In [None]:
print ("\n\nFinishing Startup.\nThis will take a few minutes...\n\n***Please Wait***\n\nStarting.",end="")




while str(response['Cluster']['Status']['State']) == 'STARTING':
        time.sleep(5)
        print(".", end="")
        response = emr.describe_cluster(
            ClusterId=cluster_id  
        )
print('...Done',end="")  


In [None]:
print ("\n\nRunning Bootstrap Actions.\nThis will take a few minutes...\n\n***Please Wait***\n\nBootstrapping.",end="")

while str(response['Cluster']['Status']['State']) == 'BOOTSTRAPPING':
        time.sleep(5)
        print(".", end="")
        response = emr.describe_cluster(
            ClusterId=cluster_id  
        )
print('...Done',end="")  
print('\n\nCluster Status: '+response['Cluster']['Status']['State'])


In [None]:
import paramiko
help(paramiko.config.SSHConfig)

In [None]:
#Refresh Cluster Description
response = emr.describe_cluster(
    ClusterId=cluster_id  
)

host_string = response['Cluster']['MasterPublicDnsName']
print(host_string)

#Bootstrap Cluster with Fabric
from fabric import tasks
from fabric import Connection

# env.host_string = response['Cluster']['MasterPublicDnsName']
# env.user = 'hadoop'
# env.key_filename = wk_dir+"/"+emr_pem_file+".pem"
# env.warn_only
# env.FABRIC_RUN_HIDE="true"


c = Connection(
    host=host_string,
    user="hadoop",
    connect_kwargs={
        "key_filename": wk_dir+"/"+emr_pem_file+".pem",
    },
)


In [None]:
def install_jupyter(fab_conn):
    fab_conn.run('sudo -u root pip install jupyter')
    fab_conn.run('sudo -u root pip install toree')
    fab_conn.run('export SPARK_HOME=/usr/lib/spark;export PYTHONPATH=$PYTHONPATH:$SPARK_HOME/python:$SPARK_HOME/python/lib')
    fab_conn.run('sudo -u root /usr/local/bin/jupyter toree install --replace --spark_home=/usr/lib/spark --spark_opts="--master=local[*]" --interpreters=Scala,PySpark,SparkR,SQL')
    fab_conn.run('mkdir -p /home/hadoop/.jupyter/')
    fab_conn.run('curl -o /home/hadoop/.jupyter/jupyter_notebook_config.py https://s3-us-west-2.amazonaws.com/dsa-mizzou/scripts/jupyter_notebook_config.py')
    fab_conn.run('sudo -u root yum -y install tmux')
    fab_conn.run('tmux new-session -d "jupyter notebook --no-browser --config /home/hadoop/.jupyter/jupyter_notebook_config.py"')

def load_dataset(fab_conn):
    fab_conn.run('/usr/bin/hadoop fs -mkdir Datasets')
    fab_conn.run('curl '+dataset_location+' | hadoop fs -appendToFile - Datasets/'+dataset_file_name)
        

In [None]:

print('\nInstalling Jupyter...')        
install_jupyter(c)
print('\nDone\n')

In [None]:
print('Loading Dataset...')
load_dataset(c)
print('\nDone\n')


In [None]:

#Upload Notebook Directory
os.system("scp -o StrictHostKeyChecking=no -r -i "+wk_dir+"/"+emr_pem_file+".pem "+wk_dir+"/"+load_notebook_location+"/."+" hadoop@"+response['Cluster']['MasterPublicDnsName']+":/home/hadoop/" )

print('Please Proceed to the Next Step')

# Access your EMR Cluster's Interfaces

#### For Web Interfaces Run the Following Cell


We are interested in running the Jupyter notebook on cluster. So click on the first link for launching Jupyterhub on EMR cluster. Here you can find the notebooks you uploaded.

In [None]:
#Web Addresses to EMR
print("Jupyter Notebooks\nhttp://"+response['Cluster']['MasterPublicDnsName']+":9090/\n")
print("YARN ResourceManager\nhttp://"+response['Cluster']['MasterPublicDnsName']+":8088/\n")
print("Hadoop HDFS NameNode\nhttp://"+response['Cluster']['MasterPublicDnsName']+":50070/\n")
print("Spark HistoryServer\nhttp://"+response['Cluster']['MasterPublicDnsName']+":18080/\n")
print("Hue\nhttp://"+response['Cluster']['MasterPublicDnsName']+":8888/\n")
print("Ganglia\nhttp://"+response['Cluster']['MasterPublicDnsName']+"/ganglia/\n")
print("HBase UI\nhttp://"+response['Cluster']['MasterPublicDnsName']+":16010/\n")

We are not doing anything in the terminal. So you dont have to worry about doing SSH into the master. Ignore below cell.


#### For SSH, Run the Following Cell and See Instructions Below
 1. Run the Cell below
 1. Highlight the ssh line and press Ctrl+C to copy it to your local clipboard
 1. Click the link below to open a termainal
 1. Paste the SSH link in (Ctrl + V) 

In [None]:
#SSH to EMR
print("ssh -i "+wk_dir+"/"+emr_pem_file+".pem"+" hadoop@"+response['Cluster']['MasterPublicDnsName'])
print("https://europa.dsa.missouri.edu/user/"+system_user_name+"/terminals/1")


### Your pasted command and output should look similar to:

```
$ ssh -i /dsa/home/scottgs/jupyter/CloudComputingDataAnalytics/module5/labs/EMR-12022019234354-scottgs.pem hadoop@ec2-34-216-17-40.us-west-2.compute.amazonaws.com
Last login: Wed Feb 13 07:39:35 2019

       __|  __|_  )
       _|  (     /   Amazon Linux AMI
      ___|\___|___|

https://aws.amazon.com/amazon-linux-ami/2017.03-release-notes/
13 package(s) needed for security, out of 243 available
Run "sudo yum update" to apply all updates.
Amazon Linux version 2018.03 is available.

EEEEEEEEEEEEEEEEEEEE MMMMMMMM           MMMMMMMM RRRRRRRRRRRRRRR
E::::::::::::::::::E M:::::::M         M:::::::M R::::::::::::::R
EE:::::EEEEEEEEE:::E M::::::::M       M::::::::M R:::::RRRRRR:::::R
  E::::E       EEEEE M:::::::::M     M:::::::::M RR::::R      R::::R
  E::::E             M::::::M:::M   M:::M::::::M   R:::R      R::::R
  E:::::EEEEEEEEEE   M:::::M M:::M M:::M M:::::M   R:::RRRRRR:::::R
  E::::::::::::::E   M:::::M  M:::M:::M  M:::::M   R:::::::::::RR
  E:::::EEEEEEEEEE   M:::::M   M:::::M   M:::::M   R:::RRRRRR::::R
  E::::E             M:::::M    M:::M    M:::::M   R:::R      R::::R
  E::::E       EEEEE M:::::M     MMM     M:::::M   R:::R      R::::R
EE:::::EEEEEEEE::::E M:::::M             M:::::M   R:::R      R::::R
E::::::::::::::::::E M:::::M             M:::::M RR::::R      R::::R
EEEEEEEEEEEEEEEEEEEE MMMMMMM             MMMMMMM RRRRRRR      RRRRRR

[hadoop@ip-172-31-11-173 ~]$
```

---

# Download Your Results

Run below cell to get back the notebooks you have run in EMR cluster. A new directory called 'Results' is created in your current directory with all the notebooks you have on EMR cluster. 

In [None]:
#Download all contents of hadoop user to local working directory
os.system("mkdir "+wk_dir+"/results")
os.system("scp -o StrictHostKeyChecking=no -r -i "+wk_dir+"/"+emr_pem_file+".pem hadoop@"+response['Cluster']['MasterPublicDnsName']+":/home/hadoop/. "+wk_dir+"/results/")

# Terminate Your Cluster
Once your work is complete please run the following cells to terminate your cluster and delete your cluster's keypair.

In [None]:
# Remove Termination Protection
emr.set_termination_protection(
    JobFlowIds=[
        cluster_id,
    ],
    TerminationProtected=False
)
# Terminate Cluster
response = emr.terminate_job_flows(
    JobFlowIds=[
       cluster_id ,
    ]
)
print('\nAWS Metadata: ')
print('http Status Code : '+str(response['ResponseMetadata']['HTTPStatusCode']))
print('Request ID       : '+response['ResponseMetadata']['RequestId'])
print('Retries          : '+str(response['ResponseMetadata']['RetryAttempts']))

# Delete SSH Keypair

In [None]:
# Delete SSH Keypair

try:
    os.remove(emr_pem_file+'.pem')
    print('Local Key Deleted')
except:
    print('Local Key Not Found')
    
response = ec2.delete_key_pair(KeyName=emr_pem_file)
print('\nAWS Metadata: ')
print('http Status Code : '+str(response['ResponseMetadata']['HTTPStatusCode']))
print('Request ID       : '+response['ResponseMetadata']['RequestId'])
print('Retries          : '+str(response['ResponseMetadata']['RetryAttempts']))

# Save your notebook, then `File > Close and Halt`

# Cells Beyond this point are for Troubleshooting and Devlopement Only
 #### Do not publish these cells

In [None]:
# List SSH Keypairs
response = ec2.describe_key_pairs()
print(json.dumps(response, indent=4, sort_keys=True))

In [None]:
#Describe Cluster
response = emr.describe_cluster(
    ClusterId=cluster_id  
)
print(response)

In [None]:
# Manually Delete SSH Keypair
user_input = input("Key Name: ")
response = ec2.delete_key_pair(KeyName=user_input)
print(json.dumps(response, indent=4, sort_keys=True))

In [None]:
# Get VPC ID
security_response = ec2.describe_security_groups(GroupIds=[master_security_group])
print(security_response['SecurityGroups'][0]['VpcId'])

In [None]:
#AWS Steps Example  
Steps=[
  {'Name': 'My word count example',
   'HadoopJarStep': {
       'Jar': 'command-runner.jar',
       'Args': [
           'hadoop-streaming',
           '-files', 's3://dsabucket1/tweetSplitter.py',
           '-mapper', 'python3.4 tweetSplitter.py',
           '-input', 's3://dsabucket1/tweets_wc/input/',
           '-output', 's3://dsabucket1/tweets_wc/output/results',
           '-reducer', 'aggregate']}
   }
],
#Do not run this cell (Standalone) 

In [None]:
# AWS Jupyter Install Bootstrap Action 
"Name": "Install Jupyter notebook",
        "ScriptBootstrapAction": { 
        "Args": ["r",
                 "julia",
                 "toree",
                 "torch",
                 "ruby",
                 "Scala",
                 "PySpark",
                 "SparkR",
                 "SQL",
                 "ds-packages",
                 "ml-packages",
                 "python-packages={ggplot,nilearn}",
                 "port=8880",
                 "password=jupyter",
                 "jupyterhub",
                 "jupyterhub-port=8001",
                 "cached-install",
                 "notebook-dir=s3://aws-logs-714861692883-us-east-1/notebooks/copy-samples",
                 "copy-samples",
                 "ssh"
                ],
       "Path": "s3://aws-bigdata-blog/artifacts/aws-blog-emr-jupyter/install-jupyter-emr5.sh"
}
#Do not run this cell (Standalone)    

In [None]:
#Bootstrapping Actions Run on All Nodes
  BootstrapActions= [ 
     { 
        "Name": "Install Jupyter Notebook",
               "ScriptBootstrapAction": { 
               "Path": "s3://dsa-mizzou/scripts/inst-run_jupyter.sh"
        },
        "Name": "Load Reviews Dataset into HDFS",
               "ScriptBootstrapAction": { 
               "Path": "s3://dsa-mizzou/scripts/load_dataset.sh"
        },
        "Name": "Load Lession Data",
               "ScriptBootstrapAction": { 
               "Path": "s3://dsa-mizzou/courses/####/####NB_data.sh"
       }
     }
  ],
#Do not run this cell (Standalone)    

In [None]:
#Step Actions Run Only on Master Node  
  Steps=[
      {'Name': 'Run Installer for Something',
       'ActionOnFailure': 'CONTINUE',
       'HadoopJarStep': {
           'Jar': 's3://region.elasticmapreduce/libs/script-runner/script-runner.jar',
           'Args': [
               's3://dsa-mizzou/scripts/my-script.sh']}
       }
  ],
#Do not run this cell (Standalone)    