In [17]:
import boto3
from time import sleep
import json

client = boto3.client('emr')

In [46]:
# Build the cluster
cluster_infos = client.run_job_flow(
    Name='cluster_trigger_sm',
    ReleaseLabel='emr-5.31.0',# Choose the version of EMR
    Instances={
        'EmrManagedMasterSecurityGroup': 'sg-0483e5b63626a0889',# Add the right secrurity groups (need to be share with the sagemaker notebook)
        'EmrManagedSlaveSecurityGroup': 'sg-0483e5b63626a0889',# Add the right secrurity groups
        'KeepJobFlowAliveWhenNoSteps' : True,
        'InstanceGroups': [
            {
                'Name': 'test_master',
                'Market': 'SPOT',
                'InstanceRole': 'MASTER',
                'InstanceType' : 'm5.xlarge',
                'InstanceCount' : 1
            },
            {
                'Name': 'test_node',
                'Market': 'SPOT',
                'InstanceRole': 'CORE',
                'InstanceType' : 'm5.xlarge',
                'InstanceCount' : 1
            }]},
    Applications=[
        {'Name': 'Spark'},
        {'Name': 'Hadoop'},
        {'Name': 'Hue'},
        {'Name': 'Livy'},
        {'Name': 'Zeppelin'},
        {'Name': 'Ganglia'}
    ],
    JobFlowRole='EMR_EC2_DefaultRole', # default role
    ServiceRole='EMR_DefaultRole',
    Configurations=[
  {
    "Classification": "spark-hive-site",
    "Properties": {
      "hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
      }
  },
  {
    "Classification": "core-site",
    "Properties": {
      "hadoop.proxyuser.livy.groups": "*",
      "hadoop.proxyuser.livy.hosts": "*"
    }
  },
  {
    "Classification": "livy-conf",
    "Properties": {
      "livy.impersonation.enabled": "true",
      "livy.server.session.timeout":"5h"
    }
  }
]
)

In [51]:
# Collect the infomation on the cluster when it's up
while True:
    instances_cluster = boto3.client('emr',region_name='us-east-1').list_instances(ClusterId =cluster_infos['JobFlowId'], InstanceGroupTypes=['MASTER'])
    if len(instances_cluster["Instances"]) == 1 and instances_cluster["Instances"][0]['Status']['State'] == 'RUNNING':
        print(instances_cluster)
        break
    sleep(15)
    
# Store the ip of the master node in a json file
master_infos = instances_cluster['Instances'][0]
master_infos = {'ip' : master_infos['PrivateIpAddress']}
with open('emr_config.json', 'w') as outfile:
    json.dump(master_infos, outfile)

{'Instances': [{'Id': 'ci-4POEA6JB8VFG', 'Ec2InstanceId': 'i-0c9d6ee467a2150d1', 'PublicDnsName': 'ec2-3-238-23-82.compute-1.amazonaws.com', 'PublicIpAddress': '3.238.23.82', 'PrivateDnsName': 'ip-172-31-95-169.ec2.internal', 'PrivateIpAddress': '172.31.95.169', 'Status': {'State': 'RUNNING', 'StateChangeReason': {}, 'Timeline': {'CreationDateTime': datetime.datetime(2021, 3, 29, 14, 23, 57, 358000, tzinfo=tzlocal()), 'ReadyDateTime': datetime.datetime(2021, 3, 29, 14, 30, 32, 477000, tzinfo=tzlocal())}}, 'InstanceGroupId': 'ig-2XXTB13G0WH1F', 'Market': 'SPOT', 'InstanceType': 'm5.xlarge', 'EbsVolumes': [{'Device': '/dev/sdc', 'VolumeId': 'vol-049ef5242fa57968b'}, {'Device': '/dev/sdb', 'VolumeId': 'vol-0f9eb12dfc93994ef'}]}], 'ResponseMetadata': {'RequestId': 'e7e1bb8e-fe04-4123-bbe6-3c647adb976f', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'e7e1bb8e-fe04-4123-bbe6-3c647adb976f', 'content-type': 'application/x-amz-json-1.1', 'content-length': '595', 'date': 'Mon, 29 Ma

In [56]:
# Execute the bash script that will do the connection betrween the sagemaker notebook (https://docs.aws.amazon.com/sagemaker/latest/dg/nbi-lifecycle-config-emr.html)
!/bin/bash /home/ec2-user/SageMaker/connect_emr_cluster.sh

/home/ec2-user/SageMaker/emr_config.json
"172.31.95.169"
172.31.95.169
Fetching Sparkmagic example config from GitHub...
--2021-03-29 14:40:51--  https://raw.githubusercontent.com/jupyter-incubator/sparkmagic/master/sparkmagic/example_config.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2081 (2.0K) [text/plain]
Saving to: ‘example_config.json’


2021-03-29 14:40:51 (47.4 MB/s) - ‘example_config.json’ saved [2081/2081]

Replacing EMR master node IP in Sparkmagic config...
Sending a sample request to Livy..
{"from":0,"total":0,"sessions":[]}