# Run EMR Job via Boto
Objective: Develop code to execute EMR job using Boto alone.  
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/emr.html

In [3]:
import boto3

In [4]:
client = boto3.session.Session(profile_name='service_wp').client('emr')

In [4]:
client.list_clusters()

{'Clusters': [{'Id': 'j-LGO98Y54966M',
   'Name': 'emr-cluster-kp-0425-boto',
   'Status': {'State': 'TERMINATED',
    'StateChangeReason': {'Code': 'USER_REQUEST',
     'Message': 'Terminated by user request'},
    'Timeline': {'CreationDateTime': datetime.datetime(2023, 4, 25, 7, 25, 10, 172000, tzinfo=tzlocal()),
     'ReadyDateTime': datetime.datetime(2023, 4, 25, 7, 30, 19, 65000, tzinfo=tzlocal()),
     'EndDateTime': datetime.datetime(2023, 4, 25, 7, 35, 29, 295000, tzinfo=tzlocal())}},
   'NormalizedInstanceHours': 8,
   'ClusterArn': 'arn:aws:elasticmapreduce:us-east-1:655268872845:cluster/j-LGO98Y54966M'},
  {'Id': 'j-39PQN52H558BE',
   'Name': 'emr-cluster-kp-0425-boto',
   'Status': {'State': 'TERMINATED',
    'StateChangeReason': {'Code': 'USER_REQUEST',
     'Message': 'Terminated by user request'},
    'Timeline': {'CreationDateTime': datetime.datetime(2023, 4, 25, 7, 15, 46, 924000, tzinfo=tzlocal()),
     'ReadyDateTime': datetime.datetime(2023, 4, 25, 7, 19, 41, 16000

In [7]:
client.describe_cluster(ClusterId='j-3NO2D6LP8TVCS')

{'Cluster': {'Id': 'j-3NO2D6LP8TVCS',
  'Name': 'emr-kp',
  'Status': {'State': 'TERMINATED',
   'StateChangeReason': {'Code': 'USER_REQUEST',
    'Message': 'Terminated by user request'},
   'Timeline': {'CreationDateTime': datetime.datetime(2023, 4, 19, 6, 42, 28, 698000, tzinfo=tzlocal()),
    'ReadyDateTime': datetime.datetime(2023, 4, 19, 6, 46, 57, 88000, tzinfo=tzlocal()),
    'EndDateTime': datetime.datetime(2023, 4, 19, 6, 52, 36, 573000, tzinfo=tzlocal())}},
  'Ec2InstanceAttributes': {'Ec2SubnetId': 'subnet-0179f300743d7583c',
   'RequestedEc2SubnetIds': ['subnet-0179f300743d7583c'],
   'Ec2AvailabilityZone': 'us-east-1d',
   'RequestedEc2AvailabilityZones': [],
   'IamInstanceProfile': 'emr-ec2-role',
   'EmrManagedMasterSecurityGroup': 'sg-0b3081a1938c163a5',
   'EmrManagedSlaveSecurityGroup': 'sg-0abb5aa1d5fad5681',
   'AdditionalMasterSecurityGroups': [],
   'AdditionalSlaveSecurityGroups': []},
  'InstanceCollectionType': 'INSTANCE_GROUP',
  'ReleaseLabel': 'emr-6.10.0'

In [7]:
response = client.run_job_flow(
    Name='emr-cluster-kp-0425-boto-2',
    ReleaseLabel='emr-6.10.0',
    Instances={
        'InstanceGroups': [
            {
                'Name': 'inst1',
                'Market': 'ON_DEMAND',
                'InstanceRole': 'MASTER',
                'InstanceType': 'm5.xlarge',
                'InstanceCount': 1,
            },
        ],
        'KeepJobFlowAliveWhenNoSteps': False,
        'TerminationProtected': False,
        'Ec2SubnetId': 'subnet-0179f300743d7583c',
        'EmrManagedMasterSecurityGroup': 'sg-0b3081a1938c163a5',
        'EmrManagedSlaveSecurityGroup': 'sg-0b3081a1938c163a5',
        'AdditionalMasterSecurityGroups': [],
        'AdditionalSlaveSecurityGroups': []
    },
    Steps=[
    {
        "Name": "run1",
        "ActionOnFailure": "CANCEL_AND_WAIT",
        "HadoopJarStep": {
            "Jar": "command-runner.jar",
            "Args": ["spark-submit", 
                     "--deploy-mode", 
                     "cluster",
                     "s3://weather-data-kpde/code/02_process_s3_parquet.py"],
        },
    }
    ],
    Applications=[{'Name': 'Spark'}],
    VisibleToAllUsers=True,
    JobFlowRole='emr-ec2-role',
    ServiceRole='emr-role',
    Tags=[
        {
            'Key': 'for-use-with-amazon-emr-managed-policies',
            'Value': 'true'
        },
    ],
    AutoTerminationPolicy={
        'IdleTimeout': 1*60*60
    }
)

In [8]:
response

{'JobFlowId': 'j-3VNKAAJ3WMCNF',
 'ClusterArn': 'arn:aws:elasticmapreduce:us-east-1:655268872845:cluster/j-3VNKAAJ3WMCNF',
 'ResponseMetadata': {'RequestId': '02ce37e1-6515-49ab-91c2-d28e9a0b9399',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '02ce37e1-6515-49ab-91c2-d28e9a0b9399',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '118',
   'date': 'Mon, 01 May 2023 23:26:36 GMT'},
  'RetryAttempts': 0}}

In [9]:
cluster_id = response['JobFlowId']
cluster_id

'j-3VNKAAJ3WMCNF'

## Get Step ID

In [11]:
response = client.list_steps(
    ClusterId=cluster_id,
)

In [12]:
response

{'Steps': [{'Id': 's-20UO3AW9GU9LD',
   'Name': 'run1',
   'Config': {'Jar': 'command-runner.jar',
    'Properties': {},
    'Args': ['spark-submit',
     '--deploy-mode',
     'cluster',
     's3://weather-data-kpde/code/02_process_s3_parquet.py']},
   'ActionOnFailure': 'CANCEL_AND_WAIT',
   'Status': {'State': 'PENDING',
    'StateChangeReason': {},
    'Timeline': {'CreationDateTime': datetime.datetime(2023, 5, 1, 19, 26, 35, 988000, tzinfo=tzlocal())}}}],
 'ResponseMetadata': {'RequestId': 'ade74ede-e806-4a0e-ae31-1bda9db7039c',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'ade74ede-e806-4a0e-ae31-1bda9db7039c',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '346',
   'date': 'Mon, 01 May 2023 23:28:28 GMT'},
  'RetryAttempts': 0}}

In [15]:
step_id = response['Steps'][0]['Id']
step_id

's-20UO3AW9GU9LD'

## Wait

In [16]:
# is the cluster running yet?
waiter = client.get_waiter('cluster_running')
waiter.wait(
    ClusterId=cluster_id
)
print("Cluster is supposedly running")

In [18]:
response_st = client.describe_cluster(
    ClusterId=cluster_id
)

In [19]:
response_st

{'Cluster': {'Id': 'j-3VNKAAJ3WMCNF',
  'Name': 'emr-cluster-kp-0425-boto-2',
  'Status': {'State': 'RUNNING',
   'StateChangeReason': {'Message': 'Running step'},
   'Timeline': {'CreationDateTime': datetime.datetime(2023, 5, 1, 19, 26, 35, 897000, tzinfo=tzlocal()),
    'ReadyDateTime': datetime.datetime(2023, 5, 1, 19, 30, 58, 57000, tzinfo=tzlocal())}},
  'Ec2InstanceAttributes': {'Ec2SubnetId': 'subnet-0179f300743d7583c',
   'RequestedEc2SubnetIds': ['subnet-0179f300743d7583c'],
   'Ec2AvailabilityZone': 'us-east-1d',
   'RequestedEc2AvailabilityZones': [],
   'IamInstanceProfile': 'emr-ec2-role',
   'EmrManagedMasterSecurityGroup': 'sg-0b3081a1938c163a5',
   'EmrManagedSlaveSecurityGroup': 'sg-0b3081a1938c163a5',
   'AdditionalMasterSecurityGroups': [],
   'AdditionalSlaveSecurityGroups': []},
  'InstanceCollectionType': 'INSTANCE_GROUP',
  'ReleaseLabel': 'emr-6.10.0',
  'AutoTerminate': False,
  'TerminationProtected': False,
  'VisibleToAllUsers': True,
  'Applications': [{'Na

In [21]:
response_step = client.describe_step(
    ClusterId=cluster_id,
    StepId=step_id
)
response_step

{'Step': {'Id': 's-20UO3AW9GU9LD',
  'Name': 'run1',
  'Config': {'Jar': 'command-runner.jar',
   'Properties': {},
   'Args': ['spark-submit',
    '--deploy-mode',
    'cluster',
    's3://weather-data-kpde/code/02_process_s3_parquet.py']},
  'ActionOnFailure': 'CANCEL_AND_WAIT',
  'Status': {'State': 'FAILED',
   'StateChangeReason': {},
   'FailureDetails': {'Reason': 'S3 Service Error.',
    'Message': 'Caused by: com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.model.AmazonS3Exception: The specified bucket does not exist (Service: Amazon S3; Status Code: 404; Error Code: NoSuchBucket; Request ID: ADGGG858759C6JW9; S3 Extended Request ID: R1Y4109jLtetNv2MDm4wjioWF0mkbq57ATR7wmFe1+50zw4DoivYQB9TmQawGJOXNL6w0wqBt1k=; Proxy: null), S3 Extended Request ID: R1Y4109jLtetNv2MDm4wjioWF0mkbq57ATR7wmFe1+50zw4DoivYQB9TmQawGJOXNL6w0wqBt1k=',
    'LogFile': 'Step log files on S3 are only available for clusters which have logging enabled.'},
   'Timeline': {'CreationDateTime': dateti

In [20]:
# is the step complete yet?
waiter = client.get_waiter('step_complete')
waiter.wait(
    ClusterId=cluster_id,
    StepId=step_id,
)
print("Step is complete")

WaiterError: Waiter StepComplete failed: Waiter encountered a terminal failure state: For expression "Step.Status.State" we matched expected path: "FAILED"

In [22]:
# is the cluster terminated yet?
waiter = client.get_waiter('cluster_terminated')
waiter.wait(
    ClusterId=cluster_id
)