In [20]:
import boto3
import json
from helper_functions import create_session

In [25]:
# Creating IAM Role with Glue full access and S3 full access. Required for crawler to function properly
session = create_session()
iam = session.client('iam')

In [26]:
#Create role for Glue to assume, provide access to Glue and S3 in subsequent step
assume_role_policy_document = json.dumps({
            "Version": "2012-10-17",
            "Statement": [
                            {
                                "Effect": "Allow",
                                "Principal": {
                                "Service": "glue.amazonaws.com"
                                },
                            "Action": "sts:AssumeRole"
                            }
                        ]
                    })
try:
    res = iam.create_role(
        RoleName='covid-project-glue-s3',
        AssumeRolePolicyDocument=assume_role_policy_document
        )
    print(f'Status Code: {res["ResponseMetadata"]["HTTPStatusCode"]}')
except Exception as e:
    print(e)

In [29]:
# Attach policies
try:
    policies = [
        'arn:aws:iam::aws:policy/AmazonS3FullAccess',  #S3 Full Access
        'arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole',  #Glue Service Role
        'arn:aws:iam::aws:policy/AWSGlueConsoleFullAccess'  #Glue Console Full Access
    ]
    for policy in policies:
        res = iam.attach_role_policy(
            RoleName='covid-project-glue-s3',
            PolicyArn=policy
        )
        print(f'Policy: {policy}\nStatus Code: {res["ResponseMetadata"]["HTTPStatusCode"]}')
except Exception as e:
    print(e)

In [31]:
# Instantiate glue client object
glue = session.client('glue')

In [32]:
# Create glue database for crawlers to save glue catalog tables / schemas to
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/client/create_database.html
# From documentation, default permissions typically not used in normal course of Glue operations.
try:
    res = glue.create_database(
        DatabaseInput={
            'Name':'aws-covid-project',
            'Description': 'Database for housing crawled schema of datasets from AWS Covid-19 Data Lake'
        }
    )
    print(f'Status Code: {res["ResponseMetadata"]["HTTPStatusCode"]}')
except Exception as e:
    print(e)

In [56]:
# Create glue crawlers and attach above created policy to each crawler for Glue / S3 full access

# Get Role ARN
role_arn = None
for role in iam.list_roles()['Roles']:
    if role['RoleName'] == 'covid-project-glue-s3':
        role_arn = role['Arn']
        break
    else:
        continue

try:
    # Store necessary parameters in variables for each target object in S3
    s3 = session.client('s3')
    bucket_name = 'kc-covid-project'
    database_name = 'aws-covid-project'

    for obj in s3.list_objects(Bucket=bucket_name)['Contents']:
        obj_key = obj['Key']
        s3_path = f's3://{bucket_name}/{obj_key}'

        # Create crawler for each
        res = glue.create_crawler(
            Name=obj_key,
            Role=role_arn,
            DatabaseName=database_name,
            Description=f'Crawler for {obj_key} dataset from AWS Covid-19 Data Lake',
            Targets={
                'S3Targets': [
                    {
                    'Path': s3_path
                    }
                ]
            },
            SchemaChangePolicy={
                'UpdateBehavior': 'LOG',
                'DeleteBehavior': 'LOG'
            },
            RecrawlPolicy={
                'RecrawlBehavior':'CRAWL_NEW_FOLDERS_ONLY'
            },
            LineageConfiguration={
                'CrawlerLineageSettings': 'DISABLE'
            }
        )
        print(obj_key)
        print(f'Status Code: {res["ResponseMetadata"]["HTTPStatusCode"]}')
except Exception as e:
    print(e)

enigma-jhu
Status Code: 200
enigma-nyt-usa-states
Status Code: 200
enigma-usa-counties
Status Code: 200
rearc-usa-daily-test
Status Code: 200
rearc-usa-daily-testing
Status Code: 200
rearc-usa-hospital-beds
Status Code: 200
rearc-usa-latest-total
Status Code: 200
static-country-codes
Status Code: 200
static-county-codes
Status Code: 200
static-state-codes
Status Code: 200
