# This notebook is used to create the relevant IAM roles, policies, RedShift cluster and secret in Secret Manager.

**Note:** Please set kernel to `Python 3 (Data Science)`

### Variables
Variable names for secret, RedShift, Athena and Glue.

In [1]:
secret_name='bankdm_redshift_login' 

# Random function to generate password.
import random
import string
def random_char(y):
       return ''.join(random.choice(string.ascii_letters) for x in range(y))
    
# The variables below are only required for notebook 01
# The RedShift, Athena and Glue information are stored in Secrets Manager
subnet_name = 'Private subnet' # Change this is the private subnet name is different

database_name_redshift = 'bankdm'
database_name_glue = 'bankdm'

schema_redshift = 'dm'
schema_athena = 'athena' # have to be athena

table_name_glue = 'bankdm_glue'
table_name_redshift = 'data'


# Redshift configuration parameters
redshift_cluster_identifier = 'bankdm'
database_name = 'bankdm'
cluster_type = 'single-node' # or multi-node

master_user_name = 'bankdm'
master_user_pw = random_char(16) + '1' # the password requires a number

# Note that only some Instance Types support Redshift Query Editor 
# (https://docs.aws.amazon.com/redshift/latest/mgmt/query-editor.html)
node_type = 'dc2.large'
# number_nodes = '1' # for multi-node. Also uncomment this line below: NumberOfNodes=int(number_nodes),

# Set the security group ID if not using the default one
security_group_id = None


### Import the necessary libraries and create client session


In [2]:
import json
import boto3
from botocore.exceptions import ClientError
from botocore.config import Config
import time
import sagemaker
import zipfile

iam = boto3.client('iam')
sts = boto3.client('sts')
accountID = sts.get_caller_identity()["Account"]  
redshift = boto3.client('redshift')
sm = boto3.client('sagemaker')
ec2 = boto3.client('ec2')
secretsmanager = boto3.client('secretsmanager')

s3 = boto3.client('s3')
lambda_client = boto3.client('lambda')

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

bucket = sagemaker_session.default_bucket()

## IAM Roles and Policy
### Adding permissions to SageMaker Execution role

In [3]:
role_name = role.split("/")[-1]

print("Role name: {}".format(role_name))

Role name: AmazonSageMaker-ExecutionRole-20210914T142114


In [4]:
setup_iam_roles_passed = False
admin = False
post_policies = iam.list_attached_role_policies(RoleName=role_name)["AttachedPolicies"]
for post_policy in post_policies:
    if post_policy["PolicyName"] == "AdministratorAccess":
        admin = True
        break

setup_iam_roles_passed = True
print("[OK] You are all set up to continue with this workshop!")

[OK] You are all set up to continue with this workshop!


In [5]:
if not admin:
    pre_policies = iam.list_attached_role_policies(RoleName=role_name)["AttachedPolicies"]

    required_policies = ["IAMFullAccess"]

    for pre_policy in pre_policies:
        for role_req in required_policies:
            if pre_policy["PolicyName"] == role_req:
                print("Attached: {}".format(pre_policy["PolicyName"]))
                try:
                    required_policies.remove(pre_policy["PolicyName"])
                except:
                    pass

    if len(required_policies) > 0:
        print(
            "*************** [ERROR] You need to attach the following policies in order to continue with this workshop *****************\n"
        )
        for required_policy in required_policies:
            print("Not Attached: {}".format(required_policy))
    else:
        print("[OK] You are all set to continue with this notebook!")
else:
    print("[OK] You are all set to continue with this notebook!")

Attached: IAMFullAccess
[OK] You are all set to continue with this notebook!


#### Create a function to add policy to the role

In [6]:
def addPolicy(policy, role_name):
    try:
        response = iam.attach_role_policy(PolicyArn="arn:aws:iam::aws:policy/{}".format(policy), RoleName=role_name)
        print("Policy {} has been succesfully attached to role: {}".format(policy, role_name))
    except ClientError as e:
        if e.response["Error"]["Code"] == "EntityAlreadyExists":
            print("[OK] Policy is already attached.")
        elif e.response["Error"]["Code"] == "LimitExceeded":
            print("[OK]")
        else:
            print("*************** [ERROR] {} *****************".format(e))


#### Add the following policies to the role.

In [7]:
addPolicy("AmazonRedshiftFullAccess", role_name)
addPolicy("SecretsManagerReadWrite", role_name)
addPolicy("AmazonAthenaFullAccess", role_name)
# The Lambda role is needed to create the lambda function below
addPolicy("AWSLambda_FullAccess", role_name)


Policy AmazonRedshiftFullAccess has been succesfully attached to role: AmazonSageMaker-ExecutionRole-20210914T142114
Policy SecretsManagerReadWrite has been succesfully attached to role: AmazonSageMaker-ExecutionRole-20210914T142114
Policy AmazonAthenaFullAccess has been succesfully attached to role: AmazonSageMaker-ExecutionRole-20210914T142114
Policy AWSLambda_FullAccess has been succesfully attached to role: AmazonSageMaker-ExecutionRole-20210914T142114


### Add the following policies to SageMaker ServiceCatalog role

In [8]:
servicerole = 'AmazonSageMakerServiceCatalogProductsUseRole'
addPolicy("AmazonSageMakerPipelinesIntegrations", servicerole)
# The Lambda role is required to create lambda function in the SageMaker Pipeline. 
# However, this portion of the code is commented out.
addPolicy("AWSLambda_FullAccess", servicerole)

Policy AmazonSageMakerPipelinesIntegrations has been succesfully attached to role: AmazonSageMakerServiceCatalogProductsUseRole
Policy AWSLambda_FullAccess has been succesfully attached to role: AmazonSageMakerServiceCatalogProductsUseRole


### Add permissions to BankDM role
#### Create AssumeRolePolicyDocument

In [9]:
role = f"arn:aws:iam::{accountID}:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole"
assume_role_policy_doc = {
  "Version": "2012-10-17",
  "Statement": [
    {
      "Effect": "Allow",
      "Principal": {
        "AWS": role,
        "Service": ["sagemaker.amazonaws.com", "redshift.amazonaws.com"]
      },
      "Action": "sts:AssumeRole"
    }
  ]
}

assume_role_policy_doc

{'Version': '2012-10-17',
 'Statement': [{'Effect': 'Allow',
   'Principal': {'AWS': 'arn:aws:iam::138604873012:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole',
    'Service': ['sagemaker.amazonaws.com', 'redshift.amazonaws.com']},
   'Action': 'sts:AssumeRole'}]}

#### Create Role

In [10]:
iam_redshift_role_name = 'BankDM-RedShift'

In [11]:
try:
    iam_role_redshift = iam.create_role(
        RoleName=iam_redshift_role_name,
        AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),
        Description='BankDM Redshift Role'
    )
except ClientError as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        print("Role already exists")
    else:
        print("Unexpected error: %s" % e)

#### Get the Role ARN

In [12]:
role = iam.get_role(RoleName=iam_redshift_role_name)
iam_role_redshift_arn = role['Role']['Arn']
print(iam_role_redshift_arn)

arn:aws:iam::138604873012:role/BankDM-RedShift


### Attach AWS built-in policy to role


In [13]:
addPolicy("SecretsManagerReadWrite", iam_redshift_role_name)
addPolicy("AmazonRedshiftFullAccess", iam_redshift_role_name)
addPolicy("AmazonSageMakerFullAccess", iam_redshift_role_name)
addPolicy("AmazonS3FullAccess", iam_redshift_role_name)
addPolicy("AmazonAthenaFullAccess", iam_redshift_role_name)

Policy SecretsManagerReadWrite has been succesfully attached to role: BankDM-RedShift
Policy AmazonRedshiftFullAccess has been succesfully attached to role: BankDM-RedShift
Policy AmazonSageMakerFullAccess has been succesfully attached to role: BankDM-RedShift
Policy AmazonS3FullAccess has been succesfully attached to role: BankDM-RedShift
Policy AmazonAthenaFullAccess has been succesfully attached to role: BankDM-RedShift


## RedShift cluster
### Get Security Group ID 

* Make sure the Redshift VPC is the same this notebook is running within
* Make sure the VPC has the following 2 properties enabled
 *     DNS resolution = Enabled
 *     DNS hostnames = Enabled
* This allows private, internal access to Redshift from this SageMaker notebook using the fully qualified endpoint name.

In [14]:
if security_group_id is None:
    try:
        domain_id = sm.list_domains()['Domains'][0]['DomainId'] #['NotebookInstances'][0]['NotebookInstanceName']
        describe_domain_response = sm.describe_domain(DomainId=domain_id)
        vpc_id = describe_domain_response['VpcId']
        security_groups = ec2.describe_security_groups(Filters=[{"Name": "vpc-id", "Values": [vpc_id]}])['SecurityGroups']
        security_group_id = ''

        for sg in security_groups:
            if(sg['GroupName'] == 'default'):
                security_group_id = sg['GroupId']

        print(security_group_id)    
    except:
        pass
else:
    pass

sg-0ed435283f51cab2b


### Subnet for RedShift

Get the subnet ID for the private subnet. 

In [15]:
sn_all = ec2.describe_subnets(Filters=[{"Name": "vpc-id", "Values": [vpc_id]}])
subnetId = ''
for sn in sn_all['Subnets'] :
    if(sn['Tags'][0]['Value'] == subnet_name):
       subnetId = sn['SubnetId']
subnetId

'subnet-0642e85803c212573'

### Create Redshift Cluster
Create the RedShift subnet group and create the RedShift cluster.

In [16]:
try:
    response = redshift.create_cluster_subnet_group(
        ClusterSubnetGroupName='bankdm-subnet',
        Description='string',
        SubnetIds=[
            subnetId,
        ]
    )
    
except ClientError as e:
    if e.response['Error']['Code'] == 'ClusterSubnetGroupAlreadyExists':
        print("Cluster subnet group already exists. This is ok.")
    else:
        print("Unexpected error: %s" % e)

Cluster subnet group already exists. This is ok.


In [17]:
try:
    response = redshift.create_cluster(
            DBName=database_name,
            ClusterIdentifier=redshift_cluster_identifier,
            ClusterType=cluster_type,
            NodeType=node_type,
    #         NumberOfNodes=int(number_nodes),       # This is required if multi-node is specified
            ClusterSubnetGroupName='bankdm-subnet',
            MasterUsername=master_user_name,
            MasterUserPassword=master_user_pw,
            IamRoles=[iam_role_redshift_arn],
            VpcSecurityGroupIds=[security_group_id],
            Port=5439,
            PubliclyAccessible=False
    )
    
except ClientError as e:
    if e.response['Error']['Code'] == 'ClusterAlreadyExists':
        print("Cluster already exists. This is ok.")
    else:
        print("Unexpected error: %s" % e)

#### Please Wait for Cluster Status to change to `Available`

In [18]:
response = redshift.describe_clusters(ClusterIdentifier=redshift_cluster_identifier)
cluster_status = response['Clusters'][0]['ClusterStatus']
print(cluster_status)

while cluster_status != 'available':
    time.sleep(10)
    response = redshift.describe_clusters(ClusterIdentifier=redshift_cluster_identifier)
    cluster_status = response['Clusters'][0]['ClusterStatus']
    print(cluster_status)

creating
creating
creating
creating
creating
creating
creating
creating
creating
creating
creating
creating
creating
creating
available


In [19]:
response = redshift.describe_clusters(ClusterIdentifier=redshift_cluster_identifier)
host = response['Clusters'][0]['Endpoint']['Address']
port = response['Clusters'][0]['Endpoint']['Port']
print(host)

bankdm.cszyoc0ofzdt.ap-southeast-1.redshift.amazonaws.com


## Create Secret in Secrets Manager

Add RedShift, Athena and Glue information to the secret. 

Note: If the secret already exists and you are creating the RedShift cluster again, the secret will not be updated to the new password. Please update the password manually in Secrets Manager.
This is to prevent accidential update to the secret.

In [20]:
secretstring = f'"username":"{master_user_name}","password":"{master_user_pw}","engine":"redshift", \
"host":"{host}","port": "{port}","dbClusterIdentifier":"{redshift_cluster_identifier}", "db":"{database_name}", \
"database_name_redshift":"{database_name_redshift}","database_name_glue": "{database_name_glue}", \
"schema_redshift":"{schema_redshift}", "schema_athena":"{schema_athena}", \
"table_name_glue":"{table_name_glue}", "table_name_redshift":"{table_name_redshift}"'

secretstring 

'"username":"bankdm","password":"cXVgQMCfFHPpyAUN1","engine":"redshift", "host":"bankdm.cszyoc0ofzdt.ap-southeast-1.redshift.amazonaws.com","port": "5439","dbClusterIdentifier":"bankdm", "db":"bankdm", "database_name_redshift":"bankdm","database_name_glue": "bankdm", "schema_redshift":"dm", "schema_athena":"athena", "table_name_glue":"bankdm_glue", "table_name_redshift":"data"'

In [21]:
try:
    response = secretsmanager.create_secret(
        Name=secret_name,
        Description='BankDM Redshift Login',
        SecretString= '{' + secretstring + '}',
    )
except ClientError as e:
    if e.response['Error']['Code'] == 'ResourceExistsException':
        print("Secret already exists. If you are recreating the RedShift cluster, please update the password manually ")
    else:
        print("Unexpected error: %s" % e)

### Create Lambda IAM role and policy

In [22]:
def create_lambda_role(role_name):
    try:
        response = iam.create_role(
            RoleName = role_name,
            AssumeRolePolicyDocument = json.dumps({
                "Version": "2012-10-17",
                "Statement": [
                    {
                        "Effect": "Allow",
                        "Principal": {
                            "Service": "lambda.amazonaws.com"
                        },
                        "Action": "sts:AssumeRole"
                    }
                ]
            }),
            Description='Role for Lambda'
        )

        role_arn = response['Role']['Arn']

        response = iam.attach_role_policy(
            RoleName=role_name,
            PolicyArn='arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole'
        )

        
        addPolicy("SecretsManagerReadWrite", role_name)
        addPolicy("AmazonRedshiftFullAccess", role_name)
        addPolicy("AmazonSageMakerFullAccess", role_name)
        addPolicy("AmazonS3FullAccess", role_name)
        
        return role_arn

    except iam.exceptions.EntityAlreadyExistsException:
        print(f'Using ARN from existing role: {role_name}')
        response = iam.get_role(RoleName=role_name)
        return response['Role']['Arn']

lambda_role = create_lambda_role("BankDM-Lambda")

Policy SecretsManagerReadWrite has been succesfully attached to role: BankDM-Lambda
Policy AmazonRedshiftFullAccess has been succesfully attached to role: BankDM-Lambda
Policy AmazonSageMakerFullAccess has been succesfully attached to role: BankDM-Lambda
Policy AmazonS3FullAccess has been succesfully attached to role: BankDM-Lambda


In [25]:
archive = zipfile.ZipFile('lambda.zip', 'w')
archive.write('lambda_redshift_dl.py', 'lambda_redshift_dl.py')
archive.close()

s3.upload_file('lambda.zip', bucket, 'bankdm/lambda.zip')

try:
    response = lambda_client.delete_function(
            FunctionName='bankdm-redshift-dl',
        )
except ClientError as e:
    if e.response['Error']['Code'] == 'ResourceNotFoundException':
        print("Lambda function not found. Creating it...")
    else:
        print("Unexpected error: %s" % e) 

try:
    response = lambda_client.create_function(
                Code={
                    'S3Bucket': bucket,
                    'S3Key': 'bankdm/lambda.zip', 
                },
                FunctionName='bankdm-redshift-dl',
                Handler='lambda_redshift_dl.lambda_handler',
                Publish=True,
                Role=lambda_role,
                Runtime='python3.8',
                Timeout=600, # Set to 10 minutes
                MemorySize=512,
            )
except ClientError as e:
    print("Unexpected error: %s" % e) 

Lambda function not found. Creating it...
