# Exercise 2: Creating Redshift Cluster using the AWS python SDK 
## An example of Infrastructure-as-code

In [1]:
import pandas as pd
import boto3
import json

# STEP 0: Make sure you have an AWS secret and access key

- Create a new IAM user in your AWS account
- Give it `AdministratorAccess`, From `Attach existing policies directly` Tab
- Take note of the access key and secret 
- Edit the file `dwh.cfg` in the same folder as this notebook and fill
<font color='red'>
<BR>
[AWS]<BR>
KEY= YOUR_AWS_KEY<BR>
SECRET= YOUR_AWS_SECRET<BR>
<font/>


# Load DWH Params from a file

In [2]:
import configparser
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

KEY                    = config.get('AWS', 'KEY')
SECRET                 = config.get('AWS', 'SECRET')
VPC_SECURITY_GROUP_ID  = config.get('AWS', 'VPC_SECURITY_GROUP_ID')

DWH_CLUSTER_TYPE       = config.get('DWH', 'DWH_CLUSTER_TYPE')
DWH_NUM_NODES          = config.get('DWH', 'DWH_NUM_NODES')
DWH_NODE_TYPE          = config.get('DWH', 'DWH_NODE_TYPE')

DWH_CLUSTER_IDENTIFIER = config.get('DWH', 'DWH_CLUSTER_IDENTIFIER')
DWH_DB                 = config.get('DWH', 'DWH_DB')
DWH_DB_USER            = config.get('DWH', 'DWH_DB_USER')
DWH_DB_PASSWORD        = config.get('DWH', 'DWH_DB_PASSWORD')
DWH_PORT               = config.get('DWH', 'DWH_PORT')

DWH_IAM_ROLE_NAME      = config.get('DWH', 'DWH_IAM_ROLE_NAME')

(DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB)

pd.DataFrame({'Param':
                  ['DWH_CLUSTER_TYPE', 
                   'DWH_NUM_NODES',
                   'DWH_NODE_TYPE', 
                   'DWH_CLUSTER_IDENTIFIER', 
                   'DWH_DB', 
                   'DWH_DB_USER', 
                   'DWH_DB_PASSWORD', 
                   'DWH_PORT',
                   'DWH_IAM_ROLE_NAME'],
              'Value':
                  [DWH_CLUSTER_TYPE,
                   DWH_NUM_NODES,
                   DWH_NODE_TYPE,
                   DWH_CLUSTER_IDENTIFIER,
                   DWH_DB,
                   DWH_DB_USER,
                   DWH_DB_PASSWORD,
                   DWH_PORT,
                   DWH_IAM_ROLE_NAME]
             })

Unnamed: 0,Param,Value
0,DWH_CLUSTER_TYPE,multi-node
1,DWH_NUM_NODES,2
2,DWH_NODE_TYPE,dc2.large
3,DWH_CLUSTER_IDENTIFIER,dwhCluster
4,DWH_DB,dwh
5,DWH_DB_USER,dwhuser
6,DWH_DB_PASSWORD,Passw0rd
7,DWH_PORT,5439
8,DWH_IAM_ROLE_NAME,dwhRole


## Create clients for EC2, S3, IAM, and Redshift

In [3]:
# import boto3

ec2 = boto3.resource('ec2',
                     region_name='us-east-1',
                     aws_access_key_id=KEY,
                     aws_secret_access_key=SECRET)

s3 = boto3.resource('s3',
                    region_name='us-east-1',
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET)

iam = boto3.client('iam',
                   region_name='us-east-1',
                   aws_access_key_id=KEY,
                   aws_secret_access_key=SECRET)

redshift = boto3.client('redshift', 
                        region_name='us-east-1',
                        aws_access_key_id=KEY,
                        aws_secret_access_key=SECRET)

## Check out the sample data sources on S3

In [4]:
sampleDbBucket = s3.Bucket('awssampledbuswest2')

# TODO: Iterate over bucket objects starting with "ssbgz" and print

# for obj in sampleDbBucket.objects.all():
#     print(obj)

for obj in sampleDbBucket.objects.filter(Prefix='ssbgz'):
    print(obj)

s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/customer0002_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/dwdate.tbl.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/lineorder0000_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/lineorder0001_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/lineorder0002_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/lineorder0003_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/lineorder0004_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/lineorder0005_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/lineorder0006_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/lineorder0007_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='s

## STEP 1: IAM ROLE
- Create an IAM Role that makes Redshift able to access S3 bucket (ReadOnly)

In [5]:
paginator = iam.get_paginator('list_users')
for response in paginator.paginate():
    print(response)

{'Users': [{'Path': '/', 'UserName': 'airflow_redshift_user', 'UserId': 'AIDASMUDD4C2GAG6L4GCG', 'Arn': 'arn:aws:iam::164557480116:user/airflow_redshift_user', 'CreateDate': datetime.datetime(2020, 4, 3, 0, 51, 38, tzinfo=tzutc())}, {'Path': '/', 'UserName': 'dwhadmin', 'UserId': 'AIDASMUDD4C2PXALN6S5X', 'Arn': 'arn:aws:iam::164557480116:user/dwhadmin', 'CreateDate': datetime.datetime(2020, 4, 5, 16, 54, 56, tzinfo=tzutc())}, {'Path': '/', 'UserName': 's3user', 'UserId': 'AIDASMUDD4C2EMVZ4JKHW', 'Arn': 'arn:aws:iam::164557480116:user/s3user', 'CreateDate': datetime.datetime(2020, 4, 9, 13, 31, 42, tzinfo=tzutc())}], 'IsTruncated': False, 'ResponseMetadata': {'RequestId': 'f443292f-6f85-4e49-be4e-934f23191f1e', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'f443292f-6f85-4e49-be4e-934f23191f1e', 'content-type': 'text/xml', 'content-length': '1079', 'date': 'Sun, 12 Apr 2020 15:18:50 GMT'}, 'RetryAttempts': 0}}


In [6]:
# TODO: Create the IAM role
try:
    print('1.1 Creating a new IAM Role')
    dwhRole = iam.create_role(Path='/',
                              RoleName=DWH_IAM_ROLE_NAME, # dwhRole
                              Description = 'Allows Redshift clusters to call AWS services on your behalf.',
                              AssumeRolePolicyDocument=json.dumps(
                                {'Statement': [{'Action': 'sts:AssumeRole',
                                                'Effect': 'Allow',
                                                'Principal': {'Service': 'redshift.amazonaws.com'}}],
                                  'Version': '2012-10-17'}))
except Exception as e:
    print(e)

1.1 Creating a new IAM Role
An error occurred (EntityAlreadyExists) when calling the CreateRole operation: Role with name dwhRole already exists.


In [7]:
# TODO: Attach Policy
print('1.2 Attaching Policy')
iam.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME, # dwhRole
                       PolicyArn='arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess')['ResponseMetadata']['HTTPStatusCode']

1.2 Attaching Policy


200

In [None]:
# TODO: Get and print the IAM role ARN
print('1.3 Get the IAM role ARN')
roleArn = iam.get_role(RoleName=DWH_IAM_ROLE_NAME)['Role']['Arn']

print(roleArn)

## STEP 2:  Redshift Cluster

- Create a RedShift Cluster
- For complete arguments to `create_cluster`, see [docs](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/redshift.html#Redshift.Client.create_cluster)

In [None]:
try:
    response = redshift.create_cluster(
        # TODO: add parameters for hardware
        ClusterType=DWH_CLUSTER_TYPE,
        NodeType=DWH_NODE_TYPE,
        NumberOfNodes=int(DWH_NUM_NODES),
        # TODO: add parameters for identifiers & credentials
        DBName=DWH_DB,
        ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
        MasterUsername=DWH_DB_USER,
        MasterUserPassword=DWH_DB_PASSWORD,
        # TODO: add parameter for role (to allow s3 access)
        IamRoles=[roleArn],
        # Define VpcId === ADDED BY ME ===
        VpcSecurityGroupIds=[VPC_SECURITY_GROUP_ID]
    )
except Exception as e:
    print(e)

## 2.1 *Describe* the cluster to see its status
- run this block several times until the cluster status becomes `Available`

In [None]:
def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', None)
    keysToShow = ['ClusterIdentifier', 
                  'NodeType',
                  'ClusterStatus', 
                  'MasterUsername', 
                  'DBName',
                  'Endpoint',
                  'NumberOfNodes',
                  'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=['Key', 'Value'])

myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

In [None]:
redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)

<h2> 2.2 Take note of the cluster <font color='red'> endpoint and role ARN </font> </h2>

<font color='red'>DO NOT RUN THIS unless the cluster status becomes "Available" </font>

In [None]:
DWH_ENDPOINT = myClusterProps['Endpoint']['Address']
DWH_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
# print('DWH_ENDPOINT :: ', endpoint)
# print('DWH_ROLE_ARN :: ', roleArn)
print('DWH_ENDPOINT :: ', DWH_ENDPOINT)
print('DWH_ROLE_ARN :: ', DWH_ROLE_ARN)

## STEP 3: Open an incoming  TCP port to access the cluster ednpoint

In [None]:
try:
    vpc = ec2.Vpc(id=myClusterProps['VpcId'])
    # defaultSg = list(vpc.security_groups.all())[0]
    # print(defaultSg)
    # print(list(vpc.security_groups.filter(GroupIds=[VPC_SECURITY_GROUP_ID])))
    redshift_sg = list(vpc.security_groups.filter(GroupIds=[VPC_SECURITY_GROUP_ID]))[0]
    print(redshift_sg)
    
    # defaultSg.authorize_ingress(
    #     GroupName=defaultSg.group_name,  # TODO: fill out
    #     CidrIp='0.0.0.0/0',  # TODO: fill out
    #     IpProtocol='TCP',  # TODO: fill out
    #     FromPort=int(DWH_PORT),
    #     ToPort=int(DWH_PORT)
    # )
    redshift_sg.authorize_ingress(
        GroupName=redshift_sg.group_name,
        CidrIp='0.0.0.0/0',
        IpProtocol='TCP',
        FromPort=int(DWH_PORT),
        ToPort=int(DWH_PORT)
    )
except Exception as e:
    print(e)

## STEP 4: Make sure you can connect to the clusterConnect to the cluster

In [None]:
%load_ext sql

In [None]:
conn_string='postgresql://{}:{}@{}:{}/{}'.format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT, DWH_PORT, DWH_DB)
print(conn_string)
%sql $conn_string

## STEP 5: Clean up your resources

<b><font color='red'>DO NOT RUN THIS UNLESS YOU ARE SURE <br/> 
    We will be using these resources in the next exercises</span></b>

In [None]:
#### CAREFUL!!
#-- Uncomment & run to delete the created resources
redshift.delete_cluster(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER, 
                        SkipFinalClusterSnapshot=True)
#### CAREFUL!!

- run this block several times until the cluster really deleted

In [None]:
myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

In [None]:
#### CAREFUL!!
#-- Uncomment & run to delete the created resources
iam.detach_role_policy(RoleName=DWH_IAM_ROLE_NAME, PolicyArn='arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess')
iam.delete_role(RoleName=DWH_IAM_ROLE_NAME)
#### CAREFUL!!