In [26]:
# Load in all required libraries
import pandas as pd 
import boto3
import botocore.exceptions
import json
import configparser
import time

In [2]:
# Open and read the contents of the config file
ioc_config = configparser.ConfigParser()
ioc_config.read_file(open('./dwh-ioc.cfg'))

In [3]:
# Load all the keys needed to create AWS services
KEY                    = ioc_config.get('AWS','KEY')
SECRET                 = ioc_config.get('AWS','SECRET')

DWH_REGION             = ioc_config.get("DWH","DWH_REGION")
DWH_CLUSTER_TYPE       = ioc_config.get("DWH","DWH_CLUSTER_TYPE")
DWH_NUM_NODES          = ioc_config.get("DWH","DWH_NUM_NODES")
DWH_NODE_TYPE          = ioc_config.get("DWH","DWH_NODE_TYPE")

DWH_CLUSTER_IDENTIFIER = ioc_config.get("DWH","DWH_CLUSTER_IDENTIFIER")
DWH_DB                 = ioc_config.get("DWH","DWH_DB")
DWH_DB_USER            = ioc_config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD        = ioc_config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT               = ioc_config.get("DWH","DWH_PORT")

DWH_IAM_ROLE_NAME      = ioc_config.get("DWH", "DWH_IAM_ROLE_NAME")

In [5]:
# Display the parameters for creating the DWH cluster
df = pd.DataFrame({
        "Param":["DWH_REGION", "DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_PORT", "DWH_IAM_ROLE_NAME"],
        "Value":[DWH_REGION, DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_PORT, DWH_IAM_ROLE_NAME]
    })

print(df)

                    Param       Value
0              DWH_REGION   us-east-1
1        DWH_CLUSTER_TYPE  multi-node
2           DWH_NUM_NODES           2
3           DWH_NODE_TYPE   dc2.large
4  DWH_CLUSTER_IDENTIFIER  dwhCluster
5                  DWH_DB         dwh
6                DWH_PORT        5439
7       DWH_IAM_ROLE_NAME     dwhRole


In [6]:
# Creating resources/clients for all needed infrastructure: EC2, S3, IAM, Redshift
def create_client(name, func):
    print("Creating client for", name)
    return func(name,
                region_name=DWH_REGION,
                aws_access_key_id=KEY,
                aws_secret_access_key=SECRET)


ec2 = create_client('ec2', boto3.resource)
s3 = create_client('s3', boto3.resource)
iam = create_client('iam', boto3.client)
redshift = create_client('redshift', boto3.client)

Creating client for ec2
Creating client for s3
Creating client for iam
Creating client for redshift


In [17]:
# Creating IAM role for Redshift, allowing it to use AWS services
print("Creating a new IAM Role") 
try:
    resp = iam.create_role(Path='/',
                           RoleName=DWH_IAM_ROLE_NAME,
                           Description = "Allows Redshift clusters to call AWS services on your behalf.",
                           AssumeRolePolicyDocument=json.dumps({'Statement': [{'Action': 'sts:AssumeRole',
                                                                               'Effect': 'Allow',
                                                                               'Principal': {'Service': 'redshift.amazonaws.com'}}],
                                                                'Version': '2012-10-17'}
                                                              )
                          )
    print("IAM Role created")
except iam.exceptions.EntityAlreadyExistsException:
    print("IAM Role already created")
except Exception as e:
    print("Error creating IAM Role:", e)

Creating a new IAM Role
IAM Role already created


In [18]:
# Attaching policy to role, and return the ARN role 
print("Attaching policy to IAM role")
iam.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME,
                       PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")['ResponseMetadata']['HTTPStatusCode']
roleArn = iam.get_role(RoleName=DWH_IAM_ROLE_NAME)['Role']['Arn']
#print("ARN role:", roleArn)
# TODO Save to dwh.cfg

Attaching policy to IAM role


In [20]:
# Creates Redshift cluster (Warning, this costs money - make sure to use it or delete it again!)
cluster = redshift.create_cluster(
    #Hardware provisioned
    ClusterType=DWH_CLUSTER_TYPE,
    NodeType=DWH_NODE_TYPE,
    NumberOfNodes=int(DWH_NUM_NODES),

    #Identifiers & Credentials
    DBName=DWH_DB,
    ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
    MasterUsername=DWH_DB_USER,
    MasterUserPassword=DWH_DB_PASSWORD,
            
    #Roles (for s3 access)
    IamRoles=[roleArn]  
)

In [None]:
#print(cluster['Cluster']['NodeType'])
# TODO: Pretty print only needed (and public) information

In [21]:
# Query status of the cluster
def prettyRedshiftProps(props, limited = True):
    #pd.set_option('display.max_colwidth', -1)
    if limited:
        keysToShow = ["ClusterStatus"]
    else:
        keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

# Print status, sleep if not available, try again
while True:
    myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
    df = prettyRedshiftProps(myClusterProps, limited=True)
    print(df)
    if myClusterProps['ClusterStatus'] == 'available':
        break
    time.sleep(20) # Sleep 20 seconds, and look again, untill cluster becomes available

# Print full details once cluster is available
df = prettyRedshiftProps(myClusterProps, limited=False)
print(df)

             Key     Value
0  ClusterStatus  creating
             Key     Value
0  ClusterStatus  creating
             Key     Value
0  ClusterStatus  creating
             Key     Value
0  ClusterStatus  creating
             Key     Value
0  ClusterStatus  creating
             Key     Value
0  ClusterStatus  creating
             Key      Value
0  ClusterStatus  available
                 Key                                              Value
0  ClusterIdentifier                                         dwhcluster
1           NodeType                                          dc2.large
2      ClusterStatus                                          available
3     MasterUsername                                            awsuser
4             DBName                                                dwh
5           Endpoint  {'Address': 'dwhcluster.csvcg7v4wmvs.us-east-1...
6              VpcId                                       vpc-bf3e0fc5
7      NumberOfNodes                        

In [23]:
# Get endpoint and ARN role for cluster
cluster_properties = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]

DWH_ENDPOINT = myClusterProps['Endpoint']['Address']
DWH_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
#print("DWH_ENDPOINT:", DWH_ENDPOINT)
#print("DWH_ROLE_ARN:", DWH_ROLE_ARN)
# TODO: Add these to dwg.cfg

In [27]:
# Update cluster security group to allow access through redshift port
vpc = ec2.Vpc(id=myClusterProps['VpcId'])

# The first Security group should be the default one
defaultSg = list(vpc.security_groups.all())[0]
print("Default Security group:", defaultSg)

# Authorize access
try:
    defaultSg.authorize_ingress(GroupName=defaultSg.group_name,
                                CidrIp='0.0.0.0/0',
                                IpProtocol='TCP',
                                FromPort=int(DWH_PORT),
                                ToPort=int(DWH_PORT)
                               )
    print("Access authorized")
except botocore.exceptions.ClientError as e:
    print("ClientError:", e)
except Exception as e:
    print("Error:", e)

Default Security group: ec2.SecurityGroup(id='sg-73b39458')
ClientError: An error occurred (InvalidPermission.Duplicate) when calling the AuthorizeSecurityGroupIngress operation: the specified rule "peer: 0.0.0.0/0, TCP, from port: 5439, to port: 5439, ALLOW" already exists


In [28]:
# Test connection
import psycopg2

dwh_config = configparser.ConfigParser()
dwh_config.read_file(open('./dwh.cfg'))

conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*dwh_config['CLUSTER'].values()))

cur = conn.cursor()
print('Connected to AWS Redshift cluster')
conn.close()

Connected to AWS Redshift cluster


In [None]:
# TODO: Add credentials created to dwh.cfg file automatically, so we know they are up to date

In [None]:
# Created and connection tested

In [None]:
# Teardown of cluster to save money

In [35]:
# Delete cluster (will take time)
resp = redshift.delete_cluster(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,   
                               SkipFinalClusterSnapshot=True)

In [36]:
# Query the status - I have no idea what the status will become after deletion, so no loop here
myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps, limited=False)

Unnamed: 0,Key,Value
0,ClusterIdentifier,dwhcluster
1,NodeType,dc2.large
2,ClusterStatus,deleting
3,MasterUsername,awsuser
4,DBName,dwh
5,Endpoint,{'Address': 'dwhcluster.csvcg7v4wmvs.us-east-1...
6,VpcId,vpc-bf3e0fc5
7,NumberOfNodes,2


In [34]:
# Detach and delete role, since there are no cluster to use this on
detach_resp = iam.detach_role_policy(RoleName=DWH_IAM_ROLE_NAME, 
                       PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")
delete_resp = iam.delete_role(RoleName=DWH_IAM_ROLE_NAME)

# TODO: Print status of these

In [None]:
# Everything SHOULD(?) be deleted