In [24]:
import pandas as pd
import boto3
from botocore.exceptions import ClientError
import json
import configparser
import psycopg2



In [9]:
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')

DWH_CLUSTER_TYPE       = config.get("DWH","DWH_CLUSTER_TYPE")
DWH_NUM_NODES          = config.get("DWH","DWH_NUM_NODES")
DWH_NODE_TYPE          = config.get("DWH","DWH_NODE_TYPE")

DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
DWH_DB                 = config.get("DWH","DWH_DB")
DWH_DB_USER            = config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD        = config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT               = config.get("DWH","DWH_PORT")

DWH_IAM_ROLE_NAME      = config.get("DWH", "DWH_IAM_ROLE_NAME")

(DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB)

pd.DataFrame({"Param":
                  ["DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME"],
              "Value":
                  [DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME]
             })

Unnamed: 0,Param,Value
0,DWH_CLUSTER_TYPE,multi-node
1,DWH_NUM_NODES,4
2,DWH_NODE_TYPE,dc2.large
3,DWH_CLUSTER_IDENTIFIER,dwhCluster
4,DWH_DB,dwh
5,DWH_DB_USER,dwhuser
6,DWH_DB_PASSWORD,Passw0rd
7,DWH_PORT,5439
8,DWH_IAM_ROLE_NAME,myRedshiftRole


In [3]:
#Create clients for each service
ec2 = boto3.resource('ec2', region_name='us-west-2', aws_access_key_id=KEY, aws_secret_access_key=SECRET)

s3 = boto3.resource('s3', region_name='us-west-2', aws_access_key_id=KEY, aws_secret_access_key=SECRET)

iam = boto3.client('iam', region_name='us-west-2', aws_access_key_id=KEY, aws_secret_access_key=SECRET)

redshift = boto3.client('redshift', region_name='us-west-2', aws_access_key_id=KEY, aws_secret_access_key=SECRET)

In [4]:
#Run over the sample data bucket
sampleDbBucket =  s3.Bucket("awssampledbuswest2")

for file in sampleDbBucket.objects.all():
    print(file)

s3.ObjectSummary(bucket_name='awssampledbuswest2', key='load/')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='load/customer-fw-manifest')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='load/customer-fw.tbl-000')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='load/customer-fw.tbl-000.bak')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='load/customer-fw.tbl-001')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='load/customer-fw.tbl-002')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='load/customer-fw.tbl-003')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='load/customer-fw.tbl-004')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='load/customer-fw.tbl-005')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='load/customer-fw.tbl-006')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='load/customer-fw.tbl-007')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='load/customer-fw.tbl.log')
s3.ObjectSummary(b

In [10]:
#Create a IAM role that makes Redshift able to access S3
try:
    print("1.1 Creating a new IAM Role")
    dwhRole = iam.create_role(
        RoleName=DWH_IAM_ROLE_NAME,
        Description = "Allows Redshift clusters to call AWS services on your behalf.",
        AssumeRolePolicyDocument=json.dumps({
            'Statement': [{'Action': 'sts:AssumeRole',
                           'Principal': {'Service': 'redshift.amazonaws.com'}}],
            'Version': '2012-10-17'
        })
    )
except Exception as e:
    print(e)

1.1 Creating a new IAM Role
An error occurred (EntityAlreadyExists) when calling the CreateRole operation: Role with name myRedshiftRole already exists.


In [11]:
#Attach POlicy
try:
    print("1.2 Attaching Policy")
    iam.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME, PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")['ResponseMetadata']['HTTPStatusCode']
except Exception as e:
    print(e)
    

1.2 Attaching Policy


In [12]:
#get and print the role ARN
try:
    print("1.3 Get the role ARN")
    roleArn = iam.get_role(RoleName=DWH_IAM_ROLE_NAME)['Role']['Arn']
    print(roleArn)
except Exception as e:
    print(e)
    
    

1.3 Get the role ARN
arn:aws:iam::827672602588:role/myRedshiftRole


In [13]:
#Create a redshift cluster
try:
    print("1.4 Creating a redshift cluster")
    response = redshift.create_cluster(
        #Parameters for the hardware configuration of the cluster
        ClusterType=DWH_CLUSTER_TYPE,
        NodeType=DWH_NODE_TYPE,
        NumberOfNodes=int(DWH_NUM_NODES),
        #add
        DBName=DWH_DB,
        ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
        MasterUsername=DWH_DB_USER,
        MasterUserPassword=DWH_DB_PASSWORD,
        Port=int(DWH_PORT),
        #Parameters for the IAM roles and other security settings
        IamRoles=[roleArn]
    )
    print(response)
except Exception as e:
    print(e)

1.4 Creating a redshift cluster
{'Cluster': {'ClusterIdentifier': 'dwhcluster', 'NodeType': 'dc2.large', 'ClusterStatus': 'creating', 'ClusterAvailabilityStatus': 'Modifying', 'MasterUsername': 'dwhuser', 'DBName': 'dwh', 'AutomatedSnapshotRetentionPeriod': 1, 'ManualSnapshotRetentionPeriod': -1, 'ClusterSecurityGroups': [], 'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-0926c046fe470934b', 'Status': 'active'}], 'ClusterParameterGroups': [{'ParameterGroupName': 'default.redshift-1.0', 'ParameterApplyStatus': 'in-sync'}], 'ClusterSubnetGroupName': 'default', 'VpcId': 'vpc-036eade898b91a4d6', 'PreferredMaintenanceWindow': 'sat:08:00-sat:08:30', 'PendingModifiedValues': {'MasterUserPassword': '****'}, 'ClusterVersion': '1.0', 'AllowVersionUpgrade': True, 'NumberOfNodes': 4, 'PubliclyAccessible': True, 'Encrypted': False, 'Tags': [], 'EnhancedVpcRouting': False, 'IamRoles': [{'IamRoleArn': 'arn:aws:iam::827672602588:role/myRedshiftRole', 'ApplyStatus': 'adding'}], 'MaintenanceTrackName':

In [18]:
#describe the cluster to see its status
def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', 1000)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

Unnamed: 0,Key,Value
0,ClusterIdentifier,dwhcluster
1,NodeType,dc2.large
2,ClusterStatus,available
3,MasterUsername,dwhuser
4,DBName,dwh
5,Endpoint,"{'Address': 'dwhcluster.ccvo2wutk8vc.us-west-2.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-036eade898b91a4d6
7,NumberOfNodes,4


In [19]:
#take note of the endpoint and role arn
DWH_ENDPOINT = myClusterProps['Endpoint']['Address']
DWH_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
print("DWH_ENDPOINT :: ", DWH_ENDPOINT)
print("DWH_ROLE_ARN :: ", DWH_ROLE_ARN)

DWH_ENDPOINT ::  dwhcluster.ccvo2wutk8vc.us-west-2.redshift.amazonaws.com
DWH_ROLE_ARN ::  arn:aws:iam::827672602588:role/myRedshiftRole


In [20]:
#Open an incoming TCP port to access the cluster
try:
    print("1.5 Open an incoming TCP port to access the cluster")
    vpc = ec2.Vpc(id=myClusterProps['VpcId'])
    defaultSg = list(vpc.security_groups.all())[0]
    print(defaultSg)
    defaultSg.authorize_ingress(
        CidrIp='0.0.0.0/0',
        IpProtocol='TCP',
        GroupName=defaultSg.group_name,
        FromPort=int(DWH_PORT),
        ToPort=int(DWH_PORT)
    )
except Exception as e:
    print(e)


1.5 Open an incoming TCP port to access the cluster
ec2.SecurityGroup(id='sg-0926c046fe470934b')


In [25]:
#make sure you can connect to the cluster
try:
    print("1.6 Make sure you can connect to the cluster")
    conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(DWH_ENDPOINT, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT))
    cur = conn.cursor()
    cur.execute("SELECT 1")
    conn.close()
except Exception as e:
    print(e)


1.6 Make sure you can connect to the cluster


In [None]:
#### CAREFUL!!
#-- Uncomment & run to delete the created resources
redshift.delete_cluster( ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,  SkipFinalClusterSnapshot=True)
#### CAREFUL!!