# Creating a Redshift Cluster using AWS python SDK 

In [43]:
import pandas as pd
import boto3
import json

# Step 0: Make sure you have AWS access and secret key 

1. Create a new IAM user in your AWS account
2. Give it AdministratorAccess, From Attach existing policies directly Tab
3. Take note of the access key and secret
4. Edit the file dwh.cfg in the same folder as this notebook and fill

## Load the DWH params for a file 

In [44]:
import configparser

config = configparser.ConfigParser()
config.read_file(open("dwh.cfg"))


KEY =                           config.get('AWS', 'KEY')
SECRET=                          config.get('AWS', 'SECRET')

DWH_CLUSTER_TYPE =              config.get('DWH', 'DWH_CLUSTER_TYPE')
DWH_NUM_NODES =                 config.get('DWH', 'DWH_NUM_NODES')
DWH_NODE_TYPE =                 config.get('DWH', 'DWH_NODE_TYPE')

DWH_CLUSTER_IDENTIFIER =        config.get('DWH', 'DWH_CLUSTER_IDENTIFIER')
DWH_DB =                        config.get('DWH', 'DWH_DB')
DWH_DB_USER =                   config.get('DWH', 'DWH_DB_USER')
DWH_DB_PASSWORD =               config.get('DWH', 'DWH_DB_PASSWORD')
DWH_PORT =                      config.get('DWH', 'DWH_PORT')

DWH_IAM_ROLE_NAME =             config.get('DWH', 'DWH_IAM_ROLE_NAME')


(DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB)

pd.DataFrame({"Param":
                  ["KEY","DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME"],
              "Value":
                  [KEY, DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME]
             })

Unnamed: 0,Param,Value
0,KEY,AKIA5OERWIN6SNNHWLFA
1,DWH_CLUSTER_TYPE,multi-node
2,DWH_NUM_NODES,4
3,DWH_NODE_TYPE,dc2.large
4,DWH_CLUSTER_IDENTIFIER,dwhCluster
5,DWH_DB,dwh
6,DWH_DB_USER,dwhadmin
7,DWH_DB_PASSWORD,Passw0rd
8,DWH_PORT,5439
9,DWH_IAM_ROLE_NAME,dwhadmin


## Create clients for EC2, S3, IAM and Redshift

In [45]:
import boto3 

ec2 = boto3.resource('ec2',
                     region_name ='us-west-2',
                     aws_access_key_id = KEY,
                     aws_secret_access_key= SECRET
                     )


s3 = boto3.resource('s3',
                    region_name= 'us-west-2',
                    aws_access_key_id = KEY,
                    aws_secret_access_key = SECRET
                    )

iam = boto3.client('iam',
                    region_name= 'us-west-2',
                    aws_access_key_id = KEY,
                    aws_secret_access_key = SECRET
                    )

redshift = boto3.client('redshift',
                    region_name= 'us-west-2',
                    aws_access_key_id = KEY,
                    aws_secret_access_key = SECRET
                    )


## Check out sample data sources on s3 

In [46]:
sampleDbBucket = s3.Bucket('awssampledbuswest2')
for obj in sampleDbBucket.objects.filter(Prefix="ssbgz"):
    print (obj)

s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/customer0002_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/dwdate.tbl.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/lineorder0000_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/lineorder0001_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/lineorder0002_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/lineorder0003_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/lineorder0004_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/lineorder0005_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/lineorder0006_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/lineorder0007_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='s

### Step 1: Setup IAM Role

In [47]:
# Create the IAM role
try:
    print('1.1 Creating a new IAM Role')
    dwhRole = iam.create_role(
        Path='/',
        RoleName=DWH_IAM_ROLE_NAME,
        Description='Allows Redshift clusters to call AWS services on your behalf.',
        AssumeRolePolicyDocument=json.dumps({
              "Version": "2012-10-17",
              "Statement": [
                {
                  "Effect": "Allow",
                  "Principal": {
                    "Service": "redshift.amazonaws.com"
                  },
                  "Action": "sts:AssumeRole"
                }
              ]
            }),
    )
    
except Exception as e:
    print(e)

1.1 Creating a new IAM Role
An error occurred (EntityAlreadyExists) when calling the CreateRole operation: Role with name dwhadmin already exists.


In [48]:
# Attach Policy
print('1.2 Attaching Policy')
iam.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME,
                       PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
                      )['ResponseMetadata']['HTTPStatusCode']

1.2 Attaching Policy


200

In [49]:
#Get and print the IAM role ARN
print('1.3 Get the IAM role ARN')
roleArn = iam.get_role(RoleName=DWH_IAM_ROLE_NAME)['Role']['Arn']

print(roleArn)

1.3 Get the IAM role ARN
arn:aws:iam::923723514749:role/dwhadmin


### Step 2: Redshift cluster 


 - Create a RedShift Cluster
 - For complete arguments to create_cluster, see docs

In [50]:
try:
    response = redshift.create_cluster(        
        #HW
        ClusterType=DWH_CLUSTER_TYPE,
        NodeType=DWH_NODE_TYPE,
        NumberOfNodes=int(DWH_NUM_NODES),

        #Identifiers & Credentials
        DBName=DWH_DB,
        ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
        MasterUsername=DWH_DB_USER,
        MasterUserPassword=DWH_DB_PASSWORD,
        
        #Roles (for s3 access)
        IamRoles=[roleArn]  
    )
except Exception as e:
    print(e)

### 2.1 Describe the cluster to see its status

 Run this block several times until the cluster status becomes Available

In [62]:
def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', None)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

Unnamed: 0,Key,Value
0,ClusterIdentifier,dwhcluster
1,NodeType,dc2.large
2,ClusterStatus,available
3,MasterUsername,dwhadmin
4,DBName,dwh
5,Endpoint,"{'Address': 'dwhcluster.czhpm2w80wz5.us-west-2.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-5081c428
7,NumberOfNodes,4


### 2.2 Take note of the cluster

In [63]:
DWH_ENDPOINT = myClusterProps['Endpoint']['Address']
DWH_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
print("DWH_ENDPOINT :: ", DWH_ENDPOINT)
print("DWH_ROLE_ARN :: ", DWH_ROLE_ARN)

DWH_ENDPOINT ::  dwhcluster.czhpm2w80wz5.us-west-2.redshift.amazonaws.com
DWH_ROLE_ARN ::  arn:aws:iam::923723514749:role/dwhadmin


## Step 3: Open an incoming TCP port to access the cluster endpoint

In [64]:
try:
    vpc = ec2.Vpc(id=myClusterProps['VpcId'])
    defaultSg = list(vpc.security_groups.all())[0]
    print(defaultSg)
    defaultSg.authorize_ingress(
        GroupName=defaultSg.group_name,
        CidrIp='0.0.0.0/0',
        IpProtocol='TCP',
        FromPort=int(DWH_PORT),
        ToPort=int(DWH_PORT)
    )
except Exception as e:
    print(e)

ec2.SecurityGroup(id='sg-b4af509c')
An error occurred (InvalidPermission.Duplicate) when calling the AuthorizeSecurityGroupIngress operation: the specified rule "peer: 0.0.0.0/0, TCP, from port: 5439, to port: 5439, ALLOW" already exists



## Step 4: Make sure you can connect to the clusterConnect to the cluster

In [65]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [67]:
conn_string="postgresql://{}:{}@{}:{}/{}".format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT, DWH_PORT,DWH_DB)
print(conn_string)
%sql $conn_string

postgresql://dwhadmin:Passw0rd@dwhcluster.czhpm2w80wz5.us-west-2.redshift.amazonaws.com:5439/dwh


## STEP 5: Clean up your resources

In [68]:
#### CAREFUL!!
#-- Uncomment & run to delete the created resources
redshift.delete_cluster( ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,  SkipFinalClusterSnapshot=True)
#### CAREFUL!!

{'Cluster': {'ClusterIdentifier': 'dwhcluster',
  'NodeType': 'dc2.large',
  'ClusterStatus': 'deleting',
  'ClusterAvailabilityStatus': 'Modifying',
  'MasterUsername': 'dwhadmin',
  'DBName': 'dwh',
  'Endpoint': {'Address': 'dwhcluster.czhpm2w80wz5.us-west-2.redshift.amazonaws.com',
   'Port': 5439},
  'ClusterCreateTime': datetime.datetime(2020, 8, 24, 13, 23, 51, 15000, tzinfo=tzutc()),
  'AutomatedSnapshotRetentionPeriod': 1,
  'ManualSnapshotRetentionPeriod': -1,
  'ClusterSecurityGroups': [],
  'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-b4af509c',
    'Status': 'active'}],
  'ClusterParameterGroups': [{'ParameterGroupName': 'default.redshift-1.0',
    'ParameterApplyStatus': 'in-sync'}],
  'ClusterSubnetGroupName': 'default',
  'VpcId': 'vpc-5081c428',
  'AvailabilityZone': 'us-west-2c',
  'PreferredMaintenanceWindow': 'mon:07:00-mon:07:30',
  'PendingModifiedValues': {},
  'ClusterVersion': '1.0',
  'AllowVersionUpgrade': True,
  'NumberOfNodes': 4,
  'PubliclyAccessible

In [69]:
myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

Unnamed: 0,Key,Value
0,ClusterIdentifier,dwhcluster
1,NodeType,dc2.large
2,ClusterStatus,deleting
3,MasterUsername,dwhadmin
4,DBName,dwh
5,Endpoint,"{'Address': 'dwhcluster.czhpm2w80wz5.us-west-2.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-5081c428
7,NumberOfNodes,4
