# Create Sparkify Redshift Cluster

In this notebook, I create the IAM Role and Redshift Cluster for this project. The values for HOST and ARN can then be copied to 'dwh.cfg' for use in the rest of the project.

In [1]:
import pandas as pd
import boto3
import json
import psycopg2

## Read information from 'dwh.cfg'

In [2]:
import configparser
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')

CLUSTER_IDENTIFIER     = config.get("DWH","CLUSTER_IDENTIFIER")
CLUSTER_TYPE           = config.get("DWH","CLUSTER_TYPE")
NUM_NODES              = config.get("DWH","NUM_NODES")
NODE_TYPE              = config.get("DWH","NODE_TYPE")

HOST                   = config.get("CLUSTER","HOST")
DB_NAME                = config.get("CLUSTER","DB_NAME")
DB_USER                = config.get("CLUSTER","DB_USER")
DB_PASSWORD            = config.get("CLUSTER","DB_PASSWORD")
DB_PORT                = config.get("CLUSTER","DB_PORT")

IAM_ROLE_NAME          = config.get("IAM_ROLE", "IAM_ROLE_NAME")

pd.DataFrame({"Param":
                  ["CLUSTER_TYPE", "NUM_NODES", "NODE_TYPE", "CLUSTER_IDENTIFIER", "DB_NAME", "DB_USER", "DB_PASSWORD", "DB_PORT", "IAM_ROLE_NAME"],
              "Value":
                  [CLUSTER_TYPE, NUM_NODES, NODE_TYPE, CLUSTER_IDENTIFIER, DB_NAME, DB_USER, DB_PASSWORD, DB_PORT, IAM_ROLE_NAME]
             })

Unnamed: 0,Param,Value
0,CLUSTER_TYPE,multi-node
1,NUM_NODES,4
2,NODE_TYPE,dc2.large
3,CLUSTER_IDENTIFIER,sparkifydwh
4,DB_NAME,spqrkify_dwh
5,DB_USER,sparkify_dwh_user
6,DB_PASSWORD,Passw0rd
7,DB_PORT,5439
8,IAM_ROLE_NAME,redshift_role


## Create IAM Role

In [3]:
iam = boto3.client('iam', region_name='us-west-2', aws_access_key_id = KEY, aws_secret_access_key = SECRET)

In [4]:
try:
    print('1.1 Creating a new IAM Role')
    dwhRole = iam.create_role(
        Path='/',
        RoleName=IAM_ROLE_NAME,
        Description = "Allows Redshift clusters to call AWS services on your behalf.",
        AssumeRolePolicyDocument=json.dumps(
            {'Statement': [{'Action': 'sts:AssumeRole',
               'Effect': 'Allow',
               'Principal': {'Service': 'redshift.amazonaws.com'}}],
             'Version': '2012-10-17'})
    )
except Exception as e:
    print(e)

1.1 Creating a new IAM Role


Attach the 'AmazonS3ReadOnlyAccess' policy

In [5]:
print('1.2 Attaching Policy')
iam.attach_role_policy(RoleName=IAM_ROLE_NAME,
                       PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
                      )['ResponseMetadata']['HTTPStatusCode']

1.2 Attaching Policy


200

In [6]:
print('1.3 Get the IAM role ARN')
roleArn = iam.get_role(RoleName=IAM_ROLE_NAME)['Role']['Arn']

print(roleArn)

1.3 Get the IAM role ARN
arn:aws:iam::650743096901:role/redshift_role


## Make Redshift Client

In [7]:
redshift = boto3.client('redshift', region_name='us-west-2', aws_access_key_id = KEY, aws_secret_access_key = SECRET)

## Create redshift Cluster

In [10]:
try:
    response = redshift.create_cluster(        
        # hardware
        ClusterType=CLUSTER_TYPE,
        NodeType=NODE_TYPE,
        NumberOfNodes=int(NUM_NODES),

        # identifiers & credentials
        DBName=DB_NAME,
        ClusterIdentifier=CLUSTER_IDENTIFIER,
        MasterUsername=DB_USER,
        MasterUserPassword=DB_PASSWORD,
        
        # role (to allow s3 access)
        IamRoles=[roleArn] 
    )
except Exception as e:
    print(e)

An error occurred (ClusterAlreadyExists) when calling the CreateCluster operation: Cluster already exists


In [12]:
def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', -1)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

In [16]:
myClusterProps = redshift.describe_clusters(ClusterIdentifier=CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

Unnamed: 0,Key,Value
0,ClusterIdentifier,sparkifydwh
1,NodeType,dc2.large
2,ClusterStatus,available
3,MasterUsername,sparkify_dwh_user
4,DBName,spqrkify_dwh
5,Endpoint,"{'Address': 'sparkifydwh.cmn7ajvbmkem.us-west-2.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-fe371986
7,NumberOfNodes,4


Once the cluster is 'available', this gives us the HOST and ARN to copy into 'dwh.cfg'

In [14]:
HOST = myClusterProps['Endpoint']['Address']
ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
print("HOST=", HOST)
print("ARN=", ARN)

HOST= sparkifydwh.cmn7ajvbmkem.us-west-2.redshift.amazonaws.com
ARN= arn:aws:iam::650743096901:role/redshift_role


## Cleanup

Delete the Cluster, detach policy and delete the role

In [17]:
#### CAREFUL!! Delete the Cluster
redshift.delete_cluster( ClusterIdentifier=CLUSTER_IDENTIFIER,  SkipFinalClusterSnapshot=True)
#### CAREFUL!!

{'Cluster': {'ClusterIdentifier': 'sparkifydwh',
  'NodeType': 'dc2.large',
  'ClusterStatus': 'deleting',
  'MasterUsername': 'sparkify_dwh_user',
  'DBName': 'spqrkify_dwh',
  'Endpoint': {'Address': 'sparkifydwh.cmn7ajvbmkem.us-west-2.redshift.amazonaws.com',
   'Port': 5439},
  'ClusterCreateTime': datetime.datetime(2021, 4, 22, 21, 1, 22, 54000, tzinfo=tzlocal()),
  'AutomatedSnapshotRetentionPeriod': 1,
  'ClusterSecurityGroups': [],
  'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-82a5cdbe',
    'Status': 'active'}],
  'ClusterParameterGroups': [{'ParameterGroupName': 'default.redshift-1.0',
    'ParameterApplyStatus': 'in-sync'}],
  'ClusterSubnetGroupName': 'default',
  'VpcId': 'vpc-fe371986',
  'AvailabilityZone': 'us-west-2b',
  'PreferredMaintenanceWindow': 'thu:11:30-thu:12:00',
  'PendingModifiedValues': {},
  'ClusterVersion': '1.0',
  'AllowVersionUpgrade': True,
  'NumberOfNodes': 4,
  'PubliclyAccessible': True,
  'Encrypted': False,
  'Tags': [],
  'EnhancedVpcRou

In [21]:
myClusterProps = redshift.describe_clusters(ClusterIdentifier=CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

ClusterNotFoundFault: An error occurred (ClusterNotFound) when calling the DescribeClusters operation: Cluster sparkifydwh not found.

In [22]:
#### CAREFUL!! Delete the Role
iam.detach_role_policy(RoleName=IAM_ROLE_NAME, PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")
iam.delete_role(RoleName=IAM_ROLE_NAME)
#### CAREFUL!!

{'ResponseMetadata': {'RequestId': 'f653349d-cba9-477b-9530-081b5ea3dabf',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'f653349d-cba9-477b-9530-081b5ea3dabf',
   'content-type': 'text/xml',
   'content-length': '200',
   'date': 'Thu, 22 Apr 2021 21:25:58 GMT'},
  'RetryAttempts': 0}}