# Step 1: Preparation of workspace
In this section we import the libs and define a function. Also, we get the configuration from dwh.cfg and check the files in the se paths.

## Definitions and imports

In [1]:
from time import time, sleep
import configparser
import matplotlib.pyplot as plt
import pandas as pd
import json

In [2]:
def prettyRedshiftProps(props):
    """
    Create a Pandas DataFrame to display selected Amazon Redshift cluster properties in a more readable format.

    Args:
        props (dict): A dictionary containing Amazon Redshift cluster properties.

    Returns:
        pandas.DataFrame: A DataFrame containing selected cluster properties with two columns:
            - "Key": The property name.
            - "Value": The corresponding property value.
    """
    pd.set_option('display.max_colwidth', -1)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

## Get config from dwh.cfg
Retrieve the configs found in dwh.cfg.

In [3]:
config = configparser.ConfigParser()
config.read_file(open('configs/dwh.cfg'))

# Reading AWS access keys
KEY=config.get('AWS','key')
SECRET= config.get('AWS','secret')
REGION= config.get('AWS', 'region')

# Reading info to create the Cluster
CLUSTER_TYPE = config.get('DWH_CREATION', 'CLUSTER_TYPE')
NUM_NODES = config.get('DWH_CREATION', 'NUM_NODES')
NODE_TYPE = config.get('DWH_CREATION', 'NODE_TYPE')
IAM_ROLE_NAME = config.get('DWH_CREATION', 'IAM_ROLE_NAME')
CLUSTER_IDENTIFIER = config.get('DWH_CREATION', 'CLUSTER_IDENTIFIER')
DB_NAME = config.get('DWH_CREATION', 'DB_NAME')
DB_USER = config.get('DWH_CREATION', 'DB_USER')
DB_PASSWORD = config.get('DWH_CREATION', 'DB_PASSWORD')
PORT = config.get('DWH_CREATION', 'PORT')

# Reading info about the location of files
LOG_DATA=config.get('S3', 'LOG_DATA')
LOG_JSONPATH=config.get('S3', 'LOG_JSONPATH')
SONG_DATA=config.get('S3', 'SONG_DATA')



pd.DataFrame({"Param":
                  ["REGION", "CLUSTER_TYPE", "NUM_NODES", "NODE_TYPE", "CLUSTER_IDENTIFIER", "DB_NAME", "DB_USER", "DB_PASSWORD", "PORT", "IAM_ROLE_NAME", "LOG_DATA", "LOG_JSONPATH", "SONG_DATA"],
              "Value":
                  [REGION, CLUSTER_TYPE, NUM_NODES, NODE_TYPE, CLUSTER_IDENTIFIER, DB_NAME, DB_USER, DB_PASSWORD, PORT, IAM_ROLE_NAME, LOG_DATA, LOG_JSONPATH, SONG_DATA]
             })

Unnamed: 0,Param,Value
0,REGION,us-east-1
1,CLUSTER_TYPE,multi-node
2,NUM_NODES,2
3,NODE_TYPE,dc2.large
4,CLUSTER_IDENTIFIER,dwhCluster
5,DB_NAME,dwh
6,DB_USER,dwhuser
7,DB_PASSWORD,Passw0rd
8,PORT,5439
9,IAM_ROLE_NAME,dwhRole


## Create clients for IAM, EC2, S3 and Redshift

In [4]:
import boto3

ec2 = boto3.resource('ec2',
                     region_name=REGION,
                     aws_access_key_id=KEY,
                     aws_secret_access_key=SECRET
                    )

s3 = boto3.resource('s3',
                    region_name=REGION,
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET
                   )

iam = boto3.client('iam',
                   aws_access_key_id=KEY,
                   aws_secret_access_key=SECRET,
                   region_name=REGION
                  )

redshift = boto3.client('redshift',
                        region_name=REGION,
                        aws_access_key_id=KEY,
                        aws_secret_access_key=SECRET
                       )

# Check out the sample data sources on S3 

In [5]:
# Download the sample files from the routes

sampleDbBucket =  s3.Bucket("udacity-dend")
sampleDbBucket.objects.filter(Prefix="log_json_path")

s3.Bucket("udacity-dend").download_file('log_json_path.json', 
                                        'sample_files/sample_log_json_path.json')
s3.Bucket("udacity-dend").download_file('log_data/2018/11/2018-11-01-events.json', 
                                        'sample_files/sample_log.json')
s3.Bucket("udacity-dend").download_file('song_data/A/H/T/TRAHTVI128F935A9A2.json', 
                                        'sample_files/sample_song.json')

In [6]:
# Check the paths in S3
sampleDbBucket =  s3.Bucket("udacity-dend")
print('\n ___________Log Files___________')
for obj in sampleDbBucket.objects.filter(Prefix="log_data"):
    print(obj)

print('\n ___________Log Json___________')
for obj in sampleDbBucket.objects.filter(Prefix="log_json_path"):
    print(obj)
    
print('\n ___________Songs Data___________')
for obj in sampleDbBucket.objects.filter(Prefix="song_data"):
    print(obj)
    break


 ___________Log Files___________
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-01-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-02-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-03-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-04-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-05-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-06-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-07-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-08-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-09-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-1

# STEP 2: IAM ROLE

We create an IAM Role that makes Redshift able to access S3 bucket (Read Only).

In [7]:
from botocore.exceptions import ClientError

#2.1 Create the role, 
try:
    print("2.1 Creating a new IAM Role") 
    dwhRole = iam.create_role(
        Path='/',
        RoleName=IAM_ROLE_NAME,
        Description = "Allows Redshift clusters to call AWS services on your behalf.",
        AssumeRolePolicyDocument=json.dumps(
            {'Statement': [{'Action': 'sts:AssumeRole',
               'Effect': 'Allow',
               'Principal': {'Service': 'redshift.amazonaws.com'}}],
             'Version': '2012-10-17'})
    )    
except Exception as e:
    print(e)
    
    
print("2.2 Attaching Policy")

iam.attach_role_policy(RoleName=IAM_ROLE_NAME,
                       PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
                      )['ResponseMetadata']['HTTPStatusCode']

print("2.3 Get the IAM role ARN")
roleArn = iam.get_role(RoleName=IAM_ROLE_NAME)['Role']['Arn']

print(roleArn)

2.1 Creating a new IAM Role
2.2 Attaching Policy
2.3 Get the IAM role ARN
arn:aws:iam::276425435005:role/dwhRole


# STEP 3: Redshift Cluster
We create the Redshift Cluster we will be using.

In [8]:
# Bool checks if the cluster is available
created = False

try:
    response = redshift.create_cluster(        
        #HW
        ClusterType=CLUSTER_TYPE,
        NodeType=NODE_TYPE,
        NumberOfNodes=int(NUM_NODES),

        #Identifiers & Credentials
        DBName=DB_NAME,
        ClusterIdentifier=CLUSTER_IDENTIFIER,
        MasterUsername=DB_USER,
        MasterUserPassword=DB_PASSWORD,
        
        #Roles (for s3 access)
        IamRoles=[roleArn]  
    )
except Exception as e:
    print(e)
    
# Check if the cluster is available in a loop each 10 seconds
while not created:
    print('Checking conection to the cluster.')
    status = redshift.describe_clusters(ClusterIdentifier=CLUSTER_IDENTIFIER)['Clusters'][0]['ClusterStatus']
    
    created = status == 'available'
    sleep(30)
    
print('Conection successfull!')

Checking conection to the cluster.
Checking conection to the cluster.
Checking conection to the cluster.
Checking conection to the cluster.
Checking conection to the cluster.
Checking conection to the cluster.
Conection successfull!


##  3.1 Describe the cluster to see its status

In [9]:
myClusterProps = redshift.describe_clusters(ClusterIdentifier=CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

Unnamed: 0,Key,Value
0,ClusterIdentifier,dwhcluster
1,NodeType,dc2.large
2,ClusterStatus,available
3,MasterUsername,dwhuser
4,DBName,dwh
5,Endpoint,"{'Address': 'dwhcluster.chqyrpi4uuia.us-east-1.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-0c12260f3a31dfe30
7,NumberOfNodes,2


## 3.2 Take note of the cluster endpoint and role ARN

In [10]:
ENDPOINT = myClusterProps['Endpoint']['Address']
ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
print("ENDPOINT :: ", ENDPOINT)
print("ROLE_ARN :: ", ROLE_ARN)

ENDPOINT ::  dwhcluster.chqyrpi4uuia.us-east-1.redshift.amazonaws.com
ROLE_ARN ::  arn:aws:iam::276425435005:role/dwhRole


# STEP 4: Open an incoming TCP port to access the cluster endpoint

In [11]:
try:
    vpc = ec2.Vpc(id=myClusterProps['VpcId'])
    defaultSg = list(vpc.security_groups.all())[0]
    print(defaultSg)
    defaultSg.authorize_ingress(
        GroupName=defaultSg.group_name,
        CidrIp='0.0.0.0/0',
        IpProtocol='TCP',
        FromPort=int(PORT),
        ToPort=int(PORT)
    )
except Exception as e:
    print(e)

ec2.SecurityGroup(id='sg-02180f7438e0e3133')
An error occurred (InvalidPermission.Duplicate) when calling the AuthorizeSecurityGroupIngress operation: the specified rule "peer: 0.0.0.0/0, TCP, from port: 5439, to port: 5439, ALLOW" already exists


# STEP 5: Lets make sure wou can connect to the cluster

In [12]:
%load_ext sql

In [13]:
conn_string="postgresql://{}:{}@{}:{}/{}".format(DB_USER, DB_PASSWORD, ENDPOINT, PORT,DB_NAME)
print(conn_string)
%sql $conn_string

postgresql://dwhuser:Passw0rd@dwhcluster.chqyrpi4uuia.us-east-1.redshift.amazonaws.com:5439/dwh


'Connected: dwhuser@dwh'

# STEP 6: Save the info in the config file
We save the configuration in a new file called dwh-2.cfg. This file will only have info about the conection to the database.

In [14]:
config2 = configparser.ConfigParser()
config2.read_file(open('configs/dwh-2.cfg'))

In [15]:
config2['IAM_ROLE'] = {'ARN':ROLE_ARN}


config2['CLUSTER'] = {'HOST':ENDPOINT,
                    'DB_NAME':DB_NAME,
                    'DB_USER':DB_USER,
                    'DB_PASSWORD':DB_PASSWORD,
                    'DB_PORT':PORT,
                    }

In [16]:
with open('configs/dwh-2.cfg', 'w') as configfile:
    config2.write(configfile)

# STEP 7: Clean up resources
Just in case we have ended working with the cluster.

In [17]:
###### Execute just if you are sure that u ended working with the cluster ######
redshift.delete_cluster( ClusterIdentifier=CLUSTER_IDENTIFIER,  SkipFinalClusterSnapshot=True)

{'Cluster': {'ClusterIdentifier': 'dwhcluster',
  'NodeType': 'dc2.large',
  'ClusterStatus': 'deleting',
  'MasterUsername': 'dwhuser',
  'DBName': 'dwh',
  'Endpoint': {'Address': 'dwhcluster.chqyrpi4uuia.us-east-1.redshift.amazonaws.com',
   'Port': 5439},
  'ClusterCreateTime': datetime.datetime(2023, 10, 2, 20, 13, 38, 45000, tzinfo=tzlocal()),
  'AutomatedSnapshotRetentionPeriod': 1,
  'ClusterSecurityGroups': [],
  'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-02180f7438e0e3133',
    'Status': 'active'}],
  'ClusterParameterGroups': [{'ParameterGroupName': 'default.redshift-1.0',
    'ParameterApplyStatus': 'in-sync'}],
  'ClusterSubnetGroupName': 'default',
  'VpcId': 'vpc-0c12260f3a31dfe30',
  'AvailabilityZone': 'us-east-1e',
  'PreferredMaintenanceWindow': 'wed:04:30-wed:05:00',
  'PendingModifiedValues': {},
  'ClusterVersion': '1.0',
  'AllowVersionUpgrade': True,
  'NumberOfNodes': 2,
  'PubliclyAccessible': True,
  'Encrypted': False,
  'Tags': [],
  'EnhancedVpcRouti

In [18]:
myClusterProps = redshift.describe_clusters(ClusterIdentifier=CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

Unnamed: 0,Key,Value
0,ClusterIdentifier,dwhcluster
1,NodeType,dc2.large
2,ClusterStatus,deleting
3,MasterUsername,dwhuser
4,DBName,dwh
5,Endpoint,"{'Address': 'dwhcluster.chqyrpi4uuia.us-east-1.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-0c12260f3a31dfe30
7,NumberOfNodes,2


In [19]:
# Detach the policy from the role and delete the role.

iam.detach_role_policy(RoleName=IAM_ROLE_NAME, PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")
iam.delete_role(RoleName=IAM_ROLE_NAME)


{'ResponseMetadata': {'RequestId': '84efb7e0-8f9b-4bf6-8ea8-6398469b7442',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '84efb7e0-8f9b-4bf6-8ea8-6398469b7442',
   'content-type': 'text/xml',
   'content-length': '200',
   'date': 'Mon, 02 Oct 2023 20:15:51 GMT'},
  'RetryAttempts': 0}}