## Redshift Setup with Python SDK (boto3)
This notebook will show how to set up some AWS resources using the Python SDK for AWS, boto3.

Boto3 Documentation: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/redshift.html

---

#### Package Import

---

In [142]:
import boto3
import configparser

---

#### Loading Config files

---

In [143]:
#AWS Credentials
aws_path = "/home/rambino/.aws/credentials"
aws_cred = configparser.ConfigParser()
aws_cred.read(aws_path)

#Redshift Credentials
redshift_path = "/home/rambino/dev/DataEngineering_Udacity/04_AWS_DataWarehousing/redshift_credentials.cfg"
redshift_cred = configparser.ConfigParser()
redshift_cred.read(redshift_path)

#ETL Config
cfg_path = "/home/rambino/dev/DataEngineering_Udacity/Projects/DataWarehouseWithRedshift/dwh.cfg"
cfg = configparser.ConfigParser()
cfg.read(cfg_path)


['/home/rambino/dev/DataEngineering_Udacity/Projects/DataWarehouseWithRedshift/dwh.cfg']

---

#### Creating IAM role for Redshift

---

In [None]:
iam = boto3.client('iam',
    region_name             = "us-west-2",
    aws_access_key_id       = aws_cred['udacity_course']['aws_access_key_id'],
    aws_secret_access_key   = aws_cred['udacity_course']['aws_secret_access_key']
)

In [None]:
#Create IAM role:

#This policy is something about allowing Redshift to impersonate a user, but I don't really understand it.
#Look more into what "sts:AssumeRole" really means.

import json

dwhRole = iam.create_role(
    Path = "/",
    RoleName =  "RedShift_Impersonation",
    Description = "Allows redshift to access S3",
    AssumeRolePolicyDocument=json.dumps(
        {
            "Version": "2012-10-17",
            "Statement": [
                {
                    "Effect": "Allow",
                    "Action": 'sts:AssumeRole',
                    "Principal":{"Service": "redshift.amazonaws.com"}
                }
            ]
        }
    )
)

dwhRole

In [None]:
role = iam.get_role(RoleName = "Redshift_Impersonation")
role_arn = role['Role']['Arn']
role_arn

#Loading IAM ARN into config file
cfg['IAM_ROLE']['ARN'] = role_arn

In [None]:
#Attaching IAM policy to the role (which actually gives permissions):

attach_response = iam.attach_role_policy(
    RoleName = "RedShift_Impersonation",
    PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
)

attach_response

---

#### Apply VPC Security Group rules to Redshift

---

In [None]:
#Defining PORT for Redshift + VPC security group
redshift_port = 5439

In [None]:
ec2 = boto3.client('ec2',
    region_name             = "us-west-2",
    aws_access_key_id       = aws_cred['udacity_course']['aws_access_key_id'],
    aws_secret_access_key   = aws_cred['udacity_course']['aws_secret_access_key']
)

In [None]:
sec_groups = ec2.describe_security_groups(
    GroupNames = [
        'Redshift_secGroup'
    ]
)

sec_groups
redshift_sg_id = sec_groups['SecurityGroups'][0]['GroupId']

In [None]:
response = ec2.create_security_group(
    Description = "Security Group for allowing all access to Redshift cluster",
    GroupName = "Redshift_secGroup"
)
response

In [None]:
vpc = ec2.authorize_security_group_ingress(
    CidrIp = '0.0.0.0/0', #Allowing permission to access from any IP
    FromPort = redshift_port, #Default port for Redshift
    ToPort = redshift_port,
    IpProtocol = 'TCP',
    GroupId = redshift_sg_id
)

---

#### Creating Redshift cluster

---

In [None]:
redshift = boto3.client('redshift',
    region_name             = "us-west-2",
    aws_access_key_id       = aws_cred['udacity_course']['aws_access_key_id'],
    aws_secret_access_key   = aws_cred['udacity_course']['aws_secret_access_key']
)

In [None]:
#Documentation: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/redshift.html#Redshift.Client.create_cluster
redshift_response = redshift.create_cluster(
    ClusterType = "multi-node",
    NodeType = 'dc2.large',
    NumberOfNodes = 4,
    DBName = "my_redshift_db",
    ClusterIdentifier = 'redshift-cluster-2',
    MasterUsername = redshift_cred['redshift_credentials']['un'],
    MasterUserPassword = redshift_cred['redshift_credentials']['pw'],
    IamRoles = [role_arn],
    PubliclyAccessible = True,
    VpcSecurityGroupIds = [
        redshift_sg_id
    ],
    Port = redshift_port
)

'''
WARNING! After running this code, you WILL create a Redshift cluster. Be sure to delete it to not incur costs!!
'''

redshift_response

In [144]:
from time import sleep

#Cluster takes time to create. This loop iterates until redshift is finished and returns details:
for i in range(20):
    clusters = redshift.describe_clusters()
    if(clusters['Clusters'] == []):
        print("cluster still forming...")
        sleep(10)
        continue
    else:
        cfg['CLUSTER']['DB_HOST'] = clusters['Clusters'][0]['Endpoint']['Address']
        cfg['CLUSTER']['DB_PORT'] = str(clusters['Clusters'][0]['Endpoint']['Port'])
        cfg['CLUSTER']['DB_NAME'] = clusters['Clusters'][0]['DBName']
        cluster_id = clusters['Clusters'][0]['ClusterIdentifier']

        cfg['CLUSTER']['DB_USER'] = redshift_cred['redshift_credentials']['UN']
        cfg['CLUSTER']['DB_PASSWORD'] = redshift_cred['redshift_credentials']['PW']
        print("---Variables Loaded Successfully---")
        print(clusters)
        break

    

    #if(clusters['Clusters'] == []):
    #   print("No clusters")

---Variables Loaded Successfully---
{'Clusters': [{'ClusterIdentifier': 'redshift-cluster-2', 'NodeType': 'dc2.large', 'ClusterStatus': 'available', 'ClusterAvailabilityStatus': 'Available', 'MasterUsername': 'dev', 'DBName': 'my_redshift_db', 'Endpoint': {'Address': 'redshift-cluster-2.cakcgemszurv.us-west-2.redshift.amazonaws.com', 'Port': 5439}, 'ClusterCreateTime': datetime.datetime(2022, 8, 23, 10, 47, 29, 422000, tzinfo=tzutc()), 'AutomatedSnapshotRetentionPeriod': 1, 'ManualSnapshotRetentionPeriod': -1, 'ClusterSecurityGroups': [], 'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-04b0e51a76a67b667', 'Status': 'active'}], 'ClusterParameterGroups': [{'ParameterGroupName': 'default.redshift-1.0', 'ParameterApplyStatus': 'in-sync'}], 'ClusterSubnetGroupName': 'default', 'VpcId': 'vpc-0055627b0d43048a7', 'AvailabilityZone': 'us-west-2d', 'PreferredMaintenanceWindow': 'sun:06:00-sun:06:30', 'PendingModifiedValues': {}, 'ClusterVersion': '1.0', 'AllowVersionUpgrade': True, 'NumberOfNod

In [None]:
#Saving Config file:
with open(cfg_path,"w") as file:
    cfg.write(file)

In [145]:
#DELETE CLUSTER
response = redshift.delete_cluster(
    ClusterIdentifier = cluster_id,
    SkipFinalClusterSnapshot=True
)

---

#### Attempt to connect to Redshift cluster:

---

Run some queries to get songplay out of Redshift

In [131]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [None]:
conn_string = f'''
    postgresql://{cfg['CLUSTER']['DB_USER']}:{cfg['CLUSTER']['DB_PASSWORD']}@{cfg['CLUSTER']['DB_HOST']}:{cfg['CLUSTER']['DB_PORT']}/{cfg['CLUSTER']['DB_NAME']}'''

%sql $conn_string

In [132]:
%sql SELECT current_database();

 * postgresql://dev:***@redshift-cluster-2.cakcgemszurv.us-west-2.redshift.amazonaws.com:5439/my_redshift_db
1 rows affected.


current_database
my_redshift_db


In [None]:
#Query load errors:
%sql select * from stl_load_errors ORDER BY starttime desc limit 3

Where were users located during their Sparkify sessions on November 30, 2018?

In [137]:
%%sql 

SELECT COUNT(*) AS freq, location
FROM songplays
JOIN time ON songplays.start_time = time.start_time
WHERE time.year = 2018 
AND time.month = 11  
AND time.day = 30 
GROUP BY songplays.location 
ORDER BY freq DESC

 * postgresql://dev:***@redshift-cluster-2.cakcgemszurv.us-west-2.redshift.amazonaws.com:5439/my_redshift_db
6 rows affected.


freq,location
9,"San Francisco-Oakland-Hayward, CA"
2,"Red Bluff, CA"
2,"Janesville-Beloit, WI"
1,"Houston-The Woodlands-Sugar Land, TX"
1,"Eugene, OR"
1,"Birmingham-Hoover, AL"


What were the most popular songs (i.e., most played) in Q4, 2018?

In [141]:

%%sql

SELECT COUNT(*) freq, songplays.song_id, songs.title 
FROM songplays JOIN time ON songplays.start_time = time.start_time 
LEFT JOIN songs on songplays.song_id = songs.song_id 
WHERE time.year = 2018
AND time.month BETWEEN 10 AND 12
GROUP BY songplays.song_id, songs.title 
ORDER BY freq DESC

 * postgresql://dev:***@redshift-cluster-2.cakcgemszurv.us-west-2.redshift.amazonaws.com:5439/my_redshift_db
217 rows affected.


freq,song_id,title
37,SOBONKR12A58A7A7E0,You're The One
9,SOHTKMO12AB01843B0,Catch You Baby (Steve Pitron & Max Sanna Radio Edit)
9,SOUNZHU12A8AE47481,I CAN'T GET STARTED
8,SOULTKQ12AB018A183,Nothin' On You [feat. Bruno Mars] (Album Version)
6,SOLZOBD12AB0185720,Hey Daddy (Daddy's Home)
5,SOARUPP12AB01842E0,Up Up & Away
5,SOTNHIP12AB0183131,Make Her Say
4,SOIZLKI12A6D4F7B61,Supermassive Black Hole (Album Version)
4,SONQEYS12AF72AABC9,Mr. Jones
4,SOIOESO12A6D4F621D,Unwell (Album Version)
