## Redshift Setup with Python SDK (boto3)
This notebook will show how to set up some AWS resources using the Python SDK for AWS, boto3.

Boto3 Documentation: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/redshift.html

---

#### Package Import

---

In [2]:
import boto3
import configparser

---

#### Loading Config files

---

In [3]:
# AWS Credentials
aws_path = "/home/rambino/.aws/credentials"
aws_cred = configparser.ConfigParser()
aws_cred.read(aws_path)

# Redshift Credentials
redshift_path = "/home/rambino/dev/DataEngineering_Udacity/04_AWS_DataWarehousing/redshift_credentials.cfg"
redshift_cred = configparser.ConfigParser()
redshift_cred.read(redshift_path)

# ETL Config
cfg_path = "/home/rambino/dev/DataEngineering_Udacity/Projects/DataWarehouseWithRedshift/dwh.cfg"
cfg = configparser.ConfigParser()
cfg.read(cfg_path)

['/home/rambino/dev/DataEngineering_Udacity/Projects/DataWarehouseWithRedshift/dwh.cfg']

---

#### Creating IAM role for Redshift

---

In [4]:
iam = boto3.client(
    "iam",
    region_name="us-west-2",
    aws_access_key_id=aws_cred["default"]["aws_access_key_id"],
    aws_secret_access_key=aws_cred["default"]["aws_secret_access_key"],
)

In [5]:
# Create IAM role:

# This policy is something about allowing Redshift to impersonate a user, but I don't fully understand it yet.
# Look more into what "sts:AssumeRole" really means.

import json

dwhRole = iam.create_role(
    Path="/",
    RoleName="RedShift_Impersonation",
    Description="Allows redshift to access S3",
    AssumeRolePolicyDocument=json.dumps(
        {
            "Version": "2012-10-17",
            "Statement": [
                {
                    "Effect": "Allow",
                    "Action": "sts:AssumeRole",
                    "Principal": {"Service": "redshift.amazonaws.com"},
                }
            ],
        }
    ),
)

dwhRole

{'Role': {'Path': '/',
  'RoleName': 'RedShift_Impersonation',
  'RoleId': 'AROA44VBDA5Q5NIXNESYX',
  'Arn': 'arn:aws:iam::886174844769:role/RedShift_Impersonation',
  'CreateDate': datetime.datetime(2023, 8, 26, 21, 55, 37, tzinfo=tzutc()),
  'AssumeRolePolicyDocument': {'Version': '2012-10-17',
   'Statement': [{'Effect': 'Allow',
     'Action': 'sts:AssumeRole',
     'Principal': {'Service': 'redshift.amazonaws.com'}}]}},
 'ResponseMetadata': {'RequestId': '27d35638-3a5b-4aeb-8c83-61978245ac88',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '27d35638-3a5b-4aeb-8c83-61978245ac88',
   'content-type': 'text/xml',
   'content-length': '800',
   'date': 'Sat, 26 Aug 2023 21:55:36 GMT'},
  'RetryAttempts': 0}}

In [6]:
role = iam.get_role(RoleName="Redshift_Impersonation")
role_arn = role["Role"]["Arn"]
role_arn

# Loading IAM ARN into config file
cfg["IAM_ROLE"]["ARN"] = role_arn

In [7]:
# Attaching IAM policy to the role (which actually gives permissions):

attach_response = iam.attach_role_policy(
    RoleName="RedShift_Impersonation",
    PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess",
)

attach_response

{'ResponseMetadata': {'RequestId': 'fcf198c6-6b3a-4f21-b28f-a107861cad27',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'fcf198c6-6b3a-4f21-b28f-a107861cad27',
   'content-type': 'text/xml',
   'content-length': '212',
   'date': 'Sat, 26 Aug 2023 21:55:41 GMT'},
  'RetryAttempts': 0}}

---

#### Apply VPC Security Group rules to Redshift

---

In [8]:
# Defining PORT for Redshift + VPC security group
redshift_port = 5439

In [9]:
ec2 = boto3.client(
    "ec2",
    region_name="us-west-2",
    aws_access_key_id=aws_cred["default"]["aws_access_key_id"],
    aws_secret_access_key=aws_cred["default"]["aws_secret_access_key"],
)

In [10]:
response = ec2.create_security_group(
    Description="Security Group for allowing all access to Redshift cluster",
    GroupName="Redshift_secGroup",
)
response

{'GroupId': 'sg-01aae4158e325fc92',
 'ResponseMetadata': {'RequestId': '8e8207f9-7aa9-4427-8943-302553ff4226',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '8e8207f9-7aa9-4427-8943-302553ff4226',
   'cache-control': 'no-cache, no-store',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'content-type': 'text/xml;charset=UTF-8',
   'content-length': '283',
   'date': 'Sat, 26 Aug 2023 21:55:50 GMT',
   'server': 'AmazonEC2'},
  'RetryAttempts': 0}}

In [11]:
sec_groups = ec2.describe_security_groups(GroupNames=["Redshift_secGroup"])

sec_groups
redshift_sg_id = sec_groups["SecurityGroups"][0]["GroupId"]

In [12]:
vpc = ec2.authorize_security_group_ingress(
    CidrIp="0.0.0.0/0",  # Allowing permission to access from any IP
    FromPort=redshift_port,  # Default port for Redshift
    ToPort=redshift_port,
    IpProtocol="TCP",
    GroupId=redshift_sg_id,
)

---

#### Creating Redshift cluster

---

In [13]:
redshift = boto3.client(
    "redshift",
    region_name="us-west-2",
    aws_access_key_id=aws_cred["default"]["aws_access_key_id"],
    aws_secret_access_key=aws_cred["default"]["aws_secret_access_key"],
)

In [14]:
# Documentation: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/redshift.html#Redshift.Client.create_cluster
redshift_response = redshift.create_cluster(
    ClusterType="multi-node",
    NodeType="dc2.large",
    NumberOfNodes=4,
    DBName="my_redshift_db",
    ClusterIdentifier="redshift-cluster-2",
    MasterUsername=redshift_cred["redshift_credentials"]["un"],
    MasterUserPassword=redshift_cred["redshift_credentials"]["pw"],
    IamRoles=[role_arn],
    PubliclyAccessible=True,
    VpcSecurityGroupIds=[redshift_sg_id],
    Port=redshift_port,
)

"""
WARNING! After running this code, you WILL create a Redshift cluster. Be sure to delete it to not incur costs!!
"""

redshift_response

{'Cluster': {'ClusterIdentifier': 'redshift-cluster-2',
  'NodeType': 'dc2.large',
  'ClusterStatus': 'creating',
  'ClusterAvailabilityStatus': 'Modifying',
  'MasterUsername': 'dev',
  'DBName': 'my_redshift_db',
  'AutomatedSnapshotRetentionPeriod': 1,
  'ManualSnapshotRetentionPeriod': -1,
  'ClusterSecurityGroups': [],
  'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-01aae4158e325fc92',
    'Status': 'active'}],
  'ClusterParameterGroups': [{'ParameterGroupName': 'default.redshift-1.0',
    'ParameterApplyStatus': 'in-sync'}],
  'ClusterSubnetGroupName': 'default',
  'VpcId': 'vpc-0debf274df5b429d4',
  'PreferredMaintenanceWindow': 'thu:08:30-thu:09:00',
  'PendingModifiedValues': {'MasterUserPassword': '****'},
  'ClusterVersion': '1.0',
  'AllowVersionUpgrade': True,
  'NumberOfNodes': 4,
  'PubliclyAccessible': True,
  'Encrypted': False,
  'Tags': [],
  'EnhancedVpcRouting': False,
  'IamRoles': [{'IamRoleArn': 'arn:aws:iam::886174844769:role/RedShift_Impersonation',
    'Ap

In [18]:
from time import sleep

# Cluster takes time to create. This loop iterates until redshift is finished and returns details:
for i in range(20):
    clusters = redshift.describe_clusters()
    if clusters["Clusters"] == []:
        print("cluster still forming...")
        sleep(5)
        continue
    else:
        try:
            cfg["CLUSTER"]["DB_HOST"] = clusters["Clusters"][0]["Endpoint"]["Address"]
            cfg["CLUSTER"]["DB_PORT"] = str(clusters["Clusters"][0]["Endpoint"]["Port"])
            cfg["CLUSTER"]["DB_NAME"] = clusters["Clusters"][0]["DBName"]
            cluster_id = clusters["Clusters"][0]["ClusterIdentifier"]

            cfg["CLUSTER"]["DB_USER"] = redshift_cred["redshift_credentials"]["UN"]
            cfg["CLUSTER"]["DB_PASSWORD"] = redshift_cred["redshift_credentials"]["PW"]
            print("---Variables Loaded Successfully---")
            print(clusters)
            break
        except:
            print("Error in outputting cluster metrics, trying again...")
            sleep(10)

    # if(clusters['Clusters'] == []):
    #   print("No clusters")

---Variables Loaded Successfully---
{'Clusters': [{'ClusterIdentifier': 'redshift-cluster-2', 'NodeType': 'dc2.large', 'ClusterStatus': 'available', 'ClusterAvailabilityStatus': 'Available', 'MasterUsername': 'dev', 'DBName': 'my_redshift_db', 'Endpoint': {'Address': 'redshift-cluster-2.clprvgacwxt3.us-west-2.redshift.amazonaws.com', 'Port': 5439}, 'ClusterCreateTime': datetime.datetime(2023, 8, 26, 21, 59, 28, 676000, tzinfo=tzutc()), 'AutomatedSnapshotRetentionPeriod': 1, 'ManualSnapshotRetentionPeriod': -1, 'ClusterSecurityGroups': [], 'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-01aae4158e325fc92', 'Status': 'active'}], 'ClusterParameterGroups': [{'ParameterGroupName': 'default.redshift-1.0', 'ParameterApplyStatus': 'in-sync'}], 'ClusterSubnetGroupName': 'default', 'VpcId': 'vpc-0debf274df5b429d4', 'AvailabilityZone': 'us-west-2d', 'PreferredMaintenanceWindow': 'thu:08:30-thu:09:00', 'PendingModifiedValues': {}, 'ClusterVersion': '1.0', 'AllowVersionUpgrade': True, 'NumberOfNod

In [19]:
# Saving Config file:
with open(cfg_path, "w") as file:
    cfg.write(file)

---

#### Attempt to connect to Redshift cluster:

---

In [20]:
%load_ext sql

In [21]:
conn_string = f'''
    postgresql://{cfg['CLUSTER']['DB_USER']}:{cfg['CLUSTER']['DB_PASSWORD']}@{cfg['CLUSTER']['DB_HOST']}:{cfg['CLUSTER']['DB_PORT']}/{cfg['CLUSTER']['DB_NAME']}'''

%sql $conn_string

#### Troubleshooting issues with data transfer:

In [22]:
%sql SELECT current_database();

 * postgresql://dev:***@redshift-cluster-2.clprvgacwxt3.us-west-2.redshift.amazonaws.com:5439/my_redshift_db
1 rows affected.


current_database
my_redshift_db


In [23]:
#Query load errors:
%sql select * from stl_load_errors ORDER BY starttime desc limit 3

 * postgresql://dev:***@redshift-cluster-2.clprvgacwxt3.us-west-2.redshift.amazonaws.com:5439/my_redshift_db
0 rows affected.


userid,slice,tbl,starttime,session,query,filename,line_number,colname,type,col_length,position,raw_line,raw_field_value,err_code,err_reason,is_partial,start_offset,copy_job_id


#### Data Checking

Double-checking that 'songplays' database only has events with a valid song, artist and duration

In [25]:
%%sql

SELECT COUNT(*)
FROM songplays
WHERE artist_id IS NULL OR artist_id = ''
OR song_id IS NULL OR song_id = ''

 * postgresql://dev:***@redshift-cluster-2.clprvgacwxt3.us-west-2.redshift.amazonaws.com:5439/my_redshift_db
1 rows affected.


count
0


Double-checking we only have unique users

In [26]:
%%sql

SELECT user_id, COUNT(user_id) count
FROM users
GROUP BY user_id
ORDER BY count DESC
LIMIT 10

 * postgresql://dev:***@redshift-cluster-2.clprvgacwxt3.us-west-2.redshift.amazonaws.com:5439/my_redshift_db
10 rows affected.


user_id,count
22,1
30,1
23,1
41,1
24,1
45,1
27,1
80,1
28,1
88,1


Double-checking we only have unique songs

In [27]:
%%sql

SELECT song_id, COUNT(song_id) count
FROM songs
GROUP BY song_id
ORDER BY count DESC
LIMIT 10

 * postgresql://dev:***@redshift-cluster-2.clprvgacwxt3.us-west-2.redshift.amazonaws.com:5439/my_redshift_db
10 rows affected.


song_id,count
SOAADJH12AB018BD30,1
SOAAOLZ12A6D4FB403,1
SOAAFUV12AB018831D,1
SOAAUGN12AB01830B6,1
SOAASHY12A58A7C439,1
SOAAVYM12A8C13C43C,1
SOABBVH12AF72A5B57,1
SOABYIT12AB0183026,1
SOABIXP12A8C135F75,1
SOACCRN12AB01855AD,1


Double-checking we only have unique artists

In [28]:
%%sql

SELECT artist_id, COUNT(artist_id) count
FROM artists
GROUP BY artist_id
ORDER BY count DESC
LIMIT 10

 * postgresql://dev:***@redshift-cluster-2.clprvgacwxt3.us-west-2.redshift.amazonaws.com:5439/my_redshift_db
10 rows affected.


artist_id,count
AR00B1I1187FB433EB,1
AR00DG71187B9B7FCB,1
AR00FVC1187FB5BE3E,1
AR00JIO1187B9A5A15,1
AR00LNI1187FB444A5,1
AR00MQ31187B9ACD8F,1
AR00TGQ1187B994F29,1
AR00Y9I1187B999412,1
AR00YYQ1187FB504DC,1
AR016P51187B98E398,1


#### Analytics

Where were users located during their Sparkify sessions on November 30, 2018?

In [29]:
%%sql 

SELECT COUNT(*) AS freq, location
FROM songplays
JOIN time ON songplays.start_time = time.start_time
WHERE time.year = 2018 
AND time.month = 11  
AND time.day = 30 
GROUP BY songplays.location 
ORDER BY freq DESC

 * postgresql://dev:***@redshift-cluster-2.clprvgacwxt3.us-west-2.redshift.amazonaws.com:5439/my_redshift_db
6 rows affected.


freq,location
9,"San Francisco-Oakland-Hayward, CA"
2,"Red Bluff, CA"
2,"Janesville-Beloit, WI"
1,"Houston-The Woodlands-Sugar Land, TX"
1,"Eugene, OR"
1,"Birmingham-Hoover, AL"


What were the most popular songs (i.e., most played) in Q4, 2018?

In [30]:

%%sql

SELECT COUNT(*) freq, songplays.song_id, songs.title 
FROM songplays JOIN time ON songplays.start_time = time.start_time 
LEFT JOIN songs on songplays.song_id = songs.song_id 
WHERE time.year = 2018
AND time.month BETWEEN 10 AND 12
GROUP BY songplays.song_id, songs.title 
ORDER BY freq DESC
LIMIT 20;

 * postgresql://dev:***@redshift-cluster-2.clprvgacwxt3.us-west-2.redshift.amazonaws.com:5439/my_redshift_db
20 rows affected.


freq,song_id,title
37,SOBONKR12A58A7A7E0,You're The One
9,SOHTKMO12AB01843B0,Catch You Baby (Steve Pitron & Max Sanna Radio Edit)
9,SOUNZHU12A8AE47481,I CAN'T GET STARTED
8,SOULTKQ12AB018A183,Nothin' On You [feat. Bruno Mars] (Album Version)
6,SOLZOBD12AB0185720,Hey Daddy (Daddy's Home)
5,SOARUPP12AB01842E0,Up Up & Away
5,SOTNHIP12AB0183131,Make Her Say
4,SOIZLKI12A6D4F7B61,Supermassive Black Hole (Album Version)
4,SONQEYS12AF72AABC9,Mr. Jones
4,SOIOESO12A6D4F621D,Unwell (Album Version)


In [31]:
# DELETE CLUSTER
response = redshift.delete_cluster(
    ClusterIdentifier=cluster_id, SkipFinalClusterSnapshot=True
)