## Redshift Setup with Python SDK (boto3)
This notebook will show how to set up some AWS resources using the Python SDK for AWS, boto3.

Boto3 Documentation: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/redshift.html

---

#### Package Import

---

In [None]:
import boto3
import configparser

---

#### Loading Config files

---

In [None]:
#AWS Credentials
aws_path = "/home/rambino/.aws/credentials"
aws_cred = configparser.ConfigParser()
aws_cred.read(aws_path)

#Redshift Credentials
redshift_path = "/home/rambino/dev/DataEngineering_Udacity/04_AWS_DataWarehousing/redshift_credentials.cfg"
redshift_cred = configparser.ConfigParser()
redshift_cred.read(redshift_path)

#ETL Config
cfg_path = "/home/rambino/dev/DataEngineering_Udacity/Projects/DataWarehouseWithRedshift/dwh.cfg"
cfg = configparser.ConfigParser()
cfg.read(cfg_path)


---

#### Creating IAM role for Redshift

---

In [None]:
s3 = boto3.client('s3',
    region_name             = "us-west-2",
    aws_access_key_id       = aws_cred['udacity_course']['aws_access_key_id'],
    aws_secret_access_key   = aws_cred['udacity_course']['aws_secret_access_key']
)

In [None]:
s3.download_file("udacity-dend","log_json_path.json","redshift_project_json_format.json")
#s3://udacity-dend/log_json_path.json

In [None]:
iam = boto3.client('iam',
    region_name             = "us-west-2",
    aws_access_key_id       = aws_cred['udacity_course']['aws_access_key_id'],
    aws_secret_access_key   = aws_cred['udacity_course']['aws_secret_access_key']
)

In [None]:
#Create IAM role:

#This policy is something about allowing Redshift to impersonate a user, but I don't really understand it.
#Look more into what "sts:AssumeRole" really means.

import json

dwhRole = iam.create_role(
    Path = "/",
    RoleName =  "RedShift_Impersonation",
    Description = "Allows redshift to access S3",
    AssumeRolePolicyDocument=json.dumps(
        {
            "Version": "2012-10-17",
            "Statement": [
                {
                    "Effect": "Allow",
                    "Action": 'sts:AssumeRole',
                    "Principal":{"Service": "redshift.amazonaws.com"}
                }
            ]
        }
    )
)

dwhRole

In [None]:
role = iam.get_role(RoleName = "Redshift_Impersonation")
role_arn = role['Role']['Arn']
role_arn

#Loading IAM ARN into config file
cfg['IAM_ROLE']['ARN'] = role_arn

In [None]:
#Attaching IAM policy to the role (which actually gives permissions):

attach_response = iam.attach_role_policy(
    RoleName = "RedShift_Impersonation",
    PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
)

attach_response

---

#### Apply VPC Security Group rules to Redshift

---

In [None]:
#Defining PORT for Redshift + VPC security group
redshift_port = 5439

In [None]:
ec2 = boto3.client('ec2',
    region_name             = "us-west-2",
    aws_access_key_id       = aws_cred['udacity_course']['aws_access_key_id'],
    aws_secret_access_key   = aws_cred['udacity_course']['aws_secret_access_key']
)

In [None]:
response = ec2.create_security_group(
    Description = "Security Group for allowing all access to Redshift cluster",
    GroupName = "Redshift_secGroup"
)
response

In [None]:
sec_groups = ec2.describe_security_groups(
    GroupNames = [
        'Redshift_secGroup'
    ]
)

sec_groups
redshift_sg_id = sec_groups['SecurityGroups'][0]['GroupId']

In [None]:
vpc = ec2.authorize_security_group_ingress(
    CidrIp = '0.0.0.0/0', #Allowing permission to access from any IP
    FromPort = redshift_port, #Default port for Redshift
    ToPort = redshift_port,
    IpProtocol = 'TCP',
    GroupId = redshift_sg_id
)

---

#### Creating Redshift cluster

---

In [None]:
redshift = boto3.client('redshift',
    region_name             = "us-west-2",
    aws_access_key_id       = aws_cred['udacity_course']['aws_access_key_id'],
    aws_secret_access_key   = aws_cred['udacity_course']['aws_secret_access_key']
)

In [None]:
#Documentation: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/redshift.html#Redshift.Client.create_cluster
redshift_response = redshift.create_cluster(
    ClusterType = "multi-node",
    NodeType = 'dc2.large',
    NumberOfNodes = 4,
    DBName = "my_redshift_db",
    ClusterIdentifier = 'redshift-cluster-2',
    MasterUsername = redshift_cred['redshift_credentials']['un'],
    MasterUserPassword = redshift_cred['redshift_credentials']['pw'],
    IamRoles = [role_arn],
    PubliclyAccessible = True,
    VpcSecurityGroupIds = [
        redshift_sg_id
    ],
    Port = redshift_port
)

'''
WARNING! After running this code, you WILL create a Redshift cluster. Be sure to delete it to not incur costs!!
'''

redshift_response

In [None]:
from time import sleep

#Cluster takes time to create. This loop iterates until redshift is finished and returns details:
for i in range(20):
    clusters = redshift.describe_clusters()
    if(clusters['Clusters'] == []):
        print("cluster still forming...")
        sleep(5)
        continue
    else:
        try:
            cfg['CLUSTER']['DB_HOST'] = clusters['Clusters'][0]['Endpoint']['Address']
            cfg['CLUSTER']['DB_PORT'] = str(clusters['Clusters'][0]['Endpoint']['Port'])
            cfg['CLUSTER']['DB_NAME'] = clusters['Clusters'][0]['DBName']
            cluster_id = clusters['Clusters'][0]['ClusterIdentifier']

            cfg['CLUSTER']['DB_USER'] = redshift_cred['redshift_credentials']['UN']
            cfg['CLUSTER']['DB_PASSWORD'] = redshift_cred['redshift_credentials']['PW']
            print("---Variables Loaded Successfully---")
            print(clusters)
            break
        except:
            print("Error in outputting cluster metrics, trying again...")
            sleep(10)

    

    #if(clusters['Clusters'] == []):
    #   print("No clusters")

In [None]:
#Saving Config file:
with open(cfg_path,"w") as file:
    cfg.write(file)

---

#### Attempt to connect to Redshift cluster:

---

In [None]:
%load_ext sql

In [None]:
conn_string = f'''
    postgresql://{cfg['CLUSTER']['DB_USER']}:{cfg['CLUSTER']['DB_PASSWORD']}@{cfg['CLUSTER']['DB_HOST']}:{cfg['CLUSTER']['DB_PORT']}/{cfg['CLUSTER']['DB_NAME']}'''

%sql $conn_string

#### Troubleshooting issues with data transfer:

In [None]:
%sql SELECT current_database();

In [None]:
#Query load errors:
%sql select * from stl_load_errors ORDER BY starttime desc limit 3

#### Data Checking

Double-checking that 'songplays' database only has events with a valid song, artist and duration

In [None]:
%%sql

SELECT COUNT(*)
FROM songplays
WHERE artist_id IS NULL OR artist_id = ''
OR song_id IS NULL OR song_id = ''

Double-checking we only have unique users

In [None]:
%%sql

SELECT user_id, COUNT(user_id) count
FROM users
GROUP BY user_id
ORDER BY count DESC
LIMIT 10

Double-checking we only have unique songs

In [None]:
%%sql

SELECT song_id, COUNT(song_id) count
FROM songs
GROUP BY song_id
ORDER BY count DESC
LIMIT 10

Double-checking we only have unique artists

In [None]:
%%sql

SELECT artist_id, COUNT(artist_id) count
FROM artists
GROUP BY artist_id
ORDER BY count DESC
LIMIT 10

#### Analytics

Where were users located during their Sparkify sessions on November 30, 2018?

In [None]:
%%sql 

SELECT COUNT(*) AS freq, location
FROM songplays
JOIN time ON songplays.start_time = time.start_time
WHERE time.year = 2018 
AND time.month = 11  
AND time.day = 30 
GROUP BY songplays.location 
ORDER BY freq DESC

What were the most popular songs (i.e., most played) in Q4, 2018?

In [None]:

%%sql

SELECT COUNT(*) freq, songplays.song_id, songs.title 
FROM songplays JOIN time ON songplays.start_time = time.start_time 
LEFT JOIN songs on songplays.song_id = songs.song_id 
WHERE time.year = 2018
AND time.month BETWEEN 10 AND 12
GROUP BY songplays.song_id, songs.title 
ORDER BY freq DESC
LIMIT 20;

In [165]:
#DELETE CLUSTER
response = redshift.delete_cluster(
    ClusterIdentifier = cluster_id,
    SkipFinalClusterSnapshot=True
)