In [1]:
import pandas as pd # for creating dataframes - dataframe(df) is like a way to use tables in python
import boto3 # aws sdk - for using aws services 
import json 
import configparser # for getting values of a config file, config file is where you store values just like an env 

# accesing env file 
import os 
from dotenv import load_dotenv, find_dotenv # to access the secret keys we've hidden in a separate file 
load_dotenv(find_dotenv()) # grab values inside env file

True

#### Grabbing the values in a config file and creating a dataframe out of it just for visualization purposes

In [2]:
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg')) # read the content of this config file

# grabbing the values in a config file
KEY = os.getenv("AWS_KEY")
SECRET = os.getenv("AWS_SECRET")

DWH_CLUSTER_TYPE = config.get("DWH","DWH_CLUSTER_TYPE") # grab the value of key under [DWH]
DWH_NUM_NODES = config.get("DWH","DWH_NUM_NODES")
DWH_NODE_TYPE = config.get("DWH","DWH_NODE_TYPE")

DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
DWH_DB = config.get("DWH","DWH_DB")
DWH_DB_USER = config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD = config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT = config.get("DWH","DWH_PORT")

DWH_IAM_ROLE_NAME = config.get("DWH", "DWH_IAM_ROLE_NAME")

# creating a dataframe out of the grabbed values 
pd.DataFrame( # key(column):array(rows)
    {"Param": 
        ["DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME"],
    "Value": 
        [DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME]
    }
)

Unnamed: 0,Param,Value
0,DWH_CLUSTER_TYPE,multi-node
1,DWH_NUM_NODES,4
2,DWH_NODE_TYPE,dc2.large
3,DWH_CLUSTER_IDENTIFIER,dwhCluster
4,DWH_DB,dwh
5,DWH_DB_USER,dwhuser
6,DWH_DB_PASSWORD,Passw0rd
7,DWH_PORT,5439
8,DWH_IAM_ROLE_NAME,dwhRole


#### Connect to aws services 

In [None]:
# ec2 - computing services 
ec2 = boto3.resource(
   'ec2', # what aws service you wanted in
   region_name="us-west-2", # region you use in your aws account
   aws_access_key_id=KEY, # this is given to a created user at aws iam console (gui)
   aws_secret_access_key=SECRET # same with access_key_id
)

# s3 - like a folder which can contain folder/s and file/s | can be used as a data lake
s3 = boto3.resource(
   's3',
   region_name="us-west-2",
   aws_access_key_id=KEY,
   aws_secret_access_key=SECRET
)

# iam - access management for users
iam = boto3.client(
   'iam',
   aws_access_key_id=KEY,
   aws_secret_access_key=SECRET,
   region_name='us-west-2'
)

# redshift - data warehouse/database like postgresql
redshift = boto3.client(
   'redshift',
   region_name="us-west-2",
   aws_access_key_id=KEY,
   aws_secret_access_key=SECRET
)

#### using an s3 bucket - bucket is like a folder but in cloud, object is like a file

In [None]:
# creating a bucket/folder 
bucket1 = s3.create_bucket(
    Bucket = 'bucket-name' # bucket name
)

# uploading an object/file to a bucket
s3.upload_file(
    'logdata.csv', # file you wanted to upload 
    'bucket_name', # name of the bucket 
    'newfilename.csv' # file name of the file you're uploading as they arrive into s3
)

#### creating a role, role is a collection of policies/permissions a user can have 
- Create an IAM Role 
- Create an additional policy for the role that makes Redshift able to access S3 bucket (ReadOnly)



In [None]:
# create an iam role 
try:
    dwhRole = iam.create_role(
        Path='/',
        RoleName=DWH_IAM_ROLE_NAME,
        Description = "Allows Redshift clusters to call AWS services on your behalf.",
        AssumeRolePolicyDocument=json.dumps({ # .dumps converts a python object into string, this is needed for assumeRolePolicyDocument
            'Statement': [{
                'Action': 'sts:AssumeRole',
                'Effect': 'Allow',
                'Principal': {
                    'Service': 'redshift.amazonaws.com'
                }
            }],
            'Version': '2012-10-17'}
        )
    )    
except Exception as e:
    print(e)

# create an additional policy for the role that makes Redshift able to access S3 bucket (ReadOnly)
iam.attach_role_policy(
    RoleName=DWH_IAM_ROLE_NAME,
    PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
)['ResponseMetadata']['HTTPStatusCode']



#### Attach the recently created role to a redshift cluster - meaning iam user who'll be using that specific redshift cluster can only do things we specified in its policy

In [None]:
roleArn = iam.get_role(RoleName=DWH_IAM_ROLE_NAME)['Role']['Arn']

# we attach a policy/role(collection of policy) to a cluster by creating it 
try:
    response = redshift.create_cluster(        
        #HW
        ClusterType=DWH_CLUSTER_TYPE,
        NodeType=DWH_NODE_TYPE,
        NumberOfNodes=int(DWH_NUM_NODES),

        #Identifiers & Credentials
        DBName=DWH_DB,
        ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
        MasterUsername=DWH_DB_USER,
        MasterUserPassword=DWH_DB_PASSWORD,
        
        #Roles (for s3 access)
        IamRoles=[roleArn]  
    )
except Exception as e:
    print(e)

#### open an incoming tcp port to access the cluster endpoint - idk what this does

In [None]:
try:
    vpc = ec2.Vpc(id=myClusterProps['VpcId'])
    defaultSg = list(vpc.security_groups.all())[0]
    print(defaultSg)
    
    defaultSg.authorize_ingress(
        GroupName=defaultSg.group_name,
        CidrIp='0.0.0.0/0',
        IpProtocol='TCP',
        FromPort=int(DWH_PORT),
        ToPort=int(DWH_PORT)
    )
except Exception as e:
    print(e)