# This notebook is to load data into RedShift

In [1]:
# !pip install -q SQLAlchemy==1.3.13
# !pip install psycopg2-binary pyathena
# !pip install -U pip
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from pyathena import connect
from botocore.exceptions import ClientError
import pandas as pd
import json
import boto3
import sagemaker

In [2]:
# Get region 
session = boto3.session.Session()
region_name = session.region_name

# Get SageMaker session & default S3 bucket
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

s3 = boto3.client('s3')
redshift = boto3.client('redshift')
secretsmanager = boto3.client('secretsmanager')

### Variables

In [3]:
secret_name='bankdemo_redshift_login' ## replace the secret name with yours

database_name_redshift = 'bankdemo'
database_name_athena = 'bankdemo'

schema_redshift = 'dm'
schema_athena = 'athena'

table_name_glue = 'bankdemo_glue'
table_name_redshift = 'data'

### Get credentials

In [4]:
# # Old code
# import json

# secret = secretsmanager.get_secret_value(SecretId='bankdemo_redshift_login')
# cred = json.loads(secret['SecretString'])

# master_user_name = cred[0]['username']
# master_user_pw = cred[1]['password']
# print(master_user_name)

In [5]:
session = boto3.session.Session()
region = session.region_name

client = session.client(
        service_name='secretsmanager',
        region_name=region
    )

try:
    get_secret_value_response = client.get_secret_value(
            SecretId=secret_name
        )
    secret_arn=get_secret_value_response['ARN']

except ClientError as e:
    print("Error retrieving secret. Error: " + e.response['Error']['Message'])
    
else:
    # Depending on whether the secret is a string or binary, one of these fields will be populated.
    if 'SecretString' in get_secret_value_response:
        secret = get_secret_value_response['SecretString']
    else:
        secret = base64.b64decode(get_secret_value_response['SecretBinary'])
            
secret_json = json.loads(secret)
master_user_name = secret_json['username']
master_user_pw = secret_json['password']
redshift_port = secret_json['port']
redshift_cluster_identifier = secret_json['dbClusterIdentifier']
redshift_endpoint_address = secret_json['host']
print(master_user_name)

bankdemo


### Copy bank-additional data to S3

In [6]:
# s3.upload_file('bank-additional/bank-additional-full.csv', bucket, 'bankdemo/bank-additional.csv')


# Athena

## Create Athena database

In [7]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)
conn = connect(region_name=region_name, s3_staging_dir=s3_staging_dir)
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name_athena)
print(statement)

CREATE DATABASE IF NOT EXISTS bankdemo


In [8]:
import pandas as pd
pd.read_sql(statement, conn)
statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,database_name
0,bankdemo
1,cagpoc
2,default
3,kianlai-a1s1-bucket-ap-southeast-1-database


## Load CSV to Athena

### The 'default' column name causes an error and I changed the name to 'defaulted' instead.

In [9]:
s3_bankdemo_path = "s3://{}/bankdemo/".format(bucket)
# SQL statement to execute
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         age int,
         job string,
         marital string,
         education string,
         defaulted string,
         housing string,
         loan string,
         contact string,
         month string,
         day_of_week string,
         duration int,
         campaign int,
         pdays int,
         previous int,
         poutcome string,
         emp_var_rate float,
         cons_price_idx float,
         cons_conf_idx float,
         euribor3m float,
         nr_employed int,
         y string
) 
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY ',' 
LINES TERMINATED BY '\n' 
LOCATION '{}'
TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')""".format(database_name_athena, table_name_glue, s3_bankdemo_path)

print(statement)

CREATE EXTERNAL TABLE IF NOT EXISTS bankdemo.bankdemo_glue(
         age int,
         job string,
         marital string,
         education string,
         defaulted string,
         housing string,
         loan string,
         contact string,
         month string,
         day_of_week string,
         duration int,
         campaign int,
         pdays int,
         previous int,
         poutcome string,
         emp_var_rate float,
         cons_price_idx float,
         cons_conf_idx float,
         euribor3m float,
         nr_employed int,
         y string
) 
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY ',' 
LINES TERMINATED BY '
' 
LOCATION 's3://sagemaker-ap-southeast-1-138604873012/bankdemo/'
TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')


In [10]:
import pandas as pd

pd.read_sql(statement, conn)
statement = "SHOW TABLES in {}".format(database_name_athena)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,tab_name
0,bankdemo_glue


### Test

In [11]:
statement = """SELECT * FROM {}.{}
""".format(
    database_name_athena, table_name_glue
)

print(statement)

SELECT * FROM bankdemo.bankdemo_glue



In [12]:
df = pd.read_sql(statement, conn)
df.head(5)

Unnamed: 0,age,job,marital,education,defaulted,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no


# RedShift

## Connect to RedShift

In [13]:
# Ensure that the cluster is available

import time

response = redshift.describe_clusters(ClusterIdentifier=redshift_cluster_identifier)
cluster_status = response['Clusters'][0]['ClusterStatus']
print(cluster_status)

while cluster_status != 'available':
    time.sleep(10)
    response = redshift.describe_clusters(ClusterIdentifier=redshift_cluster_identifier)
    cluster_status = response['Clusters'][0]['ClusterStatus']
    print(cluster_status)

available


In [14]:
# Ensure the ApplyStatus is in-sync

# redshift_endpoint_address = response['Clusters'][0]['Endpoint']['Address']
iam_role = response['Clusters'][0]['IamRoles'][0]['IamRoleArn']

response['Clusters'][0]['IamRoles']

[{'IamRoleArn': 'arn:aws:iam::138604873012:role/BankDemo',
  'ApplyStatus': 'in-sync'}]

In [15]:
print('Redshift endpoint: {}'.format(redshift_endpoint_address))
print('IAM Role: {}'.format(iam_role))

Redshift endpoint: bankdemo.cszyoc0ofzdt.ap-southeast-1.redshift.amazonaws.com
IAM Role: arn:aws:iam::138604873012:role/BankDemo


In [16]:
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(master_user_name, master_user_pw, redshift_endpoint_address, redshift_port, database_name_redshift))
session = sessionmaker()
session.configure(bind=engine)


## Create RedShift schema

In [17]:
statement = """CREATE SCHEMA IF NOT EXISTS {}""".format(schema_redshift)

s = session()
s.execute(statement)
s.commit()

## Register Athena Database bankdemo with Redshift Spectrum to Access the Data Directly in S3 using Glue Data Catalog

With just one command, you can query the S3 data lake from Amazon Redshift without moving any data into our data warehouse. This is the power of Redshift Spectrum. 

Note the `FROM DATA CATALOG` below.  This is pulling the table and schema information from the Glue Data Catalog (ie. Hive Metastore).

In [18]:
statement = """
CREATE EXTERNAL SCHEMA IF NOT EXISTS {} FROM DATA CATALOG 
    DATABASE '{}' 
    IAM_ROLE '{}'
    REGION '{}'
    CREATE EXTERNAL DATABASE IF NOT EXISTS
""".format(schema_athena, database_name_athena, iam_role, region_name)

print(statement)


CREATE EXTERNAL SCHEMA IF NOT EXISTS athena FROM DATA CATALOG 
    DATABASE 'bankdemo' 
    IAM_ROLE 'arn:aws:iam::138604873012:role/BankDemo'
    REGION 'ap-southeast-1'
    CREATE EXTERNAL DATABASE IF NOT EXISTS



In [19]:
s = session()
s.execute(statement)
s.commit()

### Run Sample Query on S3 Data through Redshift Spectrum

In [20]:
statement = """
SELECT *
    FROM {}.{}
""".format(schema_athena, table_name_glue)

print(statement)


SELECT *
    FROM athena.bankdemo_glue



In [21]:
df = pd.read_sql_query(statement, engine)
df.head(5)

Unnamed: 0,age,job,marital,education,defaulted,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,,no


## Create table

In [22]:
statement = """
CREATE TABLE IF NOT EXISTS {}.{}( 
     age integer,
     job text,
     marital text,
     education text,
     defaulted text,
     housing text,
     loan text,
     contact text,
     month text,
     day_of_week text,
     duration integer,
     campaign integer,
     pdays integer,
     previous integer,
     poutcome text,
     emp_var_rate decimal,
     cons_price_idx decimal,
     cons_conf_idx decimal,
     euribor3m decimal,
     nr_employed integer,
     y text
     )
""".format(schema_redshift, table_name_redshift)

print(statement)
s = session()
s.execute(statement)
s.commit()

print("Done.")


CREATE TABLE IF NOT EXISTS dm.data( 
     age integer,
     job text,
     marital text,
     education text,
     defaulted text,
     housing text,
     loan text,
     contact text,
     month text,
     day_of_week text,
     duration integer,
     campaign integer,
     pdays integer,
     previous integer,
     poutcome text,
     emp_var_rate decimal,
     cons_price_idx decimal,
     cons_conf_idx decimal,
     euribor3m decimal,
     nr_employed integer,
     y text
     )

Done.


## Insert data from S3 to RedShift using Athena

In [23]:
statement = """
INSERT INTO {}.{}
    SELECT
        *
    FROM
        {}.{};             

""".format(schema_redshift, table_name_redshift, schema_athena, table_name_glue)
print(statement)
s = session()
s.execute(statement)
s.commit()        
print("Done.")


INSERT INTO dm.data
    SELECT
        *
    FROM
        athena.bankdemo_glue;             


Done.


In [24]:
statement = """
SELECT *
    FROM {}.{}
""".format(schema_redshift, table_name_redshift)

print(statement)


SELECT *
    FROM dm.data



### Test getting data from RedShift

In [25]:
df = pd.read_sql_query(statement, engine)
df.head(5)

Unnamed: 0,age,job,marital,education,defaulted,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.0,94.0,-36.0,5.0,,no
1,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.0,94.0,-36.0,5.0,,no
2,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.0,94.0,-36.0,5.0,,no
3,59,admin.,married,professional.course,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.0,94.0,-36.0,5.0,,no
4,24,technician,single,professional.course,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.0,94.0,-36.0,5.0,,no
