# This notebook is used to configure RedShift ML


## Variables

In [1]:
secret_name='bankdemo_redshift_login' 

model_name = 'dm01'
function_name = 'predict_dm01'

## Install and import libraries

In [2]:
!pip install -q SQLAlchemy==1.3.13
!pip install psycopg2-binary pyathena
!pip install -U pip
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from pyathena import connect
from botocore.exceptions import ClientError
import pandas as pd
import time
import json
import boto3
import sagemaker

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes


In [3]:
# Get region 
session = boto3.session.Session()
region_name = session.region_name

# Get SageMaker session & default S3 bucket
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

s3 = boto3.client('s3')
redshift = boto3.client('redshift')
secretsmanager = boto3.client('secretsmanager')

## Get credentials & connection information from Secret Manager

In [4]:
session = boto3.session.Session()
region = session.region_name

# secretsmanager = session.client(
#         service_name='secretsmanager',
#         region_name=region
#     )

try:
    get_secret_value_response = secretsmanager.get_secret_value(
            SecretId=secret_name
        )
    secret_arn=get_secret_value_response['ARN']

except ClientError as e:
    print("Error retrieving secret. Error: " + e.response['Error']['Message'])
    
else:
    # Depending on whether the secret is a string or binary, one of these fields will be populated.
    if 'SecretString' in get_secret_value_response:
        secret = get_secret_value_response['SecretString']
    else:
        secret = base64.b64decode(get_secret_value_response['SecretBinary'])
            
secret_json = json.loads(secret)
master_user_name = secret_json['username']
master_user_pw = secret_json['password']
redshift_port = secret_json['port']
redshift_cluster_identifier = secret_json['dbClusterIdentifier']
redshift_endpoint_address = secret_json['host']

database_name_redshift = secret_json['database_name_redshift']
database_name_glue = secret_json['database_name_glue']

schema_redshift = secret_json['schema_redshift']
schema_athena = secret_json['schema_athena']

table_name_glue = secret_json['table_name_glue']
table_name_redshift = secret_json['table_name_redshift']

# print(master_user_name)

# RedShift

## Connect to RedShift

In [5]:
response = redshift.describe_clusters(ClusterIdentifier=redshift_cluster_identifier)
iam_role = response['Clusters'][0]['IamRoles'][0]['IamRoleArn']

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(master_user_name, master_user_pw, redshift_endpoint_address, redshift_port, database_name_redshift))
session = sessionmaker()
session.configure(bind=engine)


## Create model using SageMaker AutoPilot

After training with AutoPilot, the model will be run on the RedShift cluster itself and there are no additional charges on running the endpoint (AutoPilot charges still apply). Target column is 'y'.

More information on configuring RedShift ML is at https://docs.aws.amazon.com/redshift/latest/dg/r_CREATE_MODEL.html

In [None]:
statement = f"""
CREATE MODEL {model_name}
FROM {schema_redshift}.{table_name_redshift}
TARGET y
FUNCTION {function_name}
IAM_ROLE '{iam_role}'
SETTINGS (
  S3_BUCKET '{bucket}'
);
"""

# Other parameters you can set
# [ MODEL_TYPE { XGBOOST | MLP } ]              
# [ PROBLEM_TYPE ( REGRESSION | BINARY_CLASSIFICATION | MULTICLASS_CLASSIFICATION ) ]
# [ OBJECTIVE ( 'MSE' | 'Accuracy' | 'F1' | 'F1Macro' | 'AUC') ]

# print(statement)
s = session()
s.connection().connection.set_isolation_level(0)
s.execute(statement)
s.commit()
s.connection().connection.set_isolation_level(1)


### Check the status of the training. This takes one to two hour
While running, you can also look at the 'processing jobs' and 'training jobs' in SageMaker

In [6]:
statement = f"""
show model {model_name}
"""

# print(statement)
df = pd.read_sql_query(statement, engine)
# df.head(50)
print(df.values[4][1])

# This could take an hour
while df.values[4][1] != 'READY':
    time.sleep(10)
    df = pd.read_sql_query(statement, engine)
    print(df.values[4][1])

READY


### Check the details of the model
Ensure the Model State is READY

In [7]:
statement = f"""
show model {model_name}
"""

# print(statement)
df = pd.read_sql_query(statement, engine)
df.head()

Unnamed: 0,Key,Value
0,Model Name,dm01
1,Schema Name,public
2,Owner,bankdemo
3,Creation Time,"Thu, 16.09.2021 05:47:43"
4,Model State,READY


## Use the AutoPilot function with a SQL query.
### Check the accuracy for the first 10 predictions

In [8]:
statement = f"""
SELECT {function_name}(
                   age, job, marital, education, defaulted, housing,
                   loan, contact, month, day_of_week, duration, campaign,
                   pdays, previous, poutcome, emp_var_rate, cons_price_idx,
                   cons_conf_idx, euribor3m, nr_employed), y
          FROM {schema_redshift}.{table_name_redshift}
"""

# print(statement)
df = pd.read_sql_query(statement, engine)
df.head(10)

Unnamed: 0,predict_dm01,y
0,no,no
1,no,no
2,no,no
3,no,no
4,no,no
5,no,no
6,no,no
7,no,no
8,no,no
9,no,no


### Check the overall accuracy

In [9]:
statement = f"""
SELECT {function_name}, y, COUNT(*)
  FROM (SELECT {function_name}(
                   age, job, marital, education, defaulted, housing,
                   loan, contact, month, day_of_week, duration, campaign,
                   pdays, previous, poutcome, emp_var_rate, cons_price_idx,
                   cons_conf_idx, euribor3m, nr_employed), y
          FROM {schema_redshift}.{table_name_redshift})
 GROUP BY {function_name}, y;
"""

# print(statement)
df = pd.read_sql_query(statement, engine)
df.head(5)

Unnamed: 0,predict_dm01,y,count
0,no,no,33603
1,yes,no,2945
2,no,yes,379
3,yes,yes,4261
