# This notebook is to configure RedShift ML
More information on configuring RedShift ML is at https://docs.aws.amazon.com/redshift/latest/dg/r_CREATE_MODEL.html

In [1]:
# !pip install -q SQLAlchemy==1.3.13
# !pip install psycopg2-binary pyathena
# !pip install -U pip
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from pyathena import connect
from botocore.exceptions import ClientError
import pandas as pd
import time
import json

In [2]:
import boto3
import sagemaker

# Get region 
session = boto3.session.Session()
region_name = session.region_name

# Get SageMaker session & default S3 bucket
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

s3 = boto3.client('s3')
redshift = boto3.client('redshift')
secretsmanager = boto3.client('secretsmanager')

### Variables

In [3]:
secret_name='bankdemo_redshift_login' ## replace the secret name with yours

database_name_redshift = 'bankdemo'
database_name_athena = 'bankdemo'

schema_redshift = 'dm'
schema_athena = 'athena'

table_name_glue = 'bankdemo_glue'
table_name_redshift = 'data'

model_name = 'dm01'
function_name = 'predict_dm01'

### Get credentials

In [4]:

session = boto3.session.Session()
region = session.region_name

client = session.client(
        service_name='secretsmanager',
        region_name=region
    )

try:
    get_secret_value_response = client.get_secret_value(
            SecretId=secret_name
        )
    secret_arn=get_secret_value_response['ARN']

except ClientError as e:
    print("Error retrieving secret. Error: " + e.response['Error']['Message'])
    
else:
    # Depending on whether the secret is a string or binary, one of these fields will be populated.
    if 'SecretString' in get_secret_value_response:
        secret = get_secret_value_response['SecretString']
    else:
        secret = base64.b64decode(get_secret_value_response['SecretBinary'])
            
secret_json = json.loads(secret)
master_user_name = secret_json['username']
master_user_pw = secret_json['password']
redshift_port = secret_json['port']
redshift_cluster_identifier = secret_json['dbClusterIdentifier']
redshift_endpoint_address = secret_json['host']
print(master_user_name)

bankdemo


# RedShift

## Connect to RedShift

In [5]:
response = redshift.describe_clusters(ClusterIdentifier=redshift_cluster_identifier)
iam_role = response['Clusters'][0]['IamRoles'][0]['IamRoleArn']

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(master_user_name, master_user_pw, redshift_endpoint_address, redshift_port, database_name_redshift))
session = sessionmaker()
session.configure(bind=engine)


## Create model using AutoPilot

In [6]:
statement = f"""
CREATE MODEL {model_name}
FROM {schema_redshift}.{table_name_redshift}
TARGET y
FUNCTION {function_name}
IAM_ROLE '{iam_role}'
SETTINGS (
  S3_BUCKET '{bucket}'
);
"""

# print(statement)
s = session()
s.connection().connection.set_isolation_level(0)
s.execute(statement)
s.commit()
s.connection().connection.set_isolation_level(1)



### Check the status of the training. This takes one to two hour
While running, you can also look at the 'processing jobs' and 'training jobs' in SageMaker

In [7]:
statement = f"""
show model {model_name}
"""

# print(statement)
df = pd.read_sql_query(statement, engine)
# df.head(50)
print(df.values[4][1])

# This could take an hour
while df.values[4][1] != 'READY':
    time.sleep(10)
    df = pd.read_sql_query(statement, engine)
    print(df.values[4][1])

TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
T

KeyboardInterrupt: 

## Use the AutoPilot function with a SQL query.
### Check the accuracy for the first 10 predictions

In [None]:
statement = f"""
SELECT {function_name}(
                   age, job, marital, education, defaulted, housing,
                   loan, contact, month, day_of_week, duration, campaign,
                   pdays, previous, poutcome, emp_var_rate, cons_price_idx,
                   cons_conf_idx, euribor3m, nr_employed), y
          FROM {schema_redshift}.{table_name_redshift}
"""

# print(statement)
df = pd.read_sql_query(statement, engine)
df.head(10)

### Check the overall accuracy

In [None]:
statement = f"""
SELECT {function_name}, y, COUNT(*)
  FROM (SELECT {function_name}(
                   age, job, marital, education, defaulted, housing,
                   loan, contact, month, day_of_week, duration, campaign,
                   pdays, previous, poutcome, emp_var_rate, cons_price_idx,
                   cons_conf_idx, euribor3m, nr_employed), y
          FROM {schema_redshift}.{table_name_redshift})
 GROUP BY {function_name}, y;
"""

# print(statement)
df = pd.read_sql_query(statement, engine)
df.head(5)