This notebook cover following steps.
- Setup : installing libraries
- Prepare data: Configuring local feature store
- Training; Train the xgboost model 
- Register trained model: Register model into Model registry

# Setup
This note book need awswranger library

In [None]:
%pip install awswrangler

In [None]:
# import libraries
import boto3, sagemaker
import numpy as np
import pandas as pd
import time
import os
import awswrangler as wr

# This instantiates a SageMaker session that we will be operating in.
session = sagemaker.Session()
# This object represents the IAM role that we are assigned.
role = sagemaker.get_execution_role()
print("Exectuion role :",role)
bucket = session.default_bucket()
print("Bucket:", bucket)
region = session.boto_region_name
print("Region:", region)

#s3_bucket="sagemaker-us-east-1-246694999211"


# using XG Boost as example
xgboost_container = sagemaker.image_uris.retrieve("xgboost", region, "latest")
xgboost_container

# Prepare data

*** Prerequisite: A local/shared feature store exist and Sagemaker execution role has permissions to access Feature store

Configure Feature store details
 - glue_database
 - glue_table values

In [None]:
#prepare data

#data configuration
base_dest= "./data/"

#if source athena True, data is read from glue table
glue_database= "local_feature_store"
glue_table= "giri_fs_bucket"



In [None]:

boto3.setup_default_session(region_name=region)
print("prepare_data.py START")
print(f"Reading from Authena; database.table : {glue_database}.{glue_table}")
sql = f'SELECT * FROM "{glue_database}"."{glue_table}"'
model_data = wr.athena.read_sql_query(
    sql=sql, database=glue_database, ctas_approach=False
)

# Feature prep - drop the Duration, as it was post-facto data
model_data = model_data.drop(labels=["duration"], axis="columns")
model_data.head()


Verify data types

In [None]:
model_data.dtypes

Convert boolean and Object type columns into 1/0

In [None]:
# one-hot the categorical columns:
cat_cols = model_data.select_dtypes(include=["object"]).columns
for col in cat_cols:
    one_hot = pd.get_dummies(model_data[col], prefix=col)
    model_data = pd.concat([model_data, one_hot], axis=1)
    model_data = model_data.drop(col, axis=1)
    
model_data.head()

In [None]:
# True/False to 1/0
bool_cols = model_data.select_dtypes(include=['bool']).columns
bool_cols

In [None]:
for col in bool_cols:
    model_data[col] = model_data[col].astype(int)
model_data.head()

In [None]:
# move the predicted colum to first - as XGB expects
model_data = model_data.drop('y_no', axis=1)
predict_col = model_data.pop('y_yes')
model_data.insert(0, 'y_yes', predict_col)
model_data.head()

Split the data for training, validation and testing

In [None]:
train_data, val_data, test_data = np.split(
    model_data.sample(frac=1, random_state=1729),
    [int(0.7 * len(model_data)), int(0.9 * len(model_data))],
)
print(train_data.shape,val_data.shape, test_data.shape)

Upload the data to S3 bucket for training

In [None]:
s3_path="ml-workshop-module4"
train_path = base_dest +"train"
val_path = base_dest + "validation"
test_path = base_dest + "test"

boto3.set_stream_logger("boto3.resources", boto3.logging.INFO)
boto3.setup_default_session(region_name=region)

try:
    os.makedirs(train_path)
    os.makedirs(val_path)
    os.makedirs(test_path)
except Exception:
    pass

train_data.to_csv(train_path + "/train.csv", index=False, header=None)
val_data.to_csv(val_path + "/validation.csv", index=False, header=None)
test_data.to_csv(test_path + "/test.csv", index=False, header=None)

try:
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(s3_path, 'train/train.csv')).upload_file(train_path+'/train.csv')
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(s3_path, 'validation/validation.csv')).upload_file(val_path+'/validation.csv')
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(s3_path, 'test/test.csv')).upload_file(test_path+'/test.csv')
except Exception as e:
    print(e)
    
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, s3_path), content_type='csv')
s3_input_val = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/validation'.format(bucket, s3_path), content_type='csv')
s3_input_test = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/test'.format(bucket, s3_path), content_type='csv')

# Model training

In [None]:

xgb = sagemaker.estimator.Estimator(xgboost_container,role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.
                                    format(bucket, s3_path),sagemaker_session=session)
xgb.set_hyperparameters(max_depth=5,eta=0.2,gamma=4,min_child_weight=6,subsample=0.8,silent=0,objective='binary:logistic',num_round=100)
xgb.fit({'train': s3_input_train, "validation": s3_input_val})


In [None]:
job_details = xgb.latest_training_job.describe()
job_details

In [None]:
model_s3 = job_details['ModelArtifacts']['S3ModelArtifacts']
model_s3

In [None]:
# Create the model package - are we inheriting it?
# TODO - what are we calling this package
model_package_group_name = "module4-" + str(round(time.time()))
model_package_group_input_dict = {
 "ModelPackageGroupName" : model_package_group_name,
 "ModelPackageGroupDescription" : "Module 4 model package group"
}

# TODO - can't find an example of using sagemaker client
registry = boto3.client('sagemaker')

create_model_package_group_response = registry.create_model_package_group(**model_package_group_input_dict)
print('ModelPackageGroup Arn : {}'.format(create_model_package_group_response['ModelPackageGroupArn']))


# Register trained model
Register the model in to model register 

In [None]:
# Model registry Register the version
model_url = model_s3

modelpackage_inference_specification =  {
    "InferenceSpecification": {
      "Containers": [
         {
            "Image": xgboost_container,
	         "ModelDataUrl": model_url
         }
      ],
      "SupportedContentTypes": [ "text/csv" ],
      "SupportedResponseMIMETypes": [ "text/csv" ],
   }
 }

create_model_package_input_dict = {
    "ModelPackageGroupName" : model_package_group_name,
    "ModelPackageDescription" : "Bank loan default",
    "ModelApprovalStatus" : "PendingManualApproval"
}
create_model_package_input_dict.update(modelpackage_inference_specification)

create_model_package_response = registry.create_model_package(**create_model_package_input_dict)
model_package_arn = create_model_package_response["ModelPackageArn"]
print('ModelPackage Version ARN : {}'.format(model_package_arn))