### Import the libraries

In [1]:
import boto3
import pandas as pd
import awswrangler as wr
from io import StringIO
from sagemaker import image_uris, TrainingInput
from sagemaker.estimator import Estimator
import datetime
import boto3
import awswrangler as wr
import pandas as pd
import sagemaker
from sagemaker.session import Session
from sagemaker.model_monitor import DataCaptureConfig
from sagemaker.model_monitor.dataset_format import DatasetFormat
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.feature_store.feature_group import FeatureGroup
from io import StringIO
import datetime

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
%store

Stored variables and their in-db values:
bucket_name                            -> 'housing-dataset-2111'
set_up_dependencies_passed             -> True
set_up_s3_bucket_passed                -> True


In [3]:
# get bucket_name
%store -r bucket_name
print(bucket_name)

housing-dataset-2111


In [4]:
# save Amazon information
account_id = boto3.client("sts").get_caller_identity().get("Account")
region = boto3.Session().region_name
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
s3 = boto3.client('s3', region_name=sagemaker_session.boto_region_name)

In [5]:
boto_session = boto3.Session(region_name=region)
sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

### Construct Athena queries to read the data from each offline feature store

In [6]:
train_feature_group_name = "housing_train"
validation_feature_group_name = "housing_validation"
test_feature_group_name = "housing_test"

In [7]:
# set database name and table name
database_name = "housing"
table_name = "data"

In [8]:
sagemaker_client.list_feature_groups()

{'FeatureGroupSummaries': [{'FeatureGroupName': 'housing_validation',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:453322325373:feature-group/housing_validation',
   'CreationTime': datetime.datetime(2024, 6, 20, 5, 3, 25, 47000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created',
   'OfflineStoreStatus': {'Status': 'Active'}},
  {'FeatureGroupName': 'housing_train',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:453322325373:feature-group/housing_train',
   'CreationTime': datetime.datetime(2024, 6, 20, 5, 2, 37, 852000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created',
   'OfflineStoreStatus': {'Status': 'Active'}},
  {'FeatureGroupName': 'housing_test',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:453322325373:feature-group/housing_test',
   'CreationTime': datetime.datetime(2024, 6, 20, 5, 3, 46, 580000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created',
   'OfflineStoreStatus': {'Status': 'Active'}}],
 'ResponseMetadata': {'RequestId': 'ae009726-3b6d-49

In [9]:
# Initialize FeatureGroup
train_feature_group = FeatureGroup(name=train_feature_group_name, sagemaker_session=sagemaker_session)

In [10]:
# Initialize FeatureGroup
validation_feature_group = FeatureGroup(name=validation_feature_group_name, sagemaker_session=sagemaker_session)

In [11]:
# Initialize FeatureGroup
test_feature_group = FeatureGroup(name=test_feature_group_name, sagemaker_session=sagemaker_session)

In [12]:
train_data_query = train_feature_group.athena_query()
validation_data_query = validation_feature_group.athena_query()
test_data_query = test_feature_group.athena_query()

In [13]:
train_data_table = train_data_query.table_name
validation_data_table = validation_data_query.table_name
test_data_table = test_data_query.table_name

In [14]:
train_query = f"""
SELECT * FROM "{train_data_table}"
"""

In [15]:
validation_query = f"""
SELECT * FROM "{validation_data_table}"
"""

In [16]:
test_query = f"""
SELECT * FROM "{test_data_table}"
"""

### Execute queries and put into dataframes

In [17]:
# run Athena query. The output is loaded to a Pandas dataframe.
df_train = pd.DataFrame()
train_data_query.run(query_string=train_query, output_location='s3://'+bucket_name+'/query_results/train/')
train_data_query.wait()
df_train = train_data_query.as_dataframe()

In [18]:
# run Athena query. The output is loaded to a Pandas dataframe.
df_validation = pd.DataFrame()
validation_data_query.run(query_string=validation_query, output_location='s3://'+bucket_name+'/query_results/validation/')
validation_data_query.wait()
df_validation = validation_data_query.as_dataframe()

In [19]:
# run Athena query. The output is loaded to a Pandas dataframe.
df_test = pd.DataFrame()
test_data_query.run(query_string=test_query, output_location='s3://'+bucket_name+'/query_results/test/')
test_data_query.wait()
df_test = test_data_query.as_dataframe()

### For df_train and df_validation the "id" column is removed and for df_test both the "id" column and "saleprice" column are removed

In [20]:
df_train = df_train.drop(['id', 'event_time', 'write_time','api_invocation_time','is_deleted'], axis=1)
df_validation = df_validation.drop(['id', 'event_time', 'write_time','api_invocation_time','is_deleted'], axis=1)
df_test = df_test.drop(['id', 'saleprice', 'event_time', 'write_time','api_invocation_time','is_deleted'], axis=1)

### Check the shape of the data

In [21]:
print(df_train.shape)
print(df_validation.shape)
print(df_test.shape)

(2043, 187)
(584, 187)
(292, 186)


In [22]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2043 entries, 0 to 2042
Columns: 187 entries, mssubclass to salecondition_partial
dtypes: float64(187)
memory usage: 2.9 MB


### Check the first few rows of the data

In [23]:
df_train.head(5)

Unnamed: 0,mssubclass,lotarea,yearbuilt,yearremodadd,masvnrarea,bsmtfinsf1,bsmtfinsf2,bsmtunfsf,totalbsmtsf,firstflrsf,...,saletype_conlw,saletype_new,saletype_oth,saletype_wd,salecondition_abnorml,salecondition_adjland,salecondition_alloca,salecondition_family,salecondition_normal,salecondition_partial
0,20.0,7180.0,1972.0,1972.0,0.0,390.0,0.0,474.0,864.0,864.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,20.0,8450.0,2000.0,2001.0,108.0,0.0,0.0,1349.0,1349.0,1349.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,60.0,13650.0,2002.0,2002.0,232.0,578.0,0.0,1093.0,1671.0,1687.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,20.0,14694.0,1977.0,2008.0,450.0,1252.0,136.0,306.0,1694.0,1694.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,60.0,9720.0,2001.0,2002.0,134.0,1194.0,0.0,163.0,1357.0,1366.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [24]:
df_validation.head(5)

Unnamed: 0,mssubclass,lotarea,yearbuilt,yearremodadd,masvnrarea,bsmtfinsf1,bsmtfinsf2,bsmtunfsf,totalbsmtsf,firstflrsf,...,saletype_conlw,saletype_new,saletype_oth,saletype_wd,salecondition_abnorml,salecondition_adjland,salecondition_alloca,salecondition_family,salecondition_normal,salecondition_partial
0,60.0,10261.0,2000.0,2000.0,318.0,0.0,0.0,936.0,936.0,962.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,60.0,9317.0,1993.0,1993.0,137.0,513.0,0.0,227.0,740.0,1006.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,20.0,13052.0,1965.0,1965.0,0.0,712.0,0.0,312.0,1024.0,1024.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,80.0,9350.0,1965.0,1999.0,0.0,633.0,0.0,586.0,1219.0,1265.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,20.0,10859.0,1994.0,1995.0,0.0,0.0,0.0,1097.0,1097.0,1097.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [25]:
df_test.head(5)

Unnamed: 0,mssubclass,lotarea,yearbuilt,yearremodadd,masvnrarea,bsmtfinsf1,bsmtfinsf2,bsmtunfsf,totalbsmtsf,firstflrsf,...,saletype_conlw,saletype_new,saletype_oth,saletype_wd,salecondition_abnorml,salecondition_adjland,salecondition_alloca,salecondition_family,salecondition_normal,salecondition_partial
0,60.0,9316.0,2005.0,2005.0,532.0,0.0,0.0,784.0,784.0,784.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,40.0,6854.0,1925.0,1994.0,0.0,317.0,227.0,212.0,756.0,916.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,90.0,10530.0,1977.0,1977.0,90.0,975.0,0.0,0.0,975.0,1004.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,20.0,20693.0,1971.0,1971.0,652.0,434.0,0.0,1262.0,1696.0,1696.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,120.0,8013.0,1995.0,1996.0,0.0,741.0,0.0,846.0,1587.0,1734.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [26]:
df_train['saleprice'].head(2)

0    127000.0
1    179000.0
Name: saleprice, dtype: float64

In [27]:
df_validation['saleprice'].head(2)

0    186500.0
1    213000.0
Name: saleprice, dtype: float64

### Save the data to the S3 bucket as CSV files

In [28]:
df_train.to_csv(f"s3://{bucket_name}/processed/housing/train.csv", header=False,  index=False)

In [29]:
df_validation.to_csv(f"s3://{bucket_name}/processed/housing/validation.csv", header=False, index=False)

In [30]:
df_test.to_csv(f"s3://{bucket_name}/processed/housing/test.csv", header=False, index=False)

### Train a model using SageMaker built-in XgBoost algorithm on the training data and validate it on the validation data
1. set the container image for the XGBoost algorithm
2. set the output path
3. set the hyperparameters
4. create an estimator
5. fit the model to the training data and validate on the validation data

In [31]:
# set the container image for the XGBoost algorithm
container = image_uris.retrieve(region=region, framework="xgboost", version="latest")

In [32]:
# set the output path
output_path = f"s3://{bucket_name}/housing/model"

In [33]:
# set the hyperparameters
hyperparameters = {
    "objective": "reg:linear",
    "num_round": "100",
    "max_depth": "5",
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.7",
    "silent": "0",
}

In [34]:
# create an estimator
estimator = Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    output_path=output_path,
    hyperparameters=hyperparameters,
)

In [35]:
# fit the model to the training data and validate on the validation data
estimator.fit(
    {
        "train": TrainingInput(
            s3_data=f"s3://{bucket_name}/processed/housing/train.csv", content_type="text/csv"
        ),
        "validation": TrainingInput(
            s3_data=f"s3://{bucket_name}/processed/housing/validation.csv", content_type="text/csv"
        ),
    }
)

INFO:sagemaker:Creating training-job with name: xgboost-2024-06-20-05-22-57-019


2024-06-20 05:22:57 Starting - Starting the training job...
2024-06-20 05:23:11 Starting - Preparing the instances for training...
2024-06-20 05:23:38 Downloading - Downloading input data...
2024-06-20 05:24:03 Downloading - Downloading the training image...
2024-06-20 05:24:49 Training - Training image download completed. Training in progress.
2024-06-20 05:24:49 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2024-06-20:05:24:42:INFO] Running standalone xgboost training.[0m
[34m[2024-06-20:05:24:42:INFO] File size need to be processed in the node: 2.01mb. Available memory size in the node: 7985.26mb[0m
[34m[2024-06-20:05:24:42:INFO] Determined delimiter of CSV input is ','[0m
[34m[05:24:42] S3DistributionType set as FullyReplicated[0m
[34m[05:24:42] 2043x186 matrix with 379998 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2024-06-20:05:24:42:INFO] Determined delimiter of CSV input is ','[0m
[34m[05

### Upload the Sagemaker Model created during our training job to the Sagemaker Model Registry

In [36]:
# Get the current date and time
current_datetime = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
# Update the model_name variable
model_name = f"sagemaker-xgboost-{current_datetime}"
print(model_name)

sagemaker-xgboost-2024-06-20-05-25-43


In [37]:
# Saving training job information to be used in the ML lineage module
training_job_info = estimator.latest_training_job.describe()
if training_job_info != None:
    
    # Get the model data
    model_data = training_job_info["ModelArtifacts"]["S3ModelArtifacts"]
    # Create the primary container
    primary_container = {"Image": container, "ModelDataUrl": model_data}

    # Save our model to the Sagemaker Model Registry
    create_model_response = sagemaker_client.create_model(
        ModelName=model_name, ExecutionRoleArn=role, PrimaryContainer=primary_container
    )
    print(create_model_response["ModelArn"])

arn:aws:sagemaker:us-east-1:453322325373:model/sagemaker-xgboost-2024-06-20-05-25-43


In [38]:
# Inspect Training Job Details
training_job_info

{'TrainingJobName': 'xgboost-2024-06-20-05-22-57-019',
 'TrainingJobArn': 'arn:aws:sagemaker:us-east-1:453322325373:training-job/xgboost-2024-06-20-05-22-57-019',
 'ModelArtifacts': {'S3ModelArtifacts': 's3://housing-dataset-2111/housing/model/xgboost-2024-06-20-05-22-57-019/output/model.tar.gz'},
 'TrainingJobStatus': 'Completed',
 'SecondaryStatus': 'Completed',
 'HyperParameters': {'eta': '0.2',
  'gamma': '4',
  'max_depth': '5',
  'min_child_weight': '6',
  'num_round': '100',
  'objective': 'reg:linear',
  'silent': '0',
  'subsample': '0.7'},
 'AlgorithmSpecification': {'TrainingImage': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
  'TrainingInputMode': 'File',
  'MetricDefinitions': [{'Name': 'train:mae',
    'Regex': '.*\\[[0-9]+\\].*#011train-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'},
   {'Name': 'train:merror',
    'Regex': '.*\\[[0-9]+\\].*#011train-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'},
   {'Name': 'validation:mae',
    'Regex':

### Host the trained XGBoost model as a SageMaker Endpoint

In [39]:
# deploy the model to an endpoint
predictor = estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.xlarge",
    endpoint_name="housing-endpoint",
    wait=True,
)

INFO:sagemaker:Creating model with name: xgboost-2024-06-20-05-25-48-140
INFO:sagemaker:Creating endpoint-config with name housing-endpoint
INFO:sagemaker:Creating endpoint with name housing-endpoint


-----!

In [40]:
print(predictor.endpoint_name)

housing-endpoint


### Real time inference using the deployed endpoint

In [41]:
# make predictions using the endpoint
csv_data = df_test.to_csv(header=False, index=False)
predictions = predictor.predict(csv_data, initial_args={"ContentType": "text/csv"}).decode("utf-8")
predictions = pd.read_csv(StringIO(predictions), header=None)
predictions.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,282,283,284,285,286,287,288,289,290,291
0,60.520779,28.841291,94.256477,65.185905,112.786423,34.442169,158.89772,54.58456,72.164024,57.834579,...,27.607943,93.912361,43.226112,58.262253,56.814648,27.778515,26.963947,57.534649,50.460724,159.81073


### Delete the endpoint

In [42]:
# delete the endpoint
predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: housing-endpoint
INFO:sagemaker:Deleting endpoint with name: housing-endpoint


### Shut down notebook resources

In [43]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>

In [44]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>