### Import the libraries

In [2]:
import boto3
import pandas as pd
import awswrangler as wr
from io import StringIO
from sagemaker import image_uris, TrainingInput
from sagemaker.estimator import Estimator
import datetime
import boto3
import awswrangler as wr
import pandas as pd
import sagemaker
from sagemaker.session import Session
from sagemaker.model_monitor import DataCaptureConfig
from sagemaker.model_monitor.dataset_format import DatasetFormat
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.feature_store.feature_group import FeatureGroup
from io import StringIO
import datetime

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [3]:
%store

Stored variables and their in-db values:
bucket_name                            -> 'housing-dataset-5435xx'
set_up_dependencies_passed             -> True
set_up_s3_bucket_passed                -> True


In [4]:
# get bucket_name
%store -r bucket_name
print(bucket_name)

housing-dataset-5435xx


In [5]:
# save Amazon information
account_id = boto3.client("sts").get_caller_identity().get("Account")
region = boto3.Session().region_name
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
s3 = boto3.client('s3', region_name=sagemaker_session.boto_region_name)

In [6]:
boto_session = boto3.Session(region_name=region)
sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

### Construct Athena queries to read the data from each offline feature store

In [7]:
train_feature_group_name = "housing_train"
validation_feature_group_name = "housing_validation"
test_feature_group_name = "housing_test"

In [8]:
# set database name and table name
database_name = "housing"
table_name = "data"

In [9]:
sagemaker_client.list_feature_groups()

{'FeatureGroupSummaries': [{'FeatureGroupName': 'housing_validation',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:952054755114:feature-group/housing_validation',
   'CreationTime': datetime.datetime(2024, 6, 19, 5, 43, 46, 915000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created',
   'OfflineStoreStatus': {'Status': 'Active'}},
  {'FeatureGroupName': 'housing_train',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:952054755114:feature-group/housing_train',
   'CreationTime': datetime.datetime(2024, 6, 19, 5, 43, 25, 527000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created',
   'OfflineStoreStatus': {'Status': 'Active'}},
  {'FeatureGroupName': 'housing_test',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:952054755114:feature-group/housing_test',
   'CreationTime': datetime.datetime(2024, 6, 19, 5, 44, 8, 358000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created',
   'OfflineStoreStatus': {'Status': 'Active'}}],
 'ResponseMetadata': {'RequestId': '93c0012d-eac1

In [10]:
# Initialize FeatureGroup
train_feature_group = FeatureGroup(name=train_feature_group_name, sagemaker_session=sagemaker_session)

In [11]:
# Initialize FeatureGroup
validation_feature_group = FeatureGroup(name=validation_feature_group_name, sagemaker_session=sagemaker_session)

In [12]:
# Initialize FeatureGroup
test_feature_group = FeatureGroup(name=test_feature_group_name, sagemaker_session=sagemaker_session)

In [13]:
train_data_query = train_feature_group.athena_query()
validation_data_query = validation_feature_group.athena_query()
test_data_query = test_feature_group.athena_query()

In [14]:
train_data_table = train_data_query.table_name
validation_data_table = validation_data_query.table_name
test_data_table = test_data_query.table_name

In [15]:
train_query = f"""
SELECT * FROM "{train_data_table}"
"""

In [16]:
validation_query = f"""
SELECT * FROM "{validation_data_table}"
"""

In [17]:
test_query = f"""
SELECT * FROM "{test_data_table}"
"""

### Execute queries and put into dataframes

In [18]:
# run Athena query. The output is loaded to a Pandas dataframe.
df_train = pd.DataFrame()
train_data_query.run(query_string=train_query, output_location='s3://'+bucket_name+'/query_results/train/')
train_data_query.wait()
df_train = train_data_query.as_dataframe()

In [19]:
# run Athena query. The output is loaded to a Pandas dataframe.
df_validation = pd.DataFrame()
validation_data_query.run(query_string=validation_query, output_location='s3://'+bucket_name+'/query_results/validation/')
validation_data_query.wait()
df_validation = validation_data_query.as_dataframe()

In [20]:
# run Athena query. The output is loaded to a Pandas dataframe.
df_test = pd.DataFrame()
test_data_query.run(query_string=test_query, output_location='s3://'+bucket_name+'/query_results/test/')
test_data_query.wait()
df_test = test_data_query.as_dataframe()

In [21]:
df_test.head(3)

Unnamed: 0,id,mssubclass,lotarea,yearbuilt,yearremodadd,masvnrarea,bsmtfinsf1,bsmtfinsf2,bsmtunfsf,totalbsmtsf,...,poolarea,miscval,mosold,yrsold,saleprice,logsaleprice,event_time,write_time,api_invocation_time,is_deleted
0,881.0,20.0,7024.0,2005.0,2006.0,0.0,980.0,0.0,110.0,1090.0,...,0.0,0.0,6.0,2007.0,157000.0,11.964001,1718776000.0,2024-06-19 05:50:59.640,2024-06-19 05:44:40.000,False
1,104.0,20.0,10402.0,2009.0,2009.0,0.0,0.0,0.0,1226.0,1226.0,...,0.0,0.0,5.0,2010.0,198900.0,12.200558,1718776000.0,2024-06-19 05:50:59.640,2024-06-19 05:44:40.000,False
2,2757.0,30.0,10914.0,1929.0,1950.0,0.0,0.0,0.0,715.0,715.0,...,0.0,0.0,8.0,2006.0,163000.0,12.001506,1718776000.0,2024-06-19 05:50:59.640,2024-06-19 05:44:40.000,False


In [22]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 584 entries, 0 to 583
Data columns (total 40 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   584 non-null    float64
 1   mssubclass           584 non-null    float64
 2   lotarea              584 non-null    float64
 3   yearbuilt            584 non-null    float64
 4   yearremodadd         584 non-null    float64
 5   masvnrarea           584 non-null    float64
 6   bsmtfinsf1           584 non-null    float64
 7   bsmtfinsf2           584 non-null    float64
 8   bsmtunfsf            584 non-null    float64
 9   totalbsmtsf          584 non-null    float64
 10  firstflrsf           584 non-null    float64
 11  secondflrsf          584 non-null    float64
 12  lowqualfinsf         584 non-null    float64
 13  grlivarea            584 non-null    float64
 14  bsmtfullbath         584 non-null    float64
 15  bsmthalfbath         584 non-null    flo

### For df_train and df_validation the "id" column is removed and for df_test both the "id" column and "saleprice" column are removed

In [23]:
# Check if 'saleprice' column is in df_train
is_saleprice_in_train = 'saleprice' in df_train.columns
print(f"'saleprice' in df_train: {is_saleprice_in_train}")

# Check if 'saleprice' column is in df_validation
is_saleprice_in_validation = 'saleprice' in df_validation.columns
print(f"'saleprice' in df_validation: {is_saleprice_in_validation}")

# Check if 'saleprice' column is in df_validation
is_saleprice_in_test = 'saleprice' in df_test.columns
print(f"'saleprice' in df_test: {is_saleprice_in_test}")

'saleprice' in df_train: True
'saleprice' in df_validation: True
'saleprice' in df_test: True


In [24]:
df_train = df_train.drop(['id', 'event_time', 'write_time','api_invocation_time','is_deleted'], axis=1)
df_validation = df_validation.drop(['id', 'event_time', 'write_time','api_invocation_time','is_deleted'], axis=1)
df_test = df_test.drop(['id', 'saleprice', 'event_time', 'write_time','api_invocation_time','is_deleted'], axis=1)

### Check the shape of the data

In [25]:
print(df_train.shape)
print(df_validation.shape)
print(df_test.shape)

(4086, 35)
(1168, 35)
(584, 34)


In [26]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4086 entries, 0 to 4085
Data columns (total 35 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   mssubclass     4086 non-null   float64
 1   lotarea        4086 non-null   float64
 2   yearbuilt      4086 non-null   float64
 3   yearremodadd   4086 non-null   float64
 4   masvnrarea     4086 non-null   float64
 5   bsmtfinsf1     4086 non-null   float64
 6   bsmtfinsf2     4086 non-null   float64
 7   bsmtunfsf      4086 non-null   float64
 8   totalbsmtsf    4086 non-null   float64
 9   firstflrsf     4086 non-null   float64
 10  secondflrsf    4086 non-null   float64
 11  lowqualfinsf   4086 non-null   float64
 12  grlivarea      4086 non-null   float64
 13  bsmtfullbath   4086 non-null   float64
 14  bsmthalfbath   4086 non-null   float64
 15  fullbath       4086 non-null   float64
 16  halfbath       4086 non-null   float64
 17  bedroomabvgr   4086 non-null   float64
 18  kitchena

### Check the first few rows of the data

In [27]:
print(df_train.head(5))

   mssubclass  lotarea  yearbuilt  yearremodadd  masvnrarea  bsmtfinsf1  \
0        60.0   9600.0     1971.0        1971.0         0.0       329.0   
1        20.0  10682.0     1960.0        1971.0         0.0       399.0   
2        80.0  11512.0     1959.0        2006.0        84.0       719.0   
3        20.0  12537.0     1971.0        2008.0         0.0       734.0   
4        20.0  11027.0     1954.0        1954.0        28.0       468.0   

   bsmtfinsf2  bsmtunfsf  totalbsmtsf  firstflrsf  ...  openporchsf  \
0         0.0      386.0        715.0       930.0  ...         78.0   
1         0.0      615.0       1014.0      1149.0  ...        240.0   
2         0.0      300.0       1019.0      1357.0  ...          0.0   
3         0.0      344.0       1078.0      1078.0  ...          0.0   
4       539.0      171.0       1178.0      1293.0  ...          0.0   

   enclosedporch  threessnporch  screenporch  poolarea  miscval  mosold  \
0            0.0            0.0          0.0   

In [28]:
df_validation.head(5)

Unnamed: 0,mssubclass,lotarea,yearbuilt,yearremodadd,masvnrarea,bsmtfinsf1,bsmtfinsf2,bsmtunfsf,totalbsmtsf,firstflrsf,...,openporchsf,enclosedporch,threessnporch,screenporch,poolarea,miscval,mosold,yrsold,saleprice,logsaleprice
0,60.0,8396.0,2003.0,2003.0,196.0,0.0,0.0,847.0,847.0,847.0,...,48.0,0.0,0.0,0.0,0.0,0.0,5.0,2008.0,163000.0,12.001506
1,60.0,11287.0,1989.0,1989.0,340.0,421.0,0.0,386.0,807.0,1175.0,...,84.0,0.0,196.0,0.0,0.0,0.0,1.0,2007.0,228500.0,12.339292
2,60.0,53107.0,1992.0,1992.0,0.0,985.0,0.0,595.0,1580.0,1079.0,...,231.0,0.0,0.0,0.0,0.0,0.0,6.0,2007.0,240000.0,12.388394
3,20.0,10400.0,1988.0,1988.0,102.0,929.0,0.0,916.0,1845.0,1872.0,...,39.0,0.0,0.0,0.0,0.0,0.0,6.0,2006.0,241500.0,12.394625
4,50.0,12392.0,1950.0,2000.0,0.0,435.0,0.0,397.0,832.0,1218.0,...,96.0,0.0,0.0,0.0,0.0,0.0,4.0,2008.0,163000.0,12.001506


In [29]:
df_test.head(5)

Unnamed: 0,mssubclass,lotarea,yearbuilt,yearremodadd,masvnrarea,bsmtfinsf1,bsmtfinsf2,bsmtunfsf,totalbsmtsf,firstflrsf,...,wooddecksf,openporchsf,enclosedporch,threessnporch,screenporch,poolarea,miscval,mosold,yrsold,logsaleprice
0,20.0,7024.0,2005.0,2006.0,0.0,980.0,0.0,110.0,1090.0,1090.0,...,0.0,49.0,0.0,0.0,0.0,0.0,0.0,6.0,2007.0,11.964001
1,20.0,10402.0,2009.0,2009.0,0.0,0.0,0.0,1226.0,1226.0,1226.0,...,0.0,36.0,0.0,0.0,0.0,0.0,0.0,5.0,2010.0,12.200558
2,30.0,10914.0,1929.0,1950.0,0.0,0.0,0.0,715.0,715.0,715.0,...,0.0,0.0,75.0,0.0,112.0,0.0,0.0,8.0,2006.0,12.001506
3,50.0,6000.0,1929.0,1950.0,0.0,0.0,0.0,862.0,862.0,950.0,...,0.0,0.0,112.0,0.0,0.0,0.0,0.0,7.0,2008.0,11.695247
4,30.0,7392.0,1930.0,1995.0,0.0,0.0,0.0,520.0,520.0,912.0,...,0.0,90.0,0.0,0.0,0.0,0.0,0.0,5.0,2008.0,12.001506


### Save the data to the S3 bucket as CSV files

In [30]:
'''
!pip install botocore==1.34.129 s3transfer==0.3.0 boto3==1.34.129
!pip install s3fs
'''

'\n!pip install botocore==1.34.129 s3transfer==0.3.0 boto3==1.34.129\n!pip install s3fs\n'

In [31]:
df_train.to_csv(f"s3://{bucket_name}/processed/housing/train.csv", header=False,  index=False)

In [32]:
df_validation.to_csv(f"s3://{bucket_name}/processed/housing/validation.csv", header=False, index=False)

In [33]:
df_test.to_csv(f"s3://{bucket_name}/processed/housing/test.csv", header=False, index=False)

### Train a model using SageMaker built-in XgBoost algorithm on the training data and validate it on the validation data
1. set the container image for the XGBoost algorithm
2. set the output path
3. set the hyperparameters
4. create an estimator
5. fit the model to the training data and validate on the validation data

In [34]:
# set the container image for the XGBoost algorithm
container = image_uris.retrieve(region=region, framework="xgboost", version="latest")

In [35]:
# set the output path
output_path = f"s3://{bucket_name}/housing/model"

In [36]:
# set the hyperparameters
hyperparameters = {
    "objective": "reg:linear",
    "num_round": "100",
    "max_depth": "5",
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.7",
    "silent": "0",
}

In [37]:
# create an estimator
estimator = Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    output_path=output_path,
    hyperparameters=hyperparameters,
)

In [38]:
# fit the model to the training data and validate on the validation data
estimator.fit(
    {
        "train": TrainingInput(
            s3_data=f"s3://{bucket_name}/processed/housing/train.csv", content_type="text/csv"
        ),
        "validation": TrainingInput(
            s3_data=f"s3://{bucket_name}/processed/housing/validation.csv", content_type="text/csv"
        ),
    }
)

INFO:sagemaker:Creating training-job with name: xgboost-2024-06-20-02-20-59-393


2024-06-20 02:20:59 Starting - Starting the training job...
2024-06-20 02:21:15 Starting - Preparing the instances for training...
2024-06-20 02:21:42 Downloading - Downloading input data...
2024-06-20 02:22:07 Downloading - Downloading the training image...
2024-06-20 02:22:53 Training - Training image download completed. Training in progress.
2024-06-20 02:22:53 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2024-06-20:02:22:46:INFO] Running standalone xgboost training.[0m
[34m[2024-06-20:02:22:46:INFO] File size need to be processed in the node: 0.97mb. Available memory size in the node: 7988.06mb[0m
[34m[2024-06-20:02:22:46:INFO] Determined delimiter of CSV input is ','[0m
[34m[02:22:46] S3DistributionType set as FullyReplicated[0m
[34m[02:22:46] 4086x34 matrix with 138924 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2024-06-20:02:22:46:INFO] Determined delimiter of CSV input is ','[0m
[34m[02:

### Upload the Sagemaker Model created during our training job to the Sagemaker Model Registry

In [39]:
# Get the current date and time
current_datetime = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
# Update the model_name variable
model_name = f"sagemaker-xgboost-{current_datetime}"
print(model_name)

sagemaker-xgboost-2024-06-20-02-23-41


In [40]:
# Saving training job information to be used in the ML lineage module
training_job_info = estimator.latest_training_job.describe()
if training_job_info != None:
    
    # Get the model data
    model_data = training_job_info["ModelArtifacts"]["S3ModelArtifacts"]
    # Create the primary container
    primary_container = {"Image": container, "ModelDataUrl": model_data}

    # Save our model to the Sagemaker Model Registry
    create_model_response = sagemaker_client.create_model(
        ModelName=model_name, ExecutionRoleArn=role, PrimaryContainer=primary_container
    )
    print(create_model_response["ModelArn"])

arn:aws:sagemaker:us-east-1:952054755114:model/sagemaker-xgboost-2024-06-20-02-23-41


In [41]:
# Inspect Training Job Details
training_job_info

{'TrainingJobName': 'xgboost-2024-06-20-02-20-59-393',
 'TrainingJobArn': 'arn:aws:sagemaker:us-east-1:952054755114:training-job/xgboost-2024-06-20-02-20-59-393',
 'ModelArtifacts': {'S3ModelArtifacts': 's3://housing-dataset-5435xx/housing/model/xgboost-2024-06-20-02-20-59-393/output/model.tar.gz'},
 'TrainingJobStatus': 'Completed',
 'SecondaryStatus': 'Completed',
 'HyperParameters': {'eta': '0.2',
  'gamma': '4',
  'max_depth': '5',
  'min_child_weight': '6',
  'num_round': '100',
  'objective': 'reg:linear',
  'silent': '0',
  'subsample': '0.7'},
 'AlgorithmSpecification': {'TrainingImage': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
  'TrainingInputMode': 'File',
  'MetricDefinitions': [{'Name': 'train:mae',
    'Regex': '.*\\[[0-9]+\\].*#011train-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'},
   {'Name': 'train:merror',
    'Regex': '.*\\[[0-9]+\\].*#011train-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'},
   {'Name': 'validation:mae',
    'Regex

### Host the trained XGBoost model as a SageMaker Endpoint

In [42]:
# deploy the model to an endpoint
predictor = estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.xlarge",
    endpoint_name="housing-endpoint",
    wait=True,
)

INFO:sagemaker:Creating model with name: xgboost-2024-06-20-02-23-42-062
INFO:sagemaker:Creating endpoint-config with name housing-endpoint
INFO:sagemaker:Creating endpoint with name housing-endpoint


-----!

In [43]:
print(predictor.endpoint_name)

housing-endpoint


In [None]:
# Extracting information from training_job_info
model_data = training_job_info['ModelArtifacts']['S3ModelArtifacts']
image_uri = training_job_info['AlgorithmSpecification']['TrainingImage']
baseline_dataset = training_job_info['InputDataConfig'][0]['DataSource']['S3DataSource']['S3Uri']

# Store the variables
%store model_data
%store image_uri
%store baseline_dataset

### Real time inference using the deployed endpoint

In [44]:
# make predictions using the endpoint
csv_data = df_test.to_csv(header=False, index=False)
predictions = predictor.predict(csv_data, initial_args={"ContentType": "text/csv"}).decode("utf-8")
predictions = pd.read_csv(StringIO(predictions), header=None)
predictions.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,574,575,576,577,578,579,580,581,582,583
0,168.810226,142.669815,181.753235,136.428436,173.605499,169.296417,151.233002,152.473083,174.711182,171.090561,...,155.016632,156.93428,87.375931,132.90947,163.423004,141.263245,147.741699,175.709076,138.951355,154.498581


### Delete the endpoint

In [45]:
# delete the endpoint
predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: housing-endpoint
INFO:sagemaker:Deleting endpoint with name: housing-endpoint


### Shut down notebook resources

In [46]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>

In [47]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>