https://sagemaker-examples.readthedocs.io/en/latest/sagemaker-featurestore/sagemaker_featurestore_fraud_detection_python_sdk.html

# Setup SageMaker FeatureStore

In [18]:
import boto3
import sagemaker

original_boto3_version = boto3.__version__
%pip install 'boto3>1.17.21

/bin/sh: 1: Syntax error: Unterminated quoted string
Note: you may need to restart the kernel to use updated packages.


In [19]:
from sagemaker.session import Session

region = boto3.Session().region_name

boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

In [20]:
# You can modify the following to use a bucket of your choosing
default_s3_bucket_name = feature_store_session.default_bucket()
prefix = "sagemaker-featurestore-demo"

print(default_s3_bucket_name)

sagemaker-us-east-1-229319431800


In [21]:
from sagemaker import get_execution_role

# You can modify the following to use a role of your choosing. See the documentation for how to create this.
role = get_execution_role()
print(role)

arn:aws:iam::229319431800:role/service-role/AmazonSageMaker-ExecutionRole-20210804T123219


# Inspect Dataset

The provided dataset is a synthetic dataset with two tables: identity and transactions. They can both be joined by the TransactionId column. The transaction table contains information about a particular transaction such as amount, credit or debit card while the identity table contains information about the user such as device type and browser. The transaction must exist in the transaction table, but might not always be available in the identity table.

The objective of the model is to predict if a transaction is fraudulent or not, given the transaction record.

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import io

s3_client = boto3.client("s3", region_name=region)

fraud_detection_bucket_name = "sagemaker-sample-files"
identity_file_key = (
    "datasets/tabular/fraud_detection/synthethic_fraud_detection_SA/sampled_identity.csv"
)
transaction_file_key = (
    "datasets/tabular/fraud_detection/synthethic_fraud_detection_SA/sampled_transactions.csv"
)

identity_data_object = s3_client.get_object(
    Bucket=fraud_detection_bucket_name, Key=identity_file_key
)
transaction_data_object = s3_client.get_object(
    Bucket=fraud_detection_bucket_name, Key=transaction_file_key
)

identity_data = pd.read_csv(io.BytesIO(identity_data_object["Body"].read()))
transaction_data = pd.read_csv(io.BytesIO(transaction_data_object["Body"].read()))

identity_data = identity_data.round(5)
transaction_data = transaction_data.round(5)

identity_data = identity_data.fillna(0)
transaction_data = transaction_data.fillna(0)

# Feature transformations for this dataset are applied before ingestion into FeatureStore.
# One hot encode card4, card6
encoded_card_bank = pd.get_dummies(transaction_data["card4"], prefix="card_bank")
encoded_card_type = pd.get_dummies(transaction_data["card6"], prefix="card_type")

transformed_transaction_data = pd.concat(
    [transaction_data, encoded_card_type, encoded_card_bank], axis=1
)
# blank space is not allowed in feature name
transformed_transaction_data = transformed_transaction_data.rename(
    columns={"card_bank_american express": "card_bank_american_express"}
)

In [23]:
identity_data.head()


Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,...,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_18,id_19,id_20
0,2990130,-5,38780.0,0.0,0.0,0.0,-70,0,1,100.0,...,32,80,253,241,260,125,T,F,F,T
1,2990266,-10,69246.0,0.0,0.0,0.0,-67,0,2,100.0,...,47,47,122,33,38,60,T,F,T,F
2,2992553,-45,348819.0,0.0,0.0,0.0,-73,0,0,100.0,...,21,143,268,111,2,135,F,F,T,F
3,2994568,-15,337170.0,0.0,0.0,0.0,-10,1,2,100.0,...,55,127,253,202,135,49,F,F,T,T
4,2994749,-5,680670.0,0.0,0.0,8.0,-1,2,2,100.0,...,52,43,257,7,19,254,F,F,T,T


In [24]:
transformed_transaction_data.head()


Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card4,card5,card6,...,N8,N9,card_type_0,card_type_credit,card_type_debit,card_bank_0,card_bank_american_express,card_bank_discover,card_bank_mastercard,card_bank_visa
0,3343087,0,8810855,29.0,12469,360.0,150.0,mastercard,126.0,debit,...,F,T,0,0,1,0,0,0,1,0
1,3307318,0,7955295,107.95,16188,178.0,150.0,mastercard,224.0,debit,...,F,T,0,0,1,0,0,0,1,0
2,3555327,0,15084339,159.95,1825,555.0,150.0,visa,226.0,debit,...,T,F,0,0,1,0,0,0,0,1
3,3310736,0,8017157,159.95,10057,225.0,150.0,mastercard,224.0,debit,...,F,F,0,0,1,0,0,0,1,0
4,3034711,0,1127470,117.0,11444,555.0,150.0,visa,226.0,debit,...,F,F,0,0,1,0,0,0,0,1


# Ingest Data into FeatureStore


In [25]:
from time import gmtime, strftime, sleep

identity_feature_group_name = "identity-feature-group-" + strftime("%d-%H-%M-%S", gmtime())
transaction_feature_group_name = "transaction-feature-group-" + strftime("%d-%H-%M-%S", gmtime())


In [26]:
from sagemaker.feature_store.feature_group import FeatureGroup

identity_feature_group = FeatureGroup(
    name=identity_feature_group_name, sagemaker_session=feature_store_session
)
transaction_feature_group = FeatureGroup(
    name=transaction_feature_group_name, sagemaker_session=feature_store_session
)

In [27]:
import time

current_time_sec = int(round(time.time()))


def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == "object":
            data_frame[label] = data_frame[label].astype("str").astype("string")


# cast object dtype to string. The SageMaker FeatureStore Python SDK will then map the string dtype to String feature type.
cast_object_to_string(identity_data)
cast_object_to_string(transformed_transaction_data)

# record identifier and event time feature names
record_identifier_feature_name = "TransactionID"
event_time_feature_name = "EventTime"

# append EventTime feature
identity_data[event_time_feature_name] = pd.Series(
    [current_time_sec] * len(identity_data), dtype="float64"
)
transformed_transaction_data[event_time_feature_name] = pd.Series(
    [current_time_sec] * len(transaction_data), dtype="float64"
)

# load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data.
identity_feature_group.load_feature_definitions(data_frame=identity_data)
# output is suppressed
transaction_feature_group.load_feature_definitions(data_frame=transformed_transaction_data)
# output is suppressed

[FeatureDefinition(feature_name='TransactionID', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='isFraud', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='TransactionDT', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='TransactionAmt', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='card1', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='card2', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='card3', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='card4', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='card5', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='card6', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinit

In [28]:
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")


identity_feature_group.create(
    s3_uri=f"s3://{default_s3_bucket_name}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True,
)

transaction_feature_group.create(
    s3_uri=f"s3://{default_s3_bucket_name}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True,
)

wait_for_feature_group_creation_complete(feature_group=identity_feature_group)
wait_for_feature_group_creation_complete(feature_group=transaction_feature_group)

Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
FeatureGroup identity-feature-group-05-04-01-13 successfully created.
FeatureGroup transaction-feature-group-05-04-01-13 successfully created.


Confirm the FeatureGroup has been created by using the DescribeFeatureGroup and ListFeatureGroups APIs.

In [29]:
identity_feature_group.describe()


{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:229319431800:feature-group/identity-feature-group-05-04-01-13',
 'FeatureGroupName': 'identity-feature-group-05-04-01-13',
 'RecordIdentifierFeatureName': 'TransactionID',
 'EventTimeFeatureName': 'EventTime',
 'FeatureDefinitions': [{'FeatureName': 'TransactionID',
   'FeatureType': 'Integral'},
  {'FeatureName': 'id_01', 'FeatureType': 'Integral'},
  {'FeatureName': 'id_02', 'FeatureType': 'Fractional'},
  {'FeatureName': 'id_03', 'FeatureType': 'Fractional'},
  {'FeatureName': 'id_04', 'FeatureType': 'Fractional'},
  {'FeatureName': 'id_05', 'FeatureType': 'Fractional'},
  {'FeatureName': 'id_06', 'FeatureType': 'Integral'},
  {'FeatureName': 'id_07', 'FeatureType': 'Integral'},
  {'FeatureName': 'id_08', 'FeatureType': 'Integral'},
  {'FeatureName': 'id_09', 'FeatureType': 'Fractional'},
  {'FeatureName': 'id_10', 'FeatureType': 'Integral'},
  {'FeatureName': 'id_11', 'FeatureType': 'Integral'},
  {'FeatureName': 'id_12', 'FeatureTyp

In [30]:
transaction_feature_group.describe()


{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:229319431800:feature-group/transaction-feature-group-05-04-01-13',
 'FeatureGroupName': 'transaction-feature-group-05-04-01-13',
 'RecordIdentifierFeatureName': 'TransactionID',
 'EventTimeFeatureName': 'EventTime',
 'FeatureDefinitions': [{'FeatureName': 'TransactionID',
   'FeatureType': 'Integral'},
  {'FeatureName': 'isFraud', 'FeatureType': 'Integral'},
  {'FeatureName': 'TransactionDT', 'FeatureType': 'Integral'},
  {'FeatureName': 'TransactionAmt', 'FeatureType': 'Fractional'},
  {'FeatureName': 'card1', 'FeatureType': 'Integral'},
  {'FeatureName': 'card2', 'FeatureType': 'Fractional'},
  {'FeatureName': 'card3', 'FeatureType': 'Fractional'},
  {'FeatureName': 'card4', 'FeatureType': 'String'},
  {'FeatureName': 'card5', 'FeatureType': 'Fractional'},
  {'FeatureName': 'card6', 'FeatureType': 'String'},
  {'FeatureName': 'B1', 'FeatureType': 'Integral'},
  {'FeatureName': 'B2', 'FeatureType': 'Integral'},
  {'FeatureName': 'B3', '

In [31]:
sagemaker_client.list_feature_groups()  # use boto client to list FeatureGroups


{'FeatureGroupSummaries': [{'FeatureGroupName': 'transaction-feature-group-05-04-01-13',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:229319431800:feature-group/transaction-feature-group-05-04-01-13',
   'CreationTime': datetime.datetime(2021, 8, 5, 4, 1, 39, 408000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created'},
  {'FeatureGroupName': 'identity-feature-group-05-04-01-13',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:229319431800:feature-group/identity-feature-group-05-04-01-13',
   'CreationTime': datetime.datetime(2021, 8, 5, 4, 1, 37, 148000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created'},
  {'FeatureGroupName': 'FG-sample-e55bb4fe',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:229319431800:feature-group/fg-sample-e55bb4fe',
   'CreationTime': datetime.datetime(2021, 8, 5, 3, 34, 18, 947000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created'}],
 'ResponseMetadata': {'RequestId': 'c7b24463-ece0-4538-a3d4-635c05bb8ea8',
  'HTTPStatusCode': 200,
  

After the FeatureGroups have been created, we can put data into the FeatureGroups by using the PutRecord API. This API can handle high TPS and is designed to be called by different streams. The data from all of these Put requests is buffered and written to S3 in chunks. The files will be written to the offline store within a few minutes of ingestion. For this example, to accelerate the ingestion process, we are specifying multiple workers to do the job simultaneously. It will take ~1min to ingest data to the 2 FeatureGroups, respectively.



In [32]:
identity_feature_group.ingest(data_frame=identity_data, max_workers=3, wait=True)


IngestionManagerPandas(feature_group_name='identity-feature-group-05-04-01-13', sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7f6f87b86790>, max_workers=3, max_processes=1, _async_result=<multiprocess.pool.MapResult object at 0x7f6f86770b50>, _processing_pool=<pool ProcessPool(ncpus=1)>, _failed_indices=[])

In [33]:
transaction_feature_group.ingest(data_frame=transformed_transaction_data, max_workers=5, wait=True)


IngestionManagerPandas(feature_group_name='transaction-feature-group-05-04-01-13', sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7f6f87b86790>, max_workers=5, max_processes=1, _async_result=<multiprocess.pool.MapResult object at 0x7f6f86747dd0>, _processing_pool=<pool ProcessPool(ncpus=1)>, _failed_indices=[])

To confirm that data has been ingested, we can quickly retrieve a record from the online store:



In [34]:
record_identifier_value = str(2990130)

featurestore_runtime.get_record(
    FeatureGroupName=transaction_feature_group_name,
    RecordIdentifierValueAsString=record_identifier_value,
)


{'ResponseMetadata': {'RequestId': 'a907bae1-9ff3-420c-99a8-c6f7fce01279',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'a907bae1-9ff3-420c-99a8-c6f7fce01279',
   'content-type': 'application/json',
   'content-length': '2636',
   'date': 'Thu, 05 Aug 2021 04:02:24 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'TransactionID', 'ValueAsString': '2990130'},
  {'FeatureName': 'isFraud', 'ValueAsString': '0'},
  {'FeatureName': 'TransactionDT', 'ValueAsString': '152647'},
  {'FeatureName': 'TransactionAmt', 'ValueAsString': '75.0'},
  {'FeatureName': 'card1', 'ValueAsString': '4577'},
  {'FeatureName': 'card2', 'ValueAsString': '583.0'},
  {'FeatureName': 'card3', 'ValueAsString': '150.0'},
  {'FeatureName': 'card4', 'ValueAsString': 'mastercard'},
  {'FeatureName': 'card5', 'ValueAsString': '219.0'},
  {'FeatureName': 'card6', 'ValueAsString': 'credit'},
  {'FeatureName': 'B1', 'ValueAsString': '69'},
  {'FeatureName': 'B2', 'ValueAsString': '80'},
  {'Featur

In [35]:
print(identity_feature_group.as_hive_ddl())


CREATE EXTERNAL TABLE IF NOT EXISTS sagemaker_featurestore.identity-feature-group-05-04-01-13 (
  TransactionID INT
  id_01 INT
  id_02 FLOAT
  id_03 FLOAT
  id_04 FLOAT
  id_05 FLOAT
  id_06 INT
  id_07 INT
  id_08 INT
  id_09 FLOAT
  id_10 INT
  id_11 INT
  id_12 INT
  id_13 INT
  id_14 INT
  id_15 INT
  id_16 INT
  id_17 STRING
  id_18 STRING
  id_19 STRING
  id_20 STRING
  EventTime FLOAT
  write_time TIMESTAMP
  event_time TIMESTAMP
  is_deleted BOOLEAN
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
  STORED AS
  INPUTFORMAT 'parquet.hive.DeprecatedParquetInputFormat'
  OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat'
LOCATION 's3://sagemaker-us-east-1-229319431800/sagemaker-featurestore-demo/229319431800/sagemaker/us-east-1/offline-store/identity-feature-group-05-04-01-13-1628136097/data'


In [36]:
print(transaction_feature_group.as_hive_ddl())


CREATE EXTERNAL TABLE IF NOT EXISTS sagemaker_featurestore.transaction-feature-group-05-04-01-13 (
  TransactionID INT
  isFraud INT
  TransactionDT INT
  TransactionAmt FLOAT
  card1 INT
  card2 FLOAT
  card3 FLOAT
  card4 STRING
  card5 FLOAT
  card6 STRING
  B1 INT
  B2 INT
  B3 INT
  B4 INT
  B5 INT
  B6 INT
  B7 INT
  B8 INT
  B9 INT
  B10 INT
  B11 INT
  B12 INT
  F1 INT
  F2 INT
  F3 INT
  F4 INT
  F5 INT
  F6 INT
  F7 INT
  F8 INT
  F9 INT
  F10 INT
  F11 INT
  F12 INT
  F13 INT
  F14 INT
  F15 INT
  F16 INT
  F17 INT
  N1 STRING
  N2 STRING
  N3 STRING
  N4 STRING
  N5 STRING
  N6 STRING
  N7 STRING
  N8 STRING
  N9 STRING
  card_type_0 INT
  card_type_credit INT
  card_type_debit INT
  card_bank_0 INT
  card_bank_american_express INT
  card_bank_discover INT
  card_bank_mastercard INT
  card_bank_visa INT
  EventTime FLOAT
  write_time TIMESTAMP
  event_time TIMESTAMP
  is_deleted BOOLEAN
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
  STORE

Now let’s wait for the data to appear in our offline store before moving forward to creating a dataset. This will take approximately 5 minutes.



In [37]:
account_id = boto3.client("sts").get_caller_identity()["Account"]
print(account_id)

identity_feature_group_resolved_output_s3_uri = (
    identity_feature_group.describe()
    .get("OfflineStoreConfig")
    .get("S3StorageConfig")
    .get("ResolvedOutputS3Uri")
)
transaction_feature_group_resolved_output_s3_uri = (
    transaction_feature_group.describe()
    .get("OfflineStoreConfig")
    .get("S3StorageConfig")
    .get("ResolvedOutputS3Uri")
)

identity_feature_group_s3_prefix = identity_feature_group_resolved_output_s3_uri.replace(
    f"s3://{default_s3_bucket_name}/", ""
)
transaction_feature_group_s3_prefix = transaction_feature_group_resolved_output_s3_uri.replace(
    f"s3://{default_s3_bucket_name}/", ""
)

offline_store_contents = None
while offline_store_contents is None:
    objects_in_bucket = s3_client.list_objects(
        Bucket=default_s3_bucket_name, Prefix=transaction_feature_group_s3_prefix
    )
    if "Contents" in objects_in_bucket and len(objects_in_bucket["Contents"]) > 1:
        offline_store_contents = objects_in_bucket["Contents"]
    else:
        print("Waiting for data in offline store...\n")
        sleep(60)

print("Data available.")

229319431800
Waiting for data in offline store...

Waiting for data in offline store...

Waiting for data in offline store...

Waiting for data in offline store...

Waiting for data in offline store...

Waiting for data in offline store...

Data available.


# Build Training Dataset


SageMaker FeatureStore automatically builds the Glue Data Catalog for FeatureGroups (you can optionally turn it on/off while creating the FeatureGroup). In this example, we want to create one training dataset with FeatureValues from both identity and transaction FeatureGroups. This is done by utilizing the auto-built Catalog. We run an Athena query that joins the data stored in the offline store in S3 from the 2 FeatureGroups.



In [38]:
identity_query = identity_feature_group.athena_query()
transaction_query = transaction_feature_group.athena_query()

identity_table = identity_query.table_name
transaction_table = transaction_query.table_name

query_string = (
    'SELECT * FROM "'
    + transaction_table
    + '" LEFT JOIN "'
    + identity_table
    + '" ON "'
    + transaction_table
    + '".transactionid = "'
    + identity_table
    + '".transactionid'
)
print("Running " + query_string)

# run Athena query. The output is loaded to a Pandas dataframe.
# dataset = pd.DataFrame()
identity_query.run(
    query_string=query_string,
    output_location="s3://" + default_s3_bucket_name + "/" + prefix + "/query_results/",
)
identity_query.wait()
dataset = identity_query.as_dataframe()

dataset


Running SELECT * FROM "transaction-feature-group-05-04-01-13-1628136099" LEFT JOIN "identity-feature-group-05-04-01-13-1628136097" ON "transaction-feature-group-05-04-01-13-1628136099".transactionid = "identity-feature-group-05-04-01-13-1628136097".transactionid


Unnamed: 0,transactionid,isfraud,transactiondt,transactionamt,card1,card2,card3,card4,card5,card6,...,id_15,id_16,id_17,id_18,id_19,id_20,eventtime.1,write_time.1,api_invocation_time.1,is_deleted.1
0,2990386,0,155041,29.00,16070,111.0,150.0,visa,166.0,debit,...,,,,,,,,,,
1,3049607,0,1434808,34.00,16691,170.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,
2,3340591,0,8723392,59.00,9500,321.0,150.0,visa,226.0,debit,...,,,,,,,,,,
3,3284525,0,7343650,226.00,13234,298.0,150.0,visa,226.0,debit,...,,,,,,,,,,
4,3424663,0,11062176,117.00,7676,512.0,150.0,visa,226.0,debit,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,3342333,0,8797001,82.95,10112,360.0,150.0,visa,166.0,debit,...,,,,,,,,,,
146,3181470,0,4382823,68.95,9316,0.0,150.0,visa,226.0,credit,...,,,,,,,,,,
147,3115346,0,2561079,247.57,8695,170.0,150.0,visa,226.0,credit,...,,,,,,,,,,
148,2994429,0,239133,100.00,4141,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,


In [39]:
# Prepare query results for training.
query_execution = identity_query.get_query_execution()
query_result = (
    "s3://"
    + default_s3_bucket_name
    + "/"
    + prefix
    + "/query_results/"
    + query_execution["QueryExecution"]["QueryExecutionId"]
    + ".csv"
)
print(query_result)

# Select useful columns for training with target column as the first.
dataset = dataset[
    [
        "isfraud",
        "transactiondt",
        "transactionamt",
        "card1",
        "card2",
        "card3",
        "card5",
        "card_type_credit",
        "card_type_debit",
        "card_bank_american_express",
        "card_bank_discover",
        "card_bank_mastercard",
        "card_bank_visa",
        "id_01",
        "id_02",
        "id_03",
        "id_04",
        "id_05",
    ]
]

# Write to csv in S3 without headers and index column.
dataset.to_csv("dataset.csv", header=False, index=False)
s3_client.upload_file("dataset.csv", default_s3_bucket_name, prefix + "/training_input/dataset.csv")
dataset_uri_prefix = "s3://" + default_s3_bucket_name + "/" + prefix + "/training_input/"

dataset


s3://sagemaker-us-east-1-229319431800/sagemaker-featurestore-demo/query_results/37ed4ac3-0cfd-47f1-b2c6-593489cb5a83.csv


Unnamed: 0,isfraud,transactiondt,transactionamt,card1,card2,card3,card5,card_type_credit,card_type_debit,card_bank_american_express,card_bank_discover,card_bank_mastercard,card_bank_visa,id_01,id_02,id_03,id_04,id_05
0,0,155041,29.00,16070,111.0,150.0,166.0,0,1,0,0,0,1,,,,,
1,0,1434808,34.00,16691,170.0,150.0,102.0,1,0,0,0,1,0,,,,,
2,0,8723392,59.00,9500,321.0,150.0,226.0,0,1,0,0,0,1,,,,,
3,0,7343650,226.00,13234,298.0,150.0,226.0,0,1,0,0,0,1,,,,,
4,0,11062176,117.00,7676,512.0,150.0,226.0,0,1,0,0,0,1,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,0,8797001,82.95,10112,360.0,150.0,166.0,0,1,0,0,0,1,,,,,
146,0,4382823,68.95,9316,0.0,150.0,226.0,1,0,0,0,0,1,,,,,
147,0,2561079,247.57,8695,170.0,150.0,226.0,1,0,0,0,0,1,,,,,
148,0,239133,100.00,4141,404.0,150.0,102.0,1,0,0,0,1,0,,,,,


# Train and deploy

Now it’s time to launch a Training job to fit our model. We use the gradient boosting algorithm provided by XGBoost libary to fit our data. Call the SageMaker XGBoost container and construct a generic SageMaker estimator.



In [40]:
training_image = sagemaker.image_uris.retrieve("xgboost", region, "1.0-1")


In [41]:
training_output_path = "s3://" + default_s3_bucket_name + "/" + prefix + "/training_output"

from sagemaker.estimator import Estimator

training_model = Estimator(
    training_image,
    role,
    instance_count=1,
    instance_type="ml.m5.2xlarge",
    volume_size=5,
    max_run=3600,
    input_mode="File",
    output_path=training_output_path,
    sagemaker_session=feature_store_session,
)

In [42]:
training_model.set_hyperparameters(objective="binary:logistic", num_round=50)


In [43]:
train_data = sagemaker.inputs.TrainingInput(
    dataset_uri_prefix,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)
data_channels = {"train": train_data}

In [44]:
training_model.fit(inputs=data_channels, logs=True)


2021-08-05 04:10:36 Starting - Starting the training job...
2021-08-05 04:10:38 Starting - Launching requested ML instancesProfilerReport-1628136635: InProgress
...
2021-08-05 04:11:34 Starting - Preparing the instances for training.........
2021-08-05 04:13:06 Downloading - Downloading input data
2021-08-05 04:13:06 Training - Downloading the training image...
2021-08-05 04:13:35 Uploading - Uploading generated training model
2021-08-05 04:13:35 Completed - Training job completed
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV 

# Setup model hosting

In [46]:
predictor = training_model.deploy(initial_instance_count=1, instance_type="ml.m5.xlarge")


-------------!

# Feature store during inference
SageMaker FeatureStore can be useful in supplementing data for inference requests because of the low-latency GetRecord functionality. For this demo, we will be given a TransactionId and query our online FeatureGroups for data on the transaction to build our inference request.



In [57]:
# Incoming inference request.
transaction_id = str(3450774)

# Helper to parse the feature value from the record.
def get_feature_value(record, feature_name):
    return str(list(filter(lambda r: r["FeatureName"] == feature_name, record))[0]["ValueAsString"])


transaction_response = featurestore_runtime.get_record(
    FeatureGroupName=transaction_feature_group_name, RecordIdentifierValueAsString=transaction_id
)
transaction_record = transaction_response["Record"]

transaction_test_data = [
    get_feature_value(transaction_record, "TransactionDT"),
    get_feature_value(transaction_record, "TransactionAmt"),
    get_feature_value(transaction_record, "card1"),
    get_feature_value(transaction_record, "card2"),
    get_feature_value(transaction_record, "card3"),
    get_feature_value(transaction_record, "card5"),
    get_feature_value(transaction_record, "card_type_credit"),
    get_feature_value(transaction_record, "card_type_debit"),
    get_feature_value(transaction_record, "card_bank_american_express"),
    get_feature_value(transaction_record, "card_bank_discover"),
    get_feature_value(transaction_record, "card_bank_mastercard"),
    get_feature_value(transaction_record, "card_bank_visa"),
]

identity_response = featurestore_runtime.get_record(
    FeatureGroupName=identity_feature_group_name, RecordIdentifierValueAsString=transaction_id
)
identity_record = identity_response["Record"]
id_test_data = [
    get_feature_value(identity_record, "id_01"),
    get_feature_value(identity_record, "id_02"),
    get_feature_value(identity_record, "id_03"),
    get_feature_value(identity_record, "id_04"),
#    get_feature_value(identity_record, "id_05"),
]

# Join all pieces for inference request.
inference_request = []
inference_request.extend(transaction_test_data[:])
inference_request.extend(id_test_data[:])

inference_request

['11923451',
 '50.0',
 '12501',
 '490.0',
 '150.0',
 '226.0',
 '0',
 '1',
 '0',
 '0',
 '0',
 '1',
 '-40',
 '20130.0',
 '0.0',
 '0.0']

In [58]:
import json

results = predictor.predict(",".join(inference_request), initial_args={"ContentType": "text/csv"})
prediction = json.loads(results)
print(prediction)

0.0015984359197318554
