## Amazon Sagemaker Feature Store


In [None]:
import pandas as pd
import time
import sagemaker
# read the prepared data from S3. Enter any of the Results processed file S3 location
source = 's3://sagemaker-us-east-1-367858208265/Results/DataWrangler/output_1631272206/part-00000-d3369d58-6799-4d9f-91bd-0f0159be50b4-c000.csv'
df = pd.read_csv(source)

When creating a feature group, you can also create the metadata for the feature group, such as a short description, storage configuration, features for identifying each record, and the event time, as well as tags to store information such as the author, data source, version, and more. Since we do not have any such column, we are adding two extra columns called Fraud_ID and Fraud_time

In [2]:
#Add unique ID and event time for features store
df['Fraud_ID'] = df.index + 1000
current_time_sec = int(round(time.time()))
df['Fraud_time'] = pd.Series([current_time_sec]*len(df), dtype="float64")
df=df.drop(['_c0'],axis=1)
df.head()

Unnamed: 0,age,policy_number,policy_state,policy_deductable,policy_annual_premium,umbrella_limit,insured_sex,insured_education_level,insured_occupation,insured_relationship,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,fraud_reported,Fraud_ID,Fraud_time
0,48,521585,2,1000,1406.91,0,1,4,2,0,...,2,1,71610,6510,13020,52080,10,1.0,1000,1631615000.0
1,29,687698,2,2000,1413.14,5000000,0,6,11,3,...,3,0,34650,7700,3850,23100,4,0.0,1001,1631615000.0
2,41,227811,0,2000,1415.74,6000000,0,6,1,4,...,2,0,63400,6340,6340,50720,3,1.0,1002,1631615000.0
3,44,367455,0,1000,1583.91,6000000,1,0,11,4,...,1,0,6500,1300,650,4550,0,0.0,1003,1631615000.0
4,39,104594,2,1000,1351.1,0,0,6,12,4,...,2,0,64100,6410,6410,51280,10,1.0,1004,1631615000.0


In [3]:
# initialize necessary variables
import boto3
region = sagemaker.Session().boto_region_name
boto3.setup_default_session(region_name=region)
s3_client = boto3.client("s3", region_name=region)

NameError: name 'sagemaker' is not defined

### Configure the feature groups
The datatype for each feature is set by passing a dataframe and inferring the proper datatype. Feature data types can also be set via a config variable, but it will have to match the correspongin Python data type in the Pandas dataframe when it’s ingested to the Feature Group.

In [None]:
#configure the features
from sagemaker.feature_store.feature_group import FeatureGroup
fraud_fg_name = f"auto-fraud"
fraud_feature_group = FeatureGroup(name=fraud_fg_name, sagemaker_session=sess)
fraud_feature_group.load_feature_definitions(data_frame=df)

### Create the feature groups
You must tell the Feature Group which columns in the dataframe correspond to the required record indentifier and event time features.

In [None]:
record_identifier_feature_name = "Fraud_ID"
event_time_feature_name = "Fraud_time"
sagemaker_role = sagemaker.get_execution_role()
try:
    print(f"\n Using s3://{bucket}/AutoInsuranceFraudDetection/ResultSet")
    fraud_feature_group.create(
        s3_uri=f"s3://{bucket}/DataSet/insurance_claims.csv",
        record_identifier_name='Fraud_ID',
        event_time_feature_name='Fraud_time',
        role_arn=sagemaker_role,
        enable_online_store=True,
    )
    print(f'Create "fraud" feature group: SUCCESS')
except Exception as e:
    code = e.response.get("Error").get("Code")
    if code == "ResourceInUse":
        print(f"Using existing feature group: {fraud_fg_name}")
    else:
        raise (e)

### Wait until feature group creation has fully completed

In [None]:
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")


wait_for_feature_group_creation_complete(feature_group=fraud_feature_group)

### Ingest records into the Feature Groups
After the Feature Groups have been created, we can put data into each store by using the PutRecord API. This API can handle high TPS and is designed to be called by different streams. The data from all of these Put requests is buffered and written to s3 in chunks. The files will be written to the offline store within a few minutes of ingestion.

In [None]:
fraud_feature_group.ingest(data_frame=df, max_workers=3, wait=True)

### Wait for offline store data to become available
This usually takes 5-8 minutes

In [None]:
fraud_feature_group_s3_prefix = (
    f"/AutoInsuranceFraudDetection/ResultSet/FeatureStore"
)

offline_store_contents = None
while offline_store_contents is None:
    objects_in_bucket = s3_client.list_objects(
        Bucket=bucket, Prefix=fraud_feature_group_s3_prefix
    )
    if "Contents" in objects_in_bucket and len(objects_in_bucket["Contents"]) > 1:
        offline_store_contents = objects_in_bucket["Contents"]
    else:
        print("Waiting for data in offline store...")
        time.sleep(60)

print("\nData available.")