# Store Processed Data into Feature Store

In [1]:
import pandas as pd
import numpy as np
import re
import os
import glob
import shutil

import boto3
import sagemaker
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup

In [2]:
%env AWS_PROFILE=aeroxye-sagemaker

env: AWS_PROFILE=aeroxye-sagemaker


In [3]:
!aws sts get-caller-identity

{
    "UserId": "AROAWC4YSIQL5OBFCNGEX:botocore-session-1687359268",
    "Account": "418542404631",
    "Arn": "arn:aws:sts::418542404631:assumed-role/SageMaker-UserRole/botocore-session-1687359268"
}


In [4]:
try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='SageMaker-UserRole')['Role']['Arn']
region = boto3.Session().region_name
boto_session = boto3.Session(region_name=region)

## Initialise Feature Store

In [5]:
bucket = "petfinder6000"
object_path = "auxiliary"
prefix = "sagemaker-featurestore"
offline_feature_store_bucket = 's3://{}/{}/{}'.format(bucket, object_path, prefix)
print(f'Offline bucket: {offline_feature_store_bucket}')

Offline bucket: s3://petfinder6000/auxiliary/sagemaker-featurestore


In [6]:
sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

## Create Feature Group

In [40]:
# CHANGE HERE!!
data_type = 'auxiliary'
data_name = 'interactions'
file_type = 'csv'

In [41]:
feature_group_name = f'{data_name}-feature-group'
feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session)

In [14]:
!echo %cd%

C:\Users\yongr\OneDrive\Documents\MITB\SEM 3 Apr 23\CS608 Recommender Systems\Project\recommender\pre-processing\aws


In [42]:
# load data from csv
# processed_dir = 'processed'
processed_dir = 'data/processed'
if file_type == 'csv':
    data = pd.read_csv(f'{processed_dir}/{data_type}/{data_name}.{file_type}', header=0)
else:
    data = pd.read_pickle(f'{processed_dir}/{data_type}/{data_name}.{file_type}')

In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4841 entries, 0 to 4840
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Unnamed: 0     4841 non-null   int64 
 1   id             4841 non-null   object
 2   catID          4841 non-null   object
 3   userID         4841 non-null   object
 4   updated_at     4841 non-null   object
 5   like           4841 non-null   int64 
 6   created_at     4841 non-null   object
 7   dwell_time_ms  4841 non-null   int64 
 8   click          4841 non-null   int64 
dtypes: int64(4), object(5)
memory usage: 340.5+ KB


In [12]:
data.head()

Unnamed: 0,id,has_other_cats,personality,gender,good_with_other_dogs,employment,created_at,agree_to_fee,is_first_cat,good_with_kids,...,age_no_preference,age_adult,age_senior,primary_color_no_preference,primary_color_black,primary_color_calico_tortie,primary_color_tabby,primary_color_others,primary_color_ginger,primary_color_white
0,15c83d3c-196c-4f57-bee2-361ac6fcc21e,0,anything is nice,no preference,1,working full time,2023-05-19T02:35:02Z,0,1,1,...,0,0,0,1,0,0,0,0,0,0
1,8fe87a5a-a6cf-49db-8b7a-fca9772a717c,0,some spice,no preference,0,student,2023-05-17T04:32:31Z,1,1,0,...,0,0,0,1,0,0,0,0,0,0
2,1528304d-e070-469a-b476-31f3b1022681,0,anything is nice,no preference,0,working full time,2023-05-18T05:52:03Z,1,1,0,...,1,0,0,1,0,0,0,0,0,0
3,bb6d45b5-f368-4877-9a5f-b3cbb8761bd7,0,anything is nice,no preference,0,working full time,2023-06-02T06:45:38Z,1,1,0,...,0,0,0,1,0,0,0,0,0,0
4,9e936f8b-4d16-4d19-b864-6262b350f44f,0,anything is nice,no preference,0,working full time,2023-05-20T03:33:25Z,1,1,0,...,0,1,0,0,1,1,1,0,0,0


In [37]:
data.iloc[72, :]

id                             9b586a59-2705-4d92-95b1-ac2f210e2368
has_other_cats                                                    0
personality                                        anything is nice
gender                                                no preference
good_with_other_dogs                                              1
employment                                        working full time
created_at                                     2023-05-18T06:44:23Z
agree_to_fee                                                      1
is_first_cat                                                      1
good_with_kids                                                    1
attention_need                                          independent
updated_at                                     2023-05-18T06:44:23Z
username                                                 hraesvelgr
home_ownership                                 staying with parents
has_allergies                                   

In [44]:
# drop unnamed column
data = data.drop(columns=['Unnamed: 0'])

In [34]:
# drop img_vector (max length of 358,400)
data = data.drop(columns=['img_vector'])

In [46]:
record_identifier_name = "id"
event_time_feature_name = "updated_at"
feature_group.load_feature_definitions(data_frame=data)

feature_group.create(
    s3_uri=offline_feature_store_bucket,
    record_identifier_name=record_identifier_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True
)

{'FeatureGroupArn': 'arn:aws:sagemaker:ap-southeast-1:418542404631:feature-group/interactions-feature-group',
 'ResponseMetadata': {'RequestId': '078d8d82-680a-438e-abcc-38f5a0c28a5d',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '078d8d82-680a-438e-abcc-38f5a0c28a5d',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '108',
   'date': 'Mon, 19 Jun 2023 10:55:21 GMT'},
  'RetryAttempts': 0}}

In [27]:
# check status of feature group creation
status = feature_group.describe().get("FeatureGroupStatus")
print(status)

Created


## Load Data to Feature Group

In [38]:
# load data into feature group
feature_group.ingest(data_frame=data, max_workers=3, wait=True)

IngestionManagerPandas(feature_group_name='users-feature-group', sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x000001CDA1A6B910>, sagemaker_session=<sagemaker.session.Session object at 0x000001CDA075CC10>, max_workers=3, max_processes=1, profile_name='aeroxye-sagemaker', _async_result=<multiprocess.pool.MapResult object at 0x000001CDA27EA8B0>, _processing_pool=<pool ProcessPool(ncpus=1)>, _failed_indices=[])

## Miscellaneous (Delete, List etc.)

In [39]:
sagemaker_client.list_feature_groups()

{'FeatureGroupSummaries': [{'FeatureGroupName': 'users-feature-group',
   'FeatureGroupArn': 'arn:aws:sagemaker:ap-southeast-1:418542404631:feature-group/users-feature-group',
   'CreationTime': datetime.datetime(2023, 6, 19, 18, 52, 18, 285000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created'},
  {'FeatureGroupName': 'interactions-feature-group',
   'FeatureGroupArn': 'arn:aws:sagemaker:ap-southeast-1:418542404631:feature-group/interactions-feature-group',
   'CreationTime': datetime.datetime(2023, 6, 19, 0, 18, 7, 798000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created',
   'OfflineStoreStatus': {'Status': 'Active'}},
  {'FeatureGroupName': 'cats-feature-group',
   'FeatureGroupArn': 'arn:aws:sagemaker:ap-southeast-1:418542404631:feature-group/cats-feature-group',
   'CreationTime': datetime.datetime(2023, 6, 19, 18, 53, 28, 829000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created'},
  {'FeatureGroupName': 'cat-images-feature-group',
   'FeatureGroupArn': 'arn:aws:sagem

In [30]:
sagemaker_client.search(
    Resource="FeatureMetadata",
    SearchExpression={
        'Filters': [
            {
                'Name': 'FeatureGroupName',
                'Operator': 'Equals',
                'Value': 'users-feature-group'
            },
        ]
    }
)

{'Results': [{'FeatureMetadata': {'FeatureGroupArn': 'arn:aws:sagemaker:ap-southeast-1:418542404631:feature-group/users-feature-group',
    'FeatureGroupName': 'users-feature-group',
    'FeatureName': 'personality',
    'FeatureType': 'String',
    'CreationTime': datetime.datetime(2023, 6, 19, 12, 6, 28, tzinfo=tzlocal()),
    'LastModifiedTime': datetime.datetime(2023, 6, 19, 12, 6, 28, tzinfo=tzlocal()),
    'Parameters': []}},
  {'FeatureMetadata': {'FeatureGroupArn': 'arn:aws:sagemaker:ap-southeast-1:418542404631:feature-group/users-feature-group',
    'FeatureGroupName': 'users-feature-group',
    'FeatureName': 'employment',
    'FeatureType': 'String',
    'CreationTime': datetime.datetime(2023, 6, 19, 12, 6, 28, tzinfo=tzlocal()),
    'LastModifiedTime': datetime.datetime(2023, 6, 19, 12, 6, 28, tzinfo=tzlocal()),
    'Parameters': []}},
  {'FeatureMetadata': {'FeatureGroupArn': 'arn:aws:sagemaker:ap-southeast-1:418542404631:feature-group/users-feature-group',
    'FeatureGro

In [45]:
# to remove feature groups
sagemaker_session = sagemaker.Session()
feature_group_name = f'{data_name}-feature-group'
feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=sagemaker_session)
feature_group.delete()

Test feature retrieval

In [7]:
# get single record from feature group
record_identifier_value = str("8b1aa3c3-162f-4717-a9f5-d895ec52f2a4") #user
# record_identifier_value = str("4f5637ed-1bb9-4e51-979f-94213f0fbd63") #cat

featurestore_runtime.get_record(FeatureGroupName="users-feature-group",
                                RecordIdentifierValueAsString=record_identifier_value)

{'ResponseMetadata': {'RequestId': '8849a7b5-879e-44c2-a67c-080e75f3821f',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '8849a7b5-879e-44c2-a67c-080e75f3821f',
   'content-type': 'application/json',
   'content-length': '1669',
   'date': 'Wed, 21 Jun 2023 14:54:33 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'id',
   'ValueAsString': '8b1aa3c3-162f-4717-a9f5-d895ec52f2a4'},
  {'FeatureName': 'has_other_cats', 'ValueAsString': '0'},
  {'FeatureName': 'personality', 'ValueAsString': 'anything is nice'},
  {'FeatureName': 'gender', 'ValueAsString': 'no preference'},
  {'FeatureName': 'good_with_other_dogs', 'ValueAsString': '1'},
  {'FeatureName': 'employment', 'ValueAsString': 'working full time'},
  {'FeatureName': 'created_at', 'ValueAsString': '2023-05-17T09:04:07Z'},
  {'FeatureName': 'agree_to_fee', 'ValueAsString': '1'},
  {'FeatureName': 'is_first_cat', 'ValueAsString': '1'},
  {'FeatureName': 'good_with_kids', 'ValueAsString': '1'},
  {'FeatureName

Retrieve feature group as dataframe

In [48]:
sagemaker_session = sagemaker.Session()
data_name = 'cats'
feature_group_name = f'{data_name}-feature-group'
feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=sagemaker_session)

In [49]:
from sagemaker.feature_store.feature_store import FeatureStore

feature_store = FeatureStore(feature_store_session)
builder = feature_store.create_dataset(
    base=feature_group,
    output_path=f's3://petfinder6000/auxiliary/{data_name}'
)

In [50]:
df, query = builder.to_dataframe()

In [14]:
df

Unnamed: 0,id,cws_id,good_with_other_cats,good_with_kids,primary_color,name,preferred_employment,sweet,preferred_home_ownership,active,...,quiet,good_with_other_dogs,attention_need,loving,updated_at,require_fee,personality,description,spicy,energy_level
0,6f270016-5dae-49fc-b317-d85cce6b6304,1681491111-56,neutral,neutral,white,Carefree,not working,1,owned,0,...,1,neutral,loves attention,1,2023-05-15T02:51:23Z,yes,sweet,"quiet, loving, sweet, likes to be held, loves ...",0,chill
1,6faa772a-67df-4ce3-bc36-5c7954999f44,1681491601-275,no,neutral,tabby,Tiger,working full time,0,owned,1,...,0,neutral,neutral,0,2023-05-15T02:51:16Z,yes,neutral,"playful, active",0,high energy
2,9e7639a6-4ea3-4227-8233-e9b9a64a7569,1681491369-171,neutral,neutral,tabby,Adopt Nutella!,working full time,1,owned,0,...,0,neutral,neutral,0,2023-05-15T02:51:33Z,yes,sweet,"playful, sweet, friendly",0,high energy
3,268b7e59-3613-4b36-a473-a510fcea8291,1681491792-359,yes,neutral,ginger,GIN & TONIC,working full time,1,no preference,1,...,0,neutral,neutral,1,2023-05-15T02:51:45Z,no preference,sweet,"playful, active, loving, sweet, likes to be held",0,high energy
4,2a14ef19-eb16-44c9-8948-9cbb4583d1a8,1681491138-68,neutral,neutral,tabby,Ah Boy,student,1,parents,0,...,0,neutral,independent,1,2023-05-15T02:51:24Z,yes,sweet,"loving, sweet, shy",0,chill
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,f617a6d1-885a-48e8-ab56-0f7955d2076b,1681491011-11,neutral,neutral,black,Sweet Loving Gecko,student,1,parents,0,...,1,neutral,independent,1,2023-05-15T02:51:07Z,yes,sweet,"quiet, loving, sweet",0,chill
400,94b87e7e-6fb7-4f66-86a7-e6f6b9ee9bea,1681491623-285,yes,neutral,tabby,Gorgeous Belle needs a home!,working full time,1,no preference,0,...,0,neutral,independent,1,2023-05-15T02:51:37Z,no preference,sweet,"loving, sweet, shy",0,chill
401,2c538d63-1a96-4dfb-b443-aa120ebdbf63,1681491556-255,yes,neutral,others,W Family,working full time,0,no preference,1,...,0,neutral,loves attention,1,2023-05-15T02:51:08Z,no preference,sweet,"active, curious, loving, loves attention",0,high energy
402,6ff98fc7-516f-4e06-8cfc-1da42d6ca5d4,1681491125-62,neutral,neutral,black,Friendly Toothless needs a home,student,0,parents,0,...,0,neutral,neutral,0,2023-05-15T02:51:24Z,yes,neutral,no description available,0,chill
