In [2]:
!pip install -qU transformers pip

[0m

In [23]:
import boto3
import pandas as pd

from sagemaker import get_execution_role
import io

from transformers import BertTokenizer
from tensorflow.data import Dataset
from sagemaker.feature_store.feature_group import FeatureGroup

from time import gmtime, strftime, sleep
import time
from sagemaker.session import Session

from sklearn.model_selection import train_test_split

In [8]:
region_name='us-west-2'
session = boto3.Session(region_name=region_name)

s3 = session.client('s3')
bucket_name = 'aai-540-final-data'
s3_path = 'data/pre_processed_data.tsv'

In [9]:
sagemaker_client = session.client(service_name="sagemaker", region_name=region_name)
featurestore_runtime = session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region_name
)

feature_store_session = Session(
    boto_session=session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

In [10]:
prefix = "sagemaker-featurestore-GOEmotions"

In [11]:
role = get_execution_role()
print(role)

arn:aws:iam::542526735114:role/LabRole


In [12]:
# Generate the URL to get 'key-name' from 'bucket-name'
data_obj = s3.get_object(Bucket=bucket_name, Key=s3_path)

df = pd.read_csv(io.BytesIO(data_obj['Body'].read()), delimiter='\t')
df.head()

Unnamed: 0,text,emotions,id
0,"He isn't as big, but he's still quite popular....",0,eczuekb
1,that's adorable asf,0,ef961hv
2,"I have, and now that you mention it, I think t...",27,ed9w1hm
3,"I wanted to downvote this, but it's not your f...",27,ee52cjs
4,Build a wall? /jk,27,edsqvyx


In [13]:
# Loading the file with emotion label names
emotion_labels_obj = s3.get_object(Bucket=bucket_name, Key='data/emotions.txt')
emotion_labels_data = emotion_labels_obj['Body'].read()

# Converting the data from bytes to string and splitting by lines
emotion_labels = emotion_labels_data.decode('utf-8').splitlines()
# Split emotions column to get count of each emotion individually
split_emotions = df['emotions'].astype(str).str.split(',')
all_emotions = split_emotions.explode().astype(int)
# Creating a mapping of indices to emotion labels
emotion_index_to_label = {index: label for index, label in enumerate(emotion_labels)}

# Applying the mapping to the emotions dataset
labeled_emotions = all_emotions.map(emotion_index_to_label)

# Counting occurrences of each emotion label
labeled_emotion_counts = labeled_emotions.value_counts()

emotion_categories = {
	"anger": ["anger", "annoyance", "disapproval"],
	"disgust": ["disgust"],
	"fear": ["fear", "nervousness"],
	"happy": ["joy", "amusement", "approval", "gratitude"],
	"optimistic": ["optimism", "relief", "pride", "excitement"],
	"affectionate": [ "love", "caring", "admiration",  "desire"],
	"sadness": ["sadness", "disappointment", "embarrassment", "grief",  "remorse"],
	"surprise": ["surprise", "realization", "confusion", "curiosity"],
	"neutral": ["neutral"]
} 

emotion_to_category = {}
for category, emotions in emotion_categories.items():
	for emotion in emotions:
		emotion_to_category[emotion] = category

category_counts = pd.Series(dtype=int).reindex(emotion_categories.keys(), fill_value=0)

for emotion, count in labeled_emotion_counts.items():
	category = emotion_to_category[emotion]
	if category:
		category_counts[category] += count

category_counts

anger            6724
disgust           738
fear              790
happy            9993
optimistic       2274
affectionate     8904
sadness          3235
surprise         5584
neutral         16021
dtype: int64

In [14]:
# Convert the labels given the emotion_categories mapping
df['emotions'] = df['emotions'].apply(lambda x: emotion_to_category[emotion_index_to_label[x]])

In [15]:
# count number of labels
num_labels = len(category_counts)
num_labels

9

In [16]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [17]:
# Tokenize the input data
df_encodings = tokenizer(df['text'].tolist(), truncation=True, padding=True, max_length= 50)

In [18]:
# Convert input_ids and attention_mask to lists
input_ids = df_encodings['input_ids']
attention_masks = df_encodings['attention_mask']
# Create a DataFrame
df_encoded = pd.DataFrame({
    'input_ids': input_ids,
    'attention_mask': attention_masks
})
# Display the DataFrame
df_encoded

Unnamed: 0,input_ids,attention_mask
0,"[101, 2002, 3475, 1005, 1056, 2004, 2502, 1010...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"[101, 2008, 1005, 1055, 23677, 2004, 2546, 102...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[101, 1045, 2031, 1010, 1998, 2085, 2008, 2017...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,"[101, 1045, 2359, 2000, 2091, 22994, 2063, 202...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,"[101, 3857, 1037, 2813, 1029, 1013, 1046, 2243...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ..."
...,...,...
54258,"[101, 1031, 2171, 1033, 1012, 2054, 1037, 2051...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ..."
54259,"[101, 2085, 2057, 1521, 2128, 10261, 2331, 200...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
54260,"[101, 11082, 2175, 13664, 3336, 102, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
54261,"[101, 2002, 2170, 1031, 2171, 1033, 2000, 1996...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [41]:
df_encoded['emotions'] = df['emotions']
df_encoded['text'] = df['text']
df_encoded['id'] = df['id']

In [42]:
df_encoded.head()

Unnamed: 0,input_ids,attention_mask,emotions,text,EventTime,id
0,"[101, 2002, 3475, 1005, 1056, 2004, 2502, 1010...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",affectionate,"He isn't as big, but he's still quite popular....",1707786000.0,eczuekb
1,"[101, 2008, 1005, 1055, 23677, 2004, 2546, 102...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...",affectionate,that's adorable asf,1707786000.0,ef961hv
2,"[101, 1045, 2031, 1010, 1998, 2085, 2008, 2017...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",neutral,"I have, and now that you mention it, I think t...",1707786000.0,ed9w1hm
3,"[101, 1045, 2359, 2000, 2091, 22994, 2063, 202...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",neutral,"I wanted to downvote this, but it's not your f...",1707786000.0,ee52cjs
4,"[101, 3857, 1037, 2813, 1029, 1013, 1046, 2243...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...",neutral,Build a wall? /jk,1707786000.0,edsqvyx


In [43]:
# Split the dataset into training and test sets initially
df_train, df_test = train_test_split(df_encoded, test_size=0.1, random_state=42)

# Split the training set further into training and validation sets
df_train, df_val = train_test_split(df_train, test_size=0.125, random_state=42)

df_train['data_type'] = 'train'
df_val['data_type'] = 'val'
df_test['data_type'] = 'test'

df_combined = pd.concat([df_train, df_val, df_test])
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

df_combined.head()

Unnamed: 0,input_ids,attention_mask,emotions,text,EventTime,id,data_type
0,"[101, 2464, 2004, 2775, 6905, 1999, 2070, 3182...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",anger,Seen as Child abuse in some places. Not illega...,1707786000.0,ee3gbt3,train
1,"[101, 8840, 2140, 2012, 2023, 3160, 999, 2065,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",happy,LOL at this question! If he follows the sub......,1707786000.0,ed4qqzj,train
2,"[101, 2104, 11657, 1024, 2104, 4355, 21499, 19...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",neutral,"Underrate: underestimate the extent, value, or...",1707786000.0,eewq2do,train
3,"[101, 2197, 2724, 2253, 2092, 1010, 2871, 1009...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",surprise,"Last event went well, 40+ signups. Looking for...",1707786000.0,edutjm3,train
4,"[101, 1045, 3246, 2017, 1998, 2115, 2128, 7559...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",optimistic,I hope you and your retarded phone have a nice...,1707786000.0,ee7xsxe,train


In [44]:
emotion_feature_group_name = "emotion-feature-group-" + strftime("%d-%H-%M-%S", gmtime())

In [45]:
emotion_feature_group = FeatureGroup(
    name=emotion_feature_group_name, sagemaker_session=feature_store_session
)

In [46]:
current_time_sec = int(round(time.time()))


def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == "object":
            data_frame[label] = data_frame[label].astype("str").astype("string")


# cast object dtype to string. The SageMaker FeatureStore Python SDK will then map the string dtype to String feature type.
cast_object_to_string(df_combined)

# record identifier and event time feature names
record_identifier_feature_name = "id"
event_time_feature_name = "EventTime"

# append EventTime feature
df_combined[event_time_feature_name] = pd.Series(
    [current_time_sec] * len(df_combined), dtype="float64"
)

# load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data.
emotion_feature_group.load_feature_definitions(data_frame=df_combined)

[FeatureDefinition(feature_name='input_ids', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='attention_mask', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='emotions', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='text', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='EventTime', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='id', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='data_type', feature_type=<FeatureTypeEnum.STRING: 'String'>)]

In [47]:
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")


emotion_feature_group.create(
    s3_uri=f"s3://{bucket_name}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True,
)

wait_for_feature_group_creation_complete(feature_group=emotion_feature_group)

Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
FeatureGroup emotion-feature-group-13-03-13-24 successfully created.


In [48]:
emotion_feature_group.describe()

{'FeatureGroupArn': 'arn:aws:sagemaker:us-west-2:542526735114:feature-group/emotion-feature-group-13-03-13-24',
 'FeatureGroupName': 'emotion-feature-group-13-03-13-24',
 'RecordIdentifierFeatureName': 'id',
 'EventTimeFeatureName': 'EventTime',
 'FeatureDefinitions': [{'FeatureName': 'input_ids', 'FeatureType': 'String'},
  {'FeatureName': 'attention_mask', 'FeatureType': 'String'},
  {'FeatureName': 'emotions', 'FeatureType': 'String'},
  {'FeatureName': 'text', 'FeatureType': 'String'},
  {'FeatureName': 'EventTime', 'FeatureType': 'Fractional'},
  {'FeatureName': 'id', 'FeatureType': 'String'},
  {'FeatureName': 'data_type', 'FeatureType': 'String'}],
 'CreationTime': datetime.datetime(2024, 2, 13, 3, 13, 48, 411000, tzinfo=tzlocal()),
 'OnlineStoreConfig': {'EnableOnlineStore': True},
 'OfflineStoreConfig': {'S3StorageConfig': {'S3Uri': 's3://aai-540-final-data/sagemaker-featurestore-GOEmotions',
   'ResolvedOutputS3Uri': 's3://aai-540-final-data/sagemaker-featurestore-GOEmotions/

In [49]:
sagemaker_client.list_feature_groups()

{'FeatureGroupSummaries': [{'FeatureGroupName': 'neighborhood-feature-group',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-west-2:542526735114:feature-group/neighborhood-feature-group',
   'CreationTime': datetime.datetime(2024, 1, 30, 3, 43, 51, 828000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created',
   'OfflineStoreStatus': {'Status': 'Active'}},
  {'FeatureGroupName': 'emotion-feature-group-13-03-13-24',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-west-2:542526735114:feature-group/emotion-feature-group-13-03-13-24',
   'CreationTime': datetime.datetime(2024, 2, 13, 3, 13, 48, 411000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created'},
  {'FeatureGroupName': 'emotion-feature-group-13-00-59-24',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-west-2:542526735114:feature-group/emotion-feature-group-13-00-59-24',
   'CreationTime': datetime.datetime(2024, 2, 13, 1, 2, 14, 652000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created',
   'OfflineStoreStatus': {'Status': 'Active'}},

In [50]:
emotion_feature_group.ingest(data_frame=df_combined, max_workers=3, wait=True)

IngestionManagerPandas(feature_group_name='emotion-feature-group-13-03-13-24', sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7fc1b306b8b0>, max_workers=3, max_processes=1, profile_name=None, _async_result=<multiprocess.pool.MapResult object at 0x7fc1a64bf220>, _processing_pool=<pool ProcessPool(ncpus=1)>, _failed_indices=[])

In [51]:
account_id = boto3.client("sts").get_caller_identity()["Account"]
print(account_id)

emotion_feature_group_resolved_output_s3_uri = (
    emotion_feature_group.describe()
    .get("OfflineStoreConfig")
    .get("S3StorageConfig")
    .get("ResolvedOutputS3Uri")
)


emotion_feature_group_s3_prefix = emotion_feature_group_resolved_output_s3_uri.replace(
    f"s3://{bucket_name}/", ""
)


offline_store_contents = None
while offline_store_contents is None:
    objects_in_bucket = s3.list_objects(
        Bucket=bucket_name, Prefix=emotion_feature_group_s3_prefix
    )
    if "Contents" in objects_in_bucket and len(objects_in_bucket["Contents"]) > 1:
        offline_store_contents = objects_in_bucket["Contents"]
    else:
        print("Waiting for data in offline store...\n")
        sleep(60)

print("Data available.")

542526735114
Data available.
