# Feature Transformation with Scikit-Learn In This Notebook
## Saving Features into the SageMaker Feature Store

In this notebook, we convert raw text into BERT embeddings.  This will allow us to perform natural language processing tasks such as text classification. We save the features into the SageMaker Feature Store.


![](https://github.com/data-science-on-aws/data-science-on-aws/blob/quickstart/00_quickstart/img/prepare_dataset_bert.png?raw=true)

# BERT Mania!

![BERT Mania](https://github.com/data-science-on-aws/data-science-on-aws/blob/quickstart/00_quickstart/img/bert_mania.png?raw=true)

# Understand BERT Embeddings

* Bidirectional Encoder Representations from Transformers [BERT](https://arxiv.org/abs/1810.04805)
* For more details on Transformers Architecture, see [Attention Is All You Need](https://arxiv.org/abs/1706.03762).

<img src="https://github.com/data-science-on-aws/data-science-on-aws/blob/quickstart/00_quickstart/img/bert_embeddings.png?raw=true" width="60%" align="left">

<img src="https://github.com/data-science-on-aws/data-science-on-aws/blob/quickstart/00_quickstart/img/bert_input_features.png?raw=true" width="80%" align="left">

* **input_ids**: 
The id from the pre-trained BERT vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than max_seq_length)

* **input_mask**: 
Specifies which tokens BERT should pay attention to (0 or 1). Padded input_ids will have 0 in each of these vector elements.

* **segment_ids**: 
Segment ids are always 0 for single-sequence tasks such as text classification. 1 is used for two-sequence tasks such as question/answer and next sentence prediction.
  
* **label_id**: 
Label for each training row (star_rating 1 through 5)

In [2]:
import sagemaker
import boto3

sess = sagemaker.Session()
# bucket = sess.default_bucket()
bucket = 'data-science-on-aws-applied'
region = boto3.Session().region_name

import botocore.config

config = botocore.config.Config(
    user_agent_extra='dsoaws/1.0'
)

In [3]:
%store -r role

# Define Maximum Sequence Length for BERT
Maximum sequence length is chosen based on the number-of-word distribution for the review text.
![](img/max_seq_length_viz.png)

In [4]:
max_seq_length = 64

# Convert Raw Text to BERT Features using Hugging Face and TensorFlow

In [5]:
import tensorflow as tf
import collections
import json
import os
import pandas as pd
import csv
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

REVIEW_BODY_COLUMN = "review_body"
REVIEW_ID_COLUMN = "review_id"

LABEL_COLUMN = "star_rating"
LABEL_VALUES = [1, 2, 3, 4, 5]

label_map = {}
for (i, label) in enumerate(LABEL_VALUES):
    label_map[label] = i


class InputFeatures(object):
    """BERT feature vectors."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id, review_id, date, label):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        self.review_id = review_id
        self.date = date
        self.label = label


class Input(object):
    """A single training/test input for sequence classification."""

    def __init__(self, text, review_id, date, label=None):
        """Constructs an Input.
        Args:
          text: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
          label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.text = text
        self.review_id = review_id
        self.date = date
        self.label = label


def convert_input(the_input, max_seq_length):
    # First, we need to preprocess our data so that it matches the data BERT was trained on:
    # 1. Lowercase our text (if we're using a BERT lowercase model)
    # 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
    # 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
    #
    # Fortunately, the Transformers tokenizer does this for us!

    tokens = tokenizer.tokenize(the_input.text)
    tokens.insert(0, '[CLS]')
    tokens.append('[SEP]')
    print("**{} tokens**\n{}\n".format(len(tokens), tokens))

    encode_plus_tokens = tokenizer.encode_plus(
        the_input.text,
        padding='max_length', 
        max_length=max_seq_length,
        truncation=True
    )
    
    # The id from the pre-trained BERT vocabulary that represents the token.  (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)
    input_ids = encode_plus_tokens["input_ids"]

    # Specifies which tokens BERT should pay attention to (0 or 1).  Padded `input_ids` will have 0 in each of these vector elements.
    input_mask = encode_plus_tokens["attention_mask"]

    # Segment ids are always 0 for single-sequence tasks such as text classification.  1 is used for two-sequence tasks such as question/answer and next sentence prediction.
    segment_ids = [0] * max_seq_length

    # Label for each training row (`star_rating` 1 through 5)
    label_id = label_map[the_input.label]

    features = InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_id=label_id,
        review_id=the_input.review_id,
        date=the_input.date,
        label=the_input.label,
    )

    print("**{} input_ids**\n{}\n".format(len(features.input_ids), features.input_ids))
    print("**{} input_mask**\n{}\n".format(len(features.input_mask), features.input_mask))
    print("**{} segment_ids**\n{}\n".format(len(features.segment_ids), features.segment_ids))
    print("**label_id**\n{}\n".format(features.label_id))
    print("**review_id**\n{}\n".format(features.review_id))
    print("**date**\n{}\n".format(features.date))
    print("**label**\n{}\n".format(features.label))

    return features


# We'll need to transform our data into a format that BERT understands.
# - `text` is the text we want to classify, which in this case, is the `Request` field in our Dataframe.
# - `label` is the star_rating label (1, 2, 3, 4, 5) for our training input data
def transform_inputs_to_tfrecord(inputs, output_file, max_seq_length):
    records = []
    tf_record_writer = tf.io.TFRecordWriter(output_file)

    for (input_idx, the_input) in enumerate(inputs):
        if input_idx % 10000 == 0:
            print("Writing input {} of {}\n".format(input_idx, len(inputs)))

        features = convert_input(the_input, max_seq_length)

        all_features = collections.OrderedDict()

        # Create TFRecord With input_ids, input_mask, segment_ids, and label_ids
        all_features["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))
        all_features["input_mask"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))
        all_features["segment_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))
        all_features["label_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))

        tf_record = tf.train.Example(features=tf.train.Features(feature=all_features))
        tf_record_writer.write(tf_record.SerializeToString())

        # Create Record For Feature Store With All Features
        records.append(
            {
                "input_ids": features.input_ids,
                "input_mask": features.input_mask,
                "segment_ids": features.segment_ids,
                "label_id": features.label_id,
                "review_id": the_input.review_id,
                "date": the_input.date,
                "label": features.label,
            }
        )

    tf_record_writer.close()

    return records

2023-01-12 21:17:56.009328: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory
2023-01-12 21:17:56.009946: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Three(3) feature vectors are created from each raw review (`review_body`) during the feature engineering phase to prepare for BERT processing:

* **`input_ids`**:  The id from the pre-trained BERT vocabulary that represents the token.  (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)
    
* **`input_mask`**:  Specifies which tokens BERT should pay attention to (0 or 1).  Padded `input_ids` will have 0 in each of these vector elements.

* **`segment_ids`**:  Segment ids are always 0 for single-sequence tasks such as text classification.  1 is used for two-sequence tasks such as question/answer and next sentence prediction.

And one(1) label is created from each raw review (`star_rating`)  :

* **`label_id`**:  Label for each training row (`star_rating` 1 through 5)

# Demonstrate the BERT-specific Feature Engineering Step
While we are demonstrating this code with a small amount of data here in the notebook, we will soon scale this to much more data on a powerful SageMaker cluster.

## Feature Store requires an Event Time feature

We need a record identifier name and an event time feature name. This will match the column of the corresponding features in our data. 

Note: Event time date feature type provided Integral. Event time type should be either Fractional(Unix timestamp in seconds) or String (ISO-8601 format) type.

In [6]:
from datetime import datetime
from time import strftime

# timestamp = datetime.now().replace(microsecond=0).isoformat()
timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(timestamp)

2023-01-12T21:20:25Z


In [7]:
import pandas as pd

data = [
    [
        5,
        "ABCD12345",
        """I needed an "antivirus" application and know the quality of Norton products.  This was a no brainer for me and I am glad it was so simple to get.""",
    ],
    [
        3,
        "EFGH12345",
        """The problem with ElephantDrive is that it requires the use of Java. Since Java is notorious for security problems I haveit removed from all of my computers. What files I do have stored are photos.""",
    ],
    [
        1,
        "IJKL2345",
        """Terrible, none of my codes worked, and I can't uninstall it.  I think this product IS malware and viruses""",
    ],
]

df = pd.DataFrame(data, columns=["star_rating", "review_id", "review_body"])

# Use the InputExample class from BERT's run_classifier code to create examples from the data
inputs = df.apply(
    lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp),
    axis=1,
)

In [8]:
# Make sure the date is in the correct ISO-8601 format for Feature Store
print(inputs[0].date)

2023-01-12T21:20:25Z


## Save TFRecords

The three(3) features vectors and one(1) label are converted into a list of `TFRecord` instances (1 per each row of training data):
* **`tf_records`**:  Binary representation of each row of training data (3 features + 1 label)

These `TFRecord`s are the engineered features that we will use throughout the rest of the pipeline.

In [9]:
output_file = "./data-tfrecord-featurestore/data.tfrecord"

# Add Features to SageMaker Feature Store

## Create FeatureGroup

A feature group is a logical grouping of features, defined in the Feature Store, to describe records. A feature group definition is composed of a list of feature definitions, a record identifier name, and configurations for its online and offline store.

Create feature group, describe feature group, update feature groups, delete feature group and list feature groups APIs can be used to manage feature groups.


In [10]:
from time import gmtime, strftime, sleep

feature_group_name = "reviews-feature-group-" + strftime("%d-%H-%M-%S", gmtime())
print(feature_group_name)

reviews-feature-group-12-21-20-56


In [11]:
from sagemaker.feature_store.feature_definition import (
    FeatureDefinition,
    FeatureTypeEnum,
)

feature_definitions = [
    FeatureDefinition(feature_name="input_ids", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="input_mask", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="segment_ids", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="label_id", feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name="review_id", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="date", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="label", feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name="split_type", feature_type=FeatureTypeEnum.STRING),
]

In [12]:
from sagemaker.feature_store.feature_group import FeatureGroup

feature_group = FeatureGroup(name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sess)
print(feature_group)

FeatureGroup(name='reviews-feature-group-12-21-20-56', sagemaker_session=<sagemaker.session.Session object at 0x7f1ce041f460>, feature_definitions=[FeatureDefinition(feature_name='input_ids', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='input_mask', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='segment_ids', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='label_id', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>), FeatureDefinition(feature_name='review_id', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='date', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='label', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>), FeatureDefinition(feature_name='split_type', feature_type=<FeatureTypeEnum.STRING: 'String'>)])


## Specify `record identifier` and `event time` features

In [13]:
record_identifier_feature_name = "review_id"
event_time_feature_name = "date"

## Set S3 Prefix for Offline Feature Store

In [14]:
prefix = "reviews-feature-store-" + timestamp
print(prefix)

reviews-feature-store-2023-01-12T21:20:25Z


## Create Feature Group

The last step for creating the feature group is to use the `create` function. The online store is not created by default, so we must set this as `True` if we want to enable it. The `s3_uri` is the location of our offline store.

# Barrier

Need Data Wrangler permissions if you want to run this

In [15]:
feature_group.create(
    s3_uri=f"s3://{bucket}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=False,
)

ClientError: An error occurred (ValidationException) when calling the CreateFeatureGroup operation: Invalid S3Uri provided. Exception from S3: {Access Denied (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied; Request ID: DV4NXH92YB3PMFWN; S3 Extended Request ID: 1dFL6N3oIzZvFNOZqjfBNkKcctyF7PMMlxfiIsHAdClbvfNIyDWYrLDJpfuaqcXWA8S0xmhLuvg=; Proxy: null)}. Please ensure that the OfflineStore S3 bucket exists and that the given RoleArn has the 'AmazonSageMakerFeatureStoreAccess' managed policy attached, with access to the bucket and objects in the bucket, with the principal 'sagemaker.amazonaws.com' as a trusted entity. If a KMS Key is provided for the offline store, please ensure that the RoleArn has 'kms:GenerateDataKey' permission and has access to the KMS Key.

## Describe the Feature Group

In [None]:
feature_group.describe()

## Review The Records To Ingest Into Feature Store

In [None]:
records = transform_inputs_to_tfrecord(inputs, output_file, max_seq_length)

# _IGNORE THE WARNING ^^ ABOVE ^^_

## Wait For The Feature Group Creation Complete

## _Note:  This may take a few minutes.  Please be patient._

Creating a feature group takes time as the data is loaded. We will need to wait until it is created before you can use it. You can check status using the following method.

In [None]:
import time


def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")

In [None]:
wait_for_feature_group_creation_complete(feature_group=feature_group)

# Ingest Records into Feature Store

After the FeatureGroups have been created, we can put data into the FeatureGroups by using the `PutRecord` API. 

This API can handle high TPS and is designed to be called by different streams. The data from all of these Put requests is buffered and written to S3 in chunks. 

The files will be written to the offline store within a few minutes of ingestion. To accelerate the ingestion process, we can specify multiple workers to do the job simultaneously. 

Use `put_record(...)` to put a single record in the FeatureGroup.

Use `ingest(...)` to ingest the content of a pandas DataFrame to Feature Store. You can set the `max_worker` to the number of threads to be created to work on different partitions of the `data_frame` in parallel.

In [None]:
import pandas as pd

df_records = pd.DataFrame.from_dict(records)
df_records["split_type"] = "train"
df_records

# Cast DataFrame `Object` to Supported Feature Store Data Type `String`

In [None]:
def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == "object":
            data_frame[label] = data_frame[label].astype("str").astype("string")

In [None]:
%%time

cast_object_to_string(df_records)

feature_group.ingest(data_frame=df_records, max_workers=3, wait=True)

# Wait For Feature Store To Become Active
## _Note:  This may take a few minutes.  Please be patient._

In [None]:
feature_store_describe_response = feature_group.describe()

while "OfflineStoreStatus" not in feature_store_describe_response.keys():
    feature_store_describe_response = feature_group.describe()
    print("[INFO] Waiting for OfflineStore to be created.")
    # print(json.dumps(feature_store_describe_response, indent=4, sort_keys=True, default=str))
    sleep(120)

print("Offline store created.")

In [None]:
offline_store_status = None

while offline_store_status != 'Active':
    try:
        offline_store_status = feature_group.describe()['OfflineStoreStatus']['Status']
    except:
        pass
print('Offline store status: {}'.format(offline_store_status))

# Query the Feature Store

In [None]:
feature_store_query = feature_group.athena_query()

feature_store_table = feature_store_query.table_name

query_string = """
    SELECT 
        input_ids,
        input_mask,
        segment_ids, 
        label_id,
        review_id,
        date,
        label,
        split_type
    FROM "{}" 
    WHERE split_type='train' 
    LIMIT 3
""".format(feature_store_table)

print('Glue Catalog table name: {}'.format(feature_store_table))
print('Running query: {}'.format(query_string))

In [None]:
output_s3_uri = 's3://{}/query_results/{}/'.format(bucket, prefix)
print(output_s3_uri)

In [None]:
feature_store_query.run(
    query_string=query_string, 
    output_location=output_s3_uri
)

feature_store_query.wait()

In [None]:
import pandas as pd
pd.set_option("max_colwidth", 100)

df_feature_store = feature_store_query.as_dataframe()
df_feature_store

# Review the Feature Store

![Feature Store](img/feature_store_sm_extension.png)

# Release Resources

In [None]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>