title: "Create training dataset from online feature store enabled feature groups"
date: 2021-04-25
type: technical_note
draft: false
---

### Establish a connection with your Hopsworks feature store.

In [1]:
import hsfs
connection = hsfs.connection()
# get a reference to the feature store, you can access also shared feature stores by providing the feature store name
fs = connection.get_feature_store()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
6,application_1620032599856_0011,pyspark,idle,Link,Link


SparkSession available as 'spark'.
Connected. Call `.close()` to terminate connection gracefully.

## Get feature groups

In [2]:
card_transactions_10m_agg = fs.get_feature_group("card_transactions_10m_agg", version = 1)
card_transactions_1h_agg = fs.get_feature_group("card_transactions_1h_agg", version = 1)
card_transactions_12h_agg = fs.get_feature_group("card_transactions_12h_agg", version = 1)
fraud_labels = fs.get_feature_group("fraud_labels", version = 1)

## Create training dataset

In [3]:
query = card_transactions_10m_agg.select(["avg_amt_per_10m", "num_trans_per_10m"])\
                                 .join(card_transactions_1h_agg.select(["avg_amt_per_1h", "num_trans_per_1h"]))\
                                 .join(card_transactions_12h_agg.select(["avg_amt_per_12h", "num_trans_per_12h"]))\
                                 .join(fraud_labels.select(["fraud_label"])) # Label

In [4]:
query.show(4)

+---------------+-----------------+--------------+----------------+------------------+-----------------+-----------+
|avg_amt_per_10m|num_trans_per_10m|avg_amt_per_1h|num_trans_per_1h|   avg_amt_per_12h|num_trans_per_12h|fraud_label|
+---------------+-----------------+--------------+----------------+------------------+-----------------+-----------+
|          85.13|                1|          6.77|               1| 751.5399999999998|                9|          0|
|          88.89|                1|        42.875|               2|156.58642857142854|               14|          0|
|          87.44|                1|        634.81|               1|342.55333333333334|                6|          0|
|          56.75|                1|         71.31|               1| 60.95428571428571|                7|          0|
+---------------+-----------------+--------------+----------------+------------------+-----------------+-----------+
only showing top 4 rows

In [5]:
td_meta = fs.create_training_dataset(name="card_fraud_dataset",
                               description="Training dataset to train card fraud model",
                               data_format="tfrecords",
                               splits={'train': 0.8, 'test': 0.1, 'validate': 0.1},
                               statistics_config={"enabled": True, "histograms": True, "correlations": True},
                               label=["fraud_label"],
                               version=1)
td_meta.save(query)

<hsfs.training_dataset.TrainingDataset object at 0x7f321b06fe90>

In [6]:
td_meta.show(4)

+---------------+---------------+----------------+--------------+-----------------+-----------+-----------------+
|avg_amt_per_10m|avg_amt_per_12h|num_trans_per_1h|avg_amt_per_1h|num_trans_per_12h|fraud_label|num_trans_per_10m|
+---------------+---------------+----------------+--------------+-----------------+-----------+-----------------+
|         440.27|      320.84332|               2|        515.22|               15|          0|                1|
|          676.7|        1313.98|               2|       1206.69|                7|          0|                1|
|         8866.1|        144.285|               1|          25.4|                6|          0|                1|
|          26.94|       967.8611|               3|        173.15|                9|          0|                1|
+---------------+---------------+----------------+--------------+-----------------+-----------+-----------------+
only showing top 4 rows

# Train a fraud detector

In [9]:
from hops import tensorboard    
from hops import model as hops_model

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

batch_size = 32
num_epochs = 1 

In [8]:
def build_model():
    model = keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=[6]),
        layers.Dense(64, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

class PrintCallback(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        if epoch % 100 == 0: print('')
        print('.', end='')

In [10]:
train_input = td_meta.tf_data(target_name='fraud_label', split='train', is_training=True)
train_input_processed = train_input.tf_record_dataset(process=True, batch_size =32, num_epochs=1)

In [11]:
# Train model
model = build_model()
history = model.fit(train_input_processed,
    batch_size=batch_size,
    epochs=num_epochs,
    callbacks=[PrintCallback()],
    verbose=1)


.

In [12]:
# Save model in hdfs
export_path = 'Resources/CardFraudDetection/SavedModel'
tf.keras.models.save_model(model,
    export_path,
    overwrite=True,
    include_optimizer=True,
    save_format=None,
    signatures=None,
    options=None)

# Export model
hops_model.export(export_path, 'card_fraud_detector', metrics={ 'accuracy': history.history['accuracy'][-1] })

Started copying local path Resources/CardFraudDetection/SavedModel/saved_model.pb to hdfs path hdfs://rpc.namenode.service.consul:8020/Projects/card_fraud/Models/card_fraud_detector/1

Finished copying

Started copying local path Resources/CardFraudDetection/SavedModel/variables to hdfs path hdfs://rpc.namenode.service.consul:8020/Projects/card_fraud/Models/card_fraud_detector/1

Finished copying

Started copying local path Resources/CardFraudDetection/SavedModel/assets to hdfs path hdfs://rpc.namenode.service.consul:8020/Projects/card_fraud/Models/card_fraud_detector/1

Finished copying

Exported model card_fraud_detector as version 1 successfully.
Polling card_fraud_detector version 1 for model availability.
get model:/hopsworks-api/api/project/120/models/card_fraud_detector_1?filter_by=endpoint_id:120
Polling card_fraud_detector version 1 for model availability.
get model:/hopsworks-api/api/project/120/models/card_fraud_detector_1?filter_by=endpoint_id:120
Polling card_fraud_detecto