Version: 0.0.2  Updated date: 07/05/2024
Conda Environment : py-snowpark_df_ml_fs-1.15.0_v1

# Getting Started with Snowflake Feature Store
We will use the Use-Case to show how Snowflake Feature Store (and Model Registry) can be used to maintain & store features, retrieve them for training and perform micro-batch inference.

In the development (TRAINING) enviroment we will 
- create FeatureViews in the Feature Store that maintain the required customer-behaviour features.
- use these Features to train a model, and save the model in the Snowflake model-registry.
- plot the clusters for the trained model to visually verify. 

In the production (SERVING) environment we will
- re-create the FeatureViews on production data
- generate an Inference FeatureView that uses the saved model to perform incremental inference

# Feature Engineering & Model Training

In [None]:
%load_ext autoreload
%autoreload 2

#### Notebook Packages

In [None]:
# Python packages
import os
import json
import timeit

# SNOWFLAKE
# Snowpark
from snowflake.snowpark import Session, DataFrame, Window, WindowSpec

import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T

# Snowflake Feature Store
from snowflake.ml.feature_store import (
    FeatureView,
    Entity)

# COMMON FUNCTIONS
from useful_fns import check_and_update, get_latest, create_ModelRegistry, create_FeatureStore, create_SF_Session, get_spine_df
from useful_fns import run_sql

### Setup Snowflake connection and database parameters

In [None]:
# Schemas
tpcxai_training_schema     = 'SERVING'

In [None]:
fs_qs_role, tpcxai_database, tpcxai_training_schema, session, warehouse_env = create_SF_Session(tpcxai_training_schema)

## MODEL DEVELOPMENT
* Create Snowflake Model-Registry
* Create Snowflake Feature-Store
* Establish and Create CUSTOMER Entity in the development Snowflake FeatureStore
* Create Source Data references and perform basic data-cleansing
* Create & Run Preprocessing Function to create features
* Create FeatureView_Preprocess from Preprocess Dataframe SQL
* Create training data from FeatureView_Preprocess (asof join)
* Create & Fit Snowpark-ml pipeline 
* Save model in Model Registry
* 'Verify' and approve model
* Create new FeatureView_Model_Inference with Transforms UDF + KMeans model

In [None]:
# Set the Schema
tpcxai_schema = tpcxai_training_schema

# Create/Reference Snowflake Model Registry - Common across Environments
mr = create_ModelRegistry(session, tpcxai_database, 'MODEL_1')

# Create/Reference Snowflake Feature Store for Training (Development) Environment
fs = create_FeatureStore(session, tpcxai_database, f'''_{tpcxai_schema}_FEATURE_STORE''', warehouse_env)


 ### Create & Load Source Data

### Create Preprocessing FeatureView from Preprocess Dataframe (SQL)

In [None]:
ppd_fv_name    = "FV_UC01_PREPROCESS"
ppd_fv_version = "V_1"

fv_uc01_preprocess = fs.get_feature_view(name=ppd_fv_name,version=ppd_fv_version)
spine = fv_uc01_preprocess

In [None]:
# You can also use the following to retrieve a Feature View instance for use within Python
FV_UC01_PREPROCESS_V_1 = fs.get_feature_view(ppd_fv_name, 'V_1')
# We can look at the FeatureView's contents with
FV_UC01_PREPROCESS_V_1.feature_df.show(10)

### Create training data Dataset from FeatureView_Preprocess

In [None]:
from snowflake.ml.dataset import Dataset
ds = Dataset.load(session=session, name='TPCXAI_SF0001_QUICKSTART_INC._TRAINING_FEATURE_STORE.UC01_TRAINING')
ds.fully_qualified_name, ds.list_versions()

In [None]:
from snowflake.ml.dataset import load_dataset
training_dataset = load_dataset(session, 'TPCXAI_SF0001_QUICKSTART_INC._TRAINING_FEATURE_STORE.UC01_TRAINING', 'V_1')
training_dataset_sdf = training_dataset.read.to_snowpark_dataframe()

### Fit Snowpark-ML Transforms & Model using Fileset training data

We need to fit the transformer over the training Fileset to ensure we are using the same input global values for transforming and training, and later inference with the model.

The transforms here are model-specific and persisted within the model-pipeline, and not stored in the Feature Store.

In [None]:
weights = [0.6, 0.4]
training_dataset_sdf = training_dataset_sdf.with_column("FREQUENCY", F.round(F.col("FREQUENCY"), 3))
training_dataset_sdf = training_dataset_sdf.with_column("RETURN_RATIO", F.round(F.col("RETURN_RATIO"), 3))
training_dataset_sdf = training_dataset_sdf.with_column("RETURN_ROW_PRICE", F.round(F.col("RETURN_ROW_PRICE"), 3))
training_dataset_sdf = training_dataset_sdf.select(['RETURN_RATIO', 'FREQUENCY', 'RETURN_ROW_PRICE'])

train_df, test_df = training_dataset_sdf.random_split(weights, seed=42) # Using a seed for reproducibility


In [None]:
# Get and set default for latest version of the model
model_name = "MODEL_1.UC01_SNOWFLAKEML_RF_REGRESSOR_MODELSKLEARN"
m = mr.get_model(model_name)
latest_version = m.show_versions().iloc[-1]['name']
mv = m.version(latest_version)
m.default = latest_version

In [None]:
explanations = mv.run(train_df, function_name="explain")
explanations = explanations.with_column_renamed(F.col('"NUM__RETURN_RATIO_explanation"'), "RETURN_RATIO_EXPLANATION")
explanations = explanations.with_column_renamed(F.col('"NUM__FREQUENCY_explanation"'), "FREQUENCY_EXPLANATION")
explanations = explanations.limit(5000)
explanations.show(2)

In [None]:
from snowflake.ml.monitoring.explain_visualize import plot_violin
plot_violin(
    explanations.select(['RETURN_RATIO_EXPLANATION', 'FREQUENCY_EXPLANATION']),
    explanations.select(["RETURN_RATIO", "FREQUENCY"]),
    (600,150)
)

# Observability 

In [None]:
## Create & Register Inference-FeatureView to run scheduled Inference
inf_fvname = "FV_UC01_INFERENCE_RESULT"
inf_fv_version = "V_1"

fv_uc01_inference_result = fs.get_feature_view(name= inf_fvname, version= inf_fv_version)
inference_input_sdf = fs.read_feature_view(fv_uc01_inference_result)

In [None]:
inference_input_sdf.show()

In [None]:
## Create & Register Inference-FeatureView to run scheduled Inference
monitoring_fs = create_FeatureStore(session, tpcxai_database, f'''MODEL_1''', warehouse_env)
### ORDER Entity
if "ORDER" not in json.loads(monitoring_fs.list_entities().select(F.to_json(F.array_agg("NAME", True))).collect()[0][0]):
    customer_entity = Entity(name="ORDER", join_keys=["O_CUSTOMER_SK"],desc="Primary Key for CUSTOMER ORDER")
    monitoring_fs.register_entity(customer_entity)
else:
    customer_entity = monitoring_fs.get_entity("ORDER")

monitoring_fvname = "FV_UC01_MONITORING"
monitoring_fv_version = "V_1"
try:
   fv_uc01_monitoring_result = monitoring_fs.get_feature_view(name= monitoring_fvname, version= monitoring_fv_version)
except:
   fv_uc01_inference_result = FeatureView(
        name= monitoring_fvname, 
        entities=[customer_entity], 
        feature_df=inference_input_sdf,
        refresh_freq="60 minute",
        refresh_mode="INCREMENTAL",
        desc="Inference Result for monitoring")
   
   fv_uc01_inference_result = monitoring_fs.register_feature_view(
         feature_view=fv_uc01_inference_result, 
         version= monitoring_fv_version, 
         block=True
   )

In [None]:
# Check for the latest version of this model in registry, and increment version
mr_df = mr.show_models()
latest_version = m.show_versions().iloc[-1]['name']
mv = m.version(latest_version)


In [None]:
# run_sql(f'''CREATE OR REPLACE TABLE {tpcxai_database}.{tpcxai_training_schema}.RETURN_PRICE_PRED
#     (ASOF_DATE DATE,
#      OR_PRODUCT_ID INTEGER,
#      OR_RETURN_QUANTITY INTEGER)
#      CLUSTER BY (OR_PRODUCT_ID, OR_ORDER_ID);
#     ''', session)

In [None]:
from snowflake.ml.monitoring.entities.model_monitor_config import ModelMonitorConfig, ModelMonitorSourceConfig

source_config = ModelMonitorSourceConfig(
    source="TPCXAI_SF0001_QUICKSTART_INC.MODEL_1.FV_UC01_MONITORING$V_1",
    timestamp_column="LATEST_ORDER_DATE",
    id_columns=["O_CUSTOMER_SK"],
    prediction_score_columns=["OUTPUT_RETURN_ROW_PRICE"],
    actual_score_columns=["RETURN_ROW_PRICE"],
)

# Set up config for ModelMonitor.
model_monitor_config = ModelMonitorConfig(
    model_version=mv,
    model_function_name="predict",
    background_compute_warehouse_name="COMPUTE_WH",
    refresh_interval="1 hour",
    aggregation_window="1 day"
)

# Add a new ModelMonitor
model_monitor = mr.add_monitor(
    name=model_name, 
    source_config=source_config,
    model_monitor_config=model_monitor_config,
)
model_monitor

In [None]:
query = f"""
USE SCHEMA MODEL_1
"""
run_sql(query, session)

query = f"""
GRANT USAGE ON WAREHOUSE COMPUTE_WH TO ROLE FS_QS_ROLE;
"""
run_sql(query, session)

query = f"""
CREATE OR REPLACE MODEL MONITOR CHURN_MODEL_MONITOR
WITH
    MODEL=UC01_SNOWFLAKEML_XGB_REGRESSOR_MODEL
    VERSION=V_1
    FUNCTION=predict
    SOURCE=FV_UC01_MONITORING$V_1
    TIMESTAMP_COLUMN=LATEST_ORDER_DATE
    PREDICTION_SCORE_COLUMNS=(OUTPUT_RETURN_ROW_PRICE)  
    ACTUAL_SCORE_COLUMNS=(RETURN_ROW_PRICE)
    ID_COLUMNS=(O_CUSTOMER_SK)
    WAREHOUSE=COMPUTE_WH
    REFRESH_INTERVAL='1 hour'
    AGGREGATION_WINDOW='1 day';
"""
run_sql(query, session)

## CLEAN UP

In [None]:
# session.close()

In [None]:
from datetime import datetime
from zoneinfo import ZoneInfo
formatted_time = datetime.now(ZoneInfo("Australia/Melbourne")).strftime("%A, %B %d, %Y %I:%M:%S %p %Z")

print(f"The last run time in Melbourne is: {formatted_time}")