## Inspect data

In [1]:
!ls

data		 feast_infrastructure.ipynb  features.py
data_sources.py  feature_services.py	     feature_store.yaml


In [2]:
!ls data/

data_df1.parquet  data_df3.parquet  target_df.parquet
data_df2.parquet  data_df4.parquet


## Create feast registry

In [3]:
!cat feature_store.yaml

project: breast_cancer
registry: data/registry.db
provider: local
online_store:
    path: data/online_store.db
entity_key_serialization_version: 2

In [9]:
!cat data_sources.py

from feast import FileSource


f_source1 = FileSource(
    name="df1_file_source",
    path="data/data_df1.parquet",
    timestamp_field="event_timestamp",
    description="A table describing the source of the first set of features",
    owner="test1@gmail.com"
)

f_source2 = FileSource(
    name="df2_file_source",
    path="data/data_df2.parquet",
    timestamp_field="event_timestamp",
    description="A table describing the source of the second set of features",
    owner="test2@gmail.com"
)

f_source3 = FileSource(
    name="df3_file_source",
    path="data/data_df3.parquet",
    timestamp_field="event_timestamp",
    description="A table describing the source of the third set of features",
    owner="test3@gmail.com"
)

f_source4 = FileSource(
    name="df4_file_source",
    path="data/data_df4.parquet",
    timestamp_field="event_timestamp",
    description="A table describing the source of the fourth set of features",
    owner="test4@gmail.com"
)

In [10]:
!cat features.py

# Importing dependencies
from datetime import timedelta
from feast import Field, FeatureView
from feast.types import Float32, Int32

from data_sources import *


df1_fv = FeatureView(
    name="df1_feature_view",
    ttl=timedelta(seconds=86400 * 30),
    schema=[
        Field(name="mean radius", dtype=Float32),
        Field(name="mean texture", dtype=Float32),
        Field(name="mean perimeter", dtype=Float32),
        Field(name="mean area", dtype=Float32),
        Field(name="mean smoothness", dtype=Float32)
        ],    
    source=f_source1
)

df2_fv = FeatureView(
    name="df2_feature_view",
    ttl=timedelta(seconds=86400 * 30),
    schema=[
        Field(name="mean compactness", dtype=Float32),
        Field(name="mean concavity", dtype=Float32),
        Field(name="mean concave points", dtype=Float32),
        Field(name="mean symmetry", dtype=Float32),
        Field(name="mean fractal dimension", dtype=Float32)
        ],    
    source=f_s

In [11]:
!cat feature_services.py

from feast import FeatureService

from features import *

feature_service_v1 = FeatureService(
    name="feature_v1",
    features=[df1_fv]
)

feature_service_v2 = FeatureService(
    name="feature_v2",
    features=[df1_fv, df2_fv]
)

feature_service_v3 = FeatureService(
    name="feature_v3",
    features=[df1_fv, df2_fv, df3_fv]
)

feature_service_v4 = FeatureService(
    name="feature_v4",
    features=[df1_fv, df2_fv, df3_fv, df4_fv]
)


In [12]:
!../../feast_env/bin/feast apply

  collections.MutableMapping.register(ParseResults)
Created feature view [1m[32mdf3_feature_view[0m
Created feature view [1m[32mdf2_feature_view[0m
Created feature view [1m[32mtarget_feature_view[0m
Created feature view [1m[32mdf4_feature_view[0m
Created feature view [1m[32mdf1_feature_view[0m
Created feature service [1m[32mfeature_v4[0m
Created feature service [1m[32mfeature_v1[0m
Created feature service [1m[32mfeature_v3[0m
Created feature service [1m[32mfeature_v2[0m

Created sqlite table [1m[32mbreast_cancer_df1_feature_view[0m
Created sqlite table [1m[32mbreast_cancer_df2_feature_view[0m
Created sqlite table [1m[32mbreast_cancer_df3_feature_view[0m
Created sqlite table [1m[32mbreast_cancer_df4_feature_view[0m
Created sqlite table [1m[32mbreast_cancer_target_feature_view[0m



## Listing entities, featureView, featureService

In [13]:
!ls data/

data_df1.parquet  data_df3.parquet  online_store.db  target_df.parquet
data_df2.parquet  data_df4.parquet  registry.db


In [14]:
!../../feast_env/bin/feast entities list

  collections.MutableMapping.register(ParseResults)
NAME    DESCRIPTION    TYPE


In [15]:
!../../feast_env/bin/feast feature-views list

  collections.MutableMapping.register(ParseResults)
NAME                 ENTITIES    TYPE
df3_feature_view     n/a         FeatureView
df2_feature_view     n/a         FeatureView
target_feature_view  n/a         FeatureView
df4_feature_view     n/a         FeatureView
df1_feature_view     n/a         FeatureView


In [16]:
!../../feast_env/bin/feast feature-services list

  collections.MutableMapping.register(ParseResults)
NAME        FEATURES
feature_v4  df1_feature_view:mean radius, df1_feature_view:mean texture, df1_feature_view:mean perimeter, df1_feature_view:mean area, df1_feature_view:mean smoothness, df2_feature_view:mean compactness, df2_feature_view:mean concavity, df2_feature_view:mean concave points, df2_feature_view:mean symmetry, df2_feature_view:mean fractal dimension, df3_feature_view:radius error, df3_feature_view:texture error, df3_feature_view:perimeter error, df3_feature_view:area error, df3_feature_view:smoothness error, df3_feature_view:compactness error, df3_feature_view:concavity error, df4_feature_view:concave points error, df4_feature_view:symmetry error, df4_feature_view:fractal dimension error, df4_feature_view:worst radius, df4_feature_view:worst texture, df4_feature_view:worst perimeter, df4_feature_view:worst area, df4_feature_view:worst smoothness, df4_feature_view:worst compactness, df4_feature_view:worst concavity, df4_

In [17]:
!../../feast_env/bin/feast data-sources list

  collections.MutableMapping.register(ParseResults)
NAME                CLASS
target_file_source  <class 'feast.infra.offline_stores.file_source.FileSource'>
df2_file_source     <class 'feast.infra.offline_stores.file_source.FileSource'>
df3_file_source     <class 'feast.infra.offline_stores.file_source.FileSource'>
df4_file_source     <class 'feast.infra.offline_stores.file_source.FileSource'>
df1_file_source     <class 'feast.infra.offline_stores.file_source.FileSource'>


## Retrieving features

In [1]:
import pandas as pd
from feast import FeatureStore
from feast.infra.offline_stores.file_source import SavedDatasetFileStorage

In [2]:
store = FeatureStore(repo_path=".")

In [3]:
entity_df = pd.read_parquet(path="./data/target_df.parquet")
entity_df

Unnamed: 0,target,event_timestamp
0,0,2021-05-18 22:56:14.195829
1,0,2021-05-19 22:56:14.195829
2,0,2021-05-20 22:56:14.195829
3,0,2021-05-21 22:56:14.195829
4,0,2021-05-22 22:56:14.195829
...,...,...
564,0,2022-12-03 22:56:14.195829
565,0,2022-12-04 22:56:14.195829
566,0,2022-12-05 22:56:14.195829
567,0,2022-12-06 22:56:14.195829


In [4]:
# Retrieving from the offline store with a feature service v1
feature_v1 = store.get_feature_service("feature_v1")
training_data = store.get_historical_features(features=feature_v1, entity_df=entity_df)
training_data.to_df()

Unnamed: 0,target,event_timestamp,mean radius,mean texture,mean perimeter,mean area,mean smoothness
0,0,2021-05-18 22:56:14.195829+00:00,17.99,10.38,122.80,1001.0,0.11840
1,0,2021-05-19 22:56:14.195829+00:00,20.57,17.77,132.90,1326.0,0.08474
2,0,2021-05-20 22:56:14.195829+00:00,19.69,21.25,130.00,1203.0,0.10960
3,0,2021-05-21 22:56:14.195829+00:00,11.42,20.38,77.58,386.1,0.14250
4,0,2021-05-22 22:56:14.195829+00:00,20.29,14.34,135.10,1297.0,0.10030
...,...,...,...,...,...,...,...
564,0,2022-12-03 22:56:14.195829+00:00,21.56,22.39,142.00,1479.0,0.11100
565,0,2022-12-04 22:56:14.195829+00:00,20.13,28.25,131.20,1261.0,0.09780
566,0,2022-12-05 22:56:14.195829+00:00,16.60,28.08,108.30,858.1,0.08455
567,0,2022-12-06 22:56:14.195829+00:00,20.60,29.33,140.10,1265.0,0.11780


In [5]:
# Retrieving from the offline store with a feature service v4
feature_v4 = store.get_feature_service("feature_v4")
training_data = store.get_historical_features(features=feature_v4, entity_df=entity_df)
training_data.to_df()

Unnamed: 0,target,event_timestamp,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,0,2021-05-18 22:56:14.195829+00:00,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,0,2021-05-19 22:56:14.195829+00:00,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,0,2021-05-20 22:56:14.195829+00:00,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,0,2021-05-21 22:56:14.195829+00:00,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,0,2021-05-22 22:56:14.195829+00:00,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,0,2022-12-03 22:56:14.195829+00:00,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,0,2022-12-04 22:56:14.195829+00:00,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,0,2022-12-05 22:56:14.195829+00:00,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,0,2022-12-06 22:56:14.195829+00:00,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


## Save local dataset

In [23]:
dataset = store.create_saved_dataset(
    from_=training_data,
    name="breast_cancer_dataset",
    storage=SavedDatasetFileStorage("data/breast_cancer_dataset.parquet")
)



In [24]:
!ls data/

breast_cancer_dataset.parquet  data_df3.parquet  registry.db
data_df1.parquet	       data_df4.parquet  target_df.parquet
data_df2.parquet	       online_store.db


In [26]:
# Retrieving the saved dataset
training_df = store.get_saved_dataset(name="breast_cancer_dataset").to_df()
training_df



Unnamed: 0,mean concavity,mean compactness,mean smoothness,mean symmetry,concave points error,worst area,mean area,event_timestamp,smoothness error,mean texture,...,worst fractal dimension,texture error,perimeter error,worst radius,mean perimeter,concavity error,worst symmetry,target,worst perimeter,symmetry error
0,0.30010,0.27760,0.11840,0.2419,0.01587,2019.0,1001.0,2021-05-18 22:56:14.195829+00:00,0.006399,10.38,...,0.11890,0.9053,8.589,25.380,122.80,0.05373,0.4601,0,184.60,0.03003
1,0.08690,0.07864,0.08474,0.1812,0.01340,1956.0,1326.0,2021-05-19 22:56:14.195829+00:00,0.005225,17.77,...,0.08902,0.7339,3.398,24.990,132.90,0.01860,0.2750,0,158.80,0.01389
2,0.19740,0.15990,0.10960,0.2069,0.02058,1709.0,1203.0,2021-05-20 22:56:14.195829+00:00,0.006150,21.25,...,0.08758,0.7869,4.585,23.570,130.00,0.03832,0.3613,0,152.50,0.02250
3,0.24140,0.28390,0.14250,0.2597,0.01867,567.7,386.1,2021-05-21 22:56:14.195829+00:00,0.009110,20.38,...,0.17300,1.1560,3.445,14.910,77.58,0.05661,0.6638,0,98.87,0.05963
4,0.19800,0.13280,0.10030,0.1809,0.01885,1575.0,1297.0,2021-05-22 22:56:14.195829+00:00,0.011490,14.34,...,0.07678,0.7813,5.438,22.540,135.10,0.05688,0.2364,0,152.20,0.01756
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,0.24390,0.11590,0.11100,0.1726,0.02454,2027.0,1479.0,2022-12-03 22:56:14.195829+00:00,0.010300,22.39,...,0.07115,1.2560,7.673,25.450,142.00,0.05198,0.2060,0,166.10,0.01114
565,0.14400,0.10340,0.09780,0.1752,0.01678,1731.0,1261.0,2022-12-04 22:56:14.195829+00:00,0.005769,28.25,...,0.06637,2.4630,5.203,23.690,131.20,0.03950,0.2572,0,155.00,0.01898
566,0.09251,0.10230,0.08455,0.1590,0.01557,1124.0,858.1,2022-12-05 22:56:14.195829+00:00,0.005903,28.08,...,0.07820,1.0750,3.425,18.980,108.30,0.04730,0.2218,0,126.70,0.01318
567,0.35140,0.27700,0.11780,0.2397,0.01664,1821.0,1265.0,2022-12-06 22:56:14.195829+00:00,0.006522,29.33,...,0.12400,1.5950,5.772,25.740,140.10,0.07117,0.4087,0,184.60,0.02324


## Load dataset and training

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

labels = training_df['target']
features = training_df.drop(labels=['target', 'event_timestamp'],
                            axis=1)

# sorted(df) to keep the order of feature fields
feature_fields = sorted(features)
features = features[feature_fields]

# split dataset
X_train, X_test, y_train, y_test = train_test_split(features, labels, shuffle=False)

In [28]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((426, 30), (426,), (143, 30), (143,))

In [29]:
model = LogisticRegression(max_iter=10000, C=1e6)
model.fit(X=X_train, y=y_train)

In [30]:
from sklearn.metrics import classification_report
print(classification_report(y_pred=model.predict(X_train), y_true=y_train))
print(classification_report(y_pred=model.predict(X_test), y_true=y_test))

              precision    recall  f1-score   support

           0       0.98      0.97      0.97       177
           1       0.98      0.98      0.98       249

    accuracy                           0.98       426
   macro avg       0.98      0.98      0.98       426
weighted avg       0.98      0.98      0.98       426

              precision    recall  f1-score   support

           0       0.92      0.97      0.94        35
           1       0.99      0.97      0.98       108

    accuracy                           0.97       143
   macro avg       0.95      0.97      0.96       143
weighted avg       0.97      0.97      0.97       143



In [31]:
import joblib
joblib.dump(value=model, filename='../model.joblib')

['../model.joblib']

## Make online features

1. materialize: loads the latest features between two dates

`feast materialize 2021-01-01T00:00:00 2022-01-01T00:00:00`

2. materialize-incremental: loads features up to the provided end date:

`feast materialize-incremental 2022-01-01T00:00:00`

With `feast materialize-incremental`, the start time either `now - ttl` (the `ttl` that we defined in our feature views) or the time of the most recent materialization. If you've materialized features at least once, then subsequent materializations will only fetch features that weren't present in the store at the time of the previous materializations.

If you have several feature rows per entity, Feast will only load the latest values per entity key. As an example, if you have two entries on seperate days for the patient ID 100, only the latest entry will get materialized.

In [32]:
from datetime import datetime, timedelta

# Code for loading features to online store between two dates
"""store.materialize(
    end_date=datetime.now(),
    start_date=datetime.now() - timedelta(days=700))"""

# Loading the latest features after a previous materialize call or from the beginning of time
store.materialize_incremental(end_date=datetime.now())

Materializing [1m[32m5[0m feature views to [1m[32m2022-12-07 23:00:36+07:00[0m into the [1m[32msqlite[0m online store.

[1m[32mdf3_feature_view[0m from [1m[32m2022-11-07 16:00:36+07:00[0m to [1m[32m2022-12-07 23:00:36+07:00[0m:


100%|█████████████████████████████████████████████████████████████| 31/31 [00:00<00:00, 2246.66it/s]


[1m[32mdf2_feature_view[0m from [1m[32m2022-11-07 16:00:36+07:00[0m to [1m[32m2022-12-08 06:00:36+07:00[0m:


100%|█████████████████████████████████████████████████████████████| 31/31 [00:00<00:00, 3631.53it/s]

[1m[32mtarget_feature_view[0m from [1m[32m2022-11-07 16:00:36+07:00[0m to [1m[32m2022-12-08 06:00:36+07:00[0m:



100%|█████████████████████████████████████████████████████████████| 31/31 [00:00<00:00, 3433.14it/s]


[1m[32mdf4_feature_view[0m from [1m[32m2022-11-07 16:00:36+07:00[0m to [1m[32m2022-12-08 06:00:36+07:00[0m:


100%|█████████████████████████████████████████████████████████████| 31/31 [00:00<00:00, 1850.76it/s]


[1m[32mdf1_feature_view[0m from [1m[32m2022-11-07 16:00:36+07:00[0m to [1m[32m2022-12-08 06:00:36+07:00[0m:


100%|█████████████████████████████████████████████████████████████| 31/31 [00:00<00:00, 2965.25it/s]


Because the `ttl = 86400 * 30` (30 days), Feast will only load the features from `2022-11-07 16:00:36+07:00` to `2022-12-08 06:00:36+07:00` (now) and we only have `31` data points in online store.

## Inference online

Since we haven't specified the entity, feast will create the dummy entity with a `__dummy` entity key.

Let's get online features by `__dummy` entity key.

In [16]:
features = store.get_online_features(features=feature_v4,
                                     entity_rows=[{"__dummy": i} for i in range(1000)]).to_dict()

features_df = pd.DataFrame.from_dict(data=features)
features_df.dropna(inplace=True)
features_df.drop_duplicates(inplace=True)
features_df



Unnamed: 0,mean area,mean perimeter,mean texture,mean smoothness,mean radius,mean compactness,mean fractal dimension,mean concave points,mean concavity,mean symmetry,...,fractal dimension error,worst radius,concave points error,worst concave points,worst texture,worst perimeter,symmetry error,worst compactness,worst concavity,worst fractal dimension
0,181.0,47.919998,24.540001,0.05263,7.76,0.04362,0.05884,0.0,0.0,0.1587,...,0.002783,9.456,0.0,0.0,30.370001,59.16,0.02676,0.06444,0.0,0.07039


Dive into the database:

In [17]:
from IPython.display import display
import sqlalchemy

conn = sqlalchemy.create_engine("sqlite:///data/online_store.db")
result = pd.read_sql("SELECT * FROM sqlite_master", conn)
display(result)

result = pd.read_sql("SELECT * FROM breast_cancer_target_feature_view", conn)
display(result)

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,breast_cancer_df1_feature_view,breast_cancer_df1_feature_view,2,CREATE TABLE breast_cancer_df1_feature_view (e...
1,index,sqlite_autoindex_breast_cancer_df1_feature_view_1,breast_cancer_df1_feature_view,3,
2,index,breast_cancer_df1_feature_view_ek,breast_cancer_df1_feature_view,4,CREATE INDEX breast_cancer_df1_feature_view_ek...
3,table,breast_cancer_df2_feature_view,breast_cancer_df2_feature_view,5,CREATE TABLE breast_cancer_df2_feature_view (e...
4,index,sqlite_autoindex_breast_cancer_df2_feature_view_1,breast_cancer_df2_feature_view,6,
5,index,breast_cancer_df2_feature_view_ek,breast_cancer_df2_feature_view,7,CREATE INDEX breast_cancer_df2_feature_view_ek...
6,table,breast_cancer_df3_feature_view,breast_cancer_df3_feature_view,8,CREATE TABLE breast_cancer_df3_feature_view (e...
7,index,sqlite_autoindex_breast_cancer_df3_feature_view_1,breast_cancer_df3_feature_view,9,
8,index,breast_cancer_df3_feature_view_ek,breast_cancer_df3_feature_view,10,CREATE INDEX breast_cancer_df3_feature_view_ek...
9,table,breast_cancer_df4_feature_view,breast_cancer_df4_feature_view,11,CREATE TABLE breast_cancer_df4_feature_view (e...


Unnamed: 0,entity_key,feature_name,value,event_ts,created_ts
0,b'\x02\x00\x00\x00__dummy_id\x02\x00\x00\x00\x...,target,b'\x18\x01',2022-12-07 22:56:14.195829,


Not as expected, we have a dataframe with only `1` rows (after removing duplicates). Because when we materialize features without entity, all data points will have same dummy key, so Feast only fetch the latest feature values.

**Note**: To retrieve online features correctly, we must define featureView with the specific entity.