## Deploy

```
a6ef4eef9950   feast-push-server      "feast serve -h 0.0.…"   3 minutes ago   Up About a minute          0.0.0.0:6567->6566/tcp, :::6567->6566/tcp   feast_push_server
d1ab668e9ff1   feast-feature-server   "feast serve -h 0.0.…"   3 minutes ago   Up 3 minutes (unhealthy)   0.0.0.0:6566->6566/tcp, :::6566->6566/tcp   feast_feature_server
f5cb32fd7e41   postgres:14.0          "docker-entrypoint.s…"   3 minutes ago   Up 3 minutes (healthy)     0.0.0.0:5432->5432/tcp, :::5432->5432/tcp   feast_registry
9f79f7f6c66d   redis:7.0.4            "docker-entrypoint.s…"   3 minutes ago   Up 3 minutes               0.0.0.0:6379->6379/tcp, :::6379->6379/tcp   feast_redis
```

## Inspect data

In [4]:
!ls

data		 entities.py		     features.py	 util.py
data_sources.py  feast_infrastructure.ipynb  feature_store.yaml
Dockerfile	 feature_services.py	     test_workflow.py


In [5]:
!ls data/

mnist.npz  mnist.parquet


## Create feast registry

In [23]:
!../../feast_env/bin/feast apply

  collections.MutableMapping.register(ParseResults)
  schema = ParquetDataset(path).schema.to_arrow_schema()
  mnist_fresh_feature_view__feature         feature  norm_value
0                               NaN  b'hello world'           1
Traceback (most recent call last):
  File "../../feast_env/bin/feast", line 8, in <module>
    sys.exit(cli())
  File "/home/hoang/Documents/mlops-labs/feature-store/feast_env/lib/python3.8/site-packages/click/core.py", line 1130, in __call__
    return self.main(*args, **kwargs)
  File "/home/hoang/Documents/mlops-labs/feature-store/feast_env/lib/python3.8/site-packages/click/core.py", line 1055, in main
    rv = self.invoke(ctx)
  File "/home/hoang/Documents/mlops-labs/feature-store/feast_env/lib/python3.8/site-packages/click/core.py", line 1657, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "/home/hoang/Documents/mlops-labs/feature-store/feast_env/lib/python3.8/site-packages/click/core.py", line 1404, in invoke
    retu

## Listing entities, featureView, featureService

In [12]:
!ls data/

mnist.npz  mnist.parquet


In [41]:
!../../feast_env/bin/feast entities list

NAME    DESCRIPTION          TYPE
id      The ID of the event  ValueType.INT32


In [42]:
!../../feast_env/bin/feast feature-views list

NAME                ENTITIES    TYPE
mnist_feature_view  {'id'}      FeatureView


In [43]:
!../../feast_env/bin/feast feature-services list

NAME              FEATURES
mnist_feature_v1  mnist_feature_view:array, mnist_feature_view:class


In [44]:
!../../feast_env/bin/feast data-sources list

NAME               CLASS
mnist_push_source  <class 'feast.data_source.PushSource'>
mnist_file_source  <class 'feast.infra.offline_stores.file_source.FileSource'>


## Retrieving features

In [45]:
import pandas as pd
from feast import FeatureStore
from feast.infra.offline_stores.file_source import SavedDatasetFileStorage

In [46]:
store = FeatureStore(repo_path=".")

In [62]:
from datetime import datetime
import numpy as np

entity_df = pd.read_parquet(path="./data/target_df.parquet")
# entity_df['event_timestamp'] = datetime.now()
entity_df[entity_df['event_timestamp'] > np.datetime64('2021-12-07 08:21:09.345777+00:00')]

  entity_df[entity_df['event_timestamp'] > np.datetime64('2021-12-07 08:21:09.345777+00:00')]


Unnamed: 0,event_timestamp,id
51253,2021-12-07 09:16:01.289600,51253
51254,2021-12-07 10:16:01.289600,51254
51255,2021-12-07 11:16:01.289600,51255
51256,2021-12-07 12:16:01.289600,51256
51257,2021-12-07 13:16:01.289600,51257
...,...,...
59995,2022-12-06 15:16:01.289600,59995
59996,2022-12-06 16:16:01.289600,59996
59997,2022-12-06 17:16:01.289600,59997
59998,2022-12-06 18:16:01.289600,59998


In [53]:
# Retrieving from the offline store with a feature service v1
feature_v1 = store.get_feature_service("mnist_feature_v1")
training_data = store.get_historical_features(features=feature_v1, entity_df=entity_df)
training_data.to_df()

Unnamed: 0,event_timestamp,id,array,class
0,2022-12-07 08:21:09.345777+00:00,51253,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",9
1,2022-12-07 08:21:09.345777+00:00,51254,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",5
2,2022-12-07 08:21:09.345777+00:00,51255,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",4
3,2022-12-07 08:21:09.345777+00:00,51256,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",2
4,2022-12-07 08:21:09.345777+00:00,51257,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",3
...,...,...,...,...
8742,2022-12-07 08:21:09.345777+00:00,59995,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",8
8743,2022-12-07 08:21:09.345777+00:00,59996,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",3
8744,2022-12-07 08:21:09.345777+00:00,59997,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",5
8745,2022-12-07 08:21:09.345777+00:00,59998,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",6


## Save local dataset

In [336]:
dataset = store.create_saved_dataset(
    from_=training_data,
    name="breast_cancer_dataset",
    storage=SavedDatasetFileStorage("data/breast_cancer_dataset.parquet")
)



In [337]:
!ls data/

breast_cancer_dataset.parquet  data_df3.parquet  registry.db
data_df1.parquet	       data_df4.parquet  target_df.parquet
data_df2.parquet	       online_store.db


In [338]:
# Retrieving the saved dataset
training_df = store.get_saved_dataset(name="breast_cancer_dataset").to_df()
training_df



Unnamed: 0,fractal dimension error,event_timestamp,worst fractal dimension,worst compactness,worst perimeter,worst texture,smoothness error,worst symmetry,symmetry error,worst concave points,worst concavity,mean perimeter,concavity error,concave points error,mean symmetry,mean texture,mean area,patient_id,mean smoothness,radius error,target,perimeter error,mean concavity,mean fractal dimension,area error,worst area,mean concave points,mean radius,texture error,compactness error,worst radius,worst smoothness,mean compactness
0,0.006193,2021-05-17 13:58:49.110423+00:00,0.11890,0.66560,184.60,17.33,0.006399,0.4601,0.03003,0.2654,0.7119,122.80,0.05373,0.01587,0.2419,10.38,1001.0,0,0.11840,1.0950,0,8.589,0.30010,0.07871,153.40,2019.0,0.14710,17.99,0.9053,0.04904,25.380,0.16220,0.27760
1,0.003532,2021-05-18 13:58:49.110423+00:00,0.08902,0.18660,158.80,23.41,0.005225,0.2750,0.01389,0.1860,0.2416,132.90,0.01860,0.01340,0.1812,17.77,1326.0,1,0.08474,0.5435,0,3.398,0.08690,0.05667,74.08,1956.0,0.07017,20.57,0.7339,0.01308,24.990,0.12380,0.07864
2,0.004571,2021-05-19 13:58:49.110423+00:00,0.08758,0.42450,152.50,25.53,0.006150,0.3613,0.02250,0.2430,0.4504,130.00,0.03832,0.02058,0.2069,21.25,1203.0,2,0.10960,0.7456,0,4.585,0.19740,0.05999,94.03,1709.0,0.12790,19.69,0.7869,0.04006,23.570,0.14440,0.15990
3,0.009208,2021-05-20 13:58:49.110423+00:00,0.17300,0.86630,98.87,26.50,0.009110,0.6638,0.05963,0.2575,0.6869,77.58,0.05661,0.01867,0.2597,20.38,386.1,3,0.14250,0.4956,0,3.445,0.24140,0.09744,27.23,567.7,0.10520,11.42,1.1560,0.07458,14.910,0.20980,0.28390
4,0.005115,2021-05-21 13:58:49.110423+00:00,0.07678,0.20500,152.20,16.67,0.011490,0.2364,0.01756,0.1625,0.4000,135.10,0.05688,0.01885,0.1809,14.34,1297.0,4,0.10030,0.7572,0,5.438,0.19800,0.05883,94.44,1575.0,0.10430,20.29,0.7813,0.02461,22.540,0.13740,0.13280
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,0.004239,2022-12-02 13:58:49.110423+00:00,0.07115,0.21130,166.10,26.40,0.010300,0.2060,0.01114,0.2216,0.4107,142.00,0.05198,0.02454,0.1726,22.39,1479.0,564,0.11100,1.1760,0,7.673,0.24390,0.05623,158.70,2027.0,0.13890,21.56,1.2560,0.02891,25.450,0.14100,0.11590
565,0.002498,2022-12-03 13:58:49.110423+00:00,0.06637,0.19220,155.00,38.25,0.005769,0.2572,0.01898,0.1628,0.3215,131.20,0.03950,0.01678,0.1752,28.25,1261.0,565,0.09780,0.7655,0,5.203,0.14400,0.05533,99.04,1731.0,0.09791,20.13,2.4630,0.02423,23.690,0.11660,0.10340
566,0.003892,2022-12-04 13:58:49.110423+00:00,0.07820,0.30940,126.70,34.12,0.005903,0.2218,0.01318,0.1418,0.3403,108.30,0.04730,0.01557,0.1590,28.08,858.1,566,0.08455,0.4564,0,3.425,0.09251,0.05648,48.55,1124.0,0.05302,16.60,1.0750,0.03731,18.980,0.11390,0.10230
567,0.006185,2022-12-05 13:58:49.110423+00:00,0.12400,0.86810,184.60,39.42,0.006522,0.4087,0.02324,0.2650,0.9387,140.10,0.07117,0.01664,0.2397,29.33,1265.0,567,0.11780,0.7260,0,5.772,0.35140,0.07016,86.22,1821.0,0.15200,20.60,1.5950,0.06158,25.740,0.16500,0.27700


## Load dataset and training

In [361]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

labels = training_df['target']
features = training_df.drop(labels=['target', 'event_timestamp', 'patient_id'],
                            axis=1)

# sorted(df) to keep the order of feature fields
feature_fields = sorted(features)
features = features[feature_fields]

# split dataset
X_train, X_test, y_train, y_test = train_test_split(features, labels, shuffle=False)

In [362]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((426, 30), (426,), (143, 30), (143,))

In [363]:
model = LogisticRegression(max_iter=1000, C=1e6)
model.fit(X=X_train, y=y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [364]:
from sklearn.metrics import classification_report
print(classification_report(y_pred=model.predict(X_train), y_true=y_train))
print(classification_report(y_pred=model.predict(X_test), y_true=y_test))

              precision    recall  f1-score   support

           0       0.98      0.95      0.96       177
           1       0.96      0.98      0.97       249

    accuracy                           0.97       426
   macro avg       0.97      0.97      0.97       426
weighted avg       0.97      0.97      0.97       426

              precision    recall  f1-score   support

           0       0.87      0.97      0.92        35
           1       0.99      0.95      0.97       108

    accuracy                           0.96       143
   macro avg       0.93      0.96      0.95       143
weighted avg       0.96      0.96      0.96       143



In [365]:
import joblib
joblib.dump(value=model, filename='../model.joblib')

['../model.joblib']

## Make online features

1. materialize: loads the latest features between two dates

`feast materialize 2021-01-01T00:00:00 2022-01-01T00:00:00`

2. materialize-incremental: loads features up to the provided end date:

`feast materialize-incremental 2022-01-01T00:00:00`

With `feast materialize-incremental`, the start time either `now - ttl` (the `ttl` that we defined in our feature views) or the time of the most recent materialization. If you've materialized features at least once, then subsequent materializations will only fetch features that weren't present in the store at the time of the previous materializations.

If you have several feature rows per entity, Feast will only load the latest values per entity key. As an example, if you have two entries on seperate days for the patient ID 100, only the latest entry will get materialized.

In [344]:
from datetime import datetime, timedelta

# Code for loading features to online store between two dates
"""store.materialize(
    end_date=datetime.now(),
    start_date=datetime.now() - timedelta(days=700))"""

# Loading the latest features after a previous materialize call or from the beginning of time
store.materialize_incremental(end_date=datetime.now())

Materializing [1m[32m5[0m feature views to [1m[32m2022-12-06 14:02:01+07:00[0m into the [1m[32msqlite[0m online store.

[1m[32mdf3_feature_view[0m from [1m[32m2022-11-29 07:02:01+07:00[0m to [1m[32m2022-12-06 14:02:01+07:00[0m:


100%|████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 529.32it/s]


[1m[32mdf1_feature_view[0m from [1m[32m2022-11-29 07:02:01+07:00[0m to [1m[32m2022-12-06 21:02:01+07:00[0m:


100%|████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 793.89it/s]


[1m[32mdf4_feature_view[0m from [1m[32m2022-11-29 07:02:01+07:00[0m to [1m[32m2022-12-06 21:02:01+07:00[0m:


100%|████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 516.20it/s]


[1m[32mtarget_feature_view[0m from [1m[32m2022-11-29 07:02:01+07:00[0m to [1m[32m2022-12-06 21:02:01+07:00[0m:


100%|████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 829.22it/s]


[1m[32mdf2_feature_view[0m from [1m[32m2022-11-29 07:02:01+07:00[0m to [1m[32m2022-12-06 21:02:01+07:00[0m:


100%|████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 797.17it/s]


Because the `ttl = 86400 * 7` (7 days), Feast will only load the features from `2022-11-29 07:02:01+07:00` to `2022-12-06 14:02:01+07:00` (now)

# Inference

In [366]:
features = store.get_online_features(features=feature_v4,
                                     entity_rows=[{"patient_id": 568}, {"patient_id": 567}]).to_dict()

features_df = pd.DataFrame.from_dict(data=features)
features_df

Unnamed: 0,patient_id,mean area,mean smoothness,mean radius,mean perimeter,mean texture,mean fractal dimension,mean concave points,mean compactness,mean symmetry,mean concavity,area error,texture error,radius error,concavity error,compactness error,perimeter error,smoothness error,symmetry error,fractal dimension error,worst area,worst fractal dimension,worst compactness,worst perimeter,worst concave points,worst concavity,worst texture,worst radius,worst smoothness,concave points error,worst symmetry
0,568,181.0,0.05263,7.76,47.919998,24.540001,0.05884,0.0,0.04362,0.1587,0.0,19.15,1.428,0.3857,0.0,0.00466,2.548,0.007189,0.02676,0.002783,268.600006,0.07039,0.06444,59.16,0.0,0.0,30.370001,9.456,0.08996,0.0,0.2871
1,567,1265.0,0.1178,20.6,140.100006,29.33,0.07016,0.152,0.277,0.2397,0.3514,86.220001,1.595,0.726,0.07117,0.06158,5.772,0.006522,0.02324,0.006185,1821.0,0.124,0.8681,184.600006,0.265,0.9387,39.419998,25.74,0.165,0.01664,0.4087


In [367]:
features_df = features_df.drop("patient_id", axis=1)
features_df = features_df[feature_fields]
features_df

Unnamed: 0,area error,compactness error,concave points error,concavity error,fractal dimension error,mean area,mean compactness,mean concave points,mean concavity,mean fractal dimension,mean perimeter,mean radius,mean smoothness,mean symmetry,mean texture,perimeter error,radius error,smoothness error,symmetry error,texture error,worst area,worst compactness,worst concave points,worst concavity,worst fractal dimension,worst perimeter,worst radius,worst smoothness,worst symmetry,worst texture
0,19.15,0.00466,0.0,0.0,0.002783,181.0,0.04362,0.0,0.0,0.05884,47.919998,7.76,0.05263,0.1587,24.540001,2.548,0.3857,0.007189,0.02676,1.428,268.600006,0.06444,0.0,0.0,0.07039,59.16,9.456,0.08996,0.2871,30.370001
1,86.220001,0.06158,0.01664,0.07117,0.006185,1265.0,0.277,0.152,0.3514,0.07016,140.100006,20.6,0.1178,0.2397,29.33,5.772,0.726,0.006522,0.02324,1.595,1821.0,0.8681,0.265,0.9387,0.124,184.600006,25.74,0.165,0.4087,39.419998


In [368]:
model = joblib.load('../model.joblib')
predictions = model.predict(features_df)
predictions

array([1, 0])