### 1. Prepare data set and store in parquet format

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('diabetes.csv')

In [3]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
predictors_df = data.loc[:,data.columns!='Outcome']
target_df = data['Outcome']

In [5]:
predictors_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


**Create timestamp to be added as event_timestamp column in the data set.**

In [6]:
timestamps = pd.date_range(end = pd.Timestamp.now(),
                           periods = len(data),freq = 'D').to_frame(name = 'event_timestamp', index = False)

In [7]:
timestamps

Unnamed: 0,event_timestamp
0,2021-04-27 16:47:58.116752
1,2021-04-28 16:47:58.116752
2,2021-04-29 16:47:58.116752
3,2021-04-30 16:47:58.116752
4,2021-05-01 16:47:58.116752
...,...
763,2023-05-30 16:47:58.116752
764,2023-05-31 16:47:58.116752
765,2023-06-01 16:47:58.116752
766,2023-06-02 16:47:58.116752


**add event_timestamp column to the predictors and target dataframes**

In [8]:
predictors_df = pd.concat(objs = [predictors_df, timestamps], axis = 1)
target_df = pd.concat(objs = [target_df, timestamps], axis =1)

In [9]:
predictors_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,event_timestamp
0,6,148,72,35,0,33.6,0.627,50,2021-04-27 16:47:58.116752
1,1,85,66,29,0,26.6,0.351,31,2021-04-28 16:47:58.116752
2,8,183,64,0,0,23.3,0.672,32,2021-04-29 16:47:58.116752
3,1,89,66,23,94,28.1,0.167,21,2021-04-30 16:47:58.116752
4,0,137,40,35,168,43.1,2.288,33,2021-05-01 16:47:58.116752
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,2023-05-30 16:47:58.116752
764,2,122,70,27,0,36.8,0.340,27,2023-05-31 16:47:58.116752
765,5,121,72,23,112,26.2,0.245,30,2023-06-01 16:47:58.116752
766,1,126,60,0,0,30.1,0.349,47,2023-06-02 16:47:58.116752


In [10]:
target_df.head()

Unnamed: 0,Outcome,event_timestamp
0,1,2021-04-27 16:47:58.116752
1,0,2021-04-28 16:47:58.116752
2,1,2021-04-29 16:47:58.116752
3,0,2021-04-30 16:47:58.116752
4,1,2021-05-01 16:47:58.116752


**Create a patientID column to uniquely identify records with patientID and timestamp field together.**

In [11]:
predictors_df['patient_id'] = list(range(len(data)))
target_df['patient_id'] = list(range(len(data)))

In [12]:
predictors_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,event_timestamp,patient_id
0,6,148,72,35,0,33.6,0.627,50,2021-04-27 16:47:58.116752,0
1,1,85,66,29,0,26.6,0.351,31,2021-04-28 16:47:58.116752,1
2,8,183,64,0,0,23.3,0.672,32,2021-04-29 16:47:58.116752,2
3,1,89,66,23,94,28.1,0.167,21,2021-04-30 16:47:58.116752,3
4,0,137,40,35,168,43.1,2.288,33,2021-05-01 16:47:58.116752,4
...,...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,2023-05-30 16:47:58.116752,763
764,2,122,70,27,0,36.8,0.340,27,2023-05-31 16:47:58.116752,764
765,5,121,72,23,112,26.2,0.245,30,2023-06-01 16:47:58.116752,765
766,1,126,60,0,0,30.1,0.349,47,2023-06-02 16:47:58.116752,766


In [13]:
target_df

Unnamed: 0,Outcome,event_timestamp,patient_id
0,1,2021-04-27 16:47:58.116752,0
1,0,2021-04-28 16:47:58.116752,1
2,1,2021-04-29 16:47:58.116752,2
3,0,2021-04-30 16:47:58.116752,3
4,1,2021-05-01 16:47:58.116752,4
...,...,...,...
763,0,2023-05-30 16:47:58.116752,763
764,0,2023-05-31 16:47:58.116752,764
765,0,2023-06-01 16:47:58.116752,765
766,1,2023-06-02 16:47:58.116752,766


In [14]:
predictors_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Pregnancies               768 non-null    int64         
 1   Glucose                   768 non-null    int64         
 2   BloodPressure             768 non-null    int64         
 3   SkinThickness             768 non-null    int64         
 4   Insulin                   768 non-null    int64         
 5   BMI                       768 non-null    float64       
 6   DiabetesPedigreeFunction  768 non-null    float64       
 7   Age                       768 non-null    int64         
 8   event_timestamp           768 non-null    datetime64[ns]
 9   patient_id                768 non-null    int64         
dtypes: datetime64[ns](1), float64(2), int64(7)
memory usage: 60.1 KB


In [15]:
predictors_df.to_parquet(path='feature_repo/feature_repo/data/predictors_df.parquet')
target_df.to_parquet(path='feature_repo/feature_repo/data/target_df.parquet')

In [16]:
#!pip install feast

In [17]:
!feast version

Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Feast SDK Version: "feast 0.31.1"


### 2. feast init

this is option as it creates the feast repo directory structure. you can create a directory using mkdir and inside that create a feature_store.yaml file and a feature_definitions.py file. but its better to use feast init and then modify the respective files.

In [17]:
!feast init feature_repo

Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
06/03/2023 02:58:52 PM root INFO: creating /media/masoud

### 3. Update feature store yaml file if needed

you can update the online store and local store paths in feature_store.yaml file if needed.

### 4. Define Feature definitions in a python file inside feature repo directory (created using feast init)
This step is known as register and deploy the features
go inside the feature_repo folder in the github you will see the feature_definition.py file with updated code. modify as per to your dataset features.

### 5. Do feast apply

do feast apply from inside the feature_repo directory

In [18]:
cd feature_repo/feature_repo

/media/masoud/F60C689F0C685C9D/GIT_REPOS/Feature Stores/feast-tutorial/notebook/feature_repo/feature_repo


In [19]:
!pwd

/media/masoud/F60C689F0C685C9D/GIT_REPOS/Feature Stores/feast-tutorial/notebook/feature_repo/feature_repo


In [20]:
!feast apply

Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Created entity [1m[32mpatient_id[0m
Created feature v

### 6. Generate Training Data Set

In [21]:
entity_df = pd.read_parquet(path='data/target_df.parquet')

In [22]:
entity_df

Unnamed: 0,Outcome,event_timestamp,patient_id
0,1,2021-04-27 16:47:58.116752,0
1,0,2021-04-28 16:47:58.116752,1
2,1,2021-04-29 16:47:58.116752,2
3,0,2021-04-30 16:47:58.116752,3
4,1,2021-05-01 16:47:58.116752,4
...,...,...,...
763,0,2023-05-30 16:47:58.116752,763
764,0,2023-05-31 16:47:58.116752,764
765,0,2023-06-01 16:47:58.116752,765
766,1,2023-06-02 16:47:58.116752,766


In [23]:
from feast import FeatureStore
from feast.infra.offline_stores.file_source import SavedDatasetFileStorage

store = FeatureStore(repo_path='.')



training_data = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "predictors_df_feature_view:Pregnancies",
        "predictors_df_feature_view:Glucose",
        "predictors_df_feature_view:BloodPressure",
        "predictors_df_feature_view:SkinThickness",
        "predictors_df_feature_view:Insulin",
        "predictors_df_feature_view:BMI",
        "predictors_df_feature_view:DiabetesPedigreeFunction",
        "predictors_df_feature_view:Age",
    ]
)

dataset = store.create_saved_dataset(
    from_=training_data,
    name="diabetes_dataset",
    storage=SavedDatasetFileStorage('data/diabetes_dataset.parquet')
)



In [24]:
training_data.to_df()

Unnamed: 0,Outcome,event_timestamp,patient_id,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,1,2021-04-27 16:47:58.116752+00:00,0,6,148,72,35,0,33.6,0.627,50
1,0,2021-04-28 16:47:58.116752+00:00,1,1,85,66,29,0,26.6,0.351,31
2,1,2021-04-29 16:47:58.116752+00:00,2,8,183,64,0,0,23.3,0.672,32
3,0,2021-04-30 16:47:58.116752+00:00,3,1,89,66,23,94,28.1,0.167,21
4,1,2021-05-01 16:47:58.116752+00:00,4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...,...,...,...
763,0,2023-05-30 16:47:58.116752+00:00,763,10,101,76,48,180,32.9,0.171,63
764,0,2023-05-31 16:47:58.116752+00:00,764,2,122,70,27,0,36.8,0.340,27
765,0,2023-06-01 16:47:58.116752+00:00,765,5,121,72,23,112,26.2,0.245,30
766,1,2023-06-02 16:47:58.116752+00:00,766,1,126,60,0,0,30.1,0.349,47


### 7. Model Training

In [25]:
# Importing dependencies
from feast import FeatureStore
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from joblib import dump

# Getting our FeatureStore
store = FeatureStore(repo_path=".")

# Retrieving the saved dataset and converting it to a DataFrame
training_df = store.get_saved_dataset(name="diabetes_dataset").to_df()

# Separating the features and labels
y = training_df['Outcome']
X = training_df.drop(
    labels=['Outcome', 'event_timestamp', "patient_id"], 
    axis=1)

# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    stratify=y)

# Creating and training LogisticRegression
reg = LogisticRegression()
reg.fit(X=X_train[sorted(X_train)], y=y_train)

# Saving the model
dump(value=reg, filename="model.joblib")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


['model.joblib']

### 8. Prepare online feature store
(Loading the features to online store)

There are two ways you can use to load features to your online store 
- materialize

materialize loads the latest features between two dates.

`feast materialize 2020–01–01T00:00:00 2022–01–01T00:00:00`

- materialize-incremental

materialize-incremental loads features up to the provided end date:

`feast materialize-incremental 2022–01–01T00:00:00`

In [26]:
# Importing dependencies
from feast import FeatureStore
from datetime import datetime, timedelta

# Getting our FeatureStore
store = FeatureStore(repo_path=".")

store.materialize_incremental(end_date = datetime.now())

Materializing [1m[32m2[0m feature views to [1m[32m2023-06-03 16:48:36+03:30[0m into the [1m[32msqlite[0m online store.

[1m[32mtarget_df_feature_view[0m from [1m[32m2023-05-22 13:18:36+03:30[0m to [1m[32m2023-06-03 16:48:36+03:30[0m:


100%|███████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 43.41it/s]


[1m[32mpredictors_df_feature_view[0m from [1m[32m2023-05-24 13:18:36+03:30[0m to [1m[32m2023-06-03 20:18:36+03:30[0m:


100%|███████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 64.94it/s]


In [27]:
store.list_entities()

[<feast.entity.Entity at 0x7f9c0e7bf280>]

### 9. Get online features for prediction

In [28]:
# Importing dependencies
from feast import FeatureStore
import pandas as pd
from joblib import load

# Getting our FeatureStore
store = FeatureStore(repo_path=".")

# Defining our features names
feast_features = [
    "predictors_df_feature_view:Pregnancies",
    "predictors_df_feature_view:Glucose",
    "predictors_df_feature_view:BloodPressure",
    "predictors_df_feature_view:SkinThickness",
    "predictors_df_feature_view:Insulin",
    "predictors_df_feature_view:BMI",
    "predictors_df_feature_view:DiabetesPedigreeFunction",
    "predictors_df_feature_view:Age",
]

# Getting the latest features
features = store.get_online_features(
    features=feast_features,
    entity_rows=[{"patient_id": 767}, {"patient_id": 766}]
).to_dict()

# Converting the features to a DataFrame
features_df = pd.DataFrame.from_dict(data=features)

In [29]:
features_df.head()

Unnamed: 0,patient_id,SkinThickness,Age,Pregnancies,DiabetesPedigreeFunction,BloodPressure,Insulin,Glucose,BMI
0,767,31,23,1,0.315,70,0,93,30.4
1,766,0,47,1,0.349,60,0,126,30.1


### 10. Call the predict function and see the output

In [30]:
# Loading our model and doing inference
reg = load("model.joblib")
predictions = reg.predict(features_df[sorted(features_df.drop("patient_id", axis=1))])
print(predictions)

[0 0]


### References:

https://docs.feast.dev/getting-started/quickstart