In [21]:
import pandas as pd
from feast import FeatureStore
from feast.infra.offline_stores.file_source import SavedDatasetFileStorage

store = FeatureStore(repo_path=".")

# normally, this is where we'd zoom in on a timeframe or subset of interest, but this is a small dataset for training, grab it all
entity_df = pd.DataFrame.from_dict({ "PassengerId": list(range(1, 891)) })
entity_df["event_timestamp"] = pd.to_datetime("now", utc=True)
train_df = store.get_historical_features(
    entity_df=entity_df, features=store.get_feature_service('omniscient_service')
).to_df()

# and grab the test data as well
entity_df = pd.DataFrame.from_dict({ "PassengerId": list(range(892, 1309)) })
entity_df["event_timestamp"] = pd.to_datetime("now", utc=True)
test_df = store.get_historical_features(
    entity_df=entity_df, features=store.get_feature_service('test_service')
).to_df()

print(train_df.info())
print('-'*30)
print (test_df.info())




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 890 entries, 0 to 889
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   PassengerId      890 non-null    int64              
 1   event_timestamp  890 non-null    datetime64[ns, UTC]
 2   Survived         890 non-null    int64              
 3   Pclass           890 non-null    int64              
 4   Name             890 non-null    object             
 5   Sex              890 non-null    object             
 6   Age              713 non-null    float64            
 7   SibSp            890 non-null    int64              
 8   Parch            890 non-null    int64              
 9   Ticket           890 non-null    object             
 10  Fare             890 non-null    float64            
 11  Cabin            204 non-null    object             
 12  Embarked         888 non-null    object             
dtypes: datetime64[ns, UT

In [22]:
#   we already checked for duplicate records, but could do so again from omniscient_service
print('train_df # dup. records:', train_df.duplicated().sum())


train_df # dup. records: 0


In [23]:
# check for null values
print ("cols in train_df w/nulls: ")
print(train_df.isnull().sum())
print('-'*40)
print("cols in test_df w/nulls: ")
print(test_df.isnull().sum())


cols in train_df w/nulls: 
PassengerId          0
event_timestamp      0
Survived             0
Pclass               0
Name                 0
Sex                  0
Age                177
SibSp                0
Parch                0
Ticket               0
Fare                 0
Cabin              686
Embarked             2
dtype: int64
----------------------------------------
cols in test_df w/nulls: 
PassengerId          0
event_timestamp      0
Pclass               0
Name                 0
Sex                  0
Age                 85
SibSp                0
Parch                0
Ticket               0
Fare                 1
Cabin              326
Embarked             0
dtype: int64


In [None]:
# possible / needed sanitization tasks:
# TODO: complete & abstract for Age
# TODO: complete for Embarked & normalize
# TODO: normalize Sex
# TODO: complete & abstract for Fare
# TODO: title abstraction
# TODO: complete & abstract for Cabin (to Deck)
# TODO: abstract family count / alone from Parch & SibSp

# combine datasets for when we want to sanitize both at once
# working toward on_demand example, but it keeps expecting wrong field types
# combined = [train_df, test_df]
#
# features = store.get_online_features(
#     features=store.get_feature_service("sanitized_service"),
#     entity_rows=list({ "PassengerId": 1 }, { "PassengerId": 2 }, { "PassengerId": 3 } )
#     # entity_rows=list({ "PassengerId": list(range(1, 891)) }.items())
# ).to_dict()
# for key, value in sorted(features.items()):
#     print(key, " : ", value)


#TODO: add feature server (makes feature available online w/UX)
#TODO: use point-in-time correct feature set (ensures no data leakage around manual models during training)
#TODO: look into this:
# reproducible model training / model backtesting / experiment management: Feast captures feature and model metadata, but does not version-control datasets / labels or manage train / test splits. Other tools like DVC, MLflow and Kubeflow are better suited for this.



