In [1]:
from feast import FeatureStore
from feast.infra.offline_stores.file_source import SavedDatasetFileStorage
import pandas as pd
from sklearn.model_selection import train_test_split

store= FeatureStore(repo_path=r"../feature_store/feature_repo")

## Feature Exploration

In [2]:
store.get_feature_view(name="df_features").features

[continent_Africa-Int64,
 continent_Asia-Int64,
 continent_Europe-Int64,
 continent_North America-Int64,
 continent_Oceania-Int64,
 continent_South America-Int64,
 unit_of_wage_Hour-Int64,
 unit_of_wage_Month-Int64,
 unit_of_wage_Week-Int64,
 unit_of_wage_Year-Int64,
 region_of_employment_Island-Int64,
 region_of_employment_Midwest-Int64,
 region_of_employment_Northeast-Int64,
 region_of_employment_South-Int64,
 region_of_employment_West-Int64,
 has_job_experience-Int64,
 requires_job_training-Int64,
 full_time_position-Int64,
 education_of_employee-Int64,
 no_of_employees-Float32,
 company_age-Float32,
 prevailing_wage-Float32]

In [3]:
store.get_feature_view(name="df_target").features

[case_status-String]

In [4]:
store.get_entity("case_id")

<feast.entity.Entity at 0x1e917b9b430>

In [5]:
entity_df = pd.read_parquet(path=r"../feature_store/feature_repo/data/target.parquet")  
entity_df.head()

Unnamed: 0,case_id,event_timestamp,case_status
0,EZYV01,1953-02-16 17:08:52.440507,Denied
1,EZYV02,1953-02-17 17:08:52.440507,Certified
2,EZYV03,1953-02-18 17:08:52.440507,Denied
3,EZYV04,1953-02-19 17:08:52.440507,Denied
4,EZYV05,1953-02-20 17:08:52.440507,Certified


In [6]:
features_names = []
for i in store.get_feature_view(name="df_features").features:
    features_names.append(f"df_features:{i.name}")

features_names    

['df_features:continent_Africa',
 'df_features:continent_Asia',
 'df_features:continent_Europe',
 'df_features:continent_North America',
 'df_features:continent_Oceania',
 'df_features:continent_South America',
 'df_features:unit_of_wage_Hour',
 'df_features:unit_of_wage_Month',
 'df_features:unit_of_wage_Week',
 'df_features:unit_of_wage_Year',
 'df_features:region_of_employment_Island',
 'df_features:region_of_employment_Midwest',
 'df_features:region_of_employment_Northeast',
 'df_features:region_of_employment_South',
 'df_features:region_of_employment_West',
 'df_features:has_job_experience',
 'df_features:requires_job_training',
 'df_features:full_time_position',
 'df_features:education_of_employee',
 'df_features:no_of_employees',
 'df_features:company_age',
 'df_features:prevailing_wage']

In [7]:
training_data = store.get_historical_features(
    entity_df=entity_df,
    features=features_names)

training_data.to_df()

Unnamed: 0,case_id,event_timestamp,case_status,continent_Africa,continent_Asia,continent_Europe,continent_North America,continent_Oceania,continent_South America,unit_of_wage_Hour,...,region_of_employment_Northeast,region_of_employment_South,region_of_employment_West,has_job_experience,requires_job_training,full_time_position,education_of_employee,no_of_employees,company_age,prevailing_wage
0,EZYV01,1953-02-16 17:08:52.440507+00:00,Denied,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,2.0,1.517300,-0.943712,-1.398537
1,EZYV02,1953-02-17 17:08:52.440507+00:00,Certified,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,3.0,0.030912,-0.454005,0.169835
2,EZYV03,1953-02-18 17:08:52.440507+00:00,Denied,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,2.885076,-1.069026,0.919079
3,EZYV04,1953-02-19 17:08:52.440507+00:00,Denied,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-1.343550,1.625983,0.169994
4,EZYV05,1953-02-20 17:08:52.440507+00:00,Certified,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,3.0,-0.432287,-0.724210,1.428604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25475,EZYV25476,2022-11-16 17:08:52.440507+00:00,Certified,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.079917,-1.069026,0.049924
25476,EZYV25477,2022-11-17 17:08:52.440507+00:00,Certified,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,2.0,0.235747,-0.829275,3.876159
25477,EZYV25478,2022-11-18 17:08:52.440507+00:00,Certified,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,3.0,-0.413885,1.541805,1.360280
25478,EZYV25479,2022-11-19 17:08:52.440507+00:00,Certified,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,3.0,-0.111949,1.682601,0.221509


In [68]:
dataset = store.create_saved_dataset(
    from_=training_data,
    name="final_df",
    storage=SavedDatasetFileStorage(r"../feature_store/feature_repo/data/final_df.parquet")
)

In [69]:
training_df = store.get_saved_dataset(name="final_df").to_df()
training_df.head()



Unnamed: 0,prevailing_wage,continent_North America,unit_of_wage_Year,continent_Oceania,requires_job_training,region_of_employment_South,education_of_employee,continent_South America,region_of_employment_Midwest,continent_Europe,event_timestamp,case_status,continent_Asia,region_of_employment_Northeast,company_age,has_job_experience,unit_of_wage_Month,unit_of_wage_Week,region_of_employment_West,case_id,continent_Africa,unit_of_wage_Hour,region_of_employment_Island,no_of_employees,full_time_position
0,-1.398537,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1953-02-16 17:08:52.440507+00:00,Denied,1.0,0.0,-0.943712,0.0,0.0,0.0,1.0,EZYV01,0.0,1.0,0.0,1.5173,1.0
1,0.169835,0.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,1953-02-17 17:08:52.440507+00:00,Certified,1.0,1.0,-0.454005,1.0,0.0,0.0,0.0,EZYV02,0.0,0.0,0.0,0.030912,1.0
2,0.919079,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1953-02-18 17:08:52.440507+00:00,Denied,1.0,0.0,-1.069026,0.0,0.0,0.0,1.0,EZYV03,0.0,0.0,0.0,2.885076,1.0
3,0.169994,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1953-02-19 17:08:52.440507+00:00,Denied,1.0,0.0,1.625983,0.0,0.0,0.0,1.0,EZYV04,0.0,0.0,0.0,-1.34355,1.0
4,1.428604,0.0,1.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,1953-02-20 17:08:52.440507+00:00,Certified,0.0,0.0,-0.72421,1.0,0.0,0.0,0.0,EZYV05,1.0,0.0,0.0,-0.432287,1.0


In [70]:
labels = training_df['case_status']
features = training_df.drop(
    labels=['case_status', 'event_timestamp', "case_id"], 
    axis=1)

In [71]:
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    labels, 
                                                    stratify=labels)