In [None]:
import os
import json
import pandas as pd
from pathlib import Path
import calendar

pd.set_option("display.max_columns", 200)

In [None]:
%pwd

In [None]:
df = pd.read_csv("rideshare_kaggle.csv")
df.head()

In [None]:
df['datetime'] = pd.to_datetime(df['datetime'], format="%Y-%m-%d %H:%M:%S")
df = df.rename(columns = {"datetime": "event_timestamp"})

df["price"] = df["price"].fillna(0)

In [None]:
# data_len = len(df)
# ids = [x for x in range(0, data_len)]

In [None]:
# df["Driver_Id"] = ids

# df.drop('id', axis=1, inplace=True)

In [None]:
df.head(2)

In [None]:
extra_cols = ['apparentTemperature', 'precipIntensity', 'precipProbability', 'humidity', 'windSpeed',
            'windGust', 'windGustTime', 'visibility', 'temperatureHigh', 'timezone',
            'temperatureHighTime', 'temperatureLow', 'temperatureLowTime',
            'apparentTemperatureHigh', 'apparentTemperatureHighTime',
                'apparentTemperatureLowTime', 'icon',
            'dewPoint', 'pressure', 'uvIndex','visibility.1', 'ozone', 'sunriseTime', 'sunsetTime', 'moonPhase',
            'precipIntensityMax', 'uvIndexTime', 
            'temperatureMinTime', 'temperatureMaxTime',
            'apparentTemperatureMinTime', 'apparentTemperatureMaxTime']

In [None]:
df = df.drop(extra_cols, axis=1)

In [None]:
day_week = [calendar.day_name[x.dayofweek] for x in df["event_timestamp"]]

In [None]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

In [None]:
df['destination'] = label_encoder.fit_transform(df['destination'])
df['product_id'] = label_encoder.fit_transform(df['product_id'])
df['short_summary']= label_encoder.fit_transform(df['short_summary'])
df['long_summary']= label_encoder.fit_transform(df['long_summary'])
df['name'] = label_encoder.fit_transform(df['name'])
df['source'] = label_encoder.fit_transform(df['source'])
df['cab_type'] = label_encoder.fit_transform(df['cab_type'])
df['name'] = label_encoder.fit_transform(df['name'])

In [None]:
predictor_df = df.loc[:, df.columns != "price"]
target_df = df[["id","event_timestamp","price"]]

In [None]:
predictor_df.head(2)

In [None]:
df.head(3)

#### Do Feast Init

In [None]:
!feast init feast_demo

In [None]:
predictor_df.to_parquet(path="feast_demo/feature_repo/data/predictor_df.parquet")
target_df.to_parquet(path="feast_demo/feature_repo/data/target_df.parquet")

In [None]:
import os

In [None]:
os.chdir("feast_demo/feature_repo")

In [None]:
%pwd

In [None]:
!feast apply

In [None]:
from feast import FeatureStore
from feast.infra.offline_stores.file_source import SavedDatasetFileStorage

In [None]:
store = FeatureStore(repo_path=".")
entity_df = pd.read_parquet(path = "data/target_df.parquet")


training_data = store.get_historical_features(
    entity_df= entity_df,
    features = [
        "driver_stats:hour",
        "driver_stats:day",
        "driver_stats:month",
        "driver_stats:source",
        "driver_stats:destination",
        "driver_stats:cab_type",
        "driver_stats:product_id",
        "driver_stats:name",
        "driver_stats:distance",
        "driver_stats:surge_multiplier",
        "driver_stats:latitude",
        "driver_stats:longitude",
        "driver_stats:temperature",
        "driver_stats:short_summary",
        "driver_stats:apparentTemperatureLow",
        "driver_stats:windBearing",
        "driver_stats:cloudCover",
        "driver_stats:temperatureMin",
        "driver_stats:temperatureMax",
        "driver_stats:apparentTemperatureMin",
        "driver_stats:apparentTemperatureMax"
    ]
)

dataset = store.create_saved_dataset(
    from_=training_data, 
    name="Uber-lyft-Dataset",
    storage= SavedDatasetFileStorage("data/Uber-Lyft-Dataset.parquet")
)

In [None]:
training_data.to_df().head()

### Model Training

In [None]:
from feast import FeatureStore
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from joblib import dump
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
## getting feature store

store = FeatureStore(repo_path = ".")

# Retrieving the saved dataset and converting to pandas Dataframe

training_df = store.get_saved_dataset(name="Uber-lyft-Dataset").to_df()

# Seperating the features and labels

y = training_df["price"]
X = training_df.drop(["id", "event_timestamp", "price"], axis=1)

# Splitting the dataset into train and tests

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


In [None]:
training_df.head()

In [None]:
def train_test_models(X_train, X_test, y_train, y_test):
    print("Liner Regression")
    lr_model = LinearRegression()
    lr_model = lr_model.fit(X_train, y_train)
    print("Linear Regression Score: ", lr_model.score(X_test, y_test))
    
    print("Random Forest Regressor")
    reg = RandomForestRegressor()
    reg = reg.fit(X_train, y_train)
    print("Random Forest Score: ", reg.score(X_test, y_test))
    
    print("Decision Tree Regressor")
    decision = DecisionTreeRegressor()
    decision = decision.fit(X_train, y_train)
    print("Decision Tree Regressor: ", decision.score(X_test, y_test))
    
    print("Gradient Boosting Regressor")
    gbm = GradientBoostingRegressor()
    gbm = gbm.fit(X_train, y_train)
    print("Gradient Boosting Regressor: ", gbm.score(X_test, y_test))
    
    return lr_model, reg, decision, gbm
    

In [None]:
models = train_test_models(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)

In [None]:
# Saving the model
dump(value=models[1], filename="gbm.joblib")

### 8. Prepare online feature store
(Loading the features to online store)

There are two ways you can use to load features to your online store<br>

materialize
materialize loads the latest features between two dates.

feast materialize 2020–01–01T00:00:00 2022–01–01T00:00:00<br>

materialize-incremental
materialize-incremental loads features up to the provided end date:

feast materialize-incremental 2022–01–01T00:00:00

In [None]:
from feast import FeatureStore
from datetime import datetime, timedelta
from joblib import load

In [None]:
df["event_timestamp"].max()

In [None]:
datetime.now() - datetime(2018, 12, 18, 19, 15, 10, 432493)

In [None]:
datetime.now() -timedelta(days=1867)

In [None]:
datetime(2018, 12, 18, 19, 15, 10, 432493)

In [None]:
df[(df["event_timestamp"] >= "2018-12-17") & (df["event_timestamp"] <= "2018-12-18")]

In [None]:
store = FeatureStore(repo_path=".")

store.materialize(start_date=datetime.utcnow() - timedelta(days=1868), end_date=datetime.utcnow() - timedelta(days=1867))

In [None]:
from feast import FeatureStore
from pprint import pprint


store= FeatureStore(repo_path=".")


feast_features = [
        "driver_stats:hour",
        "driver_stats:day",
        "driver_stats:month",
        "driver_stats:source",
        "driver_stats:destination",
        "driver_stats:cab_type",
        "driver_stats:product_id",
        "driver_stats:name",
        "driver_stats:distance",
        "driver_stats:surge_multiplier",
        "driver_stats:latitude",
        "driver_stats:longitude",
        "driver_stats:temperature",
        "driver_stats:short_summary",
        "driver_stats:apparentTemperatureLow",
        "driver_stats:windBearing",
        "driver_stats:cloudCover",
        "driver_stats:temperatureMin",
        "driver_stats:temperatureMax",
        "driver_stats:apparentTemperatureMin",
        "driver_stats:apparentTemperatureMax"
]

feature_vector = store.get_online_features(
    features=feast_features, 
    entity_rows=[
        {"Driver_Id":"2effa2c2-6728-4274-b904-199a9fc830c4"},
        {"Driver_Id": "1d451059-895c-4179-8cec-40adfbc4f6d3"}
                ]
).to_dict()

# Converting the features to a DataFrame
features_df = pd.DataFrame.from_dict(data=feature_vector)

pprint(feature_vector)

In [None]:
features_df.head()

### Call the predict function to see the output

In [None]:
# Loading our model and doing inference
reg = load("gbm.joblib")
predictions = reg.predict(features_df[sorted(features_df.drop("id", axis=1))])
print(predictions)
prediction_probabilities = reg.predict(features_df[sorted(features_df.drop("id", axis=1))])
print(prediction_probabilities)

In [None]:
def feature_eliminate(trained_model, X, y, n_features: int = 40):
    rfe = RFE(trained_model, n_features_to_select=n_features)
    rfe = rfe.fit(X, y)
    X_new = X[X.columns[rfe.support_]]
    X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=0)
    new_fit = trained_model.fit(X_train, y_train)
    print(new_fit.score(X_test, y_test))

In [None]:
n_features = [5, 10, 15, 20]

for model in models:
    for nf in n_features:
        print(f'{model}->{nf} features')
        feature_eliminate(model, X, y, nf)