In [40]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [41]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
import src.config as config

In [42]:
import hopsworks
import pandas as pd
from datetime import timedelta
from src.inference import get_feature_store, fetch_predictions

def fetch_hourly_rides(hours):
    current_hour = (pd.Timestamp.now(tz="Etc/UTC") - timedelta(hours=hours)).floor('h')

    fs = get_feature_store()
    fg = fs.get_feature_group(
        name=config.FEATURE_GROUP_NAME,
        version=1
    )

    query = fg.select_all()
    query = query.filter(fg.pickup_hour >= current_hour)

    return query.read()

In [43]:
import os
import hopsworks

api_key = os.getenv('HOPSWORKS_API_KEY')  
project_name = os.getenv('HOPSWORKS_PROJECT_NAME')  

# print("API Key:", api_key)
print("Project Name:", project_name)

# Initialize connection to Hopsworks  
project = hopsworks.login(api_key_value=api_key, project=project_name)
print(f"Successfully connected to Hopsworks project: {project_name}")

Project Name: new_york_taxi
2025-03-03 14:21:03,118 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-03 14:21:03,123 INFO: Initializing external client
2025-03-03 14:21:03,124 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-03 14:21:04,163 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214677
Successfully connected to Hopsworks project: new_york_taxi


In [44]:
df = fetch_hourly_rides(12)

2025-03-03 14:21:05,363 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-03 14:21:05,366 INFO: Initializing external client
2025-03-03 14:21:05,367 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-03 14:21:06,229 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214677
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (4.46s) 


In [45]:
df

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2025-03-03 10:00:00+00:00,72,2
1,2025-03-03 03:00:00+00:00,50,1
2,2025-03-03 13:00:00+00:00,212,0
3,2025-03-03 13:00:00+00:00,215,2
4,2025-03-03 11:00:00+00:00,220,0
...,...,...,...
4513,2025-03-03 19:00:00+00:00,214,0
4514,2025-03-03 19:00:00+00:00,76,1
4515,2025-03-03 19:00:00+00:00,70,23
4516,2025-03-03 19:00:00+00:00,137,56


In [46]:
df_pred = fetch_predictions(240)

2025-03-03 14:21:16,617 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-03 14:21:16,622 INFO: Initializing external client
2025-03-03 14:21:16,623 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-03 14:21:17,315 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214677
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (54.27s) 


In [47]:
df_pred

Unnamed: 0,pickup_location_id,predicted_demand,pickup_hour
0,177,1.0,2025-03-03 15:00:00+00:00
1,72,2.0,2025-03-03 15:00:00+00:00
2,16,0.0,2025-03-03 15:00:00+00:00
3,209,4.0,2025-03-03 15:00:00+00:00
4,68,123.0,2025-03-03 15:00:00+00:00
...,...,...,...
748,163,188.0,2025-03-03 20:00:00+00:00
749,208,0.0,2025-03-03 20:00:00+00:00
750,256,2.0,2025-03-03 20:00:00+00:00
751,115,0.0,2025-03-03 20:00:00+00:00


In [48]:
merged_df = pd.merge(df, df_pred, on=['pickup_location_id', 'pickup_hour'])

In [49]:
merged_df

Unnamed: 0,pickup_hour,pickup_location_id,rides,predicted_demand
0,2025-03-03 15:00:00+00:00,224,5,3.0
1,2025-03-03 15:00:00+00:00,40,1,1.0
2,2025-03-03 15:00:00+00:00,37,2,1.0
3,2025-03-03 15:00:00+00:00,215,1,1.0
4,2025-03-03 15:00:00+00:00,233,76,55.0
...,...,...,...,...
497,2025-03-03 19:00:00+00:00,214,0,0.0
498,2025-03-03 19:00:00+00:00,76,1,1.0
499,2025-03-03 19:00:00+00:00,70,23,26.0
500,2025-03-03 19:00:00+00:00,137,56,24.0


In [50]:
merged_df['difference'] = merged_df['predicted_demand'] - merged_df['rides']

In [51]:
merged_df.sort_values(["pickup_location_id", "pickup_hour"])

Unnamed: 0,pickup_hour,pickup_location_id,rides,predicted_demand,difference
138,2025-03-03 15:00:00+00:00,2,0,0.0,0.0
473,2025-03-03 19:00:00+00:00,2,0,0.0,0.0
205,2025-03-03 15:00:00+00:00,3,0,0.0,0.0
469,2025-03-03 19:00:00+00:00,3,0,0.0,0.0
51,2025-03-03 15:00:00+00:00,4,3,6.0,3.0
...,...,...,...,...,...
322,2025-03-03 19:00:00+00:00,261,24,24.0,0.0
167,2025-03-03 15:00:00+00:00,262,97,20.0,-77.0
437,2025-03-03 19:00:00+00:00,262,65,71.0,6.0
141,2025-03-03 15:00:00+00:00,263,119,99.0,-20.0


In [52]:
merged_df

Unnamed: 0,pickup_hour,pickup_location_id,rides,predicted_demand,difference
0,2025-03-03 15:00:00+00:00,224,5,3.0,-2.0
1,2025-03-03 15:00:00+00:00,40,1,1.0,0.0
2,2025-03-03 15:00:00+00:00,37,2,1.0,-1.0
3,2025-03-03 15:00:00+00:00,215,1,1.0,0.0
4,2025-03-03 15:00:00+00:00,233,76,55.0,-21.0
...,...,...,...,...,...
497,2025-03-03 19:00:00+00:00,214,0,0.0,0.0
498,2025-03-03 19:00:00+00:00,76,1,1.0,0.0
499,2025-03-03 19:00:00+00:00,70,23,26.0,3.0
500,2025-03-03 19:00:00+00:00,137,56,24.0,-32.0


In [53]:
import pandas as pd  
import plotly.express as px
df1 = df
df2 = df_pred

# Merge the DataFrames on 'pickup_location_id' and 'pickup_hour'  
merged_df = pd.merge(df1, df2, on=['pickup_location_id', 'pickup_hour'])  

# Calculate the absolute error  
merged_df['absolute_error'] = abs(merged_df['predicted_demand'] - merged_df['rides'])  

# Group by 'pickup_hour' and calculate the mean absolute error (MAE)  
mae_by_hour = merged_df.groupby('pickup_hour')['absolute_error'].mean().reset_index()  
mae_by_hour.rename(columns={'absolute_error': 'MAE'}, inplace=True)  

# Create a Plotly plot  
fig = px.line(  
    mae_by_hour,  
    x='pickup_hour',  
    y='MAE',  
    title='Mean Absolute Error (MAE) by Pickup Hour',  
    labels={'pickup_hour': 'Pickup Hour', 'MAE': 'Mean Absolute Error'},  
    markers=True  
)  

# Show the plot  
fig.show()

In [54]:
mae_by_hour["MAE"].mean()

9.330677290836654