In [2]:
import sys
sys.executable


'c:\\Users\\Mon pc\\Documents\\citybike-rebalancing-agent\\.venv\\Scripts\\python.exe'

In [3]:
import datetime
import requests
import pandas as pd
import hopsworks
import datetime
from pathlib import Path
import json
import re
import os
import warnings
warnings.filterwarnings("ignore")

In [4]:
import requests
import os

# Test the connection to your chosen network
network_id = os.getenv("CITYBIKES_NETWORK_ID")
url = f"https://api.citybik.es/v2/networks/{network_id}"
response = requests.get(url)

print(f"Status Code: {response.status_code}")
print(f"Station Count: {len(response.json()['network']['stations'])}")

Status Code: 200
Station Count: 1498


In [5]:
import os
from dotenv import load_dotenv
import hopsworks

# Load variables from .env
load_dotenv()

# Connect to Hopsworks using the .env variables
project = hopsworks.login(
    host=os.getenv("HOPSWORKS_HOST"),
    project=os.getenv("HOPSWORKS_PROJECT"),
    api_key_value=os.getenv("HOPSWORKS_API_KEY")
)
print(f"Connected to MJ Project: {project.name}")

fs = project.get_feature_store()
network_id = os.getenv("CITYBIKES_NETWORK_ID")

2025-12-20 21:35:08,526 INFO: Initializing external client
2025-12-20 21:35:08,527 INFO: Base URL: https://c.app.hopsworks.ai:443






2025-12-20 21:35:10,206 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1272008
Connected to MJ Project: meriemka


In [7]:
import pyarrow.parquet as pq

# Inspect the file's schema first to find the correct column names
parquet_file = pq.ParquetFile("data/202510-world-stats.parquet")
print(f"Available columns: {parquet_file.schema.names}")

# Most common fix: network_id might be stored under a different name or is missing.
# If network_id IS present, ensure it matches the filter exactly.

Available columns: ['tag', 'id', 'nuid', 'name', 'latitude', 'longitude', 'bikes', 'free', 'extra', 'timestamp']


In [8]:
# --- STEP 2: Corrected Data Loading ---
# network_id from .env must match the 'tag' column (e.g., 'velib')
network_id = os.getenv("CITYBIKES_NETWORK_ID")

df = pd.read_parquet(
    "data/202510-world-stats.parquet",
    filters=[('tag', '==', network_id)],
    columns=['tag', 'id', 'bikes', 'free', 'timestamp']
)

# Rename 'id' to 'station_id' to match your primary key choice
df = df.rename(columns={'id': 'station_id'})

print(f"Loaded {len(df)} rows for {network_id}")

Loaded 4970559 rows for velib


In [9]:
# --- STEP 3: Corrected Feature Engineering ---
# In this dataset, capacity = bikes + free slots
df['capacity'] = df['bikes'] + df['free']

# Calculate occupancy using the new column names
df['occupancy_pct'] = (df['bikes'] / df['capacity'] * 100).fillna(0).astype('float32')

# Calculate net_flow based on 'bikes' column
df['net_flow'] = df.groupby('station_id')['bikes'].diff().fillna(0).astype('float32')

# Convert timestamp (already in timestamp[us] format) to BigInt milliseconds
df['timestamp'] = pd.to_datetime(df['timestamp']).view('int64') // 10**6

In [11]:
# Add this import at the top of your cell or in a cell above
import great_expectations as ge

# Now Step 4 will work correctly
expectation_suite = ge.core.ExpectationSuite(expectation_suite_name="bike_quality_rules")
expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={"column": "occupancy_pct", "min_value": 0, "max_value": 100}
    )
)

{"expectation_type": "expect_column_values_to_be_between", "kwargs": {"column": "occupancy_pct", "min_value": 0, "max_value": 100}, "meta": {}}

In [None]:
# --- STEP 5: Hopsworks Materialization ---
bike_fg = fs.get_or_create_feature_group(
    name="station_dynamics",
    version=1,
    primary_key=['station_id'],
    event_time='timestamp',
    online_enabled=True,
    expectation_suite=expectation_suite 
)

bike_fg.insert(df)

# Semantic layer for LLM Agent Tools
bike_fg.update_feature_description("net_flow", "Hourly change in bike availability; negative values signal depletion.")
bike_fg.update_feature_description("occupancy_pct", "How full the station is as a percentage of total docks.")
# Add descriptions for the remaining columns
bike_fg.update_feature_description("tag", "The unique network identifier for the city (e.g., 'velib').")
bike_fg.update_feature_description("station_id", "The unique identifier for each physical bike station.")
bike_fg.update_feature_description("bikes", "The number of available bikes currently at the station.")
bike_fg.update_feature_description("free", "The number of empty docks available for bike returns.")
bike_fg.update_feature_description("capacity", "The total number of docks (bikes + free) at the station.")

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1272008/fs/1258608/fg/1875274
2025-12-20 21:38:23,138 INFO: 	1 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1272008/fs/1258608/fg/1875274


Uploading Dataframe: 100.00% |██████████| Rows 4970559/4970559 | Elapsed Time: 01:24 | Remaining Time: 00:00


Launching job: station_dynamics_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1272008/jobs/named/station_dynamics_1_offline_fg_materialization/executions


<hsfs.feature_group.FeatureGroup at 0x20563b27010>

In [13]:
# --- STEP 6: Create the Feature View ---
# This is the "brain's eye" into the data store

# 1. Define the selection query
# You can select features from multiple groups if needed
query = bike_fg.select_all()

# 2. Register the Feature View
# This creates the logical interface for your ReAct agent
feature_view = fs.get_or_create_feature_view(
    name="station_dynamics_view",
    version=1,
    description="Interface for Rebalancing Agent to get station state and history",
    query=query
)

print(f"Feature View '{feature_view.name}' version {feature_view.version} is ready!")

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1272008/fs/1258608/fv/station_dynamics_view/version/1
Feature View 'station_dynamics_view' version 1 is ready!
