# <span style='color:#ff5f27'> Initialization </span>

### Hopsworks Settings

In [None]:
import sys
from pathlib import Path
import warnings
warnings.filterwarnings("ignore", module="IPython")

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def clone_repository() -> None:
    !git clone https://github.com/featurestorebook/mlfs-book.git
    %cd mlfs-book

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    # Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
    if root_dir.parts[-1:] == ('airquality',):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ('notebooks',):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir) 
    print("Local environment")

print(f"Root dir: {root_dir}")

# Add the root directory to the `PYTHONPATH` 
if root_dir not in sys.path:
    sys.path.append(root_dir)
    print(f"Added the following directory to the PYTHONPATH: {root_dir}")

# Set the environment variables from the file <root_dir>/.env
from mlfs import config
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

### Imports

In [None]:
import datetime
import requests
import pandas as pd
import hopsworks
from mlfs import util
import datetime
from pathlib import Path
import json
import re
import os
import warnings
warnings.filterwarnings("ignore")

### Hopsworks Login

In [None]:
project = hopsworks.login()

# <span style='color:#ff5f27'> Check CityaBikes API </span>

### Set City Data

In [None]:
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
city = 'Trento'
country = 'Italy'
latitude = 46.07
longitude = 11.12

### Perform a Request

In [None]:
# No API-key is required for this API

network_id = "e-motion-trento" 
url = f"https://api.citybik.es/v2/networks/{network_id}"

try:
    resp = requests.get(url).json()
    print("Request Successful!")
except:
    print("Something went wrong, please check the URL.")
    print("Networn Answer:")
    print(requests.get(url))


### Visualize Answer

These info shall coincide with the variables set in the "Set City Data" cell.

In [None]:
# Print info
print(f"Network ID: {resp['network']['id']}")

print(f"City: {resp['network']['location']['city']}")
print(f"Contry: {resp['network']['location']['country']}")
print(f"Latitude: {resp['network']['location']['latitude']}")
print(f"Longitude: {resp['network']['location']['longitude']}")

# Print first 5 stations
stations = resp['network']['stations']
print("\nFirst five stations:")
for s in stations[:5]:
    print(f"{s['name']} -> Bikes: {s['free_bikes']}, Empty slots: {s['empty_slots']}")


# <span style='color:#ff5f27'> Load Historcal Bike Data </span>

### Load File List

In [None]:
list_file = f"{root_dir}/bike-historical-data/all-files.csv"
list_df = pd.read_csv(list_file, skipinitialspace=True)
list_df

### Read Files

In [None]:
bikes_df = pd.DataFrame()

for i in range(list_df.shape[0]):
    # Read file
    file_name = list_df.iloc[i]['file-name']
    df_i = pd.read_parquet(f"{root_dir}/bike-historical-data/{file_name}", engine='pyarrow')

    # Drop nones and add columns
    df_i.dropna(inplace=True)
    df_i['country'] = list_df.iloc[i]['country']
    df_i['city'] = list_df.iloc[i]['city']

    bikes_df = pd.concat([bikes_df, df_i], ignore_index=True)

bikes_df.head()

### Print Info

In [None]:
print(bikes_df.info())

# <span style='color:#ff5f27'> Load Historcal Weather Data </span>

Features Downloaded:

 * `weather_code`: Weather condition as a numeric code (WMO).
 * `apparent_temperature_mean`: Apparent temperature is the perceived feels-like temperature combining wind chill factor, relative humidity and solar radiation.
 * `daylight_duration`: Number of seconds of daylight per day.
 * `precipitation_sum`: Sum of daily precipitation (including rain, showers and snowfall).
 * `wind_speed_10m_max`: Maximum wind speed on the day.


### Download the Data

In [None]:
earliest_date = pd.Series.min(bikes_df['timestamp'])
earliest_date = earliest_date.strftime('%Y-%m-%d')
earliest_date

In [None]:
weather_df = util.get_historical_weather(city, earliest_date, str(today), latitude, longitude)

### Print Info

In [None]:
weather_df.head()

In [None]:
weather_df.info()

# <span style='color:#ff5f27'> Define Data Validation Rules </span>

### Bike Expectations

In [None]:
import great_expectations as ge
bikes_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="bikes_expectation_suite"
)

bikes_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"bikes",
            "min_value":-0.1,
            "max_value":100.0,
            "strict_min":True
        }
    )
)

bikes_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"free",
            "min_value":-0.1,
            "max_value":100.0,
            "strict_min":True
        }
    )
)

### Weather Expectations

In [None]:
import great_expectations as ge
weather_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="weather_expectation_suite"
)

weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"weather_code",
            "min_value":-0.1,
            "max_value":70.1,
            "strict_min":True
        }
    )
)

weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"apparent_temperature_mean",
            "min_value":-20.1,
            "max_value":50.1,
            "strict_min":True
        }
    )
)

weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"daylight_duration",
            "min_value":-0.1,
            "max_value":86400.1, # 1 Day = 86400 seconds
            "strict_min":True
        }
    )
)

weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"precipitation_sum",
            "min_value":-0.1,
            "max_value":100.1,
            "strict_min":True
        }
    )
)

weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"wind_speed_10m_max",
            "min_value":-0.1,
            "max_value":100.1,
            "strict_min":True
        }
    )
)

# <span style='color:#ff5f27'> Load to Hopsworks </span>

### Load Metadata

In [None]:
fs = project.get_feature_store()

In [None]:
secrets = hopsworks.get_secrets_api()

dict_obj = {
    "country": country,
    "city": city,
    "latitude": latitude,
    "longitude": longitude
}

# Convert the dictionary to a JSON string
str_dict = json.dumps(dict_obj)

# Replace any existing secret with the new value
secret = secrets.get_secret("BIKES_LOCATION_JSON")
if secret is not None:
    secret.delete()
    print("Replacing existing BIKES_LOCATION_JSON")

secrets.create_secret("BIKES_LOCATION_JSON", str_dict)

### Load Bikes Data

In [None]:
bikes_fg = fs.get_or_create_feature_group(
    name='bikes_trento',
    description='Bikes Availability for Trento',
    version=1,
    primary_key=['id'],
    event_time='timestamp',
    expectation_suite=bikes_expectation_suite
)

In [None]:
bikes_fg.insert(bikes_df)

In [None]:
bikes_fg.update_feature_description("tag", "Network Tag")
bikes_fg.update_feature_description("id", "Sensor Identifier")
bikes_fg.update_feature_description("nuid", "Sensor Numeric Identifier")
bikes_fg.update_feature_description("name", "Sensor Name")
bikes_fg.update_feature_description("latitude", "Sensor Latitude Coordinate")
bikes_fg.update_feature_description("longitude", "Sensor Longitude Coordinate")
bikes_fg.update_feature_description("bikes", "Number of Bikes Taken (Empty Slots)")
bikes_fg.update_feature_description("free", "Number of Free Bikes")
bikes_fg.update_feature_description("extra", "Sensor Metadata")
bikes_fg.update_feature_description("timestamp", "Datetime of Measurement")
bikes_fg.update_feature_description("country", "Sensor Country")
bikes_fg.update_feature_description("city", "Sensor City")

### Load Weather Data

In [None]:
weather_fg = fs.get_or_create_feature_group(
    name='weather_trento',
    description='Weather for Trento',
    version=1,
    primary_key=['city'],
    event_time='date',
    expectation_suite=weather_expectation_suite
)

In [None]:
weather_fg.insert(weather_df, wait=True)

In [None]:
weather_fg.update_feature_description("date", "Measurement Day")
weather_fg.update_feature_description("weather_code", "Weather Condition Numeric Code (WMO)")
weather_fg.update_feature_description("apparent_temperature_mean", "Apparent Temperature, Mean of the Day")
weather_fg.update_feature_description("daylight_duration", "Seconds of Sun for the Day")
weather_fg.update_feature_description("precipitation_sum", "Total Precipitation for the Day")
weather_fg.update_feature_description("wind_speed_10m_max", "Wind Speed")
weather_fg.update_feature_description("city", "City of Measurement")