In [9]:
import sys
from pathlib import Path
from src.pipelines.feature_pipeline import run_feature_pipeline

# Añade src al path para importar los módulos
sys.path.append(str(Path().resolve().parent / 'src'))

In [10]:
%reload_ext autoreload
%autoreload 2

In [None]:
from src.data_utils import load_raw_data, transformar_a_series_temporales
 
# Parámetros de usuario
familia = "BOLLERIA"
fecha_inicio = "2023-01-02"
fecha_fin = "2025-08-03"

# Descargar y cargar todos los datos raw en el rango de fechas
df_raw = load_raw_data(
    fecha_inicio=fecha_inicio, 
    fecha_fin=fecha_fin, 
    descargar_bq=True)

# Transformar a series temporales SOLO para la familia seleccionada y guardar en interim
ts_data = transformar_a_series_temporales(
    df_raw,
    fecha_inicio=fecha_inicio,
    fecha_fin=fecha_fin,
    familia=familia,
    guardar_interim=True
)

# Mostrar información básica del DataFrame resultante
ts_data.info()
ts_data.head()

In [19]:
ts_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131 entries, 0 to 130
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   year            131 non-null    UInt32        
 1   week            131 non-null    UInt32        
 2   familia         131 non-null    object        
 3   base_imponible  131 non-null    float64       
 4   is_summer_peak  131 non-null    int32         
 5   is_easter       131 non-null    int64         
 6   dias_semana     131 non-null    int64         
 7   week_start      131 non-null    datetime64[ns]
dtypes: UInt32(2), datetime64[ns](1), float64(1), int32(1), int64(2), object(1)
memory usage: 7.0+ KB


In [None]:
import os
from dotenv import load_dotenv
from paths import ROOT_DIR
import hopsworks

load_dotenv(os.path.join(ROOT_DIR, '.env'))

HOPSWORKS_PROJECT_NAME = 'fleca_mlops'
HOPSWORKS_API_KEY = os.getenv('HOPSWORKS_API_KEY')  

project = hopsworks.login(
    api_key_value=HOPSWORKS_API_KEY, 
    project=HOPSWORKS_PROJECT_NAME)

feature_store = project.get_feature_store()

In [21]:
# Creeamos el feature store en Hopsworks
feature_store = project.get_feature_store()

# Definimos el nombre y versión del feature store
FEATURE_GROUP_NAME = "times_series_bolleria_feature_group"
FEATURE_GROUP_VERSION = 1

In [22]:
# Creamos el feature group
# Desactivamos el stream en Kafka
feature_group = feature_store.create_feature_group(
    name=FEATURE_GROUP_NAME,
    version=FEATURE_GROUP_VERSION,
    description="Feature group for time series data of bolleria",
    primary_key=["week_start", "familia"],
    event_time="week_start",
    stream=False # Desactivamos el stream en Kafka
    )


In [25]:
7# Comprobar si el feature group existe antes de crearlo
try:
    feature_group = feature_store.get_feature_group(
        name=FEATURE_GROUP_NAME,
        version=FEATURE_GROUP_VERSION
    )
    print(f"Feature group '{FEATURE_GROUP_NAME}' (v{FEATURE_GROUP_VERSION}) ya existe. Usando el existente.")
except Exception as e:
    print(f"No existe el feature group, se creará uno nuevo. Detalle: {e}")
    feature_group = feature_store.create_feature_group(
        name=FEATURE_GROUP_NAME,
        version=FEATURE_GROUP_VERSION,
        description="Feature group for time series data of bolleria",
        primary_key=["week_start", "familia"],
        event_time="week_start",
        stream=False
    )

Feature group 'times_series_bolleria_feature_group' (v1) ya existe. Usando el existente.


In [None]:
# Forzar year y week a int64 sin nulos antes de la inserción porque me estaba dando problemas
import numpy as np
ts_data['year'] = ts_data['year'].fillna(0).astype(np.int64)
ts_data['week'] = ts_data['week'].fillna(0).astype(np.int64)
ts_data.info()

In [None]:
# Comprobar el orden temporal antes de la inserción
print(ts_data[['year', 'week', 'week_start']].head(10))
print(ts_data[['year', 'week', 'week_start']].tail(10))
assert ts_data['week_start'].is_monotonic_increasing, 'El DataFrame NO está ordenado por week_start'

In [None]:
feature_group.insert(
    ts_data,
    # online=False,
    write_options={'wait_for_job': True}
)