In [4]:
import sys
from pathlib import Path

# Añade el directorio raíz del proyecto al sys.path
project_root = Path().resolve().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

In [5]:
%reload_ext autoreload
%autoreload 2

In [6]:
# Importar desde src como paquete
from src import config

DEBUG ENV HOPSWORKS_PROJECT_NAME: fleca_mlops
DEBUG ENV PATH: C:\Workspace\mlops_fleca_project\.env


In [7]:
# 1. Configuración inicial
from datetime import datetime
from src import config
import hopsworks
import pandas as pd
import logging

# Configuración básica de logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger('feature_view_creation')

In [8]:
# 2. Conectar a Hopsworks y al Feature Store
try:
    # Login y conexión al proyecto
    project = hopsworks.login(
        api_key_value=config.HOPSWORKS_API_KEY, 
        project=config.HOPSWORKS_PROJECT_NAME)
    
    # Conexión al feature store
    feature_store = project.get_feature_store()
    
    # Conexión al feature group
    feature_group = feature_store.get_feature_group(
        name=config.FEATURE_GROUP_NAME,
        version=config.FEATURE_GROUP_VERSION
    )
    
    logger.info(f"Conexión exitosa al Feature Group: {feature_group.name} (v{feature_group.version})")
    
except Exception as e:
    logger.error(f"Error en conexión: {e}")
    raise

2025-09-06 18:53:13,001 INFO: Initializing external client
2025-09-06 18:53:13,001 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-09-06 18:53:13,001 INFO: Base URL: https://c.app.hopsworks.ai:443




To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


2025-09-06 18:53:14,437 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1242272

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1242272
2025-09-06 18:53:15,781 INFO: Conexión exitosa al Feature Group: times_series_bolleria_feature_group (v1)
2025-09-06 18:53:15,781 INFO: Conexión exitosa al Feature Group: times_series_bolleria_feature_group (v1)


In [9]:
# 3. Crear/obtener feature view con características seleccionadas
try:
    # Características específicas a incluir
    selected_features = ['familia', 'base_imponible', 'is_summer_peak', 'is_easter', 'week_start']
    feature_view_name = config.FEATURE_VIEW_NAME
    feature_view_version = 1
    
    # Intentar obtener la feature view existente primero
    try:
        feature_view = feature_store.get_feature_view(
            name=feature_view_name,
            version=feature_view_version
        )
        logger.info(f"Feature view existente recuperada: {feature_view.name} (v{feature_view.version})")
    
    except:
        # Si no existe, crear una nueva
        # Obtener objetos Feature para las características seleccionadas
        selected_feature_objects = [f for f in feature_group.features if f.name in selected_features]
        
        # Crear query con características seleccionadas
        specific_query = feature_group.select(selected_feature_objects)
        
        # Crear la feature view
        feature_view = feature_store.create_feature_view(
            name=feature_view_name,
            version=feature_view_version,
            query=specific_query,
            description=f"Feature view con características: {', '.join(selected_features)}"
        )
        logger.info(f"Nueva feature view creada: {feature_view.name} (v{feature_view.version})")
    
except Exception as e:
    logger.error(f"Error al crear/obtener feature view: {e}")
    raise

2025-09-06 18:54:08,870 INFO: Feature view existente recuperada: times_series_bolleria_feature_view (v1)


In [10]:
# 4. Obtener datos de la feature view
try:
    # Obtener datos en batch normal
    df_ts = feature_view.get_batch_data()
    
    # Mostrar resumen de los datos obtenidos
    logger.info(f"Datos obtenidos: {df_ts.shape[0]} filas, {df_ts.shape[1]} columnas")
    logger.info(f"Columnas disponibles: {list(df_ts.columns)}")
    print("Muestra de datos:")
    print(df_ts.head(3))
    
except Exception as e:
    logger.error(f"Error al obtener datos: {e}")
    raise

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.61s) 
2025-09-06 18:54:20,622 INFO: Datos obtenidos: 135 filas, 5 columnas
2025-09-06 18:54:20,622 INFO: Columnas disponibles: ['familia', 'base_imponible', 'is_summer_peak', 'is_easter', 'week_start']
Muestra de datos:
    familia  base_imponible  is_summer_peak  is_easter  \
0  BOLLERIA          641.56               0          0   
1  BOLLERIA          725.72               0          0   
2  BOLLERIA          950.70               0          0   

                 week_start  
0 2023-02-06 00:00:00+00:00  
1 2025-02-24 00:00:00+00:00  
2 2023-09-18 00:00:00+00:00  
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.61s) 
2025-09-06 18:54:20,622 INFO: Datos obtenidos: 135 filas, 5 columnas
2025-09-06 18:54:20,622 INFO: Columnas disponibles: ['familia', 'base_imponible', 'is_summer_peak', 'is_easter', 'week_start']
Muestra de datos:
    familia  base_imponible  is_summer_peak  is

In [11]:
# 5. Obtener datos de entrenamiento (training_data)
try:
    # Obtener datos de entrenamiento (X, y) desde la feature view
    df_ts = feature_view.training_data()

except Exception as e:
    print(f"Error al obtener datos de entrenamiento: {e}")

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.59s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.59s) 




In [12]:
# 6. Procesar datos para entrenamiento
from src.data_utils import transformar_features_target

try:
       
    # Procesar datos usando la función mejorada que acepta tuplas directamente
    features_and_target = transformar_features_target(
        df_ts,
        lags_list=[1, 2, 3, 52], 
        columna_target='base_imponible',
        cols_exogenas=['is_easter', 'is_summer_peak'],
        periodos_adelante=1,
        eliminar_nulos=True,
        return_format='dataframe'  # Obtenemos un único DataFrame con features y target
    )
    
    # Mostrar información de los datos procesados
    logger.info(f"Datos procesados: {features_and_target.shape[0]} filas, {features_and_target.shape[1]} columnas")
    logger.info(f"Variables disponibles: {list(features_and_target.columns)}")
    print("\nMuestra de datos procesados:")
    print(features_and_target.head(3))
    
except Exception as e:
    logger.error(f"Error al procesar datos: {e}")
    raise

2025-09-06 18:54:38,948 INFO: Detectada entrada tipo tupla con 2 elementos
2025-09-06 18:54:38,949 INFO: Usando el primer elemento de la tupla como DataFrame: (135, 5)
2025-09-06 18:54:38,955 INFO: Retornando DataFrame combinado: (82, 8)
2025-09-06 18:54:38,956 INFO: Datos procesados: 82 filas, 8 columnas
2025-09-06 18:54:38,956 INFO: Variables disponibles: ['base_imponible_lag1', 'base_imponible_lag2', 'base_imponible_lag3', 'base_imponible_lag52', 'is_easter', 'is_summer_peak', 'week_start', 'target']
2025-09-06 18:54:38,949 INFO: Usando el primer elemento de la tupla como DataFrame: (135, 5)
2025-09-06 18:54:38,955 INFO: Retornando DataFrame combinado: (82, 8)
2025-09-06 18:54:38,956 INFO: Datos procesados: 82 filas, 8 columnas
2025-09-06 18:54:38,956 INFO: Variables disponibles: ['base_imponible_lag1', 'base_imponible_lag2', 'base_imponible_lag3', 'base_imponible_lag52', 'is_easter', 'is_summer_peak', 'week_start', 'target']

Muestra de datos procesados:
     base_imponible_lag1  b

In [13]:
# 7. Split temporal y entrenamiento con XGBoost
from src.data_split import train_test_split
from src.model import train_evaluate_xgboost

try:
    # Split temporal usando el 80% para train
    split_idx = int(len(features_and_target) * 0.8)
    split_date = features_and_target.loc[split_idx, 'week_start']
    X_train, y_train, X_test, y_test = train_test_split(
        features_and_target,
        split_date=split_date,
        target= 'target'  # o 'base_imponible' según tu pipeline
    )

    # Entrenamiento y evaluación
    resultados = train_evaluate_xgboost(X_train, y_train, X_test, y_test)
    print({k: (round(v,4) if isinstance(v, float) else v) for k,v in resultados.items() if k != 'model'})

except Exception as e:
    logger.error(f"Entrenamiento XGBoost falló: {e}")
    raise

{'mae': 218.2798, 'rmse': 286.1179, 'mape': 0.2023, 'r2': 0.3658}


In [29]:
# 7. Guardar el modelo entrenado
import joblib
from src.paths import MODELS_DIR

joblib.dump(resultados['model'], MODELS_DIR / 'xgboost_hopsworks.pkl')

['C:\\Workspace\\mlops_fleca_project\\models\\xgboost_hopsworks.pkl']

In [30]:
import joblib
from src.paths import MODELS_DIR

# cargamos el modelo
model = joblib.load(MODELS_DIR / 'xgboost_hopsworks.pkl')

In [31]:
# Registro del modelo (Model registry)
from hsml.schema import Schema
from hsml.model_schema import ModelSchema
from sklearn.metrics import mean_absolute_error

# Crear esquemas de entrada y salida para el modelo
# Esquema de datos
input_schema = Schema(X_train)
output_schema = Schema(y_train)

# Esquema del modelo
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

# Eliminar columnas datetime no soportadas por XGBoost
X_test_model = X_test.drop(columns=['week_start'], errors='ignore')

# Realizar predicción con el modelo entrenado
y_pred_xgb = resultados['model'].predict(X_test_model)

# Calcular MAE
test_mae = mean_absolute_error(y_test, y_pred_xgb)

# Obtener el model registry de Hopsworks
model_registry = project.get_model_registry()

# Registrar el modelo en Hopsworks
model = model_registry.sklearn.create_model(
    name="fleca_bolleria_predictor_next_week",
    description="Modelo XGBoost para predecir la base imponible de bollería la próxima semana",
    input_example=X_train.sample(),
    model_schema=model_schema,
    metrics={"mae": test_mae}
    )

# Guardar el modelo localmente (convertir ruta a string para evitar AttributeError)
model.save(str(MODELS_DIR / 'xgboost_hopsworks.pkl'))


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading C:\Workspace\mlops_fleca_project\models\xgboost_hopsworks.pkl: 0.000%|          | 0/83595 elapsed<00…

Uploading c:\Workspace\mlops_fleca_project\notebooks\input_example.json: 0.000%|          | 0/70 elapsed<00:00…

Uploading c:\Workspace\mlops_fleca_project\notebooks\model_schema.json: 0.000%|          | 0/782 elapsed<00:00…

Model created, explore it at https://c.app.hopsworks.ai:443/p/1242272/models/fleca_bolleria_predictor_next_week/2


Model(name: 'fleca_bolleria_predictor_next_week', version: 2)