# Pipeline de Entrenamiento con Feature Views de Hopsworks

Este notebook implementa un pipeline completo para:
1. Conectar al feature store de Hopsworks
2. Crear una feature view con características seleccionadas
3. Procesar los datos para crear variables de rezago (lags) y target
4. Preparar los datos para entrenamiento de modelos

In [16]:
# 1. Configuración inicial

import sys
from pathlib import Path

# Añade el directorio raíz del proyecto al sys.path
project_root = Path().resolve().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

In [2]:
%reload_ext autoreload
%autoreload 2

In [20]:

from datetime import datetime
from src import config
import hopsworks
import pandas as pd
import logging

# Configuración básica de logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger('feature_view_creation')



In [None]:
# 2. Conectar a Hopsworks y al Feature Store

from src import config
import hopsworks
try:
    # Login y conexión al proyecto
    project = hopsworks.login(
        api_key_value=config.HOPSWORKS_API_KEY, 
        project=config.HOPSWORKS_PROJECT_NAME
    )

    # Conexión al feature store
    feature_store = project.get_feature_store()
    
    # Conexión al feature group
    feature_group = feature_store.get_feature_group(
        name=config.FEATURE_GROUP_NAME,
        version=config.FEATURE_GROUP_VERSION
    )
   
    logger.info(f"Conexión exitosa al Feature Group: {feature_group.name} (v{feature_group.version})")
    
except Exception as e:
    logger.error(f"Error en conexión: {e}")
    raise

2025-09-06 19:32:39,468 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-09-06 19:32:39,471 INFO: Initializing external client
2025-09-06 19:32:39,471 INFO: Base URL: https://c.app.hopsworks.ai:443




To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


2025-09-06 19:32:40,610 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1242272
2025-09-06 19:32:41,889 INFO: Conexión exitosa al Feature Group: times_series_bolleria_feature_group (v1)


In [None]:
# 3. Crear/obtener feature view con características seleccionadas
try:
    # Características específicas a incluir
    selected_features = ['familia', 'base_imponible', 'is_summer_peak', 'is_easter', 'week_start']
    feature_view_name = config.FEATURE_VIEW_NAME
    feature_view_version = 1
    
    # Intentar obtener la feature view existente primero
    try:
        feature_view = feature_store.get_feature_view(
            name=feature_view_name,
            version=feature_view_version
        )
        logger.info(f"Feature view existente recuperada: {feature_view.name} (v{feature_view.version})")
    
    except:
        # Si no existe, crear una nueva
        # Obtener objetos Feature para las características seleccionadas
        selected_feature_objects = [f for f in feature_group.features if f.name in selected_features]
        
        # Crear query con características seleccionadas
        specific_query = feature_group.select(selected_feature_objects)
        
        # Crear la feature view
        feature_view = feature_store.create_feature_view(
            name=feature_view_name,
            version=feature_view_version,
            query=specific_query,
            description=f"Feature view con características: {', '.join(selected_features)}"
        )
        logger.info(f"Nueva feature view creada: {feature_view.name} (v{feature_view.version})")
    
except Exception as e:
    logger.error(f"Error al crear/obtener feature view: {e}")
    raise

2025-09-06 19:32:49,392 INFO: Feature view existente recuperada: times_series_bolleria_feature_view (v1)


In [25]:
# 4. Obtener datos de la feature view
try:
    # Obtener datos en batch normal
    df_ts = feature_view.get_batch_data()
    
    # Mostrar resumen de los datos obtenidos
    logger.info(f"Datos obtenidos: {df_ts.shape[0]} filas, {df_ts.shape[1]} columnas")
    logger.info(f"Columnas disponibles: {list(df_ts.columns)}")
    print("Muestra de datos:")
    print(df_ts.head(3))
    
except Exception as e:
    logger.error(f"Error al obtener datos: {e}")
    raise

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.01s) 
2025-09-06 19:34:39,211 INFO: Datos obtenidos: 135 filas, 5 columnas
2025-09-06 19:34:39,212 INFO: Columnas disponibles: ['familia', 'base_imponible', 'is_summer_peak', 'is_easter', 'week_start']
Muestra de datos:
    familia  base_imponible  is_summer_peak  is_easter  \
0  BOLLERIA          641.56               0          0   
1  BOLLERIA          725.72               0          0   
2  BOLLERIA          950.70               0          0   

                 week_start  
0 2023-02-06 00:00:00+00:00  
1 2025-02-24 00:00:00+00:00  
2 2023-09-18 00:00:00+00:00  


In [26]:
# 5. Obtener datos de entrenamiento (training_data)
try:
    # Obtener datos de entrenamiento (X, y) desde la feature view
    df_ts = feature_view.training_data()

except Exception as e:
    print(f"Error al obtener datos de entrenamiento: {e}")

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.52s) 
2025-09-06 19:34:45,181 INFO: Provenance cached data - overwriting last accessed/created training dataset from 55 to 56.




In [27]:
# 6. Procesar datos para entrenamiento
from src.data_utils import transformar_features_target

try:
       
    # Procesar datos usando la función mejorada que acepta tuplas directamente
    features_and_target = transformar_features_target(
        df_ts,
        lags_list=[1, 52], 
        columna_target='base_imponible',
        cols_exogenas=['is_easter', 'is_summer_peak'],
        periodos_adelante=1,
        eliminar_nulos=True,
        return_format='dataframe'  # Obtenemos un único DataFrame con features y target
    )
    
    # Mostrar información de los datos procesados
    logger.info(f"Datos procesados: {features_and_target.shape[0]} filas, {features_and_target.shape[1]} columnas")
    logger.info(f"Variables disponibles: {list(features_and_target.columns)}")
    print("\nMuestra de datos procesados:")
    print(features_and_target.head(3))
    
except Exception as e:
    logger.error(f"Error al procesar datos: {e}")
    raise

2025-09-06 19:34:51,690 INFO: Detectada entrada tipo tupla con 2 elementos
2025-09-06 19:34:51,691 INFO: Usando el primer elemento de la tupla como DataFrame: (135, 5)
2025-09-06 19:34:51,695 INFO: Retornando DataFrame combinado: (82, 6)
2025-09-06 19:34:51,695 INFO: Datos procesados: 82 filas, 6 columnas
2025-09-06 19:34:51,696 INFO: Variables disponibles: ['base_imponible_lag1', 'base_imponible_lag52', 'is_easter', 'is_summer_peak', 'week_start', 'target']

Muestra de datos procesados:
     base_imponible_lag1  base_imponible_lag52  is_easter  is_summer_peak  \
41                572.51                825.11          0               0   
72                597.65                658.40          0               0   
114               680.30                741.40          0               0   

                   week_start  target  
41  2024-01-15 00:00:00+00:00  680.30  
72  2024-01-22 00:00:00+00:00  603.99  
114 2024-01-29 00:00:00+00:00  600.14  


In [28]:
# 7. Split temporal y entrenamiento con XGBoost
from src.data_split import train_test_split
from src.model import train_evaluate_xgboost


# Split temporal usando el 80% para train
split_idx = int(len(features_and_target) * 0.8)
split_date = features_and_target.iloc[split_idx]['week_start']  # Usar .iloc en lugar de .loc
X_train, y_train, X_test, y_test = train_test_split(
    features_and_target,
    split_date=split_date,
    target= 'target'  # o 'base_imponible' según tu pipeline
)

 


In [41]:
# Entrenamiento y evaluación

from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

# Elimina la columna "week_start" si existe en X_train y X_test
if 'week_start' in X_train.columns:
    X_train = X_train.drop(columns=['week_start'])
if 'week_start' in X_test.columns:
    X_test = X_test.drop(columns=['week_start'])

# Baseline con XGBoost
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    random_state=42)

# Espacio de búsqueda para hiperparámetros
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}
# Timeseries split para validación cruzada
tscv = TimeSeriesSplit(n_splits=5)

# GridSearchCV para encontrar los mejores hiperparámetros
grid_search = GridSearchCV(
    xgb_model,
    param_grid,
    scoring='neg_mean_absolute_error',
    cv=tscv,
    n_jobs=-1,
    verbose=1
)   

print('⏳ Iniciando ajuste de hiperparámetros con GridSearchCV...')
# Entrenamos el modelo
grid_search.fit(X_train, y_train)
print('✅ Ajuste de hiperparámetros finalizado.')

# Resultados del grid search
best_params = grid_search.best_params_
best_score = -grid_search.best_score_

⏳ Iniciando ajuste de hiperparámetros con GridSearchCV...
Fitting 5 folds for each of 72 candidates, totalling 360 fits
✅ Ajuste de hiperparámetros finalizado.


In [42]:
print(f"Mejores hiperparámetros: {best_params}")
print(f"Mejor puntuación (MAE negativo): {best_score}")

Mejores hiperparámetros: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1.0}
Mejor puntuación (MAE negativo): 284.2987670355903


In [43]:
# Evaluación en test y guardado del mejor modelo
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
import joblib


best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE en test: {mae:.2f}, RMSE en test: {rmse:.2f}, MAPE en test: {mape:.2F}, R2 en test: {r2:.2f}")


MAE en test: 169.74, RMSE en test: 226.57, MAPE en test: 0.15, R2 en test: 0.60


In [44]:
# 7. Guardar el modelo entrenado
import joblib
from src.paths import MODELS_DIR

joblib.dump(best_model, MODELS_DIR / 'xgboost_hopsworks.pkl')


['C:\\Workspace\\mlops_fleca_project\\models\\xgboost_hopsworks.pkl']

In [45]:
import joblib
from src.paths import MODELS_DIR

# cargamos el modelo
model = joblib.load(MODELS_DIR / 'xgboost_hopsworks.pkl')


In [46]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema
from sklearn.metrics import mean_absolute_error

# Crear esquemas de entrada y salida para el modelo
input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

# Eliminar columnas datetime no soportadas por XGBoost
X_test_model = X_test.drop(columns=['week_start'], errors='ignore')

# Realizar predicción con el modelo entrenado
y_pred_xgb = best_model.predict(X_test_model)

# Calcular MAE
test_mae = mean_absolute_error(y_test, y_pred_xgb)

# Obtener el model registry de Hopsworks
model_registry = project.get_model_registry()

# Registrar el modelo en Hopsworks
model = model_registry.sklearn.create_model(
    name="fleca_bolleria_predictor_next_week",
    description="Modelo XGBoost para predecir la base imponible de bollería la próxima semana",
    input_example=X_train.sample(),
    model_schema=model_schema,
    metrics={"mae": test_mae}
    )

# Guardar el modelo localmente (convertir ruta a string para evitar AttributeError)
model.save(str(MODELS_DIR / 'xgboost_hopsworks.pkl'))


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading C:\Workspace\mlops_fleca_project\models\xgboost_hopsworks.pkl: 0.000%|          | 0/224774 elapsed<0…

Uploading c:\Workspace\mlops_fleca_project\notebooks\input_example.json: 0.000%|          | 0/28 elapsed<00:00…

Uploading c:\Workspace\mlops_fleca_project\notebooks\model_schema.json: 0.000%|          | 0/521 elapsed<00:00…

Model created, explore it at https://c.app.hopsworks.ai:443/p/1242272/models/fleca_bolleria_predictor_next_week/4


Model(name: 'fleca_bolleria_predictor_next_week', version: 4)