In [55]:
from datetime import datetime
from src import config
import hopsworks
import pandas as pd
import logging
import mlflow
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import sys
from pathlib import Path

# Añade src al path para importar los módulos
sys.path.append(str(Path().resolve().parent / 'src'))

In [44]:
%reload_ext autoreload
%autoreload 2

In [45]:
# Configuración básica de logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger('feature_view_creation')

In [46]:
# Conectar a Hopsworks y al Feature Store
try:
    # Login y conexión al proyecto
    project = hopsworks.login(
        api_key_value=config.HOPSWORKS_API_KEY, 
        project=config.HOPSWORKS_PROJECT_NAME)
    
    # Conexión al feature store
    feature_store = project.get_feature_store()
    
    # Conexión al feature group
    feature_group = feature_store.get_feature_group(
        name=config.FEATURE_GROUP_NAME,
        version=config.FEATURE_GROUP_VERSION
    )
    
    logger.info(f"Conexión exitosa al Feature Group: {feature_group.name} (v{feature_group.version})")
    
except Exception as e:
    logger.error(f"Error en conexión: {e}")
    raise

2025-08-23 12:04:58,827 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-08-23 12:04:58,831 INFO: Initializing external client
2025-08-23 12:04:58,831 INFO: Base URL: https://c.app.hopsworks.ai:443
Connection closed.
2025-08-23 12:04:58,831 INFO: Initializing external client
2025-08-23 12:04:58,831 INFO: Base URL: https://c.app.hopsworks.ai:443




To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


2025-08-23 12:04:59,984 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1242272

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1242272
2025-08-23 12:05:01,258 INFO: Conexión exitosa al Feature Group: times_series_bolleria_feature_group (v1)
2025-08-23 12:05:01,258 INFO: Conexión exitosa al Feature Group: times_series_bolleria_feature_group (v1)


In [47]:
# Crear/obtener feature view con características seleccionadas
try:
    # Características específicas a incluir
    selected_features = ['familia', 'base_imponible', 'is_summer_peak', 'is_easter', 'week_start']
    feature_view_name = config.FEATURE_VIEW_NAME
    feature_view_version = 1
    
    # Intentar obtener la feature view existente primero
    try:
        feature_view = feature_store.get_feature_view(
            name=feature_view_name,
            version=feature_view_version
        )
        logger.info(f"Feature view existente recuperada: {feature_view.name} (v{feature_view.version})")
    
    except:
        # Si no existe, crear una nueva
        # Obtener objetos Feature para las características seleccionadas
        selected_feature_objects = [f for f in feature_group.features if f.name in selected_features]
        
        # Crear query con características seleccionadas
        specific_query = feature_group.select(selected_feature_objects)
        
        # Crear la feature view
        feature_view = feature_store.create_feature_view(
            name=feature_view_name,
            version=feature_view_version,
            query=specific_query,
            description=f"Feature view con características: {', '.join(selected_features)}"
        )
        logger.info(f"Nueva feature view creada: {feature_view.name} (v{feature_view.version})")
    
except Exception as e:
    logger.error(f"Error al crear/obtener feature view: {e}")
    raise

2025-08-23 12:05:02,520 INFO: Feature view existente recuperada: times_series_bolleria_feature_view (v1)


In [48]:
# Obtener datos de la feature view
try:
    # Obtener datos en batch normal
    df_ts = feature_view.get_batch_data()
    
    # Mostrar resumen de los datos obtenidos
    logger.info(f"Datos obtenidos: {df_ts.shape[0]} filas, {df_ts.shape[1]} columnas")
    logger.info(f"Columnas disponibles: {list(df_ts.columns)}")
    print("Muestra de datos:")
    print(df_ts.head(3))
    
except Exception as e:
    logger.error(f"Error al obtener datos: {e}")
    raise

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.65s) 
2025-08-23 12:05:06,321 INFO: Datos obtenidos: 133 filas, 5 columnas
2025-08-23 12:05:06,322 INFO: Columnas disponibles: ['familia', 'base_imponible', 'is_summer_peak', 'is_easter', 'week_start']
Muestra de datos:
    familia  base_imponible  is_summer_peak  is_easter  \
0  BOLLERIA          641.56               0          0   
1  BOLLERIA          725.72               0          0   
2  BOLLERIA          950.70               0          0   

                 week_start  
0 2023-02-06 00:00:00+00:00  
1 2025-02-24 00:00:00+00:00  
2 2023-09-18 00:00:00+00:00  
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.65s) 
2025-08-23 12:05:06,321 INFO: Datos obtenidos: 133 filas, 5 columnas
2025-08-23 12:05:06,322 INFO: Columnas disponibles: ['familia', 'base_imponible', 'is_summer_peak', 'is_easter', 'week_start']
Muestra de datos:
    familia  base_imponible  is_summer_peak  is

In [49]:
# Obtener datos de entrenamiento (training_data)
try:
    # Obtener datos de entrenamiento (X, y) desde la feature view
    df_ts = feature_view.training_data()

except Exception as e:
    print(f"Error al obtener datos de entrenamiento: {e}")

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.64s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.64s) 




In [50]:
# Procesar datos para entrenamiento
from src.data_utils import transformar_features_target

try:
       
    # Procesar datos usando la función mejorada que acepta tuplas directamente
    features_and_target = transformar_features_target(
        df_ts,
        lags_list=[1, 2, 3, 52], 
        columna_target='base_imponible',
        cols_exogenas=['is_easter', 'is_summer_peak'],
        periodos_adelante=1,
        eliminar_nulos=True,
        return_format='dataframe'  # Obtenemos un único DataFrame con features y target
    )
    
    # Mostrar información de los datos procesados
    logger.info(f"Datos procesados: {features_and_target.shape[0]} filas, {features_and_target.shape[1]} columnas")
    logger.info(f"Variables disponibles: {list(features_and_target.columns)}")
    print("\nMuestra de datos procesados:")
    print(features_and_target.head(3))
    
except Exception as e:
    logger.error(f"Error al procesar datos: {e}")
    raise

2025-08-23 12:05:10,561 INFO: Detectada entrada tipo tupla con 2 elementos
2025-08-23 12:05:10,561 INFO: Usando el primer elemento de la tupla como DataFrame: (133, 5)
2025-08-23 12:05:10,564 INFO: Retornando DataFrame combinado: (80, 8)
2025-08-23 12:05:10,566 INFO: Datos procesados: 80 filas, 8 columnas
2025-08-23 12:05:10,566 INFO: Variables disponibles: ['base_imponible_lag1', 'base_imponible_lag2', 'base_imponible_lag3', 'base_imponible_lag52', 'is_easter', 'is_summer_peak', 'week_start', 'target']

Muestra de datos procesados:
     base_imponible_lag1  base_imponible_lag2  base_imponible_lag3  \
41                572.51               534.79               563.18   
72                597.65               572.51               534.79   
114               680.30               597.65               572.51   

     base_imponible_lag52  is_easter  is_summer_peak  \
41                 825.11          0               0   
72                 658.40          0               0   
114         

In [51]:
# Split temporal (automático 80/20 ya incluido en la función)
from src.data_split import train_test_split

try:
    X_train, y_train, X_test, y_test = train_test_split(
        features_and_target,
        target='target'  # o 'base_imponible' según tu pipeline
        # split_ratio=0.8  # puedes cambiar el porcentaje si lo necesitas
    )
    print(f"Train: {X_train.shape}, Test: {X_test.shape}")
except Exception as e:
    logger.error(f"Error en el split temporal: {e}")
    raise


Train: (65, 7), Test: (15, 7)


In [57]:
# Configuración de MLflow
mlflow.set_tracking_uri(config.MLFLOW_TRACKING_URI)
mlflow.set_experiment("fleca_bolleria_xgboost")


<Experiment: artifact_location='file:///C:/Workspace/mlops_fleca_project/mlruns/1', creation_time=1755943087557, experiment_id='1', last_update_time=1755943087557, lifecycle_stage='active', name='fleca_bolleria_xgboost', tags={}>

In [58]:
# Eliminar columnas datetime antes de entrenar
for df in [X_train, X_test]:
    if 'week_start' in df.columns:
        df.drop('week_start', axis=1, inplace=True)

In [63]:
# Importar XGBoost y métricas
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Definimos los modelos a entrenar
with mlflow.start_run(run_name="xgboost"):
    # Entrenamos el modelo
    model = XGBRegressor()
    model.fit(X_train, y_train)

    # Realizamos predicciones
    predictions = model.predict(X_test)

    # Calcular métricas
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    # Log de los parámetros
    mlflow.log_param("model", "xgboost")
    for param, value in model.get_params().items():
        mlflow.log_param(param, value)

    # Log de las métricas
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2", r2)

    # Log del modelo
    mlflow.xgboost.log_model(model, "model")

    # Imprimir resultados
    print(f"MAE: {mae}")
    print(f"MSE: {mse}")
    print(f"R2: {r2}")



MAE: 194.05842610677084
MSE: 54390.89484225258
R2: 0.4041277259384527
🏃 View run xgboost at: http://localhost:5000/#/experiments/1/runs/a40a756cc688464c835fd36d9fe98b9b
🧪 View experiment at: http://localhost:5000/#/experiments/1


# ESte no funciona (hay que investigar por qué)

In [61]:
# Entrenamiento y evaluación con XGBoost
from src.model import train_evaluate_xgboost

# Eliminar columnas datetime antes de entrenar
for df in [X_train, X_test]:
    if 'week_start' in df.columns:
        df.drop('week_start', axis=1, inplace=True)

with mlflow.start_run(run_name="xgboost"):
    resultados = train_evaluate_xgboost(X_train, y_train, X_test, y_test)
    model = resultados["model"]

    # Log de los parámetros del modelo
    mlflow.log_param("model", "xgboost")
    for param, value in model.get_params().items():
        mlflow.log_param(param, value)

    # Log de las métricas
    mlflow.log_metric("mae", resultados["mae"])
    
    mlflow.log_metric("r2", resultados["r2"])

    # Log del modelo
    mlflow.xgboost.log_model(model, "model")

    # Imprimir resultados
    print(f"MAE: {resultados['mae']}")
   
    print(f"R2: {resultados['r2']}")




MAE: 236.2600927734375
R2: 0.08752420359999014
🏃 View run xgboost at: http://localhost:5000/#/experiments/1/runs/65ddfe3dd1954b21a34ab0388987d4e4
🧪 View experiment at: http://localhost:5000/#/experiments/1
