In [43]:
# 1. Imports - Librerías necesarias
from datetime import datetime
from src import config
import hopsworks
import pandas as pd
import logging

# Configuración básica de logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger('feature_view_creation')

In [44]:
# 2. Conectar a Hopsworks y al Feature Group
try:
    # Login y conexión al proyecto
    project = hopsworks.login(
        api_key_value=config.HOPSWORKS_API_KEY, 
        project=config.HOPSWORKS_PROJECT_NAME)
    
    # Conexión al feature store
    feature_store = project.get_feature_store()
    
    # Conexión al feature group
    feature_group = feature_store.get_feature_group(
        name=config.FEATURE_GROUP_NAME,
        version=config.FEATURE_GROUP_VERSION
    )
    
    logger.info(f"Conectado a Feature Group: {feature_group.name} (v{feature_group.version})")
    
except Exception as e:
    logger.error(f"Error en conexión: {e}")

2025-08-15 19:43:09,828 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-08-15 19:43:09,832 INFO: Initializing external client
2025-08-15 19:43:09,832 INFO: Base URL: https://c.app.hopsworks.ai:443
Connection closed.
2025-08-15 19:43:09,832 INFO: Initializing external client
2025-08-15 19:43:09,832 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-08-15 19:43:11,118 INFO: Python Engine initialized.
2025-08-15 19:43:11,118 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1242272

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1242272
2025-08-15 19:43:12,783 INFO: Conectado a Feature Group: times_series_bolleria_feature_group (v1)
2025-08-15 19:43:12,783 INFO: Conectado a Feature Group: times_series_bolleria_feature_group (v1)


In [45]:
# 3. Verificar características disponibles
try:
    # Obtener lista de características
    features = feature_group.features
    
    # Mostrar nombres y tipos
    logger.info(f"Características en {feature_group.name}:")
    for feature in features:
        logger.info(f"  - {feature.name} ({feature.type})")
        
    # Mostrar primary keys y event time
    logger.info(f"Primary keys: {feature_group.primary_key}")
    logger.info(f"Event time: {feature_group.event_time}")
    
except Exception as e:
    logger.error(f"Error al obtener características: {e}")

2025-08-15 19:43:12,802 INFO: Características en times_series_bolleria_feature_group:
2025-08-15 19:43:12,802 INFO:   - year (bigint)
2025-08-15 19:43:12,803 INFO:   - week (bigint)
2025-08-15 19:43:12,803 INFO:   - familia (string)
2025-08-15 19:43:12,804 INFO:   - base_imponible (double)
2025-08-15 19:43:12,806 INFO:   - is_summer_peak (int)
2025-08-15 19:43:12,806 INFO:   - is_easter (bigint)
2025-08-15 19:43:12,806 INFO:   - dias_semana (bigint)
2025-08-15 19:43:12,806 INFO:   - week_start (timestamp)
2025-08-15 19:43:12,807 INFO: Primary keys: ['familia', 'week_start']
2025-08-15 19:43:12,807 INFO: Event time: week_start
2025-08-15 19:43:12,802 INFO:   - year (bigint)
2025-08-15 19:43:12,803 INFO:   - week (bigint)
2025-08-15 19:43:12,803 INFO:   - familia (string)
2025-08-15 19:43:12,804 INFO:   - base_imponible (double)
2025-08-15 19:43:12,806 INFO:   - is_summer_peak (int)
2025-08-15 19:43:12,806 INFO:   - is_easter (bigint)
2025-08-15 19:43:12,806 INFO:   - dias_semana (bigint

In [46]:
# 4. Borrar feature views existentes
try:
    # Intentar borrar versiones específicas que podrían existir
    versions_to_delete = [1, 2]
    
    for version in versions_to_delete:
        try:
            feature_store.delete_feature_view(name=config.FEATURE_VIEW_NAME, version=version)
            logger.info(f"Feature view '{config.FEATURE_VIEW_NAME}' (v{version}) eliminada")
        except Exception:
            pass  # Ignorar errores si la feature view no existe
            
    logger.info("Limpieza de feature views completada")
    
except Exception as e:
    logger.error(f"Error al borrar feature views: {e}")

2025-08-15 19:43:12,818 INFO: Limpieza de feature views completada


In [47]:
# 5. Crear feature view con características seleccionadas
try:
    # Características específicas a incluir
    selected_features = ['familia', 'base_imponible', 'is_summer_peak', 'is_easter', 'week_start']
    
    # Verificar existencia de las características
    available_features = [f.name for f in feature_group.features]
    missing_features = [f for f in selected_features if f not in available_features]
    
    if missing_features:
        logger.warning(f"Características no encontradas: {missing_features}")
        # Buscar posibles coincidencias por nombre
        selected_features = [f for f in selected_features if f in available_features]
    
    # Configurar feature view
    feature_view_name = config.FEATURE_VIEW_NAME
    feature_view_version = 1
    
    # Obtener objetos Feature para las características seleccionadas
    selected_feature_objects = [f for f in feature_group.features if f.name in selected_features]
    
    # Crear query con características seleccionadas
    specific_query = feature_group.select(selected_feature_objects)
    
    # Crear la feature view
    feature_view = feature_store.create_feature_view(
        name=feature_view_name,
        version=feature_view_version,
        query=specific_query,
        description=f"Feature view con características: {', '.join(selected_features)}"
    )
    
    logger.info(f"Feature view creada: {feature_view.name} (v{feature_view.version})")
    
except Exception as e:
    logger.error(f"Error al crear feature view: {e}")

2025-08-15 19:43:13,708 ERROR: Error al crear feature view: Metadata operation error: (url: https://c.app.hopsworks.ai/hopsworks-api/api/project/1242272/featurestores/1224799/featureview). Server response: 
HTTP code: 400, HTTP reason: Bad Request, body: b'{"errorCode":270179,"usrMsg":"Feature view: times_series_bolleria_feature_view, version: 1","errorMsg":"The provided feature view name and version already exists"}', error code: 270179, error msg: The provided feature view name and version already exists, user msg: Feature view: times_series_bolleria_feature_view, version: 1


In [48]:
# 6. Acceder a los datos de la feature view
try:
    # Obtener la feature view por nombre si no está disponible
    if 'feature_view' not in locals() or feature_view is None:
        feature_view = feature_store.get_feature_view(
            name=config.FEATURE_VIEW_NAME,
            version=1
        )
    
    # Obtener los datos
    data = feature_view.get_batch_data()
    
    # Verificar columnas obtenidas
    logger.info(f"Columnas en datos: {list(data.columns)}")
    logger.info(f"Dimensiones: {data.shape}")
    
    # Mostrar primeros registros
    print(data.head())
    
    # Verificar presencia de características solicitadas
    selected_features = ['familia', 'base_imponible', 'is_summer_peak', 'is_easter', 'week_start']
    missing_cols = [col for col in selected_features if col not in data.columns]
    
    if missing_cols:
        logger.warning(f"Columnas faltantes: {missing_cols}")
    else:
        logger.info("Todas las características solicitadas están presentes")
    
    # Guardar datos para uso posterior
    data_path = "data/processed/feature_view_data.parquet"
    data.to_parquet(data_path, index=False)
    logger.info(f"Datos guardados en {data_path}")
    
except Exception as e:
    logger.error(f"Error al acceder a datos: {e}")

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.53s) 
2025-08-15 19:43:17,652 INFO: Columnas en datos: ['familia', 'base_imponible', 'is_summer_peak', 'is_easter', 'week_start']
2025-08-15 19:43:17,653 INFO: Dimensiones: (132, 5)
    familia  base_imponible  is_summer_peak  is_easter  \
0  BOLLERIA          641.56               0          0   
1  BOLLERIA          725.72               0          0   
2  BOLLERIA          950.70               0          0   
3  BOLLERIA          785.98               0          0   
4  BOLLERIA          915.18               0          0   

                 week_start  
0 2023-02-06 00:00:00+00:00  
1 2025-02-24 00:00:00+00:00  
2 2023-09-18 00:00:00+00:00  
3 2023-02-27 00:00:00+00:00  
4 2024-04-29 00:00:00+00:00  
2025-08-15 19:43:17,654 INFO: Todas las características solicitadas están presentes
2025-08-15 19:43:17,657 INFO: Datos guardados en data/processed/feature_view_data.parquet


In [49]:
# 6b. Obtener datos de entrenamiento con training_data()
try:
    # Obtener datos de entrenamiento (devuelve una tupla con X e y)
    df_ts = feature_view.training_data()
    
    # Verificar el tipo de retorno
    logger.info(f"Tipo de datos retornado por training_data(): {type(df_ts)}")
    
    # Si es una tupla, extraer los componentes
    if isinstance(df_ts, tuple):
        X_train, y_train = df_ts
        logger.info(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
        
        # Crear un DataFrame combinado para visualización
        if isinstance(y_train, pd.Series):
            y_col_name = y_train.name if y_train.name else 'target'
            df_combined = X_train.copy()
            df_combined[y_col_name] = y_train
        else:
            # Si y_train es un array o DataFrame
            df_combined = pd.concat([X_train, 
                                    pd.DataFrame(y_train, columns=['target'])], axis=1)
            
        print("Primeras filas del conjunto de datos de entrenamiento:")
        print(df_combined.head())
    else:
        # Si no es una tupla, asumir que es un DataFrame directamente
        print("Primeras filas del conjunto de datos de entrenamiento:")
        print(df_ts.head())
        
except Exception as e:
    logger.error(f"Error al obtener datos de entrenamiento: {e}")

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.92s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.92s) 
2025-08-15 19:43:37,715 INFO: Provenance cached data - overwriting last accessed/created training dataset from 3 to 4.
2025-08-15 19:43:37,716 INFO: Tipo de datos retornado por training_data(): <class 'tuple'>
2025-08-15 19:43:37,717 ERROR: Error al obtener datos de entrenamiento: 'NoneType' object has no attribute 'shape'
2025-08-15 19:43:37,715 INFO: Provenance cached data - overwriting last accessed/created training dataset from 3 to 4.
2025-08-15 19:43:37,716 INFO: Tipo de datos retornado por training_data(): <class 'tuple'>
2025-08-15 19:43:37,717 ERROR: Error al obtener datos de entrenamiento: 'NoneType' object has no attribute 'shape'




In [50]:
# 6c. Explorar los componentes de los datos de entrenamiento
try:
    # Verificar que df_ts esté definido
    if 'df_ts' not in locals():
        logger.warning("Variable df_ts no encontrada. Ejecute primero la celda anterior.")
    # Si df_ts es una tupla, mostrar información detallada de sus componentes
    elif isinstance(df_ts, tuple) and len(df_ts) >= 2:
        # Desempaquetar la tupla
        X_features, y_target = df_ts
        
        print("=== Características (X) ===")
        print(f"Forma: {X_features.shape}")
        print(f"Columnas: {list(X_features.columns)}")
        print("Primeras 5 filas:")
        print(X_features.head())
        
        print("\n=== Variable objetivo (y) ===")
        if isinstance(y_target, pd.Series):
            print(f"Nombre: {y_target.name}")
            print(f"Forma: {y_target.shape}")
            print("Primeros 5 valores:")
            print(y_target.head())
        else:
            print(f"Tipo: {type(y_target)}")
            print(f"Forma: {y_target.shape if hasattr(y_target, 'shape') else 'No disponible'}")
            try:
                print("Primeros 5 valores:")
                print(y_target[:5])
            except:
                print("No se pueden mostrar los valores")
    else:
        # Si df_ts no es una tupla
        print(f"Los datos no están en formato tupla (X, y). Tipo actual: {type(df_ts)}")
        try:
            if hasattr(df_ts, 'head'):
                print(df_ts.head())
            else:
                print(f"No se puede mostrar vista previa para tipo {type(df_ts)}")
        except Exception as inner_e:
            print(f"Error al mostrar datos: {inner_e}")
            
except Exception as e:
    logger.error(f"Error al explorar datos de entrenamiento: {e}")

=== Características (X) ===
Forma: (132, 5)
Columnas: ['familia', 'base_imponible', 'is_summer_peak', 'is_easter', 'week_start']
Primeras 5 filas:
    familia  base_imponible  is_summer_peak  is_easter  \
0  BOLLERIA          641.56               0          0   
1  BOLLERIA          725.72               0          0   
2  BOLLERIA          950.70               0          0   
3  BOLLERIA          785.98               0          0   
4  BOLLERIA          915.18               0          0   

                  week_start  
0  2023-02-06 00:00:00+00:00  
1  2025-02-24 00:00:00+00:00  
2  2023-09-18 00:00:00+00:00  
3  2023-02-27 00:00:00+00:00  
4  2024-04-29 00:00:00+00:00  

=== Variable objetivo (y) ===
Tipo: <class 'NoneType'>
Forma: No disponible
Primeros 5 valores:
No se pueden mostrar los valores


In [53]:
# 6d. Procesar datos para entrenamiento
from src.data_utils import transformar_features_target

# Extraer el DataFrame de características de la tupla df_ts
if isinstance(df_ts, tuple) and len(df_ts) >= 1:
    # Extraer solo el primer componente (DataFrame de características)
    df_features = df_ts[0]
    logger.info(f"Extrayendo DataFrame de características de df_ts: {df_features.shape}")
else:
    logger.error("df_ts no tiene el formato esperado (tupla)")
    raise TypeError("df_ts debe ser una tupla que contenga al menos un DataFrame")

# Ahora procesamos el DataFrame de características
# Capturar todos los valores retornados en una variable
result = transformar_features_target(
    df_features,
    lags_list=[1, 2, 3, 4], 
    columna_target='base_imponible',
    cols_exogenas=None,
    periodos_adelante=1,
    eliminar_nulos=True
)

# Verificar cuántos valores devuelve la función
logger.info(f"La función transformar_features_target devolvió {len(result) if isinstance(result, tuple) else 1} valores")

# Extraer los valores según la cantidad devuelta
if isinstance(result, tuple):
    if len(result) == 2:
        features, target = result
    elif len(result) >= 3:
        features, target = result[0], result[1]
        logger.info(f"Se ignoraron {len(result) - 2} valores adicionales devueltos")
    else:
        features = result[0]
        target = None
        logger.warning("No se obtuvo un valor para 'target'")
else:
    # Si solo devuelve un valor
    features = result
    target = None
    logger.warning("La función solo devolvió un valor (features)")

# Combinar características y objetivo en un solo DataFrame si ambos existen
if features is not None and target is not None:
    features_and_target = features.copy()
    features_and_target['target'] = target
    
    print(f'DataFrame procesado: {features_and_target.shape=}')
    print(f'Columnas disponibles: {list(features_and_target.columns)}')
    print(features_and_target.head())
elif features is not None:
    print(f'Solo se obtuvieron características: {features.shape=}')
    print(f'Columnas disponibles: {list(features.columns)}')
    print(features.head())
else:
    logger.error("No se obtuvieron características válidas")

2025-08-15 19:53:58,660 INFO: Extrayendo DataFrame de características de df_ts: (132, 5)
2025-08-15 19:53:58,663 INFO: La función transformar_features_target devolvió 3 valores
2025-08-15 19:53:58,664 INFO: Se ignoraron 1 valores adicionales devueltos
DataFrame procesado: features_and_target.shape=(127, 6)
Columnas disponibles: ['base_imponible_lag1', 'base_imponible_lag2', 'base_imponible_lag3', 'base_imponible_lag4', 'week_start', 'target']
    base_imponible_lag1  base_imponible_lag2  base_imponible_lag3  \
99               653.64               741.40               658.40   
0                680.46               653.64               741.40   
70               641.56               680.46               653.64   
89               649.83               641.56               680.46   
3                713.33               649.83               641.56   

    base_imponible_lag4                 week_start  target  
99               825.11  2023-01-30 00:00:00+00:00  641.56  
0               

In [None]:
# 7. Entrenar modelo básico con las características seleccionadas
try:
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import mean_squared_error, r2_score
    import numpy as np
    
    # Definir características y objetivo (ajustar según datos)
    X = data[['familia', 'base_imponible', 'is_summer_peak', 'is_easter', 'week_start']]
    y = data['ventas'] if 'ventas' in data.columns else data.iloc[:, -1]  # Última columna como fallback
    
    # División train/test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Entrenar modelo
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluar
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    # Resultados
    logger.info(f"Resultados del modelo RandomForest:")
    logger.info(f"RMSE: {rmse:.4f}")
    logger.info(f"R²: {r2:.4f}")
    
    # Importancia de características
    importances = pd.DataFrame({
        'Característica': X.columns,
        'Importancia': model.feature_importances_
    }).sort_values('Importancia', ascending=False)
    
    print("Importancia de características:")
    print(importances)
    
except Exception as e:
    logger.error(f"Error en entrenamiento: {e}")