In [2]:
import duckdb
import pandas as pd
from datetime import datetime

In [8]:
# Conexión a la base de datos
con = duckdb.connect(database='../input/db/labo3.duckdb')

print("🚀 Iniciando Feature Engineering...")
print(f"⏰ Hora de inicio: {datetime.now()}")

# =============================================
# 1. CREAR TABLA DE FEATURES AVANZADAS
# =============================================

print("\n📊 Creando tabla de features avanzadas...")

query_features_advanced = """
CREATE OR REPLACE TABLE features_advanced AS

WITH features_base AS (
    SELECT *,
        -- Lags de toneladas vendidas
        LAG(tn, 1) OVER (PARTITION BY customer_id, product_id ORDER BY periodo) AS tn_lag_1m,
        LAG(tn, 3) OVER (PARTITION BY customer_id, product_id ORDER BY periodo) AS tn_lag_3m,
        LAG(tn, 6) OVER (PARTITION BY customer_id, product_id ORDER BY periodo) AS tn_lag_6m,
        LAG(tn, 12) OVER (PARTITION BY customer_id, product_id ORDER BY periodo) AS tn_lag_12m,
        
        -- Lags de cantidad requerida
        LAG(cust_request_tn, 1) OVER (PARTITION BY customer_id, product_id ORDER BY periodo) AS request_tn_lag_1m,
        LAG(cust_request_tn, 3) OVER (PARTITION BY customer_id, product_id ORDER BY periodo) AS request_tn_lag_3m,
        LAG(cust_request_tn, 6) OVER (PARTITION BY customer_id, product_id ORDER BY periodo) AS request_tn_lag_6m,
        
        -- Lags de stock
        LAG(stock_final, 1) OVER (PARTITION BY product_id ORDER BY periodo) AS stock_lag_1m,
        LAG(stock_final, 3) OVER (PARTITION BY product_id ORDER BY periodo) AS stock_lag_3m
    FROM tmp_cross_join_filtrada
),

features_deltas AS (
    SELECT *,
        -- Deltas de ventas
        tn - tn_lag_1m AS delta_tn_1m,
        tn - tn_lag_3m AS delta_tn_3m,
        tn - tn_lag_6m AS delta_tn_6m,
        tn - tn_lag_12m AS delta_tn_12m,
        
        -- Deltas de demanda
        cust_request_tn - request_tn_lag_1m AS delta_request_1m,
        cust_request_tn - request_tn_lag_3m AS delta_request_3m,
        cust_request_tn - request_tn_lag_6m AS delta_request_6m,
        
        -- Delta de stock
        stock_final - stock_lag_1m AS delta_stock_1m,
        stock_final - stock_lag_3m AS delta_stock_3m,
        
        -- Ratios de cambio (porcentual)
        CASE 
            WHEN tn_lag_1m > 0 THEN (tn - tn_lag_1m) / tn_lag_1m * 100
            ELSE NULL 
        END AS pct_change_tn_1m,
        
        CASE 
            WHEN tn_lag_3m > 0 THEN (tn - tn_lag_3m) / tn_lag_3m * 100
            ELSE NULL 
        END AS pct_change_tn_3m
    FROM features_base
),

features_moving_avg AS (
    SELECT *,
        -- Medias móviles de ventas
        AVG(tn) OVER (
            PARTITION BY customer_id, product_id 
            ORDER BY periodo 
            ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
        ) AS ma_tn_3m,
        
        AVG(tn) OVER (
            PARTITION BY customer_id, product_id 
            ORDER BY periodo 
            ROWS BETWEEN 5 PRECEDING AND CURRENT ROW
        ) AS ma_tn_6m,
        
        AVG(tn) OVER (
            PARTITION BY customer_id, product_id 
            ORDER BY periodo 
            ROWS BETWEEN 11 PRECEDING AND CURRENT ROW
        ) AS ma_tn_12m,
        
        -- Medias móviles de demanda
        AVG(cust_request_tn) OVER (
            PARTITION BY customer_id, product_id 
            ORDER BY periodo 
            ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
        ) AS ma_request_3m,
        
        AVG(cust_request_tn) OVER (
            PARTITION BY customer_id, product_id 
            ORDER BY periodo 
            ROWS BETWEEN 5 PRECEDING AND CURRENT ROW
        ) AS ma_request_6m,
        
        -- Media móvil de stock por producto
        AVG(stock_final) OVER (
            PARTITION BY product_id 
            ORDER BY periodo 
            ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
        ) AS ma_stock_3m
    FROM features_deltas
),

features_min_max AS (
    SELECT *,
        -- Mínimos en ventanas deslizantes
        CASE WHEN tn = MIN(tn) OVER (
            PARTITION BY customer_id, product_id 
            ORDER BY periodo 
            ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
        ) THEN 1 ELSE 0 END AS is_min_tn_3m,
        
        CASE WHEN tn = MIN(tn) OVER (
            PARTITION BY customer_id, product_id 
            ORDER BY periodo 
            ROWS BETWEEN 5 PRECEDING AND CURRENT ROW
        ) THEN 1 ELSE 0 END AS is_min_tn_6m,
        
        CASE WHEN tn = MIN(tn) OVER (
            PARTITION BY customer_id, product_id 
            ORDER BY periodo 
            ROWS BETWEEN 11 PRECEDING AND CURRENT ROW
        ) THEN 1 ELSE 0 END AS is_min_tn_12m,
        
        -- Máximos en ventanas deslizantes
        CASE WHEN tn = MAX(tn) OVER (
            PARTITION BY customer_id, product_id 
            ORDER BY periodo 
            ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
        ) THEN 1 ELSE 0 END AS is_max_tn_3m,
        
        CASE WHEN tn = MAX(tn) OVER (
            PARTITION BY customer_id, product_id 
            ORDER BY periodo 
            ROWS BETWEEN 5 PRECEDING AND CURRENT ROW
        ) THEN 1 ELSE 0 END AS is_max_tn_6m,
        
        CASE WHEN tn = MAX(tn) OVER (
            PARTITION BY customer_id, product_id 
            ORDER BY periodo 
            ROWS BETWEEN 11 PRECEDING AND CURRENT ROW
        ) THEN 1 ELSE 0 END AS is_max_tn_12m
    FROM features_moving_avg
),

features_ratios AS (
    SELECT *,
        -- Ratio con períodos futuros
        CASE 
            WHEN LEAD(tn, 2) OVER (PARTITION BY customer_id, product_id ORDER BY periodo) > 0 
            THEN tn / LEAD(tn, 2) OVER (PARTITION BY customer_id, product_id ORDER BY periodo)
            ELSE NULL 
        END AS ratio_tn_vs_2m_ahead,
        
        -- Ratios demanda vs venta
        CASE 
            WHEN cust_request_tn > 0 THEN tn / cust_request_tn 
            ELSE NULL 
        END AS fill_rate,
        
        -- Ratio venta vs stock
        CASE 
            WHEN stock_final > 0 THEN tn / stock_final 
            ELSE NULL 
        END AS stock_turnover_ratio,
        
        -- Ratio vs media móvil
        CASE 
            WHEN ma_tn_6m > 0 THEN tn / ma_tn_6m 
            ELSE NULL 
        END AS ratio_vs_ma_6m
    FROM features_min_max
),

features_temporal AS (
    SELECT *,
        -- Extraer componentes temporales
        CAST(SUBSTR(CAST(periodo AS VARCHAR), 5, 2) AS INTEGER) AS mes,
        CAST(SUBSTR(CAST(periodo AS VARCHAR), 1, 4) AS INTEGER) AS anio,
        
        -- Indicadores estacionales (Argentina)
        CASE 
            WHEN CAST(SUBSTR(CAST(periodo AS VARCHAR), 5, 2) AS INTEGER) IN (12, 1, 2) THEN 1 
            ELSE 0 
        END AS es_verano,
        
        CASE 
            WHEN CAST(SUBSTR(CAST(periodo AS VARCHAR), 5, 2) AS INTEGER) IN (6, 7, 8) THEN 1 
            ELSE 0 
        END AS es_invierno,
        
        -- Trimestre
        CASE 
            WHEN CAST(SUBSTR(CAST(periodo AS VARCHAR), 5, 2) AS INTEGER) BETWEEN 1 AND 3 THEN 1
            WHEN CAST(SUBSTR(CAST(periodo AS VARCHAR), 5, 2) AS INTEGER) BETWEEN 4 AND 6 THEN 2
            WHEN CAST(SUBSTR(CAST(periodo AS VARCHAR), 5, 2) AS INTEGER) BETWEEN 7 AND 9 THEN 3
            ELSE 4
        END AS trimestre
    FROM features_ratios
)

SELECT *,
    -- Volatilidad (desviación estándar móvil)
    STDDEV(tn) OVER (
        PARTITION BY customer_id, product_id 
        ORDER BY periodo 
        ROWS BETWEEN 5 PRECEDING AND CURRENT ROW
    ) AS volatilidad_tn_6m,
    
    -- Coeficiente de variación
    CASE 
        WHEN ma_tn_6m > 0 THEN 
            STDDEV(tn) OVER (
                PARTITION BY customer_id, product_id 
                ORDER BY periodo 
                ROWS BETWEEN 5 PRECEDING AND CURRENT ROW
            ) / ma_tn_6m
        ELSE NULL 
    END AS coef_variacion_6m,
    
    -- Tendencia (pendiente aproximada)
    CASE 
        WHEN tn_lag_6m IS NOT NULL AND tn_lag_6m != tn THEN
            (tn - tn_lag_6m) / 6.0
        ELSE NULL 
    END AS tendencia_6m,
    
    -- Indicador de crecimiento consistente
    CASE 
        WHEN tn > COALESCE(tn_lag_1m, 0) 
         AND COALESCE(tn_lag_1m, 0) > COALESCE(tn_lag_3m, 0)
         AND COALESCE(tn_lag_3m, 0) > COALESCE(tn_lag_6m, 0)
        THEN 1 ELSE 0 
    END AS crecimiento_consistente,
    
    -- Días desde primera/última venta
    DATEDIFF('day', PVC, periodo_fecha) AS dias_desde_primera_venta_cliente,
    DATEDIFF('day', UVC, periodo_fecha) AS dias_desde_ultima_venta_cliente,
    
    -- Indicador de cliente recurrente
    CASE 
        WHEN antiguedad_cliente >= 12 THEN 1 
        ELSE 0 
    END AS cliente_recurrente,
    
    -- Indicador de producto maduro
    CASE 
        WHEN antiguedad_producto >= 24 THEN 1 
        ELSE 0 
    END AS producto_maduro,
    
    -- Comparación con mismo mes año anterior
    LAG(tn, 12) OVER (PARTITION BY customer_id, product_id ORDER BY periodo) AS tn_mismo_mes_anio_anterior,
    
    -- Variación vs mismo mes año anterior
    CASE 
        WHEN LAG(tn, 12) OVER (PARTITION BY customer_id, product_id ORDER BY periodo) > 0 THEN
            (tn - LAG(tn, 12) OVER (PARTITION BY customer_id, product_id ORDER BY periodo)) / 
            LAG(tn, 12) OVER (PARTITION BY customer_id, product_id ORDER BY periodo) * 100
        ELSE NULL 
    END AS pct_change_vs_mismo_mes_anio_anterior

FROM features_temporal
ORDER BY customer_id, product_id, periodo;
"""

con.execute(query_features_advanced)
print("✅ Tabla 'features_advanced' creada exitosamente!")

# Verificar cantidad de registros
result = con.execute("SELECT COUNT(*) FROM features_advanced").fetchone()
print(f"📈 Registros en features_advanced: {result[0]:,}")

con.close()


🚀 Iniciando Feature Engineering...
⏰ Hora de inicio: 2025-06-16 21:09:01.863867

📊 Creando tabla de features avanzadas...
✅ Tabla 'features_advanced' creada exitosamente!
📈 Registros en features_advanced: 17,021,654


In [None]:
# Feature Engineering dividido en pasos separados
import duckdb
import pandas as pd
from datetime import datetime

def crear_features_adicionales_por_pasos():
    """
    Crea las features adicionales dividiendo el proceso en pasos separados
    para evitar problemas de memoria en DuckDB
    """
    
    con = duckdb.connect(database='../input/db/labo3.duckdb')
    
    print("🚀 Iniciando Feature Engineering por pasos...")
    print(f"⏰ Hora de inicio: {datetime.now()}")
    
    try:
        # =============================================
        # PASO 1: MÉTRICAS BÁSICAS DE CLIENTE Y PRODUCTO
        # =============================================
        print("\n📊 Paso 1: Creando métricas básicas de cliente y producto...")
        """
        con.execute(
        CREATE OR REPLACE TABLE tmp_metricas_cliente_producto AS
        SELECT 
            *,
            -- Métricas del cliente en el período
            COUNT(*) OVER (PARTITION BY customer_id, periodo) AS productos_total_cliente_periodo,
            SUM(tn) OVER (PARTITION BY customer_id, periodo) AS tn_total_cliente_periodo,
            
            -- Métricas del producto en el período
            SUM(tn) OVER (PARTITION BY product_id, periodo) AS tn_total_producto_periodo,
            COUNT(*) OVER (PARTITION BY product_id, periodo) AS clientes_total_producto,
            
            -- Rankings
            ROW_NUMBER() OVER (PARTITION BY customer_id, periodo ORDER BY tn DESC) AS ranking_producto_en_cliente,
            ROW_NUMBER() OVER (PARTITION BY product_id, periodo ORDER BY tn DESC) AS ranking_cliente_en_producto
        FROM features_advanced
        ORDER BY customer_id, product_id, periodo
        )
        
        result = con.execute("SELECT COUNT(*) FROM tmp_metricas_cliente_producto").fetchone()
        print(f"✅ Tabla tmp_metricas_cliente_producto creada: {result[0]:,} registros")
        """
        # =============================================
        # PASO 2: FEATURES DE INTERACCIÓN
        # =============================================
        print("\n🔄 Paso 2: Creando features de interacción...")
        """
        con.execute(
        CREATE OR REPLACE TABLE tmp_features_interaccion AS
        SELECT 
            *,
            -- Participación del producto en las compras del cliente
            CASE 
                WHEN tn_total_cliente_periodo > 0 THEN
                    tn / tn_total_cliente_periodo
                ELSE NULL
            END AS participacion_producto_en_cliente,
            
            -- Ratio vs promedio de otros productos del cliente
            CASE 
                WHEN productos_total_cliente_periodo > 1 THEN
                    tn / ((tn_total_cliente_periodo - tn) / (productos_total_cliente_periodo - 1))
                ELSE NULL
            END AS ratio_vs_promedio_otros_productos_cliente,
            
            -- Es el producto principal del cliente?
            CASE 
                WHEN ranking_producto_en_cliente = 1 THEN 1
                ELSE 0
            END AS es_producto_principal_cliente
        FROM tmp_metricas_cliente_producto
        ORDER BY customer_id, product_id, periodo
        )
        
        result = con.execute("SELECT COUNT(*) FROM tmp_features_interaccion").fetchone()
        print(f"✅ Tabla tmp_features_interaccion creada: {result[0]:,} registros")
        """
        # =============================================
        # PASO 3: FEATURES DE MERCADO
        # =============================================
        print("\n🏪 Paso 3: Creando features de mercado...")
        """
        con.execute(
        CREATE OR REPLACE TABLE tmp_features_mercado AS
        SELECT 
            *,
            -- Participación del cliente en las ventas del producto
            CASE 
                WHEN tn_total_producto_periodo > 0 THEN
                    tn / tn_total_producto_periodo
                ELSE NULL
            END AS participacion_cliente_en_producto,
            
            -- Es el cliente principal del producto?
            CASE 
                WHEN ranking_cliente_en_producto = 1 THEN 1
                ELSE 0
            END AS es_cliente_principal_producto
        FROM tmp_features_interaccion
        ORDER BY customer_id, product_id, periodo
        )
        
        result = con.execute("SELECT COUNT(*) FROM tmp_features_mercado").fetchone()
        print(f"✅ Tabla tmp_features_mercado creada: {result[0]:,} registros")
        """
        # =============================================
        # PASO 4: CONCENTRACIÓN DE PRODUCTO
        # =============================================
        print("\n📈 Paso 4: Creando índices de concentración...")
        """
        con.execute(
        CREATE OR REPLACE TABLE tmp_concentracion_producto AS
        SELECT 
            product_id,
            periodo,
            SUM(POWER(participacion_cliente_en_producto, 2)) AS indice_concentracion_producto
        FROM tmp_features_mercado
        WHERE participacion_cliente_en_producto IS NOT NULL
        GROUP BY product_id, periodo
        )
        
        result = con.execute("SELECT COUNT(*) FROM tmp_concentracion_producto").fetchone()
        print(f"✅ Tabla tmp_concentracion_producto creada: {result[0]:,} registros")
        """
        # =============================================
        # PASO 5: PROMEDIOS ESTACIONALES
        # =============================================
        print("\n🗓️ Paso 5: Creando promedios estacionales...")
        """
        con.execute(
        CREATE OR REPLACE TABLE tmp_promedios_estacionales AS
        SELECT 
            customer_id,
            product_id,
            mes,
            AVG(tn) AS promedio_historico_mes,
            STDDEV(tn) AS stddev_historico_mes,
            COUNT(*) AS observaciones_mes
        FROM tmp_features_mercado
        GROUP BY customer_id, product_id, mes
        )
        
        result = con.execute("SELECT COUNT(*) FROM tmp_promedios_estacionales").fetchone()
        print(f"✅ Tabla tmp_promedios_estacionales creada: {result[0]:,} registros")
        """
        # =============================================
        # PASO 6: PROMEDIOS ANUALES
        # =============================================
        print("\n📅 Paso 6: Creando promedios anuales...")
        """
        con.execute(
        CREATE OR REPLACE TABLE tmp_promedios_anuales AS
        SELECT 
            customer_id,
            product_id,
            AVG(tn) AS promedio_anual
        FROM tmp_features_mercado
        GROUP BY customer_id, product_id
        )
        
        result = con.execute("SELECT COUNT(*) FROM tmp_promedios_anuales").fetchone()
        print(f"✅ Tabla tmp_promedios_anuales creada: {result[0]:,} registros")
        """
        # =============================================
        # PASO 7: FEATURES DE ESTACIONALIDAD
        # =============================================
        print("\n🌦️ Paso 7: Creando features de estacionalidad...")
        """
        con.execute(
        CREATE OR REPLACE TABLE tmp_features_estacionalidad AS
        SELECT 
            fm.*,
            pe.promedio_historico_mes,
            pe.stddev_historico_mes,
            pa.promedio_anual,
            
            -- Desviación respecto al promedio histórico del mes
            CASE 
                WHEN pe.promedio_historico_mes > 0 THEN
                    (fm.tn - pe.promedio_historico_mes) / pe.promedio_historico_mes * 100
                ELSE NULL
            END AS desviacion_vs_promedio_historico_mes,
            
            -- Índice de estacionalidad
            CASE 
                WHEN pa.promedio_anual > 0 THEN
                    pe.promedio_historico_mes / pa.promedio_anual
                ELSE NULL
            END AS indice_estacionalidad,
            
            -- Coeficiente de variación estacional
            CASE 
                WHEN pe.promedio_historico_mes > 0 THEN
                    pe.stddev_historico_mes / pe.promedio_historico_mes
                ELSE NULL
            END AS coef_variacion_estacional
            
        FROM tmp_features_mercado fm
        LEFT JOIN tmp_promedios_estacionales pe ON 
            fm.customer_id = pe.customer_id AND 
            fm.product_id = pe.product_id AND 
            fm.mes = pe.mes
        LEFT JOIN tmp_promedios_anuales pa ON 
            fm.customer_id = pa.customer_id AND 
            fm.product_id = pa.product_id
        ORDER BY fm.customer_id, fm.product_id, fm.periodo
        )
        
        result = con.execute("SELECT COUNT(*) FROM tmp_features_estacionalidad").fetchone()
        print(f"✅ Tabla tmp_features_estacionalidad creada: {result[0]:,} registros")
        """
        # =============================================
        # PASO 8: FEATURES DE CICLICIDAD
        # =============================================
        print("\n🔄 Paso 8: Creando features de ciclicidad...")
        """
        con.execute(
        CREATE OR REPLACE TABLE tmp_features_ciclicidad AS
        SELECT 
            fe.*,
            cp.indice_concentracion_producto,
            
            -- Detectar ciclos regulares (cada 3 meses)
            CASE 
                WHEN fe.tn > 0 
                 AND LAG(fe.tn, 3) OVER (PARTITION BY fe.customer_id, fe.product_id ORDER BY fe.periodo) > 0
                 AND LAG(fe.tn, 6) OVER (PARTITION BY fe.customer_id, fe.product_id ORDER BY fe.periodo) > 0
                THEN 1 ELSE 0
            END AS patron_ciclico_3m,
            
            -- Detectar ciclos semestrales
            CASE 
                WHEN fe.tn > 0 
                 AND LAG(fe.tn, 6) OVER (PARTITION BY fe.customer_id, fe.product_id ORDER BY fe.periodo) > 0
                 AND LAG(fe.tn, 12) OVER (PARTITION BY fe.customer_id, fe.product_id ORDER BY fe.periodo) > 0
                THEN 1 ELSE 0
            END AS patron_ciclico_6m
            
        FROM tmp_features_estacionalidad fe
        LEFT JOIN tmp_concentracion_producto cp ON 
            fe.product_id = cp.product_id AND 
            fe.periodo = cp.periodo
        ORDER BY fe.customer_id, fe.product_id, fe.periodo
        )
        
        result = con.execute("SELECT COUNT(*) FROM tmp_features_ciclicidad").fetchone()
        print(f"✅ Tabla tmp_features_ciclicidad creada: {result[0]:,} registros")
        """
        # =============================================
        # PASO 9: PERIODICIDAD DE COMPRAS
        # =============================================
        print("\n📊 Paso 9: Creando periodicidad de compras...")
        """
        con.execute(
        CREATE OR REPLACE TABLE tmp_periodicidad_compras AS
        SELECT 
            customer_id,
            product_id,
            AVG(antiguedad_cliente) / NULLIF(SUM(CASE WHEN tn > 0 THEN 1 ELSE 0 END), 0) AS periodicidad_promedio_compras
        FROM tmp_features_ciclicidad
        WHERE antiguedad_cliente > 0
        GROUP BY customer_id, product_id
        )
        
        result = con.execute("SELECT COUNT(*) FROM tmp_periodicidad_compras").fetchone()
        print(f"✅ Tabla tmp_periodicidad_compras creada: {result[0]:,} registros")
        """
        # =============================================
        # PASO 10: MESES DESDE ÚLTIMA COMPRA
        # =============================================
        print("\n🕒 Paso 10: Calculando meses desde última compra...")
        """
        con.execute(
        CREATE OR REPLACE TABLE tmp_ultima_compra AS
        SELECT 
            customer_id,
            product_id,
            periodo,
            tn,
            MAX(CASE WHEN tn > 0 THEN periodo ELSE NULL END) 
                OVER (PARTITION BY customer_id, product_id ORDER BY periodo 
                      ROWS UNBOUNDED PRECEDING) AS ultimo_periodo_con_compra
        FROM tmp_features_ciclicidad
        )
        
        result = con.execute("SELECT COUNT(*) FROM tmp_ultima_compra").fetchone()
        print(f"✅ Tabla tmp_ultima_compra creada: {result[0]:,} registros")
        """
        """
        con.execute(
        CREATE OR REPLACE TABLE tmp_meses_sin_compra AS
        SELECT 
            *,
            CASE 
                WHEN tn = 0 AND ultimo_periodo_con_compra IS NOT NULL THEN
                    -- Calcular diferencia aproximada en meses
                    (periodo - ultimo_periodo_con_compra) % 100 + 
                    ((periodo - ultimo_periodo_con_compra) / 100) * 12
                ELSE 0
            END AS meses_desde_ultima_compra
        FROM tmp_ultima_compra
        )
        
        result = con.execute("SELECT COUNT(*) FROM tmp_meses_sin_compra").fetchone()
        print(f"✅ Tabla tmp_meses_sin_compra creada: {result[0]:,} registros")
        """
        # =============================================
        # PASO 11: TABLA FINAL
        # =============================================
        print("\n🎯 Paso 11: Creando tabla final features_adicionales...")
        """
        con.execute(
        CREATE OR REPLACE TABLE features_adicionales AS
        SELECT 
            fc.*,
            pc.periodicidad_promedio_compras,
            msc.meses_desde_ultima_compra
        FROM tmp_features_ciclicidad fc
        LEFT JOIN tmp_periodicidad_compras pc ON 
            fc.customer_id = pc.customer_id AND 
            fc.product_id = pc.product_id
        LEFT JOIN tmp_meses_sin_compra msc ON 
            fc.customer_id = msc.customer_id AND 
            fc.product_id = msc.product_id AND 
            fc.periodo = msc.periodo
        ORDER BY fc.customer_id, fc.product_id, fc.periodo
        )
        
        result = con.execute("SELECT COUNT(*) FROM features_adicionales").fetchone()
        print(f"✅ Tabla features_adicionales final creada: {result[0]:,} registros")
        """
        # =============================================
        # LIMPIEZA DE TABLAS TEMPORALES
        # =============================================
        print("\n🧹 Limpiando tablas temporales...")
        
        tablas_temp = [
            'tmp_metricas_cliente_producto',
            'tmp_features_interaccion', 
            'tmp_features_mercado',
            'tmp_concentracion_producto',
            'tmp_promedios_estacionales',
            'tmp_promedios_anuales',
            'tmp_features_estacionalidad',
            'tmp_features_ciclicidad',
            'tmp_periodicidad_compras',
            'tmp_ultima_compra',
            'tmp_meses_sin_compra'
        ]
        
        for tabla in tablas_temp:
            try:
                con.execute(f"DROP TABLE IF EXISTS {tabla}")
                print(f"   • {tabla} eliminada")
            except Exception as e:
                print(f"   ⚠️ Error eliminando {tabla}: {e}")
        
        print(f"\n🎉 Feature Engineering completado exitosamente!")
        print(f"⏰ Hora de finalización: {datetime.now()}")
        
        # Verificación final
        result = con.execute("SELECT COUNT(*) FROM features_adicionales").fetchone()
        print(f"📈 Registros finales en features_adicionales: {result[0]:,}")
        
    except Exception as e:
        print(f"❌ Error durante el proceso: {e}")
        print("🔄 Intentando limpiar tablas temporales...")
        # Intentar limpiar en caso de error
        tablas_temp = [
            'tmp_metricas_cliente_producto', 'tmp_features_interaccion', 
            'tmp_features_mercado', 'tmp_concentracion_producto',
            'tmp_promedios_estacionales', 'tmp_promedios_anuales',
            'tmp_features_estacionalidad', 'tmp_features_ciclicidad',
            'tmp_periodicidad_compras', 'tmp_ultima_compra', 'tmp_meses_sin_compra'
        ]
        for tabla in tablas_temp:
            try:
                #con.execute(f"DROP TABLE IF EXISTS {tabla}")
                print(f"⚠️ {tabla} eliminada durante limpieza")
            except:
                print(f"⚠️ Error eliminando {tabla} durante limpieza: {e}")
        raise e
    
    finally:
        con.close()

# Ejecutar el proceso
if __name__ == "__main__":
    crear_features_adicionales_por_pasos()

🚀 Iniciando Feature Engineering por pasos...
⏰ Hora de inicio: 2025-06-17 08:58:52.044276

📊 Paso 1: Creando métricas básicas de cliente y producto...

🔄 Paso 2: Creando features de interacción...

🏪 Paso 3: Creando features de mercado...

📈 Paso 4: Creando índices de concentración...

🗓️ Paso 5: Creando promedios estacionales...

📅 Paso 6: Creando promedios anuales...

🌦️ Paso 7: Creando features de estacionalidad...

🔄 Paso 8: Creando features de ciclicidad...
✅ Tabla tmp_features_ciclicidad creada: 17,021,654 registros

📊 Paso 9: Creando periodicidad de compras...
✅ Tabla tmp_periodicidad_compras creada: 695,998 registros

🕒 Paso 10: Calculando meses desde última compra...

🎯 Paso 11: Creando tabla final features_adicionales...
✅ Tabla features_adicionales final creada: 17,021,654 registros

🧹 Limpiando tablas temporales...
   • tmp_metricas_cliente_producto eliminada
   • tmp_features_interaccion eliminada
   • tmp_features_mercado eliminada
   • tmp_concentracion_producto eliminad

In [4]:
con = duckdb.connect(database='../input/db/labo3.duckdb')

# genero la tabla final de features avanzadas
query_final_features = """
CREATE OR REPLACE TABLE ventas_features_final AS
SELECT
    concat(cjt.product_id ,'|' , cjt.customer_id) AS key_producto_cliente,
    cjt.antiguedad_cliente,
    cjt.antiguedad_producto,
    cjt.cust_request_qty,
    cjt.cust_request_tn,
    cjt.customer_id,
    cjt.key_customer_producto_periodo,
    cjt.key_periodo_customer_producto,
    cjt.key_periodo_producto,
    cjt.periodo,
    cjt.periodo_fecha,
    cjt.plan_precios_cuidados,
    cjt.product_id,
    cjt.PVC,
    cjt.PVP,
    cjt.registro_sintetico,
    cjt.stock_final,
    cjt.tn,
    cjt.UVC,
    cjt.UVP,
    fa.anio,
    fa.cliente_recurrente,
    fa.coef_variacion_6m,
    fa.crecimiento_consistente,
    fa.delta_request_1m,
    fa.delta_request_3m,
    fa.delta_request_6m,
    fa.delta_stock_1m,
    fa.delta_stock_3m,
    fa.delta_tn_12m,
    fa.delta_tn_1m,
    fa.delta_tn_3m,
    fa.delta_tn_6m,
    fa.dias_desde_primera_venta_cliente,
    fa.dias_desde_ultima_venta_cliente,
    fa.es_invierno,
    fa.es_verano,
    fa.fill_rate,
    fa.is_max_tn_12m,
    fa.is_max_tn_3m,
    fa.is_max_tn_6m,
    fa.is_min_tn_12m,
    fa.is_min_tn_3m,
    fa.is_min_tn_6m,
    fa.ma_request_3m,
    fa.ma_request_6m,
    fa.ma_stock_3m,
    fa.ma_tn_12m,
    fa.ma_tn_3m,
    fa.ma_tn_6m,
    fa.mes,
    fa.pct_change_tn_1m,
    fa.pct_change_tn_3m,
    fa.pct_change_vs_mismo_mes_anio_anterior,
    fa.producto_maduro,
    fa.ratio_tn_vs_2m_ahead,
    fa.ratio_vs_ma_6m,
    fa.request_tn_lag_1m,
    fa.request_tn_lag_3m,
    fa.request_tn_lag_6m,
    fa.stock_lag_1m,
    fa.stock_lag_3m,
    fa.stock_turnover_ratio,
    fa.tendencia_6m,
    fa.tn_lag_12m,
    fa.tn_lag_1m,
    fa.tn_lag_3m,
    fa.tn_lag_6m,
    fa.tn_mismo_mes_anio_anterior,
    fa.trimestre,
    fa.volatilidad_tn_6m,
    fadi.clientes_total_producto,
    fadi.coef_variacion_estacional,
    fadi.desviacion_vs_promedio_historico_mes,
    fadi.es_cliente_principal_producto,
    fadi.es_producto_principal_cliente,
    fadi.indice_concentracion_producto,
    fadi.indice_estacionalidad,
    fadi.meses_desde_ultima_compra,
    fadi.participacion_cliente_en_producto,
    fadi.participacion_producto_en_cliente,
    fadi.patron_ciclico_3m,
    fadi.patron_ciclico_6m,
    fadi.periodicidad_promedio_compras,
    fadi.productos_total_cliente_periodo,
    fadi.promedio_anual,
    fadi.promedio_historico_mes,
    fadi.ranking_cliente_en_producto,
    fadi.ranking_producto_en_cliente,
    fadi.ratio_vs_promedio_otros_productos_cliente,
    fadi.stddev_historico_mes,
    fadi.tn_total_cliente_periodo,
    fadi.tn_total_producto_periodo,
    pr.cat1,
    pr.cat2,
    pr.cat3,
    pr.brand,
    pr.sku_size
    -- pr.descripcion,
FROM tmp_cross_join_filtrada cjt
JOIN features_advanced fa ON cjt.key_periodo_customer_producto = fa.key_periodo_customer_producto
JOIN features_adicionales fadi ON cjt.key_periodo_customer_producto = fadi.key_periodo_customer_producto
left JOIN tb_productos pr ON cjt.product_id = pr.product_id
ORDER BY cjt.key_periodo_customer_producto
"""

con.execute(query_final_features)
print("✅ Tabla 'ventas_features_final' creada exitosamente!")
result = con.execute("SELECT COUNT(*) FROM ventas_features_final").fetchone()
print(f"📈 Registros en ventas_features_final: {result[0]:,}")

con.close()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✅ Tabla 'ventas_features_final' creada exitosamente!
📈 Registros en ventas_features_final: 17,021,654


In [10]:
con = duckdb.connect(database='../input/db/labo3.duckdb')
# =============================================
# 3. VERIFICACIONES Y ESTADÍSTICAS
# =============================================

print("\n📊 Estadísticas de las tablas creadas:")

# Estadísticas features_advanced
print("\n🔹 FEATURES_ADVANCED:")
stats_advanced = con.execute("""
    SELECT 
        COUNT(*) as total_registros,
        COUNT(DISTINCT customer_id) as clientes_unicos,
        COUNT(DISTINCT product_id) as productos_unicos,
        MIN(periodo) as periodo_min,
        MAX(periodo) as periodo_max,
        SUM(CASE WHEN registro_sintetico = 0 THEN 1 ELSE 0 END) as registros_reales,
        SUM(CASE WHEN registro_sintetico = 1 THEN 1 ELSE 0 END) as registros_sinteticos
    FROM features_advanced
""").fetchdf()

print(stats_advanced.to_string(index=False))

# Estadísticas features_adicionales
print("\n🔹 FEATURES_ADICIONALES:")
stats_adicionales = con.execute("""
    SELECT 
        COUNT(*) as total_registros,
        COUNT(DISTINCT customer_id) as clientes_unicos,
        COUNT(DISTINCT product_id) as productos_unicos,
        AVG(productos_total_cliente_periodo) as promedio_productos_por_cliente,
        AVG(clientes_total_producto) as promedio_clientes_por_producto,
        COUNT(CASE WHEN es_producto_principal_cliente = 1 THEN 1 END) as registros_producto_principal,
        COUNT(CASE WHEN es_cliente_principal_producto = 1 THEN 1 END) as registros_cliente_principal
    FROM features_adicionales
""").fetchdf()

print(stats_adicionales.to_string(index=False))

# =============================================
# 4. EJEMPLOS DE FEATURES CREADAS
# =============================================

print("\n🔍 Ejemplos de las nuevas features:")

# Mostrar algunas columnas nuevas
sample_features = con.execute("""
    SELECT 
        customer_id, 
        product_id, 
        periodo,
        tn,
        tn_lag_1m,
        delta_tn_1m,
        ma_tn_6m,
        fill_rate,
        -- participacion_producto_en_cliente,
        -- participacion_cliente_en_producto,
        -- indice_estacionalidad,
        -- patron_ciclico_3m
    FROM features_advanced
    WHERE customer_id <= 3 AND product_id <= 3
    ORDER BY customer_id, product_id, periodo
    LIMIT 10
""").fetchdf()

print("\n📋 Muestra de features creadas:")
print(sample_features.to_string(index=False))

# Cerrar conexión
con.close()

print(f"\n🎉 Feature Engineering completado exitosamente!")
print(f"⏰ Hora de finalización: {datetime.now()}")
print("\n📝 Tablas creadas:")
print("   • features_advanced: Features básicas + lags + medias móviles + ratios")
print("   • features_adicionales: Todas las anteriores + interacción + mercado + ciclicidad")

print("\n💡 Próximos pasos sugeridos:")
print("   1. Analizar distribución de las nuevas features")
print("   2. Identificar features con alta correlación")
print("   3. Validar que los lags y medias móviles sean correctos")
print("   4. Explorar features de estacionalidad para patrones de demanda")


📊 Estadísticas de las tablas creadas:

🔹 FEATURES_ADVANCED:
 total_registros  clientes_unicos  productos_unicos  periodo_min  periodo_max  registros_reales  registros_sinteticos
        17021654              597              1188       201701       201912         2938370.0            14083284.0

🔹 FEATURES_ADICIONALES:
 total_registros  clientes_unicos  productos_unicos  promedio_productos_por_cliente  promedio_clientes_por_producto  registros_producto_principal  registros_cliente_principal
        17021654              597              1188                       873.55074                      546.963782                         19555                        31229

🔍 Ejemplos de las nuevas features:

📋 Muestra de features creadas:
Empty DataFrame
Columns: [customer_id, product_id, periodo, tn, tn_lag_1m, delta_tn_1m, ma_tn_6m, fill_rate]
Index: []

🎉 Feature Engineering completado exitosamente!
⏰ Hora de finalización: 2025-06-17 18:35:25.220965

📝 Tablas creadas:
   • features_advanced: