In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tqdm import tqdm
from sklearn.exceptions import UndefinedMetricWarning
import warnings
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, VotingRegressor

# Desactivar los warnings UndefinedMetricWarning para r2_score
warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)

In [2]:
def preparar_datos(df, tb_productos, lags=12):
    # Convertir el periodo a formato datetime
    df['periodo'] = pd.to_datetime(df['periodo'], format='%Y%m')

    # Agregar los datos por periodo y product_id para obtener la serie temporal
    ts = df.groupby(['periodo', 'product_id'])['tn'].sum().reset_index()

    # Unir las categorías de productos desde el archivo tb_productos
    ts = ts.merge(tb_productos[['product_id', 'cat1', 'cat2', 'cat3','brand','descripcion','sku_size']], on='product_id', how='left')

    # Convertir las columnas de categoría a tipo 'category'
    ts['cat1'] = ts['cat1'].astype('category')
    ts['cat2'] = ts['cat2'].astype('category')
    ts['cat3'] = ts['cat3'].astype('category')
    ts['brand'] = ts['brand'].astype('category')
    ts['descripcion'] = ts['descripcion'].astype('category')
    ts['sku_size'] = ts['sku_size'].astype('category')
    
    # Crear características adicionales
    ts['crisis'] = (ts['periodo'].dt.year == 2019) & (ts['periodo'].dt.month == 8)
    ts['quarter'] = ts['periodo'].dt.quarter
    ts['month'] = ts['periodo'].dt.month
    ts['year'] = ts['periodo'].dt.year
    ts['season'] = ts['periodo'].apply(lambda x: 1 if x.month in [6, 7, 8] else 0)
    ts['tn_diff'] = ts['tn'].diff()
    ts['rolling_mean'] = ts['tn'].rolling(window=3).mean()
    ts['interaction'] = ts['year'] * ts['month']

    # Normalización por producto
    ts['tn_norm'] = ts.groupby('product_id')['tn'].transform(lambda x: (x - x.mean()) / x.std())

    # Agregar lags a los datos
    for lag in range(1, lags + 1):
        ts[f'tn_lag_{lag}'] = ts.groupby('product_id')['tn'].shift(lag)

    return ts

In [3]:
def entrenar_modelo(ts, lags=12):
    # Calcular los pesos basados en ventas
    pesos_ventas = calcular_pesos(ts)

    # Crear conjunto de entrenamiento y objetivo
    X = ts[['product_id', 'cat1', 'cat2','cat3','brand','descripcion','sku_size','crisis', 'quarter', 'month','year','season','tn_diff','rolling_mean','interaction'] + [f'tn_lag_{lag}' for lag in range(1, lags + 1)] + ['tn_norm']]
    y = ts['tn'].shift(-2)

    # Eliminar las últimas 2 filas
    X = X.iloc[:-2]
    y = y.iloc[:-2]

    # Validación temporal en lugar de train_test_split
    tscv = TimeSeriesSplit(n_splits=5)
    X_train, X_test, y_train, y_test = None, None, None, None
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Codificar las características categóricas 'cat1', 'cat2', 'cat3'
    for col in ['cat1', 'cat2', 'cat3', 'brand','descripcion','sku_size']:
        X_train[col] = X_train[col].astype('category').cat.codes
        X_test[col] = X_test[col].astype('category').cat.codes

    # Obtener los pesos para el conjunto de entrenamiento
    pesos_entrenamiento = pesos_ventas.loc[X_train['product_id']].values

    # Definir el espacio de búsqueda de hiperparámetros para LightGBM
    param_dist = {
        'num_leaves': [31, 50, 70, 128],
        'max_depth': [-1, 10, 20, 30],
        'learning_rate': [0.01, 0.05, 0.1],
        'n_estimators': [100, 200, 500],
        'min_child_samples': [20, 30, 40],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }

    # Definir el modelo de LightGBM con RandomizedSearchCV
    lgb_model = lgb.LGBMRegressor(random_state=42)
    random_search = RandomizedSearchCV(lgb_model, param_distributions=param_dist, n_iter=100, cv=5, verbose=1, n_jobs=-1, random_state=42)
    random_search.fit(X_train, y_train, sample_weight=pesos_entrenamiento)

    print(f"Best parameters found: {random_search.best_params_}")

    # Crear y ajustar el modelo de Random Forest con pesos
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train, sample_weight=pesos_entrenamiento)

    # Crear y ajustar el modelo de XGBoost con pesos
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
    xgb_model.fit(X_train, y_train, sample_weight=pesos_entrenamiento)

    # Obtener el mejor modelo de LightGBM
    lgb_model = random_search.best_estimator_

    # Crear el modelo de ensemble con VotingRegressor
    ensemble_model = VotingRegressor(estimators=[
        ('lgb', lgb_model),
        ('rf', rf_model),
        ('xgb', xgb_model)
    ])

    # Ajustar el modelo de ensemble
    ensemble_model.fit(X_train, y_train, sample_weight=pesos_entrenamiento)

    # Predecir en el conjunto de prueba
    y_pred = ensemble_model.predict(X_test)

    # Calcular métricas de rendimiento
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Ensemble Model MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")

    return ensemble_model

In [4]:
def calcular_pesos(ts):
    # Calcular el total de ventas por producto
    ventas_totales = ts.groupby('product_id')['tn'].sum()
    # Normalizar los pesos para que sumen 1
    pesos = ventas_totales / ventas_totales.sum()
    return pesos

In [5]:
def predecir_producto(ensemble_model, ts, product_ids, next_period='2020-02-01', lags=12):
    next_period = pd.Timestamp(next_period)
    results = []

    # Calcular los pesos basados en ventas
    pesos_ventas = calcular_pesos(ts)

    for product_id in tqdm(product_ids, desc="Predicting with ensemble model"):
        product_data = ts[ts['product_id'] == product_id].copy()
        if not product_data.empty:
            last_data = product_data.iloc[-1]

            # Convertir a categoría si es list-like
            try:
                cat1 = pd.Categorical(last_data['cat1'])
                cat2 = pd.Categorical(last_data['cat2'])
                cat3 = pd.Categorical(last_data['cat3'])
                brand = pd.Categorical(last_data['brand'])
                descripcion = pd.Categorical(last_data['descripcion'])
                sku_size = pd.Categorical(last_data['sku_size'])
            except TypeError:
                cat1, cat2, cat3, brand, descripcion, sku_size = None, None, None, None, None, None

            if cat1 is not None and cat2 is not None and cat3 is not None and brand is not None and descripcion is not None and sku_size is not None:
                # Construir datos para la predicción
                next_data = pd.DataFrame({
                    'product_id': [product_id],
                    'cat1': [cat1.codes[0] if len(cat1) > 0 else 0],
                    'cat2': [cat2.codes[0] if len(cat2) > 0 else 0],
                    'cat3': [cat3.codes[0] if len(cat3) > 0 else 0],
                    'brand': [brand.codes[0] if len(brand) > 0 else 0],
                    'descripcion': [descripcion.codes[0] if len(descripcion) > 0 else 0],
                    'sku_size': [sku_size.codes[0] if len(sku_size) > 0 else 0],
                    'crisis': [(next_period.year == 2019) & (next_period.month == 8)],
                    'quarter': [next_period.quarter],
                    'month': [next_period.month],
                    'year': [next_period.year],
                    'season': [1 if next_period.month in [6, 7, 8] else 0],
                    'tn_diff': [last_data['tn_diff']],
                    'rolling_mean': [last_data['rolling_mean']],
                    'interaction': [next_period.year * next_period.month],
                    **{f'tn_lag_{lag}': [last_data[f'tn_lag_{lag}']] if f'tn_lag_{lag}' in product_data.columns else [0] for lag in range(1, lags + 1)},
                    'tn_norm': [0]  # Ajustar tn_norm adecuadamente si es necesario
                })

                # Predecir usando el modelo de ensemble
                pred = ensemble_model.predict(next_data)
                # Obtener el peso basado en ventas para el producto
                peso_ventas = pesos_ventas.get(product_id, 0)
                results.append({'product_id': product_id, 'predicted_tn': pred[0] * peso_ventas})
            else:
                product_mean_tn = ts[ts['product_id'] == product_id]['tn'].mean()
                if not pd.isna(product_mean_tn):
                    results.append({'product_id': product_id, 'predicted_tn': product_mean_tn})
                else:
                    global_mean_tn = ts['tn'].mean()
                    results.append({'product_id': product_id, 'predicted_tn': global_mean_tn})
        else:
            product_mean_tn = ts[ts['product_id'] == product_id]['tn'].mean()
            if not pd.isna(product_mean_tn):
                results.append({'product_id': product_id, 'predicted_tn': product_mean_tn})
            else:
                global_mean_tn = ts['tn'].mean()
                results.append({'product_id': product_id, 'predicted_tn': global_mean_tn})

    return pd.DataFrame(results)

In [6]:
def evaluar_metricas(df, results_df, target_date='2019-12-01'):
    df['date'] = pd.to_datetime(df['periodo'], format='%Y%m')
    df_filtered = df[df['date'] == target_date]
    results_df_ajustado = results_df.groupby('product_id')['predicted_tn'].sum()

    metricas_por_producto = []
    numerador = 0
    denominator = 0

    for product_id in df_filtered['product_id'].unique():
        if product_id in results_df_ajustado.index:
            y_true = df_filtered.loc[df_filtered['product_id'] == product_id, 'tn'].values[0]
            y_pred = results_df_ajustado.loc[product_id]
            numerador += abs(y_true - y_pred)
            denominator += y_true
            rmse = np.sqrt(mean_squared_error([y_true], [y_pred]))
            mae = mean_absolute_error([y_true], [y_pred])
            r2 = r2_score([y_true], [y_pred]) if len([y_true]) > 1 and len([y_pred]) > 1 else float('nan')
            metricas_por_producto.append({
                'product_id': product_id,
                'rmse': rmse,
                'mae': mae,
                'r2': r2
            })

    if metricas_por_producto:
        avg_rmse = np.mean([m['rmse'] for m in metricas_por_producto])
        avg_mae = np.mean([m['mae'] for m in metricas_por_producto])
        avg_r2 = np.nanmean([m['r2'] for m in metricas_por_producto])
        metricas_por_producto.append({
            'product_id': 'average',
            'rmse': avg_rmse,
            'mae': avg_mae,
            'r2': avg_r2
        })

    metricas_df = pd.DataFrame(metricas_por_producto)
    metricaMultinacion = numerador / denominator
    print("Métrica multinacional", metricaMultinacion)
    print("metrica multinacional", metricaMultinacion)
    print("rmse: ", metricas_df['rmse'].mean())
    print("mae: ", metricas_df['mae'].mean())
    print("r2: ", metricas_df['r2'].mean())

    # Exportar métricas a un archivo CSV
    metricas_df.to_csv('metricas_por_producto.csv', index=False)
    print(f"Métricas por producto exportadas a 'metricas_por_producto.csv'")
    
    return metricas_df

Inicio codigo

In [7]:
# Cargar los datos
df = pd.read_csv('../../../sell-in.txt/sell-in.txt', sep='\t')
productosPredecir = pd.read_csv('C:/Users/Josvaldes/Documents/Maestria/Austral/2ano/Labo3/datasets/Proyecto/Labo3/Datasets/productos_a_predecir.txt', sep='\t')
tb_productos = pd.read_csv('c:/Users/Josvaldes/Documents/Maestria/Austral/2ano/Labo3/datasets/Proyecto/Labo3/Datasets/tb_productos_descripcion.txt', sep='\t')

In [8]:
ts = preparar_datos(df, tb_productos, lags=30)

Predicción sobre febrero 2020

In [9]:
productosPredecir = ts['product_id'].values
productosPredecir

array([20001, 20002, 20003, ..., 21267, 21271, 21276], dtype=int64)

In [10]:
ensemble_model = entrenar_modelo(ts, lags=3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = X_train[col].astype('category').cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[col] = X_test[col].astype('category').cat.codes


Fitting 5 folds for each of 100 candidates, totalling 500 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004972 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2306
[LightGBM] [Info] Number of data points in the train set: 26035, number of used features: 18
[LightGBM] [Info] Start training from score 261.931920
Best parameters found: {'subsample': 1.0, 'num_leaves': 128, 'n_estimators': 500, 'min_child_samples': 20, 'max_depth': -1, 'learning_rate': 0.1, 'colsample_bytree': 0.8}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002171 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2306
[LightGBM] [Info] Number of data points in the train set: 26035, number of used features: 18
[LightGBM] [Info] Start training from score 261.931920
Ensemble Model MSE: 3309.5480, MAE: 14.7492, R²: 0.6417


In [11]:
result_df = predecir_producto(ensemble_model, ts, productosPredecir, next_period='2020-02-01', lags=3)

Predicting with ensemble model: 100%|██████████| 31243/31243 [01:25<00:00, 363.45it/s]


In [12]:
print(result_df)

       product_id  predicted_tn
0           20001   1398.344322
1           20002   1009.368178
2           20003    889.004243
3           20004    671.615383
4           20005    644.200514
...           ...           ...
31238       21265      0.089541
31239       21266      0.094659
31240       21267      0.092835
31241       21271      0.026964
31242       21276      0.045447

[31243 rows x 2 columns]


In [13]:
productosPredecir = pd.read_csv('C:/Users/Josvaldes/Documents/Maestria/Austral/2ano/Labo3/datasets/Proyecto/Labo3/Datasets/productos_a_predecir.txt', sep='\t')

# Asegúrate de que la columna 'product_id' en ambos DataFrames sea del mismo tipo
result_df['product_id'] = result_df['product_id'].astype(int)
productosPredecir['product_id'] = productosPredecir['product_id'].astype(int)

# Realiza un merge para obtener solo los productos predichos que están en productosPredecir
predicted_products = pd.merge(productosPredecir, result_df, on='product_id', how='inner')

# Eliminar duplicados para asegurarse de tener un producto único
predicted_products = predicted_products.drop_duplicates(subset=['product_id'])

# Verifica el resultado
print(predicted_products)

       product_id  predicted_tn
0           20001   1398.344322
36          20002   1009.368178
72          20003    889.004243
108         20004    671.615383
144         20005    644.200514
...           ...           ...
22294       21263      0.089233
22309       21265      0.089541
22319       21266      0.094659
22329       21267      0.092835
22339       21276      0.045447

[780 rows x 2 columns]


In [14]:
predicted_products.to_csv('resultadosPredichos_8.csv', index=False)

Kaggle 0.317

Validación sobre diciembre 2019

In [15]:
ts = preparar_datos(df, tb_productos, lags=3)
ts = ts[ts['periodo'] < '2019-11-01']

In [16]:
# Convertir cada elemento a int64
#productoPrueba_int64 = [np.int64(x) for x in productosPredecir]

In [17]:
ensemble_model_dic23 = entrenar_modelo(ts, lags=3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = X_train[col].astype('category').cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[col] = X_test[col].astype('category').cat.codes


Fitting 5 folds for each of 100 candidates, totalling 500 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002326 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2301
[LightGBM] [Info] Number of data points in the train set: 24480, number of used features: 18
[LightGBM] [Info] Start training from score 261.952202
Best parameters found: {'subsample': 1.0, 'num_leaves': 128, 'n_estimators': 500, 'min_child_samples': 20, 'max_depth': -1, 'learning_rate': 0.1, 'colsample_bytree': 0.8}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002476 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2301
[LightGBM] [Info] Number of data points in the train set: 24480, number of used features: 18
[LightGBM] [Info] Start training from score 261.952202
Ensemble Model MSE: 3482.3428, MAE: 15.4231, R²: 0.6454


In [18]:
productosPredecir = ts['product_id'].values

In [19]:
result_df_dic23 = predecir_producto(ensemble_model_dic23, ts, productosPredecir, next_period='2019-12-01', lags=3)

Predicting with ensemble model: 100%|██████████| 29377/29377 [02:24<00:00, 203.96it/s]


In [20]:
result_df_dic23

Unnamed: 0,product_id,predicted_tn
0,20001,1395.245139
1,20002,994.893190
2,20003,887.157574
3,20004,671.067991
4,20005,646.795952
...,...,...
29372,21266,0.103531
29373,21267,0.109018
29374,21269,0.108780
29375,21271,0.028482


In [21]:
productosPredecir = pd.read_csv('C:/Users/Josvaldes/Documents/Maestria/Austral/2ano/Labo3/datasets/Proyecto/Labo3/Datasets/productos_a_predecir.txt', sep='\t')

In [22]:
productosPredecir

Unnamed: 0,product_id
0,20001
1,20002
2,20003
3,20004
4,20005
...,...
775,21263
776,21265
777,21266
778,21267


In [23]:
# Asegúrate de que la columna 'product_id' en ambos DataFrames sea del mismo tipo
result_df_dic23['product_id'] = result_df_dic23['product_id'].astype(int)
productosPredecir['product_id'] = productosPredecir['product_id'].astype(int)

# Realiza un merge para obtener solo los productos predichos que están en productosPredecir
predicted_products = pd.merge(productosPredecir, result_df_dic23, on='product_id', how='inner')

# Eliminar duplicados para asegurarse de tener un producto único
predicted_products = predicted_products.drop_duplicates(subset=['product_id'])

# Verifica el resultado
print(predicted_products)

       product_id  predicted_tn
0           20001   1395.245139
34          20002    994.893190
68          20003    887.157574
102         20004    671.067991
136         20005    646.795952
...           ...           ...
20744       21263      0.099487
20757       21265      0.097417
20765       21266      0.103531
20773       21267      0.109018
20781       21276      0.051518

[780 rows x 2 columns]


In [24]:
ts = preparar_datos(df, tb_productos, lags=3)

In [25]:
# Paso 1: Filtrar el DataFrame 'ts' para obtener los datos del período específico y los productos a predecir
filtered_df = ts[(ts['periodo'] == '2019-12-01') & (ts['product_id'].isin(productosPredecir['product_id']))]

# Paso 2: Agrupar los datos filtrados por 'product_id' y calcular la suma de 'tn' para cada producto
real_tn = filtered_df.groupby('product_id')['tn'].sum()

# Paso 3: Eliminar duplicados en 'result_df' para asegurar que cada producto aparezca una vez
result_df_unique = result_df.drop_duplicates(subset='product_id')

# Paso 4: Realizar un merge para asegurar que los 'product_id' coincidan en 'result_df' y 'real_tn'
result_df_unique = result_df_unique.merge(real_tn.rename('real_tn'), on='product_id', how='left')

# Paso 5: Calcular la métrica de la empresa por producto
result_df_unique['metricaempresa'] = abs(result_df_unique['real_tn'] - result_df_unique['predicted_tn']) / result_df_unique['real_tn']

# Paso 6: Filtrar 'result_df_unique' para obtener solo los productos que están en 'productosPredecir'
final_result_df = result_df_unique[result_df_unique['product_id'].isin(productosPredecir['product_id'])]

# Imprimir el resultado final
print(final_result_df)

      product_id  predicted_tn     real_tn  metricaempresa
0          20001   1398.344322  1504.68856        0.070675
1          20002   1009.368178  1087.30855        0.071682
2          20003    889.004243   892.50129        0.003918
3          20004    671.615383   637.90002        0.052854
4          20005    644.200514   593.24443        0.085894
...          ...           ...         ...             ...
1185       20962      3.915682     1.99182        0.965882
1186       20975      3.583990     1.69045        1.120140
1187       20995      3.365322     1.55285        1.167191
1188       21087      0.907423     1.02205        0.112154
1189       21214      0.411062     0.24428        0.682751

[780 rows x 4 columns]


In [26]:
metrica = evaluar_metricas(df, result_df, target_date='2019-12-01')

Métrica multinacional 5037.532761987415
metrica multinacional 5037.532761987415
rmse:  1291.3910324271842
mae:  1291.3910324271842
r2:  nan
Métricas por producto exportadas a 'metricas_por_producto.csv'


  avg_r2 = np.nanmean([m['r2'] for m in metricas_por_producto])


In [27]:
final_result_df.to_csv('validacionDic23_8.csv', index=False)

Segunda iteración con productos que tiene una metrica de la empresa superior al 10%

In [28]:
# Filtrar el DataFrame usando la función query
SegundaIteracion_result_df = final_result_df.query('metricaempresa > 0.20')

# Imprimir el resultado filtrado
print(SegundaIteracion_result_df)

      product_id  predicted_tn    real_tn  metricaempresa
5          20006    585.798891  417.23228        0.404011
6          20007    611.623676  390.43432        0.566521
7          20008    554.119264  195.36854        1.836277
9          20010    518.641088  359.59998        0.442272
11         20012    494.822204  173.13004        1.858096
...          ...           ...        ...             ...
1184       20703     12.166955    9.46570        0.285373
1185       20962      3.915682    1.99182        0.965882
1186       20975      3.583990    1.69045        1.120140
1187       20995      3.365322    1.55285        1.167191
1189       21214      0.411062    0.24428        0.682751

[601 rows x 4 columns]


In [29]:
SegundaIteracion_result_df.to_csv('productos2Iteracion.csv', index=False)

In [30]:
result_df

Unnamed: 0,product_id,predicted_tn
0,20001,1398.344322
1,20002,1009.368178
2,20003,889.004243
3,20004,671.615383
4,20005,644.200514
...,...,...
31238,21265,0.089541
31239,21266,0.094659
31240,21267,0.092835
31241,21271,0.026964


In [31]:
# Filtrar el DataFrame usando la función query
listadoProductosFinal_result_df_1 = final_result_df.query('metricaempresa < 0.20')

predicted_products = pd.read_csv('C:/Users/Josvaldes/Documents/Maestria/Austral/2ano/Labo3/datasets/Proyecto/Labo3/Predicciones/resultadosPredichos_8.csv', sep=',')

# Asegúrate de que la columna 'product_id' en ambos DataFrames sea del mismo tipo
predicted_products['product_id'] = predicted_products['product_id'].astype(int)
listadoProductosFinal_result_df_1['product_id'] = listadoProductosFinal_result_df_1['product_id'].astype(int)

# Realiza un merge para obtener solo los productos predichos que están en productosPredecir
listadoProductosFinal_result_df_1 = pd.merge(listadoProductosFinal_result_df_1, predicted_products, on='product_id', how='inner')

# Eliminar duplicados para asegurarse de tener un producto único
#listadoProductosFinal_result_df_1 = predicted_products.drop_duplicates(subset=['product_id'])

# Verifica el resultado
print(listadoProductosFinal_result_df_1)

     product_id  predicted_tn_x     real_tn  metricaempresa  predicted_tn_y
0         20001     1398.344322  1504.68856        0.070675     1398.344322
1         20002     1009.368178  1087.30855        0.071682     1009.368178
2         20003      889.004243   892.50129        0.003918      889.004243
3         20004      671.615383   637.90002        0.052854      671.615383
4         20005      644.200514   593.24443        0.085894      644.200514
..          ...             ...         ...             ...             ...
174       21058        1.643780     1.84115        0.107199        1.643780
175       21097        1.378714     1.34469        0.025302        1.378714
176       21110        1.391452     1.52502        0.087584        1.391452
177       21129        0.927806     0.78410        0.183275        0.927806
178       21087        0.907423     1.02205        0.112154        0.907423

[179 rows x 5 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listadoProductosFinal_result_df_1['product_id'] = listadoProductosFinal_result_df_1['product_id'].astype(int)


In [32]:
predicted_products

Unnamed: 0,product_id,predicted_tn
0,20001,1398.344322
1,20002,1009.368178
2,20003,889.004243
3,20004,671.615383
4,20005,644.200514
...,...,...
775,21263,0.089233
776,21265,0.089541
777,21266,0.094659
778,21267,0.092835


In [46]:
ts = preparar_datos(df, tb_productos, lags=3)

# Cargar los DataFrames desde los archivos CSV
SegundaIteracion_result_df = pd.read_csv('productos2Iteracion.csv')
resultados_predichos_df = pd.read_csv('resultadosPredichos_8.csv')

In [57]:
# Filtrar los productos que están en la segunda iteración
productos_ids_iteracion = SegundaIteracion_result_df['product_id'].unique()

# Crear una lista para almacenar los promedios de los últimos 12 meses
promedio_ultimos_12_meses = []

# Calcular el promedio de 'tn' para los últimos 12 meses para cada producto en la segunda iteración
for product_id in productos_ids_iteracion:
    # Filtrar los datos del producto específico para los últimos 12 meses
    filtered_data = ts[(ts['product_id'] == product_id) & 
                       (ts['periodo'] >= ts['periodo'].max() - pd.DateOffset(months=12))]
    
    # Calcular el promedio de 'tn' para los últimos 12 meses
    avg_tn_last_12_months = filtered_data['tn'].mean()
    
    # Añadir el resultado a la lista
    promedio_ultimos_12_meses.append((product_id, avg_tn_last_12_months))

# Crear un DataFrame con los promedios de los últimos 12 meses
promedios_df = pd.DataFrame(promedio_ultimos_12_meses, columns=['product_id', 'promedio_ultimos_12_meses'])

# Combinar los DataFrames basándose en 'product_id'
resultados_finales = pd.merge(resultados_predichos_df, promedios_df, on='product_id', how='inner')

# Calcular el promedio ponderado entre la predicción y el promedio de los últimos 12 meses
resultados_finales['promedio_ponderado'] = (resultados_finales['predicted_tn'] + resultados_finales['promedio_ultimos_12_meses']) / 2

# Mostrar los resultados finales
print(resultados_finales[['product_id', 'predicted_tn', 'promedio_ultimos_12_meses', 'promedio_ponderado']])

     product_id  predicted_tn  promedio_ultimos_12_meses  promedio_ponderado
0         20006    585.798891                 473.163365          529.481128
1         20007    611.623676                 428.575593          520.099635
2         20008    554.119264                 422.377476          488.248370
3         20010    518.641088                 418.455800          468.548444
4         20012    494.822204                 331.719096          413.270650
..          ...           ...                        ...                 ...
596       21263      0.089233                   0.032354            0.060794
597       21265      0.089541                   0.089541            0.089541
598       21266      0.094659                   0.094659            0.094659
599       21267      0.092835                   0.092835            0.092835
600       21276      0.045447                   0.045447            0.045447

[601 rows x 4 columns]


In [60]:
resultados_predichos_df.head()

Unnamed: 0,product_id,predicted_tn
0,20001,1398.344322
1,20002,1009.368178
2,20003,889.004243
3,20004,671.615383
4,20005,644.200514


In [62]:
# Crear una copia del DataFrame de resultados predichos
resultados_predichos_actualizados = resultados_predichos_df[['product_id', 'predicted_tn']].copy()

# Combinar el DataFrame de resultados predichos con los promedios ponderados basándose en 'product_id'
resultados_ajustados = pd.merge(resultados_predichos_actualizados, 
                                resultados_finales[['product_id', 'promedio_ponderado']], 
                                on='product_id', 
                                how='left')

# Rellenar los valores NaN en la columna 'promedio_ponderado' con los valores originales de 'predicted_tn'
resultados_ajustados['predicted_tn'] = resultados_ajustados['promedio_ponderado'].combine_first(resultados_ajustados['predicted_tn'])

# Seleccionar solo las columnas deseadas
resultados_finales_actualizados = resultados_ajustados[['product_id', 'predicted_tn']]

# Mostrar el DataFrame final en la estructura deseada
print(resultados_finales_actualizados)

     product_id  predicted_tn
0         20001   1398.344322
1         20002   1009.368178
2         20003    889.004243
3         20004    671.615383
4         20005    644.200514
..          ...           ...
775       21263      0.060794
776       21265      0.089541
777       21266      0.094659
778       21267      0.092835
779       21276      0.045447

[780 rows x 2 columns]


In [63]:
resultados_finales_actualizados.to_csv('resultadosPredichos_8_Ajustado.csv', index=False)

Kaggle 0.287