**Grupo AutoML2**


---


Integrantes:

*   Falcones, Johanna
*   Orduz, Monica
*   Rodriguez, José Antonio






In [None]:
!pip install pycaret

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
from pycaret.regression import *
import gzip
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

# Load Dataset

In [3]:
def fill_moving_average(df, col, window=2):
    df = df.copy()
    for i in range(len(df)):
        if pd.isnull(df.loc[i, col]):
            # Calcular la media móvil de las dos ventas más cercanas
            if i == 0:
                df.loc[i, col] = 0
            elif i == 1:
                df.loc[i, col] = df.loc[:i-1, col].mean()
            else:
                df.loc[i, col] = df.loc[:i-1, col].rolling(window=window, min_periods=1).mean().iloc[-1]
    return df

In [4]:
# Especifica el nombre del archivo que has subido
filename = 'sell-in.txt.gz'


# Abre el archivo en modo lectura ('r')
with open('productos_a_predecir.txt', 'rt') as ff:
    df_productos = pd.read_csv(ff, delimiter='\t')

# Abre y lee el archivo gzip
with gzip.open(filename, 'rt') as f:
    df = pd.read_csv(f, delimiter='\t')  # Ajusta el delimitador según tu archivo

filtered_df = df[df['product_id'].isin(df_productos['product_id'])]
grouped_df = filtered_df.groupby(['periodo', 'product_id']).agg({
    'plan_precios_cuidados': 'sum',
    'cust_request_qty': 'sum',
    'cust_request_tn': 'sum',
    'tn': 'sum'
}).reset_index()


todos_periodos = pd.DataFrame({'periodo': grouped_df["periodo"].unique()})
productos = df_productos
ventas = grouped_df

# Crear todas las combinaciones posibles de periodos y productos
combinaciones = pd.DataFrame(list(itertools.product(todos_periodos['periodo'], productos['product_id'])), columns=['periodo', 'product_id'])

# Merge con el DataFrame de ventas
df_completo = combinaciones.merge(ventas, on=['periodo', 'product_id'], how='left')
df_completo = fill_moving_average(df_completo, 'tn')

df_completo['periodo']=pd.to_datetime(df_completo['periodo'].astype(str), format='%Y%m')

# Filtrar solo las columnas necesarias
data = df_completo[['periodo', 'product_id', 'tn']]

# Pycaret

In [5]:
# Crear un DataFrame con las fechas futuras y los productos
future_dates = pd.to_datetime(['2020-01-01', '2020-02-01'])
unique_product_ids = data['product_id'].unique()

# Generar las predicciones para cada producto
future_df = pd.DataFrame({
    'periodo': future_dates.repeat(len(unique_product_ids)),
    'product_id': list(unique_product_ids) * len(future_dates)
})


# Configurar el entorno
regressor = setup(data=data,
                  target='tn',
                  normalize = True,
                  normalize_method = 'zscore',
                  fold=3,
                  session_id=123)

# Entrenar el mejor modelo
best_model = compare_models()

# Realizar predicciones
predictions = predict_model(best_model, data=future_df)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,tn
2,Target type,Regression
3,Original data shape,"(28080, 3)"
4,Transformed data shape,"(28080, 5)"
5,Transformed train set shape,"(19656, 5)"
6,Transformed test set shape,"(8424, 5)"
7,Numeric features,1
8,Date features,1
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,9.5301,879.1903,29.6455,0.925,0.3703,0.9709,2.94
gbr,Gradient Boosting Regressor,10.2673,959.4137,30.9723,0.9182,0.4666,2.0857,0.97
et,Extra Trees Regressor,9.8888,972.0674,31.1644,0.9171,0.39,0.9962,1.7433
lightgbm,Light Gradient Boosting Machine,10.6885,1318.0385,36.2426,0.8875,0.4129,1.4348,0.9833
ada,AdaBoost Regressor,17.5108,1511.7663,38.8795,0.871,1.222,16.7479,0.2967
dt,Decision Tree Regressor,12.3061,1513.5945,38.8591,0.8709,0.4793,1.0828,0.15
xgboost,Extreme Gradient Boosting,11.7077,1801.4821,42.3891,0.8464,0.4104,1.2919,0.3
knn,K Neighbors Regressor,12.0611,2079.9017,45.5893,0.8225,0.4288,1.136,0.11
lar,Least Angle Regression,47.3302,9253.1733,96.1925,0.2105,1.7818,57.7701,0.0633
br,Bayesian Ridge,47.3118,9253.1754,96.1925,0.2105,1.7811,57.6873,0.07


Processing:   0%|          | 0/81 [00:00<?, ?it/s]

Unnamed: 0,periodo,product_id,prediction_label
0,2020-01-01,20001,1240.887330
1,2020-01-01,20002,1146.660759
2,2020-01-01,20003,955.124370
3,2020-01-01,20004,499.190195
4,2020-01-01,20005,497.894102
...,...,...,...
1555,2020-02-01,21263,0.054950
1556,2020-02-01,21265,0.061799
1557,2020-02-01,21266,0.057610
1558,2020-02-01,21267,0.079779


In [6]:
predictions.rename(columns={'prediction_label': 'tn'}, inplace=True)
predictions = predictions[predictions['periodo'] == '2020-02-01']
predictions['tn'] = predictions['tn'].apply(lambda x: 0 if x < 0 else x)
predictions.drop(columns=['periodo'], inplace=True)

# Guardar el DataFrame en un archivo CSV con separador de coma y punto decimal
predictions.to_csv('kaggle.csv', sep=',', decimal='.', index=False)