In [1]:
import pandas as pd
import pmdarima as pm
from datetime import datetime
from pmdarima import model_selection
from pmdarima import auto_arima

In [2]:
# Lee el archivo como un DataFrame
df  = pd.read_csv('data/sell-in.csv', delimiter='\t')
df_pid_validos  = pd.read_csv('data/productos_a_predecir.txt')

# Ajustar el ancho máximo de las columnas
pd.set_option('display.max_colwidth', None)

# Ajustar el ancho máximo de la visualización
pd.set_option('display.width', 1000)

# Muestra las primeras filas del DataFrame
print(df.head())
print(df_pid_validos.head())

   periodo  customer_id  product_id  plan_precios_cuidados  cust_request_qty  cust_request_tn       tn
0   201701        10234       20524                      0                 2          0.05300  0.05300
1   201701        10032       20524                      0                 1          0.13628  0.13628
2   201701        10217       20524                      0                 1          0.03028  0.03028
3   201701        10125       20524                      0                 1          0.02271  0.02271
4   201701        10012       20524                      0                11          1.54452  1.54452
   product_id
0       20001
1       20002
2       20003
3       20004
4       20005


In [3]:
product_ids_validos = df_pid_validos['product_id'].tolist()

In [4]:
df.head()

Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,cust_request_qty,cust_request_tn,tn
0,201701,10234,20524,0,2,0.053,0.053
1,201701,10032,20524,0,1,0.13628,0.13628
2,201701,10217,20524,0,1,0.03028,0.03028
3,201701,10125,20524,0,1,0.02271,0.02271
4,201701,10012,20524,0,11,1.54452,1.54452


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2945818 entries, 0 to 2945817
Data columns (total 7 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   periodo                int64  
 1   customer_id            int64  
 2   product_id             int64  
 3   plan_precios_cuidados  int64  
 4   cust_request_qty       int64  
 5   cust_request_tn        float64
 6   tn                     float64
dtypes: float64(2), int64(5)
memory usage: 157.3 MB


In [6]:
# Filtrar el DataFrame original para quedarse solo con los product_id válidos
data = df[df['product_id'].isin(product_ids_validos)]

In [7]:
data.head()

Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,cust_request_qty,cust_request_tn,tn
0,201701,10234,20524,0,2,0.053,0.053
1,201701,10032,20524,0,1,0.13628,0.13628
2,201701,10217,20524,0,1,0.03028,0.03028
3,201701,10125,20524,0,1,0.02271,0.02271
4,201701,10012,20524,0,11,1.54452,1.54452


In [10]:
# Suponiendo que tu DataFrame se llama df
data.set_index('periodo', inplace=True)

KeyError: "None of ['periodo'] are in the columns"

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2293481 entries, 0 to 2945817
Data columns (total 7 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   periodo                int64  
 1   customer_id            int64  
 2   product_id             int64  
 3   plan_precios_cuidados  int64  
 4   cust_request_qty       int64  
 5   cust_request_tn        float64
 6   tn                     float64
dtypes: float64(2), int64(5)
memory usage: 140.0 MB


In [15]:
# Función para realizar la predicción con auto_arima
def predict_with_auto_arima(df):
    # Filtrar los datos para cada product_id
    results = []
    product_ids = df['product_id'].unique()
    
    for product_id in product_ids:
        product_data = df[df['product_id'] == product_id]
        product_data = product_data.sort_values(by='periodo')
        
        # Asegurarse de que los datos están en el formato correcto
        y = product_data['tn'].values
        
        # Manejar casos con datos insuficientes para la predicción
        if len(y) < 2:
            print(f"Datos insuficientes para el producto {product_id}, se omite la predicción.")
            results.append((product_id, np.nan))  # Asignar NaN si los datos son insuficientes
            continue
        
        # Ajustar el modelo
        model = auto_arima(y, seasonal=False, error_action='ignore', suppress_warnings=True)
        
        # Realizar la predicción
        forecast = model.predict(n_periods=1)
        results.append((product_id, forecast[0]))
    
    return results

In [16]:
# Convertir la columna 'periodo' a datetime utilizando .loc para evitar SettingWithCopyWarning
data.loc[:, 'periodo'] = pd.to_datetime(data['periodo'], format='%Y%m', errors='coerce')

# Realizar las predicciones
predictions = predict_with_auto_arima(data)

# Crear un DataFrame con los resultados
output_df = pd.DataFrame(predictions, columns=['product_id', 'tn'])

# Guardar los resultados en un archivo CSV
output_df.to_csv('salida_autoarima_v5.csv', index=False)
print("Predicciones guardadas en salida_autoarima_v5.csv")

['2017-01-01 00:00:00', '2017-01-01 00:00:00', '2017-01-01 00:00:00', '2017-01-01 00:00:00', '2017-01-01 00:00:00', '2017-01-01 00:00:00', '2017-01-01 00:00:00', '2017-01-01 00:00:00', '2017-01-01 00:00:00', '2017-01-01 00:00:00',
 ...
 '2019-12-01 00:00:00', '2019-12-01 00:00:00', '2019-12-01 00:00:00', '2019-12-01 00:00:00', '2019-12-01 00:00:00', '2019-12-01 00:00:00', '2019-12-01 00:00:00', '2019-12-01 00:00:00', '2019-12-01 00:00:00', '2019-12-01 00:00:00']
Length: 2293481, dtype: datetime64[ns]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.loc[:, 'periodo'] = pd.to_datetime(data['periodo'], format='%Y%m', errors='coerce')


Predicciones guardadas en salida_autoarima_v5.csv
