In [37]:
#Librerias 
import pandas as pd 
import sys
import yaml
import pickle
from sklearn.ensemble import RandomForestRegressor
pd.options.display.float_format = '{:.2f}'.format
import json
import yaml
import os
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np 

In [14]:
def leer_data(input_path):
    df = pd.read_excel(input_path)
    df = df.rename(columns={'Precio Unitario': 'Precio_Unitario', 
    })
    return df 

In [26]:
def check_and_impute_dates(df, output_path,date_column='Fecha'):
    params = yaml.safe_load(open("params.yaml"))["prepare"]
    impute = params["imputation_strategy"]
    df[date_column] = pd.to_datetime(df[date_column])
    df = df.sort_values(date_column)
    full_range = pd.date_range(start=df[date_column].min(), 
                               end=df[date_column].max(), 
                               freq='D')
    if len(full_range) == len(df):
        print("Todas las fechas son consecutivas.")
        return df
    else:
        missing_days = len(full_range) - len(df)
        print(f" Se detectaron {missing_days} días faltantes. Iniciando imputación...")
        df = df.set_index(date_column).reindex(full_range).reset_index()
        df = df.rename(columns={'index': date_column})
        df['Ventas'] = df['Ventas'].fillna(impute["ventas"])
        df['Promo'] = df['Promo'].fillna(impute["promo"])   
        # Imputación dinámica para Precio_Unitario
        if impute["precio"] == "linear":
            df['Precio_Unitario'] = df['Precio_Unitario'].interpolate(method='linear')
        else:
            df['Precio_Unitario'] = df['Precio_Unitario'].fillna(0)    
        df['Precio_Unitario'] = df['Precio_Unitario'].round(2)
        df['Ventas'] = df['Ventas'].round(0).astype(int)
        df['Fecha'] = pd.to_datetime(df['Fecha'])
        df = df.sort_values('Fecha')
        df['dia_semana'] = df['Fecha'].dt.dayofweek
        df['mes'] = df['Fecha'].dt.month
        df.to_csv(output_path, index=False)
        return df

In [27]:
if __name__ == "__main__":
    df = leer_data('data/Data.xlsx')

### Analizaremos si las fechas presentan continuidad 


In [28]:
df = check_and_impute_dates(df,'data/prepared_data.csv','Fecha')

 Se detectaron 209 días faltantes. Iniciando imputación...


### Si presentaron faltantes en 209 días, para ello se tomaron los siguientes supuestos, si no hay registros en venta, asumimos que fue cero, si falta la promo asumimos que no hubo promoción, para los precios se utiliza interpolación lineal.

In [29]:
df.head()

Unnamed: 0,Fecha,Ventas,Precio_Unitario,Promo,dia_semana,mes
0,2023-01-03,1150338303,15136030.31,1.0,1,1
1,2023-01-04,572420892,15063707.67,1.0,2,1
2,2023-01-05,373414154,15558923.09,0.0,3,1
3,2023-01-06,1669568306,15177893.69,1.0,4,1
4,2023-01-07,0,15179816.58,0.0,5,1


In [31]:
def train():
    with open("params.yaml", "r") as f:
        config = yaml.safe_load(f)
    df = pd.read_csv("data/prepared_data.csv")
    
    X = df[config["prepare"]["features"]]
    y = df['Ventas']
    
    model = RandomForestRegressor(
        n_estimators=config["train"]["n_estimators"],
        max_depth=config["train"]["max_depth"],
        random_state=config["base"]["random_state"]
    )
    model.fit(X, y)
    
    with open("model.pkl", "wb") as f:
        pickle.dump(model, f)

if __name__ == "__main__":
    train()

In [34]:
df.head(1)

Unnamed: 0,Fecha,Ventas,Precio_Unitario,Promo,dia_semana,mes
0,2023-01-03,1150338303,15136030.31,1.0,1,1


In [38]:
def evaluate():
    with open("params.yaml", "r") as f:
        config = yaml.safe_load(f)
    
    df = pd.read_csv("data/prepared_data.csv")
    with open("model.pkl", "rb") as f:
        model = pickle.load(f)
    
    features = config["prepare"]["features"]
    X = df[features]
    y = df['Ventas']
    
    predictions = model.predict(X)
    
    decimals = config.get("evaluate", {}).get("round_decimals", 4)
    
    mae = mean_absolute_error(y, predictions)
    mse = mean_squared_error(y, predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(y, predictions)
    
    metrics = {
        "mae": round(float(mae), decimals),
        "rmse": round(float(rmse), decimals),
        "r2": round(float(r2), decimals)
    }
    
    os.makedirs("metrics", exist_ok=True)
    with open("metrics/scores.json", "w") as f:
        json.dump(metrics, f, indent=4)
        
    print(f"Métricas calculadas con éxito: {metrics}")

if __name__ == "__main__":
    evaluate()

Métricas calculadas con éxito: {'mae': 462038861.51, 'rmse': 685103496.259, 'r2': 0.7583}
