In [1]:
import os, sys

repo_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(repo_root)

import pandas as pd
import numpy as np

from src.utils import log
from src.config import DATA_PATH, OUTPUT_PATH
from src.data_io import map_files, weekly_aggregate_2022_stream
from src.features import jan_2023_weeks
from src.baselines import naive4_forecast, ewma_forecast
from src.metrics import mae, rmse, mape, wmape

In [3]:
mapping = map_files(DATA_PATH)
assert 'transacoes' in mapping, "Não encontrei o parquet de transações em data/."

log("Agregando 2022 por semana (streaming)…")
wk = weekly_aggregate_2022_stream(mapping['transacoes'])
wk = wk.sort_values(['pdv','produto','ano_iso','semana_iso']).reset_index(drop=True)
wk.head(), wk.shape

[2025-09-10 17:48:42] Arquivo detectado: part-00000-tid-2779033056155408584-f6316110-4c9a-4061-ae48-69b77c7c8c36-4-1-c000.snappy.parquet  =>  dim_pdv
[2025-09-10 17:48:42] Arquivo detectado: part-00000-tid-5196563791502273604-c90d3a24-52f2-4955-b4ec-fb143aae74d8-4-1-c000.snappy.parquet  =>  transacoes
[2025-09-10 17:48:42] Arquivo detectado: part-00000-tid-6364321654468257203-dc13a5d6-36ae-48c6-a018-37d8cfe34cf6-263-1-c000.snappy.parquet  =>  desconhecido
[2025-09-10 17:48:42] Agregando 2022 por semana (streaming)…


(                   pdv              produto  ano_iso  semana_iso  quantidade  \
 0  1000237487041964405  1837429607327399565     2022           6         1.0   
 1  1000237487041964405  1837429607327399565     2022           7         2.0   
 2  1000237487041964405  1837429607327399565     2022          21         1.0   
 3  1000237487041964405  1837429607327399565     2022          25         2.0   
 4  1000237487041964405  1837429607327399565     2022          28         2.0   
 
    faturamento  
 0    32.920242  
 1    65.840485  
 2    36.120243  
 3    72.240486  
 4    72.240486  ,
 (6247301, 6))

In [5]:
def unique_weeks_2022(wk):
    weeks = wk[['ano_iso','semana_iso']].drop_duplicates().sort_values(['ano_iso','semana_iso'])
    # início ou fim de ano ISO podem “vazar” 2021/2023; manter a ordem natural
    return list(map(tuple, weeks[['ano_iso','semana_iso']].itertuples(index=False, name=None)))

def make_calendar_from_weeks(test_weeks):
    cal = pd.DataFrame(test_weeks, columns=['ano_iso','semana_iso']).sort_values(['ano_iso','semana_iso'])
    cal['semana'] = np.arange(1, len(cal)+1, dtype=int)
    return cal[['ano_iso','semana_iso','semana']]

def filter_weeks(df, weeks_set):
    key = list(map(tuple, df[['ano_iso','semana_iso']].itertuples(index=False, name=None)))
    mask = [k in weeks_set for k in key]
    return df.loc[mask]

def rolling_origins(weeks_list, horizon=4, n_folds=6, min_train_weeks=16):
    start = min_train_weeks
    end = len(weeks_list) - horizon
    end = max(end, start+1)
    idxs = np.linspace(start, end-1, num=min(n_folds, max(1, end-start)), dtype=int)
    folds = []
    for o in idxs:
        train_weeks = set(weeks_list[:o+1])
        test_weeks = weeks_list[o+1:o+1+horizon]
        folds.append((train_weeks, test_weeks))
    return folds

def evaluate_naive_ewma(wk, horizon=4, n_folds=6, alpha=0.5):
    weeks = unique_weeks_2022(wk)
    folds = rolling_origins(weeks, horizon=horizon, n_folds=n_folds, min_train_weeks=16)

    rows = []
    for i,(train_set,test_list) in enumerate(folds,1):
        log(f"Fold {i}/{len(folds)} | teste={test_list}")
        train_wk = filter_weeks(wk, train_set)
        test_wk  = filter_weeks(wk, set(test_list))
        cal = make_calendar_from_weeks(test_list)

        pred_naive = naive4_forecast(train_wk, cal)
        pred_ewma  = ewma_forecast(train_wk, cal, alpha=alpha)

        y_true = (test_wk.merge(cal, on=['ano_iso','semana_iso'], how='inner')
                        [['semana','pdv','produto','quantidade']]
                        .rename(columns={'quantidade':'y'}))

        def score(pred, name):
            dfm = y_true.merge(pred, on=['semana','pdv','produto'], how='left')
            dfm['quantidade'] = dfm['quantidade'].fillna(0).astype(int)
            return {
                'fold': i, 'model': name,
                'wmape': wmape(dfm['y'], dfm['quantidade']),
                'mape':  mape(dfm['y'], dfm['quantidade']),
                'mae':   mae(dfm['y'], dfm['quantidade']),
                'rmse':  rmse(dfm['y'], dfm['quantidade']),
                'n_rows': len(dfm)
            }

        rows.append(score(pred_naive, "naive4"))
        rows.append(score(pred_ewma,  f"ewma_a{alpha}"))

    res = pd.DataFrame(rows)
    res_overall = (res.groupby('model', as_index=False)
                     .agg({'wmape':'mean','mape':'mean','mae':'mean','rmse':'mean','n_rows':'sum'})
                     .sort_values('wmape'))
    return res, res_overall

res_folds, res_overall = evaluate_naive_ewma(wk, horizon=4, n_folds=6, alpha=0.5)
res_overall

[2025-09-10 17:50:54] Fold 1/6 | teste=[(2022, 17), (2022, 18), (2022, 19), (2022, 20)]
[2025-09-10 18:00:46] Fold 2/6 | teste=[(2022, 23), (2022, 24), (2022, 25), (2022, 26)]
[2025-09-10 18:12:21] Fold 3/6 | teste=[(2022, 29), (2022, 30), (2022, 31), (2022, 32)]
[2025-09-10 18:23:43] Fold 4/6 | teste=[(2022, 36), (2022, 37), (2022, 38), (2022, 39)]
[2025-09-10 18:36:08] Fold 5/6 | teste=[(2022, 42), (2022, 43), (2022, 44), (2022, 45)]
[2025-09-10 18:54:40] Fold 6/6 | teste=[(2022, 49), (2022, 50), (2022, 51), (2022, 52)]


Unnamed: 0,model,wmape,mape,mae,rmse,n_rows
0,ewma_a0.5,1.410378,0.857108,12.521502,87.765507,3225397
1,naive4,1.440827,0.867919,12.679144,90.096507,3225397


In [6]:
cal = jan_2023_weeks(n_weeks=4)   # troque para 5 se necessário
pred = naive4_forecast(wk, cal)

os.makedirs(os.path.join(repo_root, "output"), exist_ok=True)
out_path = os.path.join(repo_root, "output", "predictions_naive4.csv")
pred.to_csv(out_path, sep=";", index=False, encoding="utf-8")
out_path, pred.shape, pred.head()

('C:\\Users\\kaiog\\OneDrive\\Área de Trabalho\\Datathon Big Data 2025\\output\\predictions_naive4.csv',
 (4177240, 4),
    semana                  pdv              produto  quantidade
 0       1  1000237487041964405  1837429607327399565           2
 1       2  1000237487041964405  1837429607327399565           2
 2       3  1000237487041964405  1837429607327399565           2
 3       4  1000237487041964405  1837429607327399565           2
 4       1  1000237487041964405  4038588102284338370           1)

In [7]:
cal = jan_2023_weeks(n_weeks=4)   # troque para 5 se necessário
pred = ewma_forecast(wk, cal, alpha=0.5)  # experimente 0.3 ou 0.7

os.makedirs(os.path.join(repo_root, "output"), exist_ok=True)
out_path = os.path.join(repo_root, "output", "predictions_ewma.csv")
pred.to_csv(out_path, sep=";", index=False, encoding="utf-8")
out_path, pred.shape, pred.head()

('C:\\Users\\kaiog\\OneDrive\\Área de Trabalho\\Datathon Big Data 2025\\output\\predictions_ewma.csv',
 (4177240, 4),
    semana                  pdv              produto  quantidade
 0       1  1000237487041964405  1837429607327399565           2
 1       2  1000237487041964405  1837429607327399565           2
 2       3  1000237487041964405  1837429607327399565           2
 3       4  1000237487041964405  1837429607327399565           2
 4       1  1000237487041964405  4038588102284338370           1)

In [8]:
from src.validate_submission import validate_submission

ok, msg = validate_submission(out_path, max_weeks=4, sep=";")
print(msg)

OK: 4177240 linhas | semanas 1..4 | PDVs 15086 | Produtos 7092.

Amostra:
         semana                  pdv              produto  quantidade
4154999       4   966615364041998304  7799275059591320830           8
4169475       4   990181515273433855   379454763001947803          48
3652751       4  8309229924396980786  7891860100939313524           4
1780055       4   459097202938988684  1441611406423199764           0
2423110       3  5890345310509258153   945870105710456853           1
