# 06_xgboost

## Importando bibliotecas

In [1]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import os
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error

import numpy as np


## Carregando os dados

In [2]:
# Dados
df_train = pd.read_csv('../data/raw/train.csv', parse_dates=['Date'])
# Feature Engineering Temporal no treino
df_train = df_train[df_train['Store'] == 1]
df_train = df_train[df_train['Sales'] > 0]
df_train.set_index('Date', inplace=True)
df_train = df_train.sort_index()

  df_train = pd.read_csv('../data/raw/train.csv', parse_dates=['Date'])


## Processamento

In [3]:
# Criar features
df_train['dayofweek'] = df_train.index.dayofweek
df_train['month'] = df_train.index.month
df_train['quarter'] = df_train.index.quarter
df_train['day'] = df_train.index.day
df_train['year'] = df_train.index.year
df_train['lag_1'] = df_train['Sales'].shift(1)
df_train['lag_7'] = df_train['Sales'].shift(7)
df_train['rolling_mean_7'] = df_train['Sales'].rolling(7).mean()
df_train['rolling_mean_30'] = df_train['Sales'].rolling(30).mean()
df_train.dropna(inplace=True)

In [4]:
# Mesmas features no teste
df_test = pd.read_csv('../data/raw/test.csv', parse_dates=['Date'])
df_test = df_test[df_test['Store'] == 1]
df_test.set_index('Date', inplace=True)
df_test = df_test.sort_index()
df_test['dayofweek'] = df_test.index.dayofweek
df_test['month'] = df_test.index.month
df_test['quarter'] = df_test.index.quarter
df_test['day'] = df_test.index.day
df_test['year'] = df_test.index.year
display(df_test.head(), df_test.shape)

Unnamed: 0_level_0,Id,Store,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,dayofweek,month,quarter,day,year
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015-08-01,40233,1,6,1.0,0,0,1,5,8,3,1,2015
2015-08-02,39377,1,7,0.0,0,0,1,6,8,3,2,2015
2015-08-03,38521,1,1,1.0,1,0,1,0,8,3,3,2015
2015-08-04,37665,1,2,1.0,1,0,1,1,8,3,4,2015
2015-08-05,36809,1,3,1.0,1,0,1,2,8,3,5,2015


(48, 12)

In [5]:
print(df_train['Sales'].dtype)
# print(df_test['Sales'].dtype)
#Force a conversão para numérico, ignorando erros:
df_train['Sales'] = pd.to_numeric(df_train['Sales'], errors='coerce')
# df_test['Sales'] = pd.to_numeric(df_test['Sales'], errors='coerce')

int64


In [6]:
# Criar coluna Sales no teste com NaN para permitir concatenação
df_test['Sales'] = pd.to_numeric(pd.NA, errors='coerce') 

# Concatenar séries mantendo índice temporal (supondo que índice seja a data)
combined = pd.concat([df_train[['Sales']], df_test[['Sales']]])

# # Criar lags e médias móveis na série concatenada
combined['lag_1'] = combined['Sales'].shift(1)
combined['lag_7'] = combined['Sales'].shift(7)
combined['rolling_mean_7'] = combined['Sales'].rolling(7).mean()
combined['rolling_mean_30'] = combined['Sales'].rolling(30).mean()
# combined.set_index('Date', inplace=True)
# display(df_test.head())
# display(combined.head())
# # "Puxar" as features para o df_test usando o índice (datas)
df_test = df_test.join(combined[['lag_1', 'lag_7', 'rolling_mean_7', 'rolling_mean_30']], rsuffix='_new')
# display(df_test.head())

# # Remover linhas do teste que ficaram com NaN por causa dos lags no início
# df_test.dropna(inplace=True)

# df_test.head()

## Modelagem

In [7]:
# Modelagem
features = ['dayofweek', 'month', 'quarter', 'day', 'year', 'lag_1', 'lag_7', 'rolling_mean_7', 'rolling_mean_30']
X_train = df_train[features]
y_train = df_train['Sales']
X_test = df_test[features]

model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
model.fit(X_train, y_train)

# Previsão
df_test['PredictedSales'] = model.predict(X_test)
display(df_test['PredictedSales'].head())

Date
2015-08-01    6590.632812
2015-08-02    7092.127441
2015-08-03    7758.769531
2015-08-04    7070.836426
2015-08-05    6946.858887
Name: PredictedSales, dtype: float32

In [8]:
# Previsão no conjunto de treino
y_pred_train = model.predict(X_train)
# Valores reais
y_true_train = y_train

# Avaliação do desempenho do modelo de previsão
rmse = np.sqrt(mean_squared_error(y_true_train, y_pred_train))
mae = mean_absolute_error(y_true_train, y_pred_train)
mape = mean_absolute_percentage_error(y_true_train, y_pred_train)

print(f"📉 RMSE (Root Mean Squared Error) - Treino: {rmse:.2f}")
print(f"📉 MAE  (Mean Absolute Error) - Treino: {mae:.2f}")
print(f"📉 MAPE (Mean Absolute Percentage Error) - Treino: {mape:.2%}")

📉 RMSE (Root Mean Squared Error) - Treino: 19.19
📉 MAE  (Mean Absolute Error) - Treino: 13.57
📉 MAPE (Mean Absolute Percentage Error) - Treino: 0.31%


In [9]:
# Cria a figura interativa
fig = go.Figure()

# Vendas reais no treino
fig.add_trace(go.Scatter(
    x=df_train.index,
    y=df_train['Sales'],
    mode='lines',
    name='Real (Treino)',
    line=dict(color='blue')
))

# Previsão no treino
fig.add_trace(go.Scatter(
    x=df_train.index,
    y=y_pred_train,
    mode='lines',
    name='Previsto (Treino)',
    line=dict(color='orange', dash='dot')
))

# Previsão no teste
fig.add_trace(go.Scatter(
    x=df_test.index,
    y=df_test['PredictedSales'],
    mode='lines',
    name='Previsto (Teste)',
    line=dict(color='green')
))

# Layout do gráfico
fig.update_layout(
    title='📈 Previsão de Vendas com XGBoost - Loja 1',
    xaxis_title='Data',
    yaxis_title='Vendas',
    legend=dict(x=0.01, y=0.99),
    template='plotly_white',
    width=1000,
    height=400
)

# Salva o gráfico
fig.write_image("../reports/figures/xgboost_forecast.png")

# Exibe no notebook
fig.show()