In [1]:
import pandas as pd

df = pd.read_csv("train.csv", parse_dates=["date"])
df = df.sort_values(["store_nbr", "family", "date"])  # molto importante!

In [2]:
# Feature temporali
df["dayofweek"] = df["date"].dt.dayofweek
df["month"] = df["date"].dt.month
df["year"] = df["date"].dt.year
df["day"] = df["date"].dt.day

# Codifica booleana
df["is_holiday"] = df["is_holiday"].astype(int)

In [3]:
from sklearn.preprocessing import LabelEncoder

le_store = LabelEncoder()
le_family = LabelEncoder()

df["store_nbr_enc"] = le_store.fit_transform(df["store_nbr"])
df["family_enc"] = le_family.fit_transform(df["family"])

In [5]:
# Per ogni serie (store + family) aggiungiamo un contatore temporale
df["series_id"] = df["store_nbr_enc"].astype(str) + "_" + df["family_enc"].astype(str)
df["time_idx"] = df.groupby("series_id").cumcount()

In [6]:
def add_lag_and_rolling(df):
    df = df.sort_values(["series_id", "date"])

    for lag in [1, 7]:
        df[f"lag_{lag}"] = df.groupby("series_id")["sales"].shift(lag)

    for window in [7, 14]:
        df[f"rolling_mean_{window}"] = (
            df.groupby("series_id")["sales"].shift(1).rolling(window=window).mean()
        )

    return df

df = add_lag_and_rolling(df)
df = df.fillna(0)

In [7]:
from sklearn.preprocessing import StandardScaler

df["sales_normalized"] = 0.0
scalers = {}

for sid, group in df.groupby("series_id"):
    scaler = StandardScaler()
    df.loc[group.index, "sales_normalized"] = scaler.fit_transform(group["sales"].values.reshape(-1, 1)).flatten()
    scalers[sid] = scaler

In [15]:
import numpy as np

def create_sequences(df, input_len=30, forecast_len=7):
    X, y, store_ids, family_ids = [], [], [], []

    feature_cols = [
        "sales_normalized", "onpromotion", "is_holiday", "dcoilwtico",
        "dayofweek", "month", "lag_1", "lag_7", "rolling_mean_7", "rolling_mean_14"
    ]

    for sid, group in df.groupby("series_id"):
        group = group.sort_values("time_idx")
        values = group[feature_cols].values
        targets = group["sales_normalized"].values
        store = group["store_nbr_enc"].values[0]
        family = group["family_enc"].values[0]

        for i in range(input_len, len(group) - forecast_len + 1):
            X.append(values[i - input_len:i])
            y.append(targets[i:i + forecast_len])
            store_ids.append(store)
            family_ids.append(family)

    return (
        np.array(X),
        np.array(y),
        np.array(store_ids),
        np.array(family_ids)
    )


X, y, store_ids, family_ids = create_sequences(df)
print("Shape X:", X.shape)  # (n_seq, 30, n_features)
print("Shape y:", y.shape)  # (n_seq, forecast_len)

Shape X: (2990196, 30, 10)
Shape y: (2990196, 7)


In [16]:
import torch
from torch.utils.data import Dataset, DataLoader

class SalesDataset(Dataset):
    def __init__(self, X, y, store_ids, family_ids):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.store_ids = torch.tensor(store_ids, dtype=torch.long)
        self.family_ids = torch.tensor(family_ids, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], self.store_ids[idx], self.family_ids[idx]

# Split in train/test
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val, store_ids_train, store_ids_val, family_ids_train, family_ids_val = train_test_split(
    X, y, store_ids, family_ids, test_size=0.1, shuffle=False
)

train_dataset = SalesDataset(X_train, y_train, store_ids_train, family_ids_train)
val_dataset = SalesDataset(X_val, y_val, store_ids_val, family_ids_val)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128)

In [17]:
import torch.nn as nn

class SalesLSTM(nn.Module):
    def __init__(self, input_size, hidden_size=64, num_layers=2, forecast_len=7,
                 n_stores=54, n_families=33, emb_dim=8, dropout_rate=0.3):
        super().__init__()
        self.store_emb = nn.Embedding(n_stores, emb_dim)
        self.family_emb = nn.Embedding(n_families, emb_dim)
        
        self.lstm = nn.LSTM(input_size + 2 * emb_dim, hidden_size, num_layers,
                            batch_first=True, dropout=dropout_rate)
        
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_size, forecast_len)

    def forward(self, x, store_id, family_id):
        # x: (batch, seq_len, input_size)
        store_e = self.store_emb(store_id).unsqueeze(1).expand(-1, x.size(1), -1)
        family_e = self.family_emb(family_id).unsqueeze(1).expand(-1, x.size(1), -1)
        x = torch.cat([x, store_e, family_e], dim=-1)  # concat embeddings

        out, _ = self.lstm(x)
        out = self.dropout(out[:, -1, :])
        out = self.fc(out)
        return out  # shape: (batch, forecast_len)

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SalesLSTM(input_size=X.shape[2]).to(device)
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

def train(model, train_loader, val_loader, epochs=20):
    model.to(device)
    loss_fn = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for X_batch, y_batch, store_ids, family_ids in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            store_ids, family_ids = store_ids.to(device), family_ids.to(device)

            optimizer.zero_grad()
            preds = model(X_batch, store_ids, family_ids)
            loss = loss_fn(preds, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X_batch, y_batch, store_ids, family_ids in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                store_ids, family_ids = store_ids.to(device), family_ids.to(device)
                preds = model(X_batch, store_ids, family_ids)
                loss = loss_fn(preds, y_batch)
                val_loss += loss.item()

        print(f"Epoch {epoch+1} - Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")

train(model, train_loader, val_loader, epochs=10)

Epoch 1 - Train Loss: 0.6042, Val Loss: 0.5681
Epoch 2 - Train Loss: 0.5653, Val Loss: 0.5661
Epoch 3 - Train Loss: 0.5542, Val Loss: 0.5508
Epoch 4 - Train Loss: 0.5486, Val Loss: 0.5446
Epoch 5 - Train Loss: 0.5436, Val Loss: 0.5434
Epoch 6 - Train Loss: 0.5384, Val Loss: 0.5374
Epoch 7 - Train Loss: 0.5344, Val Loss: 0.5326
Epoch 8 - Train Loss: 0.5319, Val Loss: 0.5319
Epoch 9 - Train Loss: 0.5299, Val Loss: 0.5349
Epoch 10 - Train Loss: 0.5281, Val Loss: 0.5540


In [19]:
def create_test_sequences(df, input_len=30, forecast_len=7):
    X, store_ids, family_ids, series_ids = [], [], [], []

    feature_cols = [
        "sales_normalized", "onpromotion", "is_holiday", "dcoilwtico",
        "dayofweek", "month", "lag_1", "lag_7", "rolling_mean_7", "rolling_mean_14"
    ]

    for sid, group in df.groupby("series_id"):
        group = group.sort_values("time_idx")
        values = group[feature_cols].values
        store = group["store_nbr_enc"].values[0]
        family = group["family_enc"].values[0]

        for i in range(input_len, len(group) - forecast_len + 1):
            X.append(values[i - input_len:i])
            store_ids.append(store)
            family_ids.append(family)
            series_ids.append(sid)

    return (
        np.array(X),
        np.array(store_ids),
        np.array(family_ids),
        series_ids
    )

In [24]:
test_df = pd.read_csv("test.csv", parse_dates=["date"])
test_df = test_df.sort_values(["store_nbr", "family", "date"])

# 1. Unisci train + test per calcolare lag/rolling
test_df["sales"] = np.nan  # placeholder per le vendite future
df_all = pd.concat([df, test_df], sort=False)

# 2. Calcola time_idx e series_id (se non già fatti)
df_all["store_nbr_enc"] = le_store.transform(df_all["store_nbr"])
df_all["family_enc"] = le_family.transform(df_all["family"])
df_all["series_id"] = df_all["store_nbr_enc"].astype(str) + "_" + df_all["family_enc"].astype(str)
df_all["time_idx"] = df_all.groupby("series_id").cumcount()

# 3. Calcola lag e rolling
df_all = add_lag_and_rolling(df_all)

# 4. Normalizza le vendite dove sono presenti
df_all["sales_normalized"] = np.nan

for sid, group in df_all.groupby("series_id"):
    scaler = scalers[sid]
    sales = group["sales"].values.reshape(-1, 1)

    normed = np.full_like(sales, np.nan)
    mask = ~np.isnan(sales).flatten()

    if np.any(mask):
        normed[mask] = scaler.transform(sales[mask].reshape(-1, 1)).flatten()

    df_all.loc[group.index, "sales_normalized"] = normed.flatten()

# 5. Ricava solo il test_df aggiornato
test_df = df_all[df_all["date"] >= test_df["date"].min()].copy()
test_df = test_df.dropna()

ValueError: shape mismatch: value array of shape (1714,) could not be broadcast to indexing result of shape (1714,1)

In [None]:
model.eval()
all_preds = []

with torch.no_grad():
    for X_batch, _, store_ids, family_ids in test_loader:
        X_batch = X_batch.to(device)
        store_ids = store_ids.to(device)
        family_ids = family_ids.to(device)

        preds = model(X_batch, store_ids, family_ids)  # (batch, 7)
        all_preds.append(preds.cpu().numpy())

all_preds = np.concatenate(all_preds, axis=0)  # (n_samples, 7)

In [None]:
flat_preds = all_preds.reshape(-1, 1)
flat_preds_denorm = scaler.inverse_transform(flat_preds)
final_preds = flat_preds_denorm.reshape(all_preds.shape) 

In [None]:
output_rows = []
for sid, preds in zip(series_ids_test, final_preds):
    store, family = sid.split("_")
    for i, p in enumerate(preds):
        output_rows.append({
            "store_nbr": int(store),
            "family": family,
            "day": i,
            "predicted_sales": p
        })

submission_df = pd.DataFrame(output_rows)
submission_df.to_csv("submission.csv", index=False)