In [1]:
import pandas as pd

df = pd.read_csv('train.csv', parse_dates=['date'])

# Ordina per chiave temporale e identificativa della serie
df = df.sort_values(by=['store_nbr', 'family', 'date'])

In [2]:
# Esempio di lag di 1, 7, 14, 28 giorni
for lag in [1, 7, 14, 28]:
    df[f'sales_lag_{lag}'] = df.groupby(['store_nbr', 'family'])['sales'].shift(lag)

In [3]:
# Esempio: media su 7 e 14 giorni
for window in [7, 14]:
    df[f'sales_rollmean_{window}'] = df.groupby(['store_nbr', 'family'])['sales'].shift(1).rolling(window=window).mean()

In [4]:
# Rolling std, min, max su 14 giorni
for func in ['std', 'min', 'max']:
    df[f'sales_roll{func}_14'] = (
        df.groupby(['store_nbr', 'family'])['sales']
        .shift(1)
        .rolling(window=14)
        .agg(func)
    )

In [5]:
df['promo_7d'] = (
    df.groupby(['store_nbr', 'family'])['onpromotion']
    .shift(1)
    .rolling(7).sum()
)

In [6]:
# Dopo aver costruito le feature, puoi decidere se droppare o imputare i NaN
df = df.dropna()

In [7]:
# Assumendo che la colonna 'date' sia già in formato datetime
df['day'] = df['date'].dt.day
df['month'] = df['date'].dt.month
df['dayofweek'] = df['date'].dt.dayofweek
df['is_weekend'] = df['dayofweek'] >= 5

In [8]:
from sklearn.preprocessing import LabelEncoder

# Encoding per family e type (stringhe)
le = LabelEncoder()
df['family'] = le.fit_transform(df['family'])
label_encoders = {'family': le}

# store_nbr è già numerico, ma se vuoi assicurarti che sia trattato come categoria:
df['store_nbr'] = df['store_nbr'].astype(int)

In [9]:
# is_holiday: booleano (già True/False), possiamo trasformarlo in 0/1
df['is_holiday'] = df['is_holiday'].astype(int)

# onpromotion è già 0/1

# Riempi eventuali NaN nel prezzo del petrolio con interpolazione o mediana
df['dcoilwtico'] = df['dcoilwtico'].interpolate(method='linear')
df['dcoilwtico'] = df['dcoilwtico'].bfill()  # fallback

In [10]:
# Split on percentage
train_size = 0.8

train = df.sample(frac=train_size, random_state=42)
test = df.drop(train.index)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [11]:
target_col = 'sales'

# Colonne da escludere
drop_cols = ['id', 'date', 'sales']

# Feature matrix
X_train = train.drop(columns=drop_cols)
X_test = test.drop(columns=drop_cols)

# Target
y_train = train[target_col]
y_test = test[target_col]

In [12]:
import optuna
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'tree_method': 'hist',
        'random_state': 42,
        'n_jobs': -1
    }

    model = XGBRegressor(**params)
    model.fit(X_train, y_train)  # Nessun early_stopping

    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    return rmse

In [13]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)  # puoi aumentare il numero di trial

print("Migliori iperparametri:")
print(study.best_params)

[I 2025-05-15 12:57:26,314] A new study created in memory with name: no-name-31d2926d-5ff8-4834-b4e5-975ddf1cb3d2
[I 2025-05-15 12:57:34,679] Trial 0 finished with value: 237.97798225820753 and parameters: {'n_estimators': 491, 'max_depth': 6, 'learning_rate': 0.04124232088063815, 'subsample': 0.5583661004458964, 'colsample_bytree': 0.7548284955160666, 'min_child_weight': 6, 'gamma': 1.4297820447169203}. Best is trial 0 with value: 237.97798225820753.
[I 2025-05-15 12:57:38,660] Trial 1 finished with value: 227.6293672236135 and parameters: {'n_estimators': 185, 'max_depth': 8, 'learning_rate': 0.17185340280807737, 'subsample': 0.7025419120745093, 'colsample_bytree': 0.7244651814149816, 'min_child_weight': 4, 'gamma': 2.2876434510563692}. Best is trial 1 with value: 227.6293672236135.
[I 2025-05-15 12:57:51,234] Trial 2 finished with value: 226.99845030495104 and parameters: {'n_estimators': 336, 'max_depth': 12, 'learning_rate': 0.15575186677267144, 'subsample': 0.5342824309821195, 'c

Migliori iperparametri:
{'n_estimators': 416, 'max_depth': 12, 'learning_rate': 0.061262821041241845, 'subsample': 0.9173305659044061, 'colsample_bytree': 0.5620303828499797, 'min_child_weight': 1, 'gamma': 2.0104361823750687}


In [14]:
best_params = study.best_params
best_params['tree_method'] = 'hist'
best_params['random_state'] = 42
best_params['n_jobs'] = -1

final_model = XGBRegressor(**best_params)
final_model.fit(X_train, y_train)

y_pred = final_model.predict(X_test)
final_rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"RMSE dopo tuning: {final_rmse:.2f}")

RMSE dopo tuning: 211.68
