In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

# --- Load Data and Feature Engineering (same as notebook 4) ---
df = pd.read_csv('../data/processed/train_merged.csv', low_memory=False, parse_dates=['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['DayOfWeek'] = df['Date'].dt.dayofweek
df['WeekOfYear'] = df['Date'].dt.isocalendar().week.astype(int)
df['CompetitionOpenSinceMonth'].fillna(0, inplace=True)
df['CompetitionOpenSinceYear'].fillna(0, inplace=True)
df['CompetitionOpen'] = (df['Year'] - df['CompetitionOpenSinceYear']) * 12 + (df['Month'] - df['CompetitionOpenSinceMonth'])
df['CompetitionOpen'] = df['CompetitionOpen'].apply(lambda x: max(x, 0))
df['CompetitionDistance'].fillna(df['CompetitionDistance'].median(), inplace=True)
df['Promo2SinceWeek'].fillna(0, inplace=True)
df['Promo2SinceYear'].fillna(0, inplace=True)
df['PromoInterval'].fillna('', inplace=True)
def is_promo2_active(row):
    if row['Promo2'] == 0: return 0
    promo2_start_year = int(row['Promo2SinceYear'])
    promo2_start_week = int(row['Promo2SinceWeek'])
    current_year = row['Year']
    current_week = row['WeekOfYear']
    if current_year < promo2_start_year: return 0
    if current_year == promo2_start_year and current_week < promo2_start_week: return 0
    month_str = row['Date'].strftime('%b')
    return 1 if month_str in row['PromoInterval'] else 0
df['IsPromo2'] = df.apply(is_promo2_active, axis=1)

# --- Feature Selection and Encoding ---
features = [
    'Year', 'Month', 'Day', 'DayOfWeek', 'WeekOfYear',
    'Store', 'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpen',
    'Promo', 'Promo2', 'IsPromo2',
    'StateHoliday', 'SchoolHoliday'
]
target = 'Sales'
categorical_features = ['StoreType', 'Assortment', 'StateHoliday']
for feature in categorical_features:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature])

# --- Train/Val Split ---
df_train = df[(df['Open'] == 1) & (df['Sales'] > 0)]
validation_date = df_train['Date'].max() - pd.DateOffset(weeks=6)
train_indices = df_train['Date'] < validation_date
val_indices = df_train['Date'] >= validation_date
X_train, y_train = df_train[train_indices][features], df_train[train_indices][target]
X_val, y_val = df_train[val_indices][features], df_train[val_indices][target]

# --- Model Training ---
def rmsp_error_xgb(y_pred, y_true):
    y_true = y_true.get_label()
    y_true[y_true == 0] = 1e-6
    percentage_error = (y_true - y_pred) / y_true
    rmspe = np.sqrt(np.mean(np.square(percentage_error)))
    return 'rmspe', rmspe

params = {
    'objective': 'reg:squarederror',
    'eta': 0.05,
    'max_depth': 8,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'eval_metric': 'rmse',
    'seed': 42
}
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
watchlist = [(dtrain, 'train'), (dval, 'eval')]
model = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=watchlist,
    custom_metric=rmsp_error_xgb,
    maximize=False,
    early_stopping_rounds=50,
    verbose_eval=False # We don't need the output here
)

print('Model retrained.')

### Feature Importance

In [None]:
plt.style.use('ggplot')
fig, ax = plt.subplots(figsize=(12, 8))
xgb.plot_importance(model, ax=ax, height=0.8)
plt.title('Feature Importance')
plt.show()