In [None]:
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import numpy as np

main_df = pd.read_csv('/content/drive/MyDrive/engineered_features_tree.csv')

In [None]:
main_df.columns

Index(['store_id', 'date_id', 'sales', 'promo', 'open', 'school_holiday',
       'state_holiday', 'store_type', 'assortment_type',
       'competition_distance', 'competition_open_month',
       'competition_open_year', 'promo2', 'promo2_since_week',
       'promo2_since_year', 'promo_interval', 'day', 'month', 'year',
       'weekday', 'is_weekend', 'weekofyear', 'quarter', 'is_month_start',
       'is_month_end', 'is_promo2_month', 'competition_duration_months',
       'promo2_duration_weeks', 'shopavg_open', 'shopavg_salespercustomer',
       'shopavg_schoolholiday', 'shopsales_holiday', 'shopsales_promo',
       'shopsales_schoolholiday'],
      dtype='object')

In [None]:
import pandas as pd
import numpy as np

# ---------- helpers ----------
def mode_by_group(df, group_cols, target):
    return (
        df.groupby(group_cols, dropna=False)[target]
          .agg(lambda s: int(s.value_counts().idxmax()))
          .reset_index()
    )

def _safe_competition_open(year, month):
    try:
        y = int(year); m = int(month)
        if y >= 1900 and 1 <= m <= 12:
            return pd.Timestamp(year=y, month=m, day=1)
    except Exception:
        pass
    return pd.NaT

def _safe_promo2_since(year, week):
    try:
        y = int(year); w = int(week)
        if y >= 1900 and 1 <= w <= 53:
            # ISO week date: Mon of ISO week
            return pd.to_datetime(f'{y}-W{w:02d}-1', format='%G-W%V-%u')
    except Exception:
        pass
    return pd.NaT

# ---------- main builder ----------
def build_future_features_next_month(
    df_hist: pd.DataFrame,
    feature_cols: list | None = None,
    store_col: str = 'store_id',
    date_col: str = 'date_id',
    horizon_days: int = 30
):

    df = df_hist.copy()
    df[date_col] = pd.to_datetime(df[date_col])
    df.sort_values([store_col, date_col], inplace=True)

    last_date = df[date_col].max()
    future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1),
                                 end=last_date + pd.Timedelta(days=horizon_days),
                                 freq='D')

    # --- base grid (store × future date) ---
    stores = df[store_col].unique()
    future_full = (
        pd.MultiIndex.from_product([stores, future_dates], names=[store_col, date_col])
          .to_frame(index=False)
    )

    # ---------- STATIC store-level columns ----------
    static_cols = [
        'store_type', 'assortment_type',
        'competition_distance', 'competition_open_month', 'competition_open_year',
        'promo2', 'promo2_since_week', 'promo2_since_year',
        # engineered aggregates
        'shopavg_open', 'shopavg_salespercustomer', 'shopavg_schoolholiday',
        'shopsales_holiday', 'shopsales_promo', 'shopsales_schoolholiday'
    ]
    # include promo_interval if you have it in history (needed to compute is_promo2_month)
    if 'promo_interval' in df.columns:
        static_cols.append('promo_interval')

    have_static = [c for c in static_cols if c in df.columns]
    static_df = df.groupby(store_col, as_index=False)[have_static].first()
    future_full = future_full.merge(static_df, on=store_col, how='left')

    # ---------- calendar features ----------
    future_full['day'] = future_full[date_col].dt.day
    future_full['month'] = future_full[date_col].dt.month
    future_full['year'] = future_full[date_col].dt.year
    future_full['weekday'] = future_full[date_col].dt.weekday  # 0=Mon
    future_full['is_weekend'] = (future_full['weekday'] >= 5).astype(int)
    future_full['weekofyear'] = future_full[date_col].dt.isocalendar().week.astype(int)
    future_full['quarter'] = future_full[date_col].dt.quarter
    future_full['is_month_start'] = future_full[date_col].dt.is_month_start.astype(int)
    future_full['is_month_end'] = future_full[date_col].dt.is_month_end.astype(int)

    # ---------- infer OPEN from history (mode by store & weekday, with weekday fallback) ----------
    # Ensure weekday exists in history (if not already engineered)
    if 'weekday' not in df.columns:
        df['weekday'] = df[date_col].dt.weekday

    open_by_store_dow = mode_by_group(df, [store_col, 'weekday'], 'open')
    open_by_dow = mode_by_group(df, ['weekday'], 'open').rename(columns={'open': 'open_fallback'})

    future_full = future_full.merge(open_by_store_dow, on=[store_col, 'weekday'], how='left')
    future_full = future_full.merge(open_by_dow, on='weekday', how='left')
    future_full['open'] = future_full['open'].fillna(future_full['open_fallback']).fillna(1).astype(int)
    future_full.drop(columns=['open_fallback'], inplace=True)

    # ---------- compute is_promo2_month using promo_interval (if known) ----------
    month_str = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sept','Oct','Nov','Dec']
    if 'promo_interval' in future_full.columns:
        future_full['promo2_month_name'] = future_full['month'].map(lambda m: month_str[m-1])
        future_full['is_promo2_month'] = future_full.apply(
            lambda r: 1 if (isinstance(r.get('promo_interval', ''), str) and
                            r['promo2_month_name'] in str(r['promo_interval']).split(',')) else 0,
            axis=1
        ).astype(int)
        future_full.drop(columns=['promo2_month_name'], inplace=True)
    else:
        # if unknown, assume not a promo2 month
        future_full['is_promo2_month'] = 0

    # ---------- infer PROMO from history (mode by store & weekday, with weekday fallback), then OR with promo2-month ----------
    promo_by_store_dow = mode_by_group(df, [store_col, 'weekday'], 'promo')
    promo_by_dow = mode_by_group(df, ['weekday'], 'promo').rename(columns={'promo': 'promo_fallback'})
    future_full = future_full.merge(promo_by_store_dow, on=[store_col, 'weekday'], how='left')
    future_full = future_full.merge(promo_by_dow, on='weekday', how='left')

    # base promo from historical patterns
    future_full['promo'] = future_full['promo'].fillna(future_full['promo_fallback']).fillna(0).astype(int)
    future_full.drop(columns=['promo_fallback'], inplace=True)

    # if store has promo2 and current month is within promo_interval => force promo=1
    if 'promo2' in future_full.columns:
        future_full['promo'] = np.where((future_full['promo2'] == 1) & (future_full['is_promo2_month'] == 1),
                                        1, future_full['promo']).astype(int)

    # ---------- school_holiday & state_holiday from historical calendar patterns ----------
    # Map by (month, day) from history (global mode across stores/years)
    # SCHOOL HOLIDAY
    if 'school_holiday' in df.columns:
        school_by_md = mode_by_group(df.assign(_month=df[date_col].dt.month,
                                               _day=df[date_col].dt.day),
                                     ['_month','_day'], 'school_holiday') \
                       .rename(columns={'_month':'month','_day':'day'})
        future_full = future_full.merge(school_by_md, on=['month','day'], how='left', suffixes=(None, '_schmd'))
        # if still NaN, fallback by weekday pattern globally
        school_by_dow = mode_by_group(df, ['weekday'], 'school_holiday') \
                            .rename(columns={'school_holiday':'school_holiday_fallback'})
        future_full = future_full.merge(school_by_dow, on='weekday', how='left')
        future_full['school_holiday'] = future_full['school_holiday'] \
                                            .fillna(future_full['school_holiday_fallback']) \
                                            .fillna(0).astype(int)
        future_full.drop(columns=['school_holiday_fallback'], inplace=True)

    # STATE HOLIDAY (categorical codes assumed, e.g., 0/1/2/3)
    if 'state_holiday' in df.columns:
        state_by_md = (
            df.assign(_month=df[date_col].dt.month, _day=df[date_col].dt.day)
              .groupby(['_month','_day'])['state_holiday']
              .agg(lambda s: s.value_counts().idxmax())  # mode (may be non-binary)
              .reset_index()
              .rename(columns={'_month':'month','_day':'day'})
        )
        future_full = future_full.merge(state_by_md, on=['month','day'], how='left', suffixes=(None, '_statemd'))
        # default to "no state holiday" if unknown
        future_full['state_holiday'] = future_full['state_holiday'].fillna(0).astype(int)

    # ---------- relative durations (competition & promo2) ----------
    # competition_duration_months ~ days/30 from competition_open_(year, month)
    comp_open = future_full.apply(
        lambda r: _safe_competition_open(r.get('competition_open_year', np.nan),
                                         r.get('competition_open_month', np.nan)), axis=1)
    comp_open = pd.to_datetime(comp_open)
    future_full['competition_duration_months'] = (
        (future_full[date_col] - comp_open).dt.days / 30
    ).clip(lower=0).fillna(0)

    # promo2_duration_weeks from promo2_since_(year, week)
    promo2_since = future_full.apply(
        lambda r: _safe_promo2_since(r.get('promo2_since_year', np.nan),
                                     r.get('promo2_since_week', np.nan)), axis=1)
    promo2_since = pd.to_datetime(promo2_since)
    future_full['promo2_duration_weeks'] = (
        (future_full[date_col] - promo2_since).dt.days / 7
    ).clip(lower=0).fillna(0)

    if 'sales' in future_full.columns:
        future_full['sales'] = np.nan

    # Drop any helper columns we created that aren't in training
    drop_helpers = [c for c in ['promo_interval'] if (c in future_full.columns and c not in (feature_cols or []))]
    if drop_helpers:
        future_full.drop(columns=drop_helpers, inplace=True)

    future_X = None
    if feature_cols is not None:
        missing = [c for c in feature_cols if c not in future_full.columns]
        if missing:
            raise ValueError(f"Missing columns for prediction: {missing}")
        future_X = future_full[feature_cols].copy()

    return future_full, future_X


In [None]:
feature_cols = main_df.drop(columns=['sales', 'promo_interval']).columns.tolist()

# Build future features for next 30 days
future_full, future_X = build_future_features_next_month(
    df_hist=main_df,
    feature_cols=feature_cols,
    store_col='store_id',
    date_col='date_id',
    horizon_days=30
)

# **Prediction**

In [None]:
import joblib

model_lgb = joblib.load("/content/drive/MyDrive/lgbm_sales_forecast.pkl")

In [None]:
import xgboost as xgb

model_xgb = xgb.Booster()
model_xgb.load_model("/content/drive/MyDrive/xgb_sales_forecast.json")

In [None]:
future_preds_lgb = model_lgb.predict(future_X.drop(columns=['date_id']), num_iteration=model_lgb.best_iteration_)

In [None]:
future_preds_xgb = model_xgb.predict(xgb.DMatrix(future_X.drop(columns=['date_id'])),
                                     iteration_range=(0, model_xgb.best_iteration))

In [None]:
future_X['preds_lgb'] = future_preds_lgb
future_X['preds_xgb'] = future_preds_xgb

feature_keep = ['store_id', 'date_id', 'preds_lgb', 'preds_xgb']
prediction = future_X[feature_keep]

In [None]:
cols = ["preds_lgb", "preds_xgb"]
prediction[cols] = prediction[cols].clip(lower=0)
prediction

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction[cols] = prediction[cols].clip(lower=0)


Unnamed: 0,store_id,date_id,preds_lgb,preds_xgb
0,0,2015-08-01,5878.227153,5308.719727
1,0,2015-08-02,0.000000,231.461716
2,0,2015-08-03,6644.103590,6210.636719
3,0,2015-08-04,5276.309025,4879.173828
4,0,2015-08-05,4624.027939,4371.063965
...,...,...,...,...
33445,1114,2015-08-26,6776.699682,6814.674316
33446,1114,2015-08-27,7419.717900,7239.202637
33447,1114,2015-08-28,8355.545520,8256.649414
33448,1114,2015-08-29,7269.811218,7759.067871


In [None]:
prediction.to_csv("tree_predictions.csv", index=False)

In [None]:
!cp /content/tree_predictions.csv /content/drive/MyDrive/