Importing 

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder


loading data

In [2]:

# Load files
train_df = pd.read_csv("train.csv")
features_df = pd.read_csv("features.csv")
stores_df = pd.read_csv("stores.csv")


Merging

In [3]:

# Merge step by step
dataset= pd.merge(train_df, features_df, on=["Store", "Date", "IsHoliday"], how="left")
dataset = pd.merge(dataset, stores_df, on="Store", how="left")
print(dataset.head())


dataset=dataset.drop(["MarkDown1","MarkDown2","MarkDown3","MarkDown4","MarkDown5"],axis=1)
print(dataset)



   Store  Dept        Date  Weekly_Sales  IsHoliday  Temperature  Fuel_Price  \
0      1     1  2010-02-05      24924.50      False        42.31       2.572   
1      1     1  2010-02-12      46039.49       True        38.51       2.548   
2      1     1  2010-02-19      41595.55      False        39.93       2.514   
3      1     1  2010-02-26      19403.54      False        46.63       2.561   
4      1     1  2010-03-05      21827.90      False        46.50       2.625   

   MarkDown1  MarkDown2  MarkDown3  MarkDown4  MarkDown5         CPI  \
0        NaN        NaN        NaN        NaN        NaN  211.096358   
1        NaN        NaN        NaN        NaN        NaN  211.242170   
2        NaN        NaN        NaN        NaN        NaN  211.289143   
3        NaN        NaN        NaN        NaN        NaN  211.319643   
4        NaN        NaN        NaN        NaN        NaN  211.350143   

   Unemployment Type    Size  
0         8.106    A  151315  
1         8.106    A  15

encoding data

In [4]:



encoder = LabelEncoder()
dataset.Type = encoder.fit_transform(dataset.Type)

print(dataset)         # [0 1 2 0 1]
print(encoder.classes_) # ['StoreA' 'StoreB' 'StoreC']



        Store  Dept        Date  Weekly_Sales  IsHoliday  Temperature  \
0           1     1  2010-02-05      24924.50      False        42.31   
1           1     1  2010-02-12      46039.49       True        38.51   
2           1     1  2010-02-19      41595.55      False        39.93   
3           1     1  2010-02-26      19403.54      False        46.63   
4           1     1  2010-03-05      21827.90      False        46.50   
...       ...   ...         ...           ...        ...          ...   
421565     45    98  2012-09-28        508.37      False        64.88   
421566     45    98  2012-10-05        628.10      False        64.89   
421567     45    98  2012-10-12       1061.02      False        54.47   
421568     45    98  2012-10-19        760.01      False        56.47   
421569     45    98  2012-10-26       1076.80      False        58.85   

        Fuel_Price         CPI  Unemployment  Type    Size  
0            2.572  211.096358         8.106     0  151315  
1

date separtated to date month and year

In [5]:



def create_date_features(df, date_col):
    """
    Converts a date column into ML-friendly features for retail forecasting.

    Parameters:
    df (pd.DataFrame): Your dataset
    date_col (str): Column name containing the date

    Returns:
    pd.DataFrame: Dataset with new date features
    """
    # Convert to datetime
    df[date_col] = pd.to_datetime(df[date_col])

    # Time-based features
    df['year'] = df[date_col].dt.year
    df['month'] = df[date_col].dt.month
    df['day'] = df[date_col].dt.day
    df['dayofweek'] = df[date_col].dt.dayofweek  # 0=Mon, 6=Sun
    df['weekofyear'] = df[date_col].dt.isocalendar().week.astype(int)
    df['quarter'] = df[date_col].dt.quarter
    df['is_weekend'] = df['dayofweek'].isin([5,6]).astype(int)

    # Continuous time (trend)
    df['days_since'] = (df[date_col] - df[date_col].min()).dt.days

    # Cyclical encoding for seasonality
    df['month_sin'] = np.sin(2 * np.pi * df['month']/12)
    df['month_cos'] = np.cos(2 * np.pi * df['month']/12)
    df['dow_sin'] = np.sin(2 * np.pi * df['dayofweek']/7)
    df['dow_cos'] = np.cos(2 * np.pi * df['dayofweek']/7)

    return df


dropping unwanted data

In [6]:



dataset=create_date_features(dataset,"Date")
# X = all columns except Weekly_Sales
X = dataset.loc[:, dataset.columns != "Weekly_Sales"]

# y = Weekly_Sales column
y = dataset.loc[:, "Weekly_Sales"]

X=X.drop("Date",axis=1)



print(X)
X["IsHoliday"] = X["IsHoliday"].astype(int)
print(X)


        Store  Dept  IsHoliday  Temperature  Fuel_Price         CPI  \
0           1     1      False        42.31       2.572  211.096358   
1           1     1       True        38.51       2.548  211.242170   
2           1     1      False        39.93       2.514  211.289143   
3           1     1      False        46.63       2.561  211.319643   
4           1     1      False        46.50       2.625  211.350143   
...       ...   ...        ...          ...         ...         ...   
421565     45    98      False        64.88       3.997  192.013558   
421566     45    98      False        64.89       3.985  192.170412   
421567     45    98      False        54.47       4.000  192.327265   
421568     45    98      False        56.47       3.969  192.330854   
421569     45    98      False        58.85       3.882  192.308899   

        Unemployment  Type    Size  year  ...  day  dayofweek  weekofyear  \
0              8.106     0  151315  2010  ...    5          4         

split the data

In [7]:

# -------------------------------
# 1. Split the data
# -------------------------------
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)


random forest 

In [8]:

# -------------------------------
# 2. Train Random Forest
# -------------------------------
rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

rf_val_pred = rf_model.predict(X_val)
rf_test_pred = rf_model.predict(X_test)


xgboost

In [9]:

# -------------------------------
# 3. Train XGBoost
# -------------------------------
dtrain = xgb.DMatrix(X_train, label=y_train)
dval   = xgb.DMatrix(X_val, label=y_val)
dtest  = xgb.DMatrix(X_test, label=y_test)

params = {
    "objective": "reg:squarederror",
    "learning_rate": 0.03,
    "max_depth": 6,
    "min_child_weight": 3,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "reg_alpha": 0.01,
    "reg_lambda": 1.5,
    "eval_metric": "rmse",
    "seed": 42,
}

evals = [(dtrain, "train"), (dval, "val")]
xgb_model = xgb.train(
    params,
    dtrain,
    num_boost_round=5000,
    evals=evals,
    early_stopping_rounds=50,
    verbose_eval=False
)

xgb_val_pred = xgb_model.predict(dval)
xgb_test_pred = xgb_model.predict(dtest)


combine using linear regression 

In [10]:

# -------------------------------
# 4. Combine using Linear Regression
# -------------------------------
# Stack validation predictions for training meta-model
meta_X_train = np.column_stack((rf_val_pred, xgb_val_pred))
meta_X_test = np.column_stack((rf_test_pred, xgb_test_pred))



meta_model = LinearRegression()
meta_model.fit(meta_X_train, y_val)

final_pred = meta_model.predict(meta_X_test)

# -------------------------------
# 5. Evaluate all models
# -------------------------------
def evaluate_model(name, y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"\n✅ {name} Performance:")
    print(f"MSE : {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE : {mae:.4f}")
    print(f"R²  : {r2:.4f}")


evaluating performance

In [11]:

# Random Forest
evaluate_model("Random Forest", y_test, rf_test_pred)

# XGBoost
evaluate_model("XGBoost", y_test, xgb_test_pred)

# Combined (Linear Regression meta-model)
evaluate_model("Stacked Model (Linear Regression)", y_test, final_pred)



✅ Random Forest Performance:
MSE : 11775876.4346
RMSE: 3431.5997
MAE : 1343.9862
R²  : 0.9769

✅ XGBoost Performance:
MSE : 13274648.2036
RMSE: 3643.4391
MAE : 1984.4220
R²  : 0.9739

✅ Stacked Model (Linear Regression) Performance:
MSE : 10201513.1571
RMSE: 3193.9808
MAE : 1435.3660
R²  : 0.9800
