In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor

# Custom libraries/methods
from libs import split_data, evaluate_model

In [10]:
# Load dataset
df = pd.read_csv("data/preprocessed/main_ML_ready.csv")
df.head()

Unnamed: 0,Temperature,Fuel_Price,CPI,Unemployment,IsHoliday,Dept,Weekly_Sales,Store_Size,Month,Day,...,Store_36,Store_37,Store_38,Store_39,Store_40,Store_41,Store_42,Store_43,Store_44,Store_45
0,-0.963831,-1.72052,1.018422,0.078331,False,1,24924.5,0.238802,2,5,...,False,False,False,False,False,False,False,False,False,False
1,-0.963831,-1.72052,1.018422,0.078331,False,2,50605.27,0.238802,2,5,...,False,False,False,False,False,False,False,False,False,False
2,-0.963831,-1.72052,1.018422,0.078331,False,3,13740.12,0.238802,2,5,...,False,False,False,False,False,False,False,False,False,False
3,-0.963831,-1.72052,1.018422,0.078331,False,4,39954.04,0.238802,2,5,...,False,False,False,False,False,False,False,False,False,False
4,-0.963831,-1.72052,1.018422,0.078331,False,5,32229.38,0.238802,2,5,...,False,False,False,False,False,False,False,False,False,False


### Split Data

In [11]:
# Split the data
X_train, X_test, y_train, y_test = split_data(df, target_column="Weekly_Sales")
print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)
print("X_test shape: ", X_test.shape)
print("y_test shape: ", y_test.shape)

X_train shape:  (336169, 58)
y_train shape:  (336169,)
X_test shape:  (84043, 58)
y_test shape:  (84043,)


# 1. Decision Tree model

### 1.1 Train Decision Tree model

In [4]:
# Initialize the Decision Tree Regressor
dt_model = DecisionTreeRegressor(
    random_state=42,  # For reproducibility
    max_depth=None    # Default depth (unrestricted)
)

# Train the model
dt_model.fit(X_train, y_train)

In [5]:
# Make predictions on the test set
y_pred = dt_model.predict(X_test)
evaluate_model(y_test, y_pred)

Mean Absolute Error (MAE): 1712.99
Root Mean Squared Error (RMSE): 4123.78
R² Score: 0.97


### 1.2 Hyperparameter Tuning

In [6]:
# Define the hyperparameter grid
param_grid = {
    'max_depth': [4, 6, 8, 10, None],   # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],   # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],     # Minimum samples required to be at a leaf node
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=DecisionTreeRegressor(random_state=42),
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',  # Consistent with XGBoost scoring
    cv=3,                               # 3-fold cross-validation
    verbose=2,
    n_jobs=-1
)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score (Negative MAE):", grid_search.best_score_)


Fitting 3 folds for each of 45 candidates, totalling 135 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10}
Best Cross-Validation Score (Negative MAE): -1766.350121428666


### 1.3 Evaluate the Tuned Model

In [12]:
# # As a backup of best hyperparameter set
# best_dt_model = DecisionTreeRegressor(
#     random_state=42,
#     max_depth=None,
#     min_samples_leaf=4,
#     min_samples_split=10
# )
# 
# best_dt_model.fit(X_train, y_train)

In [8]:
# Retrieve the best model
best_dt_model = grid_search.best_estimator_

# Calculate evaluation metrics with the best model
y_pred = best_dt_model.predict(X_test)
is_holiday_test = df.loc[X_test.index, 'IsHoliday']
evaluate_model(y_test, y_pred, is_holiday_test)

Mean Absolute Error (MAE): 1599.52
Root Mean Squared Error (RMSE): 3916.31
R² Score: 0.97
Weighted Mean Absolute Error (WMAE): 1813.78
