In [14]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Custom libraries/methods
from libs import split_data, evaluate_model

In [15]:
# Load dataset
df = pd.read_csv("data/preprocessed/main_ML_ready.csv")
df.head()

Unnamed: 0,Store,Temperature,Fuel_Price,CPI,Unemployment,IsHoliday,Dept,Weekly_Sales,Store_Size,Year,Month,Day,Week,Store_Type_A,Store_Type_B,Store_Type_C
0,1,-0.963831,-1.72052,1.018422,0.078331,False,1,24924.5,0.238802,2010,2,5,5,True,False,False
1,1,-0.963831,-1.72052,1.018422,0.078331,False,2,50605.27,0.238802,2010,2,5,5,True,False,False
2,1,-0.963831,-1.72052,1.018422,0.078331,False,3,13740.12,0.238802,2010,2,5,5,True,False,False
3,1,-0.963831,-1.72052,1.018422,0.078331,False,4,39954.04,0.238802,2010,2,5,5,True,False,False
4,1,-0.963831,-1.72052,1.018422,0.078331,False,5,32229.38,0.238802,2010,2,5,5,True,False,False


### Split Data

In [16]:
# Split the data
X_train, X_test, y_train, y_test = split_data(df, target_column="Weekly_Sales")
print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)
print("X_test shape: ", X_test.shape)
print("y_test shape: ", y_test.shape)

X_train shape:  (336169, 15)
y_train shape:  (336169,)
X_test shape:  (84043, 15)
y_test shape:  (84043,)


# 2. Random Forest

### 2.1 Train Random Forest Base Model

In [22]:
# Initialize the model
rf = RandomForestRegressor(random_state=42, n_estimators=50)

# Train the model
rf.fit(X_train, y_train)

# Predict on the Test set
y_pred_rf = rf.predict(X_test)

# Evaluate the model
mae_rf = mean_absolute_error(y_test, y_pred_rf)
print(f"Random Forest MAE: {mae_rf}")

# Training Accuracy
acc_rf= round(rf.score(X_train, y_train) * 100, 3)
print (f"Training Accuracy: {acc_rf} %")

Random Forest MAE: 1304.7614635484215
Training Accuracy: 99.675 %


### 2.2 Hyperparameter Tuning using Grid search + Cross validation

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define the model
rf = RandomForestRegressor(random_state=42)

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
}

# Initialize GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error', # Set scoring to MAE
    cv=3,  # 3-fold cross-validation
    n_jobs=-1,  # Use all available cores
    verbose=2  # Verbosity for monitoring progress
)

# Fit GridSearchCV on the training data
grid_search.fit(X_train, y_train)

# Best parameters and score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Score: {grid_search.best_score_}")

# Get the best model
best_rf_model = grid_search.best_estimator_

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Best Cross-Validation Score: -1414.7980392785128


In [21]:
# Training Accuracy with Hyperparameter Tuning
acc_rf= round(best_rf_model.score(X_train, y_train) * 100, 3)
print (f"Training Accuracy: {acc_rf} %")

Training Accuracy: 99.709 %


### 2.3 Evaluate final model

In [19]:
# Evaluate final model
final_predictions = best_rf_model.predict(X_test)
print(evaluate_model(y_test, final_predictions))

{'MAE': 1290.624812325833, 'RMSE': 3195.258480409026, 'R²': 0.9801034875340918}


### 2.3 Feature Importance for Random Forest

In [20]:
# Feature Importance for Random Forest
importances = best_rf_model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df)


         Feature  Importance
6           Dept    0.628948
7     Store_Size    0.190512
0          Store    0.055963
11          Week    0.039441
3            CPI    0.027335
4   Unemployment    0.011608
13  Store_Type_B    0.010644
1    Temperature    0.009739
10           Day    0.008989
9          Month    0.007139
2     Fuel_Price    0.004234
12  Store_Type_A    0.003441
5      IsHoliday    0.001028
8           Year    0.000622
14  Store_Type_C    0.000357
