In [2]:
# Import required libraries

# Data handling
import pandas as pd
import numpy as np

# Data splitting
from sklearn.model_selection import train_test_split

# Preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Evaluation metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Model saving
import joblib

# Display settings
pd.set_option("display.max_columns", None)

In [3]:
# Load feature-engineered data

X = pd.read_csv("../data/processed_data/train_features.csv")

# Load target variable
y = pd.read_csv("../data/processed_data/train_target.csv").values.ravel()

# Check shapes
print("Feature matrix shape:", X.shape)
print("Target vector shape:", y.shape)

X.head()

Feature matrix shape: (844392, 24)
Target vector shape: (844392,)


Unnamed: 0,Store,DayOfWeek,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,Week,Day,CompetitionDuration,Promo2Duration,WeekDay,IsWeekend
0,-1.732571,0.858414,-0.517732,0.0,1.113726,-0.029796,2.041038,0.582814,-0.942988,-0.535816,0.950941,0.682279,-0.997372,-0.757527,-0.997371,0.909585,1.502796,0.347258,0.858414,1.746347,0.633683,-0.622407,0.858414,-0.460344
1,-1.729462,0.858414,-0.343268,0.0,1.113726,-0.029796,2.041038,-0.884146,-0.942988,-0.625549,1.417831,0.68121,1.002635,0.09171,1.00089,-0.877596,1.502796,0.347258,0.858414,1.746347,0.814779,2.387273,0.858414,-0.460344
2,-1.726354,0.858414,0.145233,0.0,1.113726,-0.029796,2.041038,-0.884146,-0.942988,1.112703,1.651277,0.680142,1.002635,0.157036,1.001885,-0.877596,1.502796,0.347258,0.858414,1.746347,0.995875,1.785337,0.858414,-0.460344
3,-1.723246,0.858414,1.832556,0.0,1.113726,-0.029796,2.041038,0.582814,1.070916,-0.619139,0.950941,0.683348,-0.997372,-0.757527,-0.997371,0.909585,1.502796,0.347258,0.858414,1.746347,0.452588,-0.622407,0.858414,-0.460344
4,-1.720138,0.858414,-0.507763,0.0,1.113726,-0.029796,2.041038,-0.884146,-0.942988,3.135536,-0.216285,0.689761,-0.997372,-0.757527,-0.997371,0.909585,1.502796,0.347258,0.858414,1.746347,-0.633987,-0.622407,0.858414,-0.460344


In [4]:
# Split data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)

Training set size: (675513, 24)
Test set size: (168879, 24)


In [5]:
# Create baseline predictions
baseline_pred = np.full_like(y_test, y_train.mean())

# Evaluate baseline
baseline_mae = mean_absolute_error(y_test, baseline_pred)

baseline_mae

2292.797766448167

In [6]:
# Initialize standard scaler
standard_scaler = StandardScaler()

# Fit on training data, transform both train & test
X_train_std = standard_scaler.fit_transform(X_train)
X_test_std = standard_scaler.transform(X_test)

In [7]:
# Initialize min-max scaler
minmax_scaler = MinMaxScaler()

# Apply normalization
X_train_norm = minmax_scaler.fit_transform(X_train)
X_test_norm = minmax_scaler.transform(X_test)

In [8]:
# Initialize linear regression model
lr = LinearRegression()

# Train model
lr.fit(X_train_std, y_train)

# Predict on test set
y_pred_lr = lr.predict(X_test_std)

In [9]:
# MAE
lr_mae = mean_absolute_error(y_test, y_pred_lr)

# RMSE (manual calculation)
lr_mse = mean_squared_error(y_test, y_pred_lr)
lr_rmse = np.sqrt(lr_mse)

# RÂ² score
lr_r2 = r2_score(y_test, y_pred_lr)

lr_mae, lr_rmse, lr_r2

(1036.8490605311806, np.float64(1496.239659115665), 0.7678568840362366)

In [10]:
from sklearn.metrics import mean_squared_error
import numpy as np

baseline_mse = mean_squared_error(y_test, baseline_pred)
baseline_rmse = np.sqrt(baseline_mse)

In [11]:
# Initialize random forest model
rf = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

# Train model
rf.fit(X_train, y_train)

# Predict on test set
y_pred_rf = rf.predict(X_test)

In [12]:
# Evaluation of Random Forest model

rf_mae = mean_absolute_error(y_test, y_pred_rf)

# RMSE calculated manually 
rf_mse = mean_squared_error(y_test, y_pred_rf)
rf_rmse = np.sqrt(rf_mse)

rf_r2 = r2_score(y_test, y_pred_rf)

rf_mae, rf_rmse, rf_r2

(317.30891632470576, np.float64(476.8481769678215), 0.9764216321484067)

In [13]:
# MODEL COMPARISON 

print("Baseline MAE:", baseline_mae)
print("Linear Regression MAE:", lr_mae)
print("Random Forest MAE:", rf_mae)

Baseline MAE: 2292.797766448167
Linear Regression MAE: 1036.8490605311806
Random Forest MAE: 317.30891632470576


In [14]:
# overfiting Checking

# Predictions on training data
train_pred_rf = rf.predict(X_train)

train_mae_rf = mean_absolute_error(y_train, train_pred_rf)
test_mae_rf = mean_absolute_error(y_test, y_pred_rf)

train_mae_rf, test_mae_rf

(118.13887303427167, 317.30891632470576)

In [15]:
# Feature importance
feature_importance = pd.Series(
    rf.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

feature_importance.head(10)

Customers                    0.737948
StoreType                    0.057111
CompetitionDistance          0.054619
Promo                        0.034045
Store                        0.031537
Promo2SinceWeek              0.010036
CompetitionOpenSinceMonth    0.009632
Assortment                   0.009498
CompetitionOpenSinceYear     0.008617
CompetitionDuration          0.007156
dtype: float64

In [16]:
# Save trained model
joblib.dump(rf, "../models/random_forest_model.pkl")

print("Random Forest model saved successfully.")

Random Forest model saved successfully.


In [17]:
ml_results = pd.DataFrame({
    "Model": ["Baseline", "Linear Regression", "Random Forest"],
    "MAE": [baseline_mae, lr_mae, rf_mae],
    "RMSE": [baseline_rmse, lr_rmse, rf_rmse]
})

ml_results.to_csv("../results/ml_results.csv", index=False)
print("ML results saved.")

ML results saved.


In [19]:
joblib.dump(X.columns.tolist(), "../models/feature_columns.pkl")

['../models/feature_columns.pkl']