# Modeling & Evaluation

In [1]:
# lets import needed libraries

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
# Load Dataset

product_monthly = pd.read_csv("../data/processed/feature_engineered_sales.csv", parse_dates=['Month'])
product_monthly.head()


Unnamed: 0,Product,Month,Sales,Quantity Ordered,Month_Num,Quarter,Lag_1,Lag_2,Rolling_3
0,20in Monitor,2019-04-01,43226.07,393.0,4,2,35856.74,27057.54,28854.043333
1,20in Monitor,2019-05-01,37506.59,341.0,5,2,43226.07,35856.74,35380.116667
2,20in Monitor,2019-06-01,35416.78,322.0,6,2,37506.59,43226.07,38863.133333
3,20in Monitor,2019-07-01,35966.73,327.0,7,3,35416.78,37506.59,38716.48
4,20in Monitor,2019-08-01,28707.39,261.0,8,3,35966.73,35416.78,36296.7


### Train-Test Split (Time-Aware, Across Products)

In [3]:
# Here lets define features and target variable

X = product_monthly.drop(columns=['Sales', 'Month'])
y = product_monthly['Sales']

In [4]:
# here we encode the product names

X = pd.get_dummies(X, columns=['Product'], drop_first=True)

In [5]:
# Time- based split

split_date = product_monthly['Month'].quantile(0.8)

train_idx = product_monthly['Month'] <= split_date
test_idx = product_monthly['Month'] > split_date

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]


In [6]:
# Let's train Models

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)


In [7]:
# predictions

lr_preds = lr_model.predict(X_test)
rf_preds = rf_model.predict(X_test)

In [8]:
# Lets evaluate the models

lr_mae = mean_absolute_error(y_test, lr_preds)
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_preds))
lr_r2 = r2_score(y_test, lr_preds)

rf_mae = mean_absolute_error(y_test, rf_preds)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_preds))
rf_r2 = r2_score(y_test, rf_preds)


In [9]:
# Let's save predictions and metrics

results_df = pd.DataFrame({
    "Month": product_monthly.loc[test_idx, "Month"],
    "Actual Sales": y_test,
    "LR_Predicted": lr_preds,
    "RF_Predicted": rf_preds
})

results_df.to_csv("../data/processed/model_predictions.csv", index=False)

metrics_df = pd.DataFrame({
    "Model": ["Linear Regression", "Random Forest"],
    "MAE": [lr_mae, rf_mae],
    "RMSE": [lr_rmse, rf_rmse],
    "R²": [lr_r2, rf_r2]
})

metrics_df.to_csv("../data/processed/model_metrics.csv", index=False)

metrics_df

Unnamed: 0,Model,MAE,RMSE,R²
0,Linear Regression,99272.353488,137304.75677,0.741726
1,Random Forest,58033.208879,102825.751292,0.855152
