In [22]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Custom libraries/methods
from libs import split_data, evaluate_model

In [23]:
# Load dataset
df = pd.read_csv("data/preprocessed/main_ML_ready.csv")
df.head()

Unnamed: 0,Store,Temperature,Fuel_Price,CPI,Unemployment,IsHoliday,Dept,Weekly_Sales,Store_Size,Month,Day,Week,Store_Type_A,Store_Type_B,Store_Type_C
0,1,-0.963831,-1.72052,1.018422,0.078331,False,1,24924.5,0.238802,2,5,5,True,False,False
1,1,-0.963831,-1.72052,1.018422,0.078331,False,2,50605.27,0.238802,2,5,5,True,False,False
2,1,-0.963831,-1.72052,1.018422,0.078331,False,3,13740.12,0.238802,2,5,5,True,False,False
3,1,-0.963831,-1.72052,1.018422,0.078331,False,4,39954.04,0.238802,2,5,5,True,False,False
4,1,-0.963831,-1.72052,1.018422,0.078331,False,5,32229.38,0.238802,2,5,5,True,False,False


In [24]:
# Encoding numerical representation of categorical variables for linear regression
df = pd.get_dummies(df, columns=['Dept', 'Store']) # Provide same One-hot encoding

### Split Data

In [25]:
# Split the data
X_train, X_test, y_train, y_test = split_data(df, target_column="Weekly_Sales")
print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)
print("X_test shape: ", X_test.shape)
print("y_test shape: ", y_test.shape)

X_train shape:  (336169, 138)
y_train shape:  (336169,)
X_test shape:  (84043, 138)
y_test shape:  (84043,)


# Linear Regression model (As the base model)

### Train Linear Regression model and predict

In [26]:
# Train the Linear Regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lr.predict(X_test)

# Calculate evaluation metrics
is_holiday_test = df.loc[X_test.index, 'IsHoliday']
evaluate_model(y_test, y_pred, is_holiday_test)

Mean Absolute Error (MAE): 8124.82
Root Mean Squared Error (RMSE): 13118.12
R² Score: 0.66
Weighted Mean Absolute Error (WMAE): 8318.84
