In [53]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import statsmodels.api as sm

In [27]:
df = pd.read_csv("Walmart_sales.csv")
df.head(5)

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.9,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.24217,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.5,2.625,211.350143,8.106


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         6435 non-null   int64  
 1   Date          6435 non-null   object 
 2   Weekly_Sales  6435 non-null   float64
 3   Holiday_Flag  6435 non-null   int64  
 4   Temperature   6435 non-null   float64
 5   Fuel_Price    6435 non-null   float64
 6   CPI           6435 non-null   float64
 7   Unemployment  6435 non-null   float64
dtypes: float64(5), int64(2), object(1)
memory usage: 402.3+ KB


In [29]:
df.describe()

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
count,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0
mean,23.0,1046965.0,0.06993,60.663782,3.358607,171.578394,7.999151
std,12.988182,564366.6,0.255049,18.444933,0.45902,39.356712,1.875885
min,1.0,209986.2,0.0,-2.06,2.472,126.064,3.879
25%,12.0,553350.1,0.0,47.46,2.933,131.735,6.891
50%,23.0,960746.0,0.0,62.67,3.445,182.616521,7.874
75%,34.0,1420159.0,0.0,74.94,3.735,212.743293,8.622
max,45.0,3818686.0,1.0,100.14,4.468,227.232807,14.313


In [30]:
df['Date'] = pd.to_datetime(df['Date'],format="%d-%m-%Y")
df['Quarter'] = df['Date'].dt.quarter
df.drop('Date', axis=1, inplace=True)

df.head()


Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Quarter
0,1,1643690.9,0,42.31,2.572,211.096358,8.106,1
1,1,1641957.44,1,38.51,2.548,211.24217,8.106,1
2,1,1611968.17,0,39.93,2.514,211.289143,8.106,1
3,1,1409727.59,0,46.63,2.561,211.319643,8.106,1
4,1,1554806.68,0,46.5,2.625,211.350143,8.106,1


In [31]:
df.isnull().sum()

Store           0
Weekly_Sales    0
Holiday_Flag    0
Temperature     0
Fuel_Price      0
CPI             0
Unemployment    0
Quarter         0
dtype: int64

In [34]:
X = df.drop(columns = "Weekly_Sales")
y = df['Weekly_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Linear Regression Model without Feature Scaling

In [35]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
n = len(y_test) 
p = X_test.shape[1]
adj_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))
print("Model Score:", model.score(X_test, y_test))
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared:", r2)
print("Adjusted R-squared:", adj_r2)

Model Score: 0.15401253347450727
Mean Absolute Error (MAE): 432487.075150992
Mean Squared Error (MSE): 272538723964.11508
Root Mean Squared Error (RMSE): 522052.41495860845
R-squared: 0.15401253347450727
Adjusted R-squared: 0.14938242224254594


### Decision Tree Model without Feature SCALING

In [36]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mae,mse = mean_absolute_error(y_test, y_pred), mean_squared_error(y_test, y_pred)
rmse, r2 = np.sqrt(mse), r2_score(y_test, y_pred)
n,p = len(y_test), X_test.shape[1]
adj_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))
print("Model Score:", model.score(X_test, y_test))
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared:", r2)
print("Adjusted R-squared:", adj_r2)

Model Score: 0.9058572310192494
Mean Absolute Error (MAE): 93155.58758352758
Mean Squared Error (MSE): 30328522754.407818
Root Mean Squared Error (RMSE): 174150.86205473638
R-squared: 0.9058572310192494
Adjusted R-squared: 0.9053419852156018


### Random Forest Regressor without Feature SCALING

In [33]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mae,mse = mean_absolute_error(y_test, y_pred), mean_squared_error(y_test, y_pred)
rmse, r2 = np.sqrt(mse), r2_score(y_test, y_pred)
n,p = len(y_test), X_test.shape[1]
adj_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))
print("Model Score:", model.score(X_test, y_test))
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared:", r2)
print("Adjusted R-squared:", adj_r2)

Model Score: 0.933868175539245
Mean Absolute Error (MAE): 75976.83847832163
Mean Squared Error (MSE): 21304669117.589027
Root Mean Squared Error (RMSE): 145961.19045002692
R-squared: 0.933868175539245
Adjusted R-squared: 0.9335062343576771


### Feature SCALING

In [37]:
columns_to_scale = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
scaler = StandardScaler()
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])
df.head(5)

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Quarter
0,1,1643690.9,0,-0.995136,-1.7138,1.004175,0.056964,1
1,1,1641957.44,1,-1.20117,-1.766089,1.00788,0.056964,1
2,1,1611968.17,0,-1.124178,-1.840166,1.009074,0.056964,1
3,1,1409727.59,0,-0.760907,-1.737766,1.009849,0.056964,1
4,1,1554806.68,0,-0.767955,-1.598328,1.010624,0.056964,1


In [45]:
X = df.drop(columns = "Weekly_Sales")
y = df['Weekly_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Applying Models after feature SCALING and without OHE

In [46]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
n = len(y_test) 
p = X_test.shape[1]
adj_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))
print("Linear Regression Model Score:", model.score(X_test, y_test))
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared:", r2)
print("Adjusted R-squared:", adj_r2)

Linear Regression Model Score: 0.15584124138436195
Mean Absolute Error (MAE): 431725.52148014476
Mean Squared Error (MSE): 271949597363.57358
Root Mean Squared Error (RMSE): 521487.86885561736
R-squared: 0.15584124138436195
Adjusted R-squared: 0.14922557713188833


In [47]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mae,mse = mean_absolute_error(y_test, y_pred), mean_squared_error(y_test, y_pred)
rmse, r2 = np.sqrt(mse), r2_score(y_test, y_pred)
n,p = len(y_test), X_test.shape[1]
adj_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))
print("Decision Tree Model Score:", model.score(X_test, y_test))
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared:", r2)
print("Adjusted R-squared:", adj_r2)

Decision Tree Model Score: 0.9081169625647834
Mean Absolute Error (MAE): 92487.35836829837
Mean Squared Error (MSE): 29600539922.167233
Root Mean Squared Error (RMSE): 172048.07445062333
R-squared: 0.9081169625647834
Adjusted R-squared: 0.9073968760645074


In [48]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mae,mse = mean_absolute_error(y_test, y_pred), mean_squared_error(y_test, y_pred)
rmse, r2 = np.sqrt(mse), r2_score(y_test, y_pred)
n,p = len(y_test), X_test.shape[1]
adj_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))
print("Random Forest Model Score:", model.score(X_test, y_test))
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared:", r2)
print("Adjusted R-squared:", adj_r2)

Random Forest Model Score: 0.9331255635017992
Mean Absolute Error (MAE): 75845.04616682202
Mean Squared Error (MSE): 21543904975.204113
Root Mean Squared Error (RMSE): 146778.42135410817
R-squared: 0.9331255635017992
Adjusted R-squared: 0.932601469171876


### One Hot Encoding

In [41]:
df['Quarter'] = pd.Categorical(df['Quarter'])
encoder = OneHotEncoder()
df_new = encoder.fit_transform(df[['Quarter']])



In [42]:
df1 = pd.DataFrame(df_new.toarray(), columns=encoder.get_feature_names_out(['Quarter']))
df = pd.concat([df, df1], axis=1)
df.drop('Quarter', axis=1, inplace=True)

df.head()

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Quarter_1,Quarter_2,Quarter_3,Quarter_4
0,1,1643690.9,0,-0.995136,-1.7138,1.004175,0.056964,1.0,0.0,0.0,0.0
1,1,1641957.44,1,-1.20117,-1.766089,1.00788,0.056964,1.0,0.0,0.0,0.0
2,1,1611968.17,0,-1.124178,-1.840166,1.009074,0.056964,1.0,0.0,0.0,0.0
3,1,1409727.59,0,-0.760907,-1.737766,1.009849,0.056964,1.0,0.0,0.0,0.0
4,1,1554806.68,0,-0.767955,-1.598328,1.010624,0.056964,1.0,0.0,0.0,0.0


In [49]:
X = df.drop(columns = "Weekly_Sales")
y = df['Weekly_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### With OHE as well

In [50]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
n = len(y_test) 
p = X_test.shape[1]
adj_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))
print("Linear Regression Model Score:", model.score(X_test, y_test))
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared:", r2)
print("Adjusted R-squared:", adj_r2)

Linear Regression Model Score: 0.15584124138436195
Mean Absolute Error (MAE): 431725.52148014476
Mean Squared Error (MSE): 271949597363.57358
Root Mean Squared Error (RMSE): 521487.86885561736
R-squared: 0.15584124138436195
Adjusted R-squared: 0.14922557713188833


In [51]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mae,mse = mean_absolute_error(y_test, y_pred), mean_squared_error(y_test, y_pred)
rmse, r2 = np.sqrt(mse), r2_score(y_test, y_pred)
n,p = len(y_test), X_test.shape[1]
adj_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))
print("Decision Tree Model Score:", model.score(X_test, y_test))
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared:", r2)
print("Adjusted R-squared:", adj_r2)

Decision Tree Model Score: 0.9068196917404261
Mean Absolute Error (MAE): 92834.128010878
Mean Squared Error (MSE): 30018461639.80009
Root Mean Squared Error (RMSE): 173258.36672380383
R-squared: 0.9068196917404261
Adjusted R-squared: 0.9060894385408996


In [52]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mae,mse = mean_absolute_error(y_test, y_pred), mean_squared_error(y_test, y_pred)
rmse, r2 = np.sqrt(mse), r2_score(y_test, y_pred)
n,p = len(y_test), X_test.shape[1]
adj_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))
print("Random Forest Model Score:", model.score(X_test, y_test))
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared:", r2)
print("Adjusted R-squared:", adj_r2)

Random Forest Model Score: 0.9332096722463652
Mean Absolute Error (MAE): 75664.23195112661
Mean Squared Error (MSE): 21516808959.216553
Root Mean Squared Error (RMSE): 146686.0898627288
R-squared: 0.9332096722463652
Adjusted R-squared: 0.9326862370758822


### Backward Elimination

In [56]:
X = df.drop(columns = "Weekly_Sales")
y = df['Weekly_Sales']
def backward_elimination(X, y, significance_level=0.05):
    num_features = X.shape[1]
    for i in range(num_features):
        model = sm.OLS(y, X).fit()
        max_p_value = max(model.pvalues)
        if max_p_value > significance_level:
            max_p_value_index = model.pvalues.idxmax()
            print(f"Removing feature '{max_p_value_index}' with p-value {max_p_value}")
            X = X.drop(max_p_value_index, axis=1)
        else:
            break
    return model
final_model = backward_elimination(X_train, y_train)

print(final_model.summary())



Removing feature 'Fuel_Price' with p-value 0.8917690760412098
Removing feature 'Temperature' with p-value 0.07680684989096545
                            OLS Regression Results                            
Dep. Variable:           Weekly_Sales   R-squared:                       0.144
Model:                            OLS   Adj. R-squared:                  0.143
Method:                 Least Squares   F-statistic:                     123.9
Date:                Wed, 20 Mar 2024   Prob (F-statistic):          6.49e-169
Time:                        20:28:42   Log-Likelihood:                -75073.
No. Observations:                5148   AIC:                         1.502e+05
Df Residuals:                    5140   BIC:                         1.502e+05
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
---

In [57]:
X = df.drop(columns = "Weekly_Sales")
y = df['Weekly_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [58]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
n = len(y_test) 
p = X_test.shape[1]
adj_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))
print("Linear Regression Model Score:", model.score(X_test, y_test))
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared:", r2)
print("Adjusted R-squared:", adj_r2)

Linear Regression Model Score: 0.15584124138436195
Mean Absolute Error (MAE): 431725.52148014476
Mean Squared Error (MSE): 271949597363.57358
Root Mean Squared Error (RMSE): 521487.86885561736
R-squared: 0.15584124138436195
Adjusted R-squared: 0.14922557713188833


In [59]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mae,mse = mean_absolute_error(y_test, y_pred), mean_squared_error(y_test, y_pred)
rmse, r2 = np.sqrt(mse), r2_score(y_test, y_pred)
n,p = len(y_test), X_test.shape[1]
adj_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))
print("Decision Tree Model Score:", model.score(X_test, y_test))
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared:", r2)
print("Adjusted R-squared:", adj_r2)

Decision Tree Model Score: 0.9101774392666901
Mean Absolute Error (MAE): 91763.16846153846
Mean Squared Error (MSE): 28936747947.3265
Root Mean Squared Error (RMSE): 170108.04786172375
R-squared: 0.9101774392666901
Adjusted R-squared: 0.9094735007029495


In [60]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mae,mse = mean_absolute_error(y_test, y_pred), mean_squared_error(y_test, y_pred)
rmse, r2 = np.sqrt(mse), r2_score(y_test, y_pred)
n,p = len(y_test), X_test.shape[1]
adj_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))
print("Random Forest Model Score:", model.score(X_test, y_test))
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared:", r2)
print("Adjusted R-squared:", adj_r2)

Random Forest Model Score: 0.9336442924332056
Mean Absolute Error (MAE): 75877.04310994556
Mean Squared Error (MSE): 21376794082.143993
Root Mean Squared Error (RMSE): 146208.05067486534
R-squared: 0.9336442924332056
Adjusted R-squared: 0.9331242633770395
