# **<font color=#8e44ad>Walmart Sales Prediction</font>**

Data is from a Kaggle dataset available here: https://www.kaggle.com/datasets/aslanahmedov/walmart-sales-forecast?select=features.csv

Credits go to Aslan Ahmedov for arranging data collection and availability. 

The idea behind this dataset is to be able to predict the sales for a given store at a given date. 

## **<font color=#8e44ad>Loading Data</font>**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('..')
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [2]:
data = pd.read_csv('../data/train.csv')

  data = pd.read_csv('../data/train.csv')


## **<font color=#8e44ad>Feature Engineering</font>**

In [3]:
def filling_na(df, columns):
    for col in columns:
        df[col].fillna(value=0, inplace=True)
    return df

In [4]:
data = filling_na(data, columns=data.columns)

In [5]:
def create_time_feature(df):
    df['Date'] = pd.to_datetime(data['Date'], errors='coerce')
    df['dayofmonth'] = df['Date'].dt.day
    df['dayofweek'] = df['Date'].dt.dayofweek
    df['quarter'] = df['Date'].dt.quarter
    df['month'] = df['Date'].dt.month
    df['year'] = df['Date'].dt.year
    df['dayofyear'] = df['Date'].dt.dayofyear
    df['weekofyear'] = df['Date'].dt.isocalendar().week
    return df

In [6]:
data.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,Type,Size
0,1,1,2010-02-05,24924.5,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,A,151315.0
1,1,2,2010-02-05,50605.27,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,A,151315.0
2,1,3,2010-02-05,13740.12,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,A,151315.0
3,1,4,2010-02-05,39954.04,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,A,151315.0
4,1,5,2010-02-05,32229.38,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,A,151315.0


In [7]:
df = create_time_feature(data)
df.head(5)

Unnamed: 0,Store,Dept,Date,Weekly_Sales,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,...,IsHoliday,Type,Size,dayofmonth,dayofweek,quarter,month,year,dayofyear,weekofyear
0,1,1,2010-02-05,24924.5,42.31,2.572,0.0,0.0,0.0,0.0,...,False,A,151315.0,5.0,4.0,1.0,2.0,2010.0,36.0,5
1,1,2,2010-02-05,50605.27,42.31,2.572,0.0,0.0,0.0,0.0,...,False,A,151315.0,5.0,4.0,1.0,2.0,2010.0,36.0,5
2,1,3,2010-02-05,13740.12,42.31,2.572,0.0,0.0,0.0,0.0,...,False,A,151315.0,5.0,4.0,1.0,2.0,2010.0,36.0,5
3,1,4,2010-02-05,39954.04,42.31,2.572,0.0,0.0,0.0,0.0,...,False,A,151315.0,5.0,4.0,1.0,2.0,2010.0,36.0,5
4,1,5,2010-02-05,32229.38,42.31,2.572,0.0,0.0,0.0,0.0,...,False,A,151315.0,5.0,4.0,1.0,2.0,2010.0,36.0,5


In [8]:
df.sort_values(by=['Store', 'Date'], ascending=True, inplace=True)

In [9]:
def cpi_difference(df):
    differences = []

    for rows in range(1, len(df)):
        value_0 = float(df['CPI'].iloc[rows - 1])
        value_1 = float(df['CPI'].iloc[rows])
        result = value_1 - value_0
        differences.append(result)
    
    df['CPI_Difference'] = [0] + differences
    return df

In [10]:
final_data = cpi_difference(df)

In [11]:
X = filling_na(final_data, columns=final_data.columns)

In [12]:
X.columns

Index(['Store', 'Dept', 'Date', 'Weekly_Sales', 'Temperature', 'Fuel_Price',
       'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI',
       'Unemployment', 'IsHoliday', 'Type', 'Size', 'dayofmonth', 'dayofweek',
       'quarter', 'month', 'year', 'dayofyear', 'weekofyear',
       'CPI_Difference'],
      dtype='object')

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X.drop(columns=['Store','Weekly_Sales', 'Date', 'Fuel_Price', 'CPI', 'Unemployment', 'Type']), X['Weekly_Sales'])

encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X_train)
X_train_encoded = encoder.transform(X_train)
X_test_encoded = encoder.transform(X_test)

In [14]:
model = RandomForestRegressor(max_depth=10, random_state=0)

model.fit(X_train_encoded, y_train)

In [15]:
y_pred_train_RF = model.predict(X_train_encoded)
y_pred_test_RF = model.predict(X_test_encoded)

print(f"This is the score for the train: {model.score(X_train_encoded, y_train)}")
print(f"This is the score for the test: {model.score(X_test_encoded, y_test)}")

This is the score for the train: 0.6048127313292896
This is the score for the test: 0.6008618925060373


## **<font color=#8e44ad>Using Python files</font>**

In [16]:
import sys
sys.path.append('..')
from sales_prediction.__init__ import TRAIN_FEATURES, TEST_FEATURES, MODEL_BASE_PATH
from sales_prediction.training import build_model
from sales_prediction.preprocessing import *
from sales_prediction.inference import make_predictions

In [17]:
data = pd.read_csv('../data/train.csv')

  data = pd.read_csv('../data/train.csv')


In [18]:
RMSE = build_model(data)

In [19]:
print(RMSE)

{'Train accuracy score': 0.6055356170505113, 'Test accuracy score': 0.6019835590657951}


In [20]:
test_set = pd.read_csv('../data/test_part_1.csv')

  test_set = pd.read_csv('../data/test_part_1.csv')


In [21]:
predictions = make_predictions(test_set)

In [22]:
predictions

{'Here are our predictions using Random Forest': array([10849.86309946, 46262.26402023, 10849.86309946, ...,
        10849.86309946, 10849.86309946, 10849.86309946])}