# Data Understanding

In [63]:
#importing relevant libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline
import statsmodels.api as sm
import scipy.stats as stats
import datetime
import re

In [64]:
#reading the dataset
df = pd.read_csv ('walmart.csv')
df.head(2)


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.9,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.24217,8.106


In [65]:
#checking last records in the dataset
df.tail(2)

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
6433,45,19-10-2012,718125.53,0,56.47,3.969,192.330854,8.667
6434,45,26-10-2012,760281.43,0,58.85,3.882,192.308899,8.667


In [66]:
df.shape

(6435, 8)

In [67]:
#checking the dataset info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         6435 non-null   int64  
 1   Date          6435 non-null   object 
 2   Weekly_Sales  6435 non-null   float64
 3   Holiday_Flag  6435 non-null   int64  
 4   Temperature   6435 non-null   float64
 5   Fuel_Price    6435 non-null   float64
 6   CPI           6435 non-null   float64
 7   Unemployment  6435 non-null   float64
dtypes: float64(5), int64(2), object(1)
memory usage: 402.3+ KB


In [68]:
#checking the missing values in the dataset
df.isna().sum()

Store           0
Date            0
Weekly_Sales    0
Holiday_Flag    0
Temperature     0
Fuel_Price      0
CPI             0
Unemployment    0
dtype: int64

In [69]:
#checking Duplicates
df.duplicated().value_counts()

False    6435
Name: count, dtype: int64

In [70]:
#checking the unique values in the weekly_sales column
df['Weekly_Sales'].unique()


array([1643690.9 , 1641957.44, 1611968.17, ...,  734464.36,  718125.53,
        760281.43])

In [61]:

# Function to check for any special characters except " . " and " - "
def contains_special_characters(value):
    return bool(re.search(r"[^\w\s\.\-]", value))

# Create a temporary copy of dataframe with all values as string for cross-checking.
validity_check = df.astype("string")

# Create a boolean mask of the dataframe by applying above function.
mask = validity_check.applymap(contains_special_characters)

# Query the dataframe for records containing special characters.
result = validity_check[mask.any(axis=1)]

# Display Results
result.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Date          0 non-null      string
 1   Weekly_Sales  0 non-null      string
 2   Holiday_Flag  0 non-null      string
 3   Temperature   0 non-null      string
 4   Fuel_Price    0 non-null      string
 5   CPI           0 non-null      string
 6   Unemployment  0 non-null      string
dtypes: string(7)
memory usage: 0.0 bytes


In [71]:
# Dataset Statistical Information
df.describe()

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
count,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0
mean,23.0,1046965.0,0.06993,60.663782,3.358607,171.578394,7.999151
std,12.988182,564366.6,0.255049,18.444933,0.45902,39.356712,1.875885
min,1.0,209986.2,0.0,-2.06,2.472,126.064,3.879
25%,12.0,553350.1,0.0,47.46,2.933,131.735,6.891
50%,23.0,960746.0,0.0,62.67,3.445,182.616521,7.874
75%,34.0,1420159.0,0.0,74.94,3.735,212.743293,8.622
max,45.0,3818686.0,1.0,100.14,4.468,227.232807,14.313


### Data Description

1. We deduce that the dataset has 6430 rows and 8 columns with 7 numericals and 1 object
2. There is no missing values
3. The dataset has no duplicates
4. The least temperature is -2.060000 F and the highest is 100.14000

# Data Preparation

In [72]:
#dropping irrelevant columns
df.drop(['Store'],  axis=1, inplace=True)


In [73]:
df.columns

Index(['Date', 'Weekly_Sales', 'Holiday_Flag', 'Temperature', 'Fuel_Price',
       'CPI', 'Unemployment'],
      dtype='object')

In [54]:
df.head()



Unnamed: 0,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,05-02-2010,1643690.9,0,42.31,2.572,211.096358,8.106
1,12-02-2010,1641957.44,1,38.51,2.548,211.24217,8.106
2,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,05-03-2010,1554806.68,0,46.5,2.625,211.350143,8.106


In [55]:
#converting date to datetime
df["Date"] = pd.to_datetime (df["Date"], format="%d-%m-%Y")


In [74]:
# Statistical Information
df.describe()

Unnamed: 0,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
count,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0
mean,1046965.0,0.06993,60.663782,3.358607,171.578394,7.999151
std,564366.6,0.255049,18.444933,0.45902,39.356712,1.875885
min,209986.2,0.0,-2.06,2.472,126.064,3.879
25%,553350.1,0.0,47.46,2.933,131.735,6.891
50%,960746.0,0.0,62.67,3.445,182.616521,7.874
75%,1420159.0,0.0,74.94,3.735,212.743293,8.622
max,3818686.0,1.0,100.14,4.468,227.232807,14.313


The precise degree Fahrenheit is less significant for studying consumer turnout and sales than the general climatic condition given by the temperature value, which influences customers' demands and likelihood to go shopping.

Therefore, we will group the temperature data into 4 Seasons
1. Summer (>60 degrees)
2. Autumn (40 to 60 degrees)
3. Winter (< 20 degrees)
4. Spring (20 to 40 degrees)



In [76]:
#Season Mapping
season_map = {
    "summer" : [1, (60, 120)],
    "autumn"     : [2,  (40, 60)],
    "winter"     : [3,  (-20, 20)],
    "spring"     : [4,  (20, 40)],
}

In [78]:
# Function to Map Season Conditions
def map_season(value):
    for i in season_map:
        if ((value > season_map[i][1][0]) & (value <= season_map[i][1][1])):
            return i
        else:
            pass

def map_season_numerically(value):
    for i in season_map:
        if ((value > season_map[i][1][0]) & (value <= season_map[i][1][1])):
            return season_map[i][0]
        else:
            pass

In [86]:
df["season"] = df["Temperature"].apply(map_season)

# Apply Numerical Weather Mapping to Dataframe
df["season_num"] = df["Temperature"].apply(map_season_numerically)
df[["Temperature", "season", "season_num"]].head()


Unnamed: 0,Temperature,season,season_num
0,42.31,autumn,2
1,38.51,spring,4
2,39.93,spring,4
3,46.63,autumn,2
4,46.5,autumn,2


In [87]:
df[["Temperature", "season", "season_num"]].tail()

Unnamed: 0,Temperature,season,season_num
6430,64.88,summer,1
6431,64.89,summer,1
6432,54.47,autumn,2
6433,56.47,autumn,2
6434,58.85,autumn,2


In [88]:
# checking Season Values
df["season"].value_counts()

season
summer    3498
autumn    1948
spring     903
winter      86
Name: count, dtype: int64

In [90]:
df.head()

Unnamed: 0,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,season,season_num
0,05-02-2010,1643690.9,0,42.31,2.572,211.096358,8.106,autumn,2
1,12-02-2010,1641957.44,1,38.51,2.548,211.24217,8.106,spring,4
2,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106,spring,4
3,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106,autumn,2
4,05-03-2010,1554806.68,0,46.5,2.625,211.350143,8.106,autumn,2
