## Sales Performance Analysis with Walmart Data

In [10]:
import pandas as pd

In [14]:
# Set the base path to your data folder
path ='C:/Users/bbuser/Desktop/DataScience-Brain-Bytes/Team_members/from_alzahra/data/walmart-recruiting-store-sales-forecasting--'

train = pd.read_csv(f"{path}//train.csv")
features = pd.read_csv(f"{path}//features.csv")
stores = pd.read_csv(f"{path}//stores.csv")


In [15]:
# Optionally display the head of each DataFrame
print("Sales Data:\n", train.head())
print("Features Data:\n", features.head())
print("Stores Data:\n", stores.head())

Sales Data:
    Store  Dept        Date  Weekly_Sales  IsHoliday
0      1     1  2010-02-05      24924.50      False
1      1     1  2010-02-12      46039.49       True
2      1     1  2010-02-19      41595.55      False
3      1     1  2010-02-26      19403.54      False
4      1     1  2010-03-05      21827.90      False
Features Data:
    Store        Date  Temperature  Fuel_Price  MarkDown1  MarkDown2  \
0      1  2010-02-05        42.31       2.572        NaN        NaN   
1      1  2010-02-12        38.51       2.548        NaN        NaN   
2      1  2010-02-19        39.93       2.514        NaN        NaN   
3      1  2010-02-26        46.63       2.561        NaN        NaN   
4      1  2010-03-05        46.50       2.625        NaN        NaN   

   MarkDown3  MarkDown4  MarkDown5         CPI  Unemployment  IsHoliday  
0        NaN        NaN        NaN  211.096358         8.106      False  
1        NaN        NaN        NaN  211.242170         8.106       True  
2        N

#### Aggregation with groupby():

In [19]:
# Total sales per store
total_sales_per_store = train.groupby("Store")["Weekly_Sales"].sum()

# Average sales per department
avg_sales_per_dept = train.groupby("Dept")["Weekly_Sales"].mean()

print("Total Sales:\n", total_sales_per_store)
print("Average Sales:\n", avg_sales_per_dept)

Total Sales:
 Store
1     2.224028e+08
2     2.753824e+08
3     5.758674e+07
4     2.995440e+08
5     4.547569e+07
6     2.237561e+08
7     8.159828e+07
8     1.299512e+08
9     7.778922e+07
10    2.716177e+08
11    1.939628e+08
12    1.442872e+08
13    2.865177e+08
14    2.889999e+08
15    8.913368e+07
16    7.425243e+07
17    1.277821e+08
18    1.551147e+08
19    2.066349e+08
20    3.013978e+08
21    1.081179e+08
22    1.470756e+08
23    1.987506e+08
24    1.940160e+08
25    1.010612e+08
26    1.434164e+08
27    2.538559e+08
28    1.892637e+08
29    7.714155e+07
30    6.271689e+07
31    1.996139e+08
32    1.668192e+08
33    3.716022e+07
34    1.382498e+08
35    1.315207e+08
36    5.341221e+07
37    7.420274e+07
38    5.515963e+07
39    2.074455e+08
40    1.378703e+08
41    1.813419e+08
42    7.956575e+07
43    9.056544e+07
44    4.329309e+07
45    1.123953e+08
Name: Weekly_Sales, dtype: float64
Average Sales:
 Dept
1     19213.485088
2     43607.020113
3     11793.698516
4     25974.

#### Merging DataFrames

In [21]:
# Merge train with stores
train_merged = pd.merge(train, stores, on="Store", how="left")
print(train_merged.head())

   Store  Dept        Date  Weekly_Sales  IsHoliday Type    Size
0      1     1  2010-02-05      24924.50      False    A  151315
1      1     1  2010-02-12      46039.49       True    A  151315
2      1     1  2010-02-19      41595.55      False    A  151315
3      1     1  2010-02-26      19403.54      False    A  151315
4      1     1  2010-03-05      21827.90      False    A  151315


#### Concatenating Weekly Data

In [23]:
# Make sure 'Date' is treated as a datetime object
train["Date"] = pd.to_datetime(train["Date"])

# Split into two parts by date
first_half = train[train["Date"] < "2011-01-01"]
second_half = train[train["Date"] >= "2011-01-01"]

# Concatenate both parts back together
combined_data = pd.concat([first_half, second_half])

# Optional: check shape and preview
print("Combined shape:", combined_data.shape)
print(combined_data.head())

Combined shape: (421570, 5)
   Store  Dept       Date  Weekly_Sales  IsHoliday
0      1     1 2010-02-05      24924.50      False
1      1     1 2010-02-12      46039.49       True
2      1     1 2010-02-19      41595.55      False
3      1     1 2010-02-26      19403.54      False
4      1     1 2010-03-05      21827.90      False


#### Filtering and Frequency Counts (Using train)

In [24]:
# Filter rows with high sales (more than $20,000)
high_sales = train[train["Weekly_Sales"] > 20000]

# Count number of records per store
store_counts = train["Store"].value_counts()

# Optional: preview outputs
print("High Sales:\n", high_sales.head())
print("\nStore Frequencies:\n", store_counts.head())

High Sales:
    Store  Dept       Date  Weekly_Sales  IsHoliday
0      1     1 2010-02-05      24924.50      False
1      1     1 2010-02-12      46039.49       True
2      1     1 2010-02-19      41595.55      False
4      1     1 2010-03-05      21827.90      False
5      1     1 2010-03-12      21043.39      False

Store Frequencies:
 Store
13    10474
10    10315
4     10272
1     10244
2     10238
Name: count, dtype: int64
