# Sales Performance Analysis with Walmart Data

### Focus:
- Exploring the dataset using pandas functions
- Practicing groupby(), merge(), join(), and concat()
### Objective:
- Analyze sales performance across different stores and departments. Use groupby to find trends and combine data using merging and concatenation techniques.
### Skills Practiced:
- Aggregation with groupby() (e.g. total sales by store or department)
- Merging different datasets (e.g. sales with features)
- Concatenating data (e.g. appending data from different weeks)
- Basic EDA (describe(), value_counts(), filtering)

### import

In [90]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### read cvs file path

#### features.csv

In [91]:
path = r'C:\Users\bbuser\Desktop\data_walmart\features.csv\features.csv'
df_features = pd.read_csv(path)
df_features

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,2010-02-05,42.31,2.572,,,,,,211.096358,8.106,False
1,1,2010-02-12,38.51,2.548,,,,,,211.242170,8.106,True
2,1,2010-02-19,39.93,2.514,,,,,,211.289143,8.106,False
3,1,2010-02-26,46.63,2.561,,,,,,211.319643,8.106,False
4,1,2010-03-05,46.50,2.625,,,,,,211.350143,8.106,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8185,45,2013-06-28,76.05,3.639,4842.29,975.03,3.00,2449.97,3169.69,,,False
8186,45,2013-07-05,77.50,3.614,9090.48,2268.58,582.74,5797.47,1514.93,,,False
8187,45,2013-07-12,79.37,3.614,3789.94,1827.31,85.72,744.84,2150.36,,,False
8188,45,2013-07-19,82.84,3.737,2961.49,1047.07,204.19,363.00,1059.46,,,False


#### train.csv

In [97]:
path_train = r'C:\Users\bbuser\Desktop\data_walmart\train.csv\train.csv'
df_train = pd.read_csv(path_train)
df_train

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,24924.50,False
1,1,1,2010-02-12,46039.49,True
2,1,1,2010-02-19,41595.55,False
3,1,1,2010-02-26,19403.54,False
4,1,1,2010-03-05,21827.90,False
...,...,...,...,...,...
421565,45,98,2012-09-28,508.37,False
421566,45,98,2012-10-05,628.10,False
421567,45,98,2012-10-12,1061.02,False
421568,45,98,2012-10-19,760.01,False


#### stores.csv

In [101]:
path_stores =r'C:\Users\bbuser\Desktop\data_walmart\stores.csv'
df_stores = pd.read_csv(path_stores)
df_stores.head(10)

Unnamed: 0,Store,Type,Size
0,1,A,151315
1,2,A,202307
2,3,B,37392
3,4,A,205863
4,5,B,34875
5,6,A,202505
6,7,B,70713
7,8,A,155078
8,9,B,125833
9,10,B,126512


### Collect info

In [102]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421570 entries, 0 to 421569
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Store         421570 non-null  int64  
 1   Dept          421570 non-null  int64  
 2   Date          421570 non-null  object 
 3   Weekly_Sales  421570 non-null  float64
 4   IsHoliday     421570 non-null  bool   
dtypes: bool(1), float64(1), int64(2), object(1)
memory usage: 13.3+ MB


In [103]:
df_train['Dept'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 40, 41, 42, 44, 45, 46, 47, 48, 49, 51, 52, 54, 55, 56,
       58, 59, 60, 67, 71, 72, 74, 77, 78, 79, 80, 81, 82, 83, 85, 87, 90,
       91, 92, 93, 94, 95, 96, 97, 98, 99, 39, 50, 43, 65])

In [104]:
df_train['Dept'].value_counts() 

Dept
1     6435
2     6435
3     6435
4     6435
7     6435
      ... 
78     235
77     150
65     143
39      16
43      12
Name: count, Length: 81, dtype: int64

In [105]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Store   45 non-null     int64 
 1   Type    45 non-null     object
 2   Size    45 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 1.2+ KB


In [106]:
df_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8190 entries, 0 to 8189
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         8190 non-null   int64  
 1   Date          8190 non-null   object 
 2   Temperature   8190 non-null   float64
 3   Fuel_Price    8190 non-null   float64
 4   MarkDown1     4032 non-null   float64
 5   MarkDown2     2921 non-null   float64
 6   MarkDown3     3613 non-null   float64
 7   MarkDown4     3464 non-null   float64
 8   MarkDown5     4050 non-null   float64
 9   CPI           7605 non-null   float64
 10  Unemployment  7605 non-null   float64
 11  IsHoliday     8190 non-null   bool   
dtypes: bool(1), float64(9), int64(1), object(1)
memory usage: 712.0+ KB


In [107]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421570 entries, 0 to 421569
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Store         421570 non-null  int64  
 1   Dept          421570 non-null  int64  
 2   Date          421570 non-null  object 
 3   Weekly_Sales  421570 non-null  float64
 4   IsHoliday     421570 non-null  bool   
dtypes: bool(1), float64(1), int64(2), object(1)
memory usage: 13.3+ MB


### Groupby() and agg()

#### Total & average weekly sales per store

In [108]:
average_sales_week_store = df_train.groupby('Store')['Weekly_Sales'].mean().sort_values(ascending=False).head(10)
average_sales_week_store

Store
20    29508.301592
4     29161.210415
14    28784.851727
13    27355.136891
2     26898.070031
10    26332.303819
27    24826.984536
6     21913.243624
1     21710.543621
39    21000.763562
Name: Weekly_Sales, dtype: float64

In [109]:
total_sales_week_store = df_train.groupby('Store')['Weekly_Sales'].sum().sort_values(ascending=False).head(10)
total_sales_week_store

Store
20    3.013978e+08
4     2.995440e+08
14    2.889999e+08
13    2.865177e+08
2     2.753824e+08
10    2.716177e+08
27    2.538559e+08
6     2.237561e+08
1     2.224028e+08
39    2.074455e+08
Name: Weekly_Sales, dtype: float64

#### Total & average weekly sales per Dept

In [110]:
average_sales_week_dept = df_train.groupby('Store')['Weekly_Sales'].mean().sort_values(ascending=False).head(10)
average_sales_week_dept

Store
20    29508.301592
4     29161.210415
14    28784.851727
13    27355.136891
2     26898.070031
10    26332.303819
27    24826.984536
6     21913.243624
1     21710.543621
39    21000.763562
Name: Weekly_Sales, dtype: float64

In [111]:
total_sales_week_dept = df_train.groupby('Dept')['Weekly_Sales'].sum().sort_values(ascending=False).head(10)
#total_sales_week_dept
df_train_df = pd.DataFrame(total_sales_week_dept)
df_train_df

Unnamed: 0_level_0,Weekly_Sales
Dept,Unnamed: 1_level_1
92,483943300.0
95,449320200.0
38,393118100.0
72,305725200.0
90,291068500.0
40,288936000.0
2,280611200.0
91,216781700.0
13,197321600.0
8,194280800.0


### Merge()

Merging train.csv and stores.csv

In [112]:
average_sales_merged = avg_sales_df.merge(df_stores,on='Store',how='left')
average_sales_merged

Unnamed: 0,Store,Avg_Weekly_Sales,Type,Size
0,20,29508.301592,A,203742
1,4,29161.210415,A,205863
2,14,28784.851727,A,200898
3,13,27355.136891,A,219622
4,2,26898.070031,A,202307
5,10,26332.303819,B,126512
6,27,24826.984536,A,204184
7,6,21913.243624,A,202505
8,1,21710.543621,A,151315
9,39,21000.763562,A,184109


### Concat()

In [113]:
df_Average_total_combine = pd.concat([average_sales_week_store, total_sales_week_store],axis=1)
df_Average_total_combine

Unnamed: 0_level_0,Weekly_Sales,Weekly_Sales
Store,Unnamed: 1_level_1,Unnamed: 2_level_1
20,29508.301592,301397800.0
4,29161.210415,299544000.0
14,28784.851727,288999900.0
13,27355.136891,286517700.0
2,26898.070031,275382400.0
10,26332.303819,271617700.0
27,24826.984536,253855900.0
6,21913.243624,223756100.0
1,21710.543621,222402800.0
39,21000.763562,207445500.0


In [114]:
combine_features_store = pd.concat([df_features, df_stores], axis=0, ignore_index=True)
combine_features_store.head(10)

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,Type,Size
0,1,2010-02-05,42.31,2.572,,,,,,211.096358,8.106,False,,
1,1,2010-02-12,38.51,2.548,,,,,,211.24217,8.106,True,,
2,1,2010-02-19,39.93,2.514,,,,,,211.289143,8.106,False,,
3,1,2010-02-26,46.63,2.561,,,,,,211.319643,8.106,False,,
4,1,2010-03-05,46.5,2.625,,,,,,211.350143,8.106,False,,
5,1,2010-03-12,57.79,2.667,,,,,,211.380643,8.106,False,,
6,1,2010-03-19,54.58,2.72,,,,,,211.215635,8.106,False,,
7,1,2010-03-26,51.45,2.732,,,,,,211.018042,8.106,False,,
8,1,2010-04-02,62.27,2.719,,,,,,210.82045,7.808,False,,
9,1,2010-04-09,65.86,2.77,,,,,,210.622857,7.808,False,,


### Join()

In [115]:
sales_week_dept= df_train.groupby('Dept')['Weekly_Sales'].mean().reset_index()
sales_week_dept.rename(columns={'Weekly_Sales': 'Avg_Weekly_Sales'}, inplace=True)
train_average_sales = df_train.merge(Weekly_Sales_per_Dept, on='Dept', how='left')
train_average_sales

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Avg_Weekly_Sales
0,1,1,2010-02-05,24924.50,False,19213.485088
1,1,1,2010-02-12,46039.49,True,19213.485088
2,1,1,2010-02-19,41595.55,False,19213.485088
3,1,1,2010-02-26,19403.54,False,19213.485088
4,1,1,2010-03-05,21827.90,False,19213.485088
...,...,...,...,...,...,...
421565,45,98,2012-09-28,508.37,False,6824.694889
421566,45,98,2012-10-05,628.10,False,6824.694889
421567,45,98,2012-10-12,1061.02,False,6824.694889
421568,45,98,2012-10-19,760.01,False,6824.694889


### filter

In [116]:
high_sales = df_train[df_train['Weekly_Sales'] > 20000]
high_sales.head(10)

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,24924.5,False
1,1,1,2010-02-12,46039.49,True
2,1,1,2010-02-19,41595.55,False
4,1,1,2010-03-05,21827.9,False
5,1,1,2010-03-12,21043.39,False
6,1,1,2010-03-19,22136.64,False
7,1,1,2010-03-26,26229.21,False
8,1,1,2010-04-02,57258.43,False
9,1,1,2010-04-09,42960.91,False
34,1,1,2010-10-01,20094.19,False
