In [39]:
import pandas as pd

# Define the folder path where the files are located
folder_path = "C:\\Users\\bbuser\\Downloads\\walmart-recruiting-store-sales-forecasting\\"

# Load the CSV files with 'Date' column parsed where needed
train_df = pd.read_csv(folder_path + "train.csv", parse_dates=["Date"])
features_df = pd.read_csv(folder_path + "features.csv", parse_dates=["Date"])
stores_df = pd.read_csv(folder_path + "stores.csv")

# Print the shapes of all DataFrames
print("=== Shapes of DataFrames ===")
print("Train:", train_df.shape)
print("Features:", features_df.shape)
print("Stores:", stores_df.shape)

# Print first 5 rows of each DataFrame
print("\n=== Train Data (first 5 rows) ===")
print(train_df.head())

print("\n=== Features Data (first 5 rows) ===")
print(features_df.head())

print("\n=== Stores Data (first 5 rows) ===")
print(stores_df.head())


=== Shapes of DataFrames ===
Train: (421570, 5)
Features: (8190, 12)
Stores: (45, 3)

=== Train Data (first 5 rows) ===
   Store  Dept       Date  Weekly_Sales  IsHoliday
0      1     1 2010-02-05      24924.50      False
1      1     1 2010-02-12      46039.49       True
2      1     1 2010-02-19      41595.55      False
3      1     1 2010-02-26      19403.54      False
4      1     1 2010-03-05      21827.90      False

=== Features Data (first 5 rows) ===
   Store       Date  Temperature  Fuel_Price  MarkDown1  MarkDown2  MarkDown3  \
0      1 2010-02-05        42.31       2.572        NaN        NaN        NaN   
1      1 2010-02-12        38.51       2.548        NaN        NaN        NaN   
2      1 2010-02-19        39.93       2.514        NaN        NaN        NaN   
3      1 2010-02-26        46.63       2.561        NaN        NaN        NaN   
4      1 2010-03-05        46.50       2.625        NaN        NaN        NaN   

   MarkDown4  MarkDown5         CPI  Unemployment

In [44]:
# Show shapes of the dataframes
print("Shapes:")
print("Train:", train_df.shape)
print("Features:", features_df.shape)
print("Stores:", stores_df.shape)

# Display first few rows of the train DataFrame
print("\nTrain head:")
print(train_df.head())


Shapes:
Train: (421570, 5)
Features: (8190, 12)
Stores: (45, 3)

Train head:
   Store  Dept       Date  Weekly_Sales  IsHoliday
0      1     1 2010-02-05      24924.50      False
1      1     1 2010-02-12      46039.49       True
2      1     1 2010-02-19      41595.55      False
3      1     1 2010-02-26      19403.54      False
4      1     1 2010-03-05      21827.90      False


In [49]:
# Example: split train into two parts and concatenate back
part1 = train_df.head(1000)
part2 = train_df.tail(1000)
concat = pd.concat([part1, part2])

print("\nShape comparison:")
print("Original train:", train_df.shape)
print("Concatenated part1 + part2:", concat.shape)


Shape comparison:
Original train: (421570, 5)
Concatenated part1 + part2: (2000, 5)


# Explore the Datasets (Basic EDA)

In [4]:
# Check dataset shapes
print("Train:", train_df.shape)
print("Features:", features_df.shape)
print("Stores:", stores_df.shape)

Train: (421570, 5)
Features: (8190, 12)
Stores: (45, 3)


In [5]:
# Check missing values
print("\nMissing values in train:")
print(train_df.isnull().sum())


Missing values in train:
Store           0
Dept            0
Date            0
Weekly_Sales    0
IsHoliday       0
dtype: int64


In [6]:
# Check data types and summary
print(train_df.info())
print(train_df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421570 entries, 0 to 421569
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Store         421570 non-null  int64  
 1   Dept          421570 non-null  int64  
 2   Date          421570 non-null  object 
 3   Weekly_Sales  421570 non-null  float64
 4   IsHoliday     421570 non-null  bool   
dtypes: bool(1), float64(1), int64(2), object(1)
memory usage: 13.3+ MB
None
               Store           Dept   Weekly_Sales
count  421570.000000  421570.000000  421570.000000
mean       22.200546      44.260317   15981.258123
std        12.785297      30.492054   22711.183519
min         1.000000       1.000000   -4988.940000
25%        11.000000      18.000000    2079.650000
50%        22.000000      37.000000    7612.030000
75%        33.000000      74.000000   20205.852500
max        45.000000      99.000000  693099.360000


In [None]:
# Unique values
print(train_df['Store'].value_counts().head())
print(train_df['Dept'].value_counts().head())

# 2. Aggregate Sales Using groupby()

In [7]:
#Total sales by store

sales_by_store = train_df.groupby('Store')['Weekly_Sales'].sum().sort_values(ascending=False)
print(sales_by_store.head())

Store
20    3.013978e+08
4     2.995440e+08
14    2.889999e+08
13    2.865177e+08
2     2.753824e+08
Name: Weekly_Sales, dtype: float64


In [8]:
#Total sales by department

sales_by_dept = train_df.groupby('Dept')['Weekly_Sales'].sum().sort_values(ascending=False)
print(sales_by_dept.head())

Dept
92    4.839433e+08
95    4.493202e+08
38    3.931181e+08
72    3.057252e+08
90    2.910685e+08
Name: Weekly_Sales, dtype: float64


In [9]:
#Average weekly sales by store

avg_sales_store = train_df.groupby('Store')['Weekly_Sales'].mean()
print(avg_sales_store.head())

Store
1    21710.543621
2    26898.070031
3     6373.033983
4    29161.210415
5     5053.415813
Name: Weekly_Sales, dtype: float64


# Merge Datasets

In [10]:
#combine train_df with features_df and stores_df to enrich your sales data.
# Merge train with features on Store, Date

train_features = pd.merge(train_df, features_df, on=['Store', 'Date'], how='left')


In [11]:
# Merge with stores info on Store
full_df = pd.merge(train_features, stores_df, on='Store', how='left')

In [12]:
# Check result
print(full_df.head())

   Store  Dept        Date  Weekly_Sales  IsHoliday_x  Temperature  \
0      1     1  2010-02-05      24924.50        False        42.31   
1      1     1  2010-02-12      46039.49         True        38.51   
2      1     1  2010-02-19      41595.55        False        39.93   
3      1     1  2010-02-26      19403.54        False        46.63   
4      1     1  2010-03-05      21827.90        False        46.50   

   Fuel_Price  MarkDown1  MarkDown2  MarkDown3  MarkDown4  MarkDown5  \
0       2.572        NaN        NaN        NaN        NaN        NaN   
1       2.548        NaN        NaN        NaN        NaN        NaN   
2       2.514        NaN        NaN        NaN        NaN        NaN   
3       2.561        NaN        NaN        NaN        NaN        NaN   
4       2.625        NaN        NaN        NaN        NaN        NaN   

          CPI  Unemployment  IsHoliday_y Type    Size  
0  211.096358         8.106        False    A  151315  
1  211.242170         8.106       

# Concatenate Data (multiple weeks, simulate)

In [13]:
# Example (simulate by splitting train_df)
df1 = train_df.iloc[:20000]
df2 = train_df.iloc[20000:]

In [14]:
# Concatenate
concat_df = pd.concat([df1, df2], axis=0)
print(concat_df.shape)

(421570, 5)


# GroupBy Analysis on Merged Data

In [15]:
#Sales by Store Type

sales_by_type = full_df.groupby('Type')['Weekly_Sales'].sum()
print(sales_by_type)


Type
A    4.331015e+09
B    2.000701e+09
C    4.055035e+08
Name: Weekly_Sales, dtype: float64


In [16]:
#Department performance by store size

dept_by_size = full_df.groupby(['Dept', 'Size'])['Weekly_Sales'].mean().reset_index()
print(dept_by_size.head())


   Dept   Size  Weekly_Sales
0     1  34875   9774.553077
1     1  37392   7328.621049
2     1  39690   6559.257832
3     1  39910   7104.485198
4     1  41062   7549.109021


In [17]:
#Filter: High-sales departments

high_sales = full_df[full_df['Weekly_Sales'] > 50000]
print(high_sales[['Store', 'Dept', 'Weekly_Sales']].head())

     Store  Dept  Weekly_Sales
8        1     1      57258.43
46       1     1      55931.23
63       1     1      50510.31
106      1     1      54060.10
113      1     1      57592.12
