In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import warnings

# Random seed
np.random.seed(42)


In [3]:
# Generate sample grocery store data

def generate_grocery_data():
    # Date range: 3 months of daily data
    start_date = datetime(2024, 1, 1)
    end_date = datetime(2024, 3, 1)
    dates = pd.date_range(start_date, end_date, freq='D')

    # Store and product info
    stores = ['Store_001', 'Store_002', 'Store_003', 'Store_004', 'Store_005']
    products = {
        'Bananas': {'category': 'Fresh Produce', 'avg_price': 1.99, 'perishable': True},
        'Apples': {'category': 'Fresh Produce', 'avg_price': 3.49, 'perishable': True},
        'Milk': {'category': 'Dairy', 'avg_price': 4.29, 'perishable': True},
        'Bread': {'category': 'Bakery', 'avg_price': 2.99, 'perishable': True},
        'Canned Beans': {'category': 'Pantry', 'avg_price': 1.89, 'perishable': False},
        'Rice': {'category': 'Pantry', 'avg_price': 5.99, 'perishable': False},
        'Chicken Breast': {'category': 'Meat', 'avg_price': 8.99, 'perishable': True},
        'Yogurt': {'category': 'Dairy', 'avg_price': 5.49, 'perishable': True}
    }

    # Generate data
    data = []
    for date in dates:
        for store in stores:
            for product, info in products.items():
                # Add some seasonality and randomness
                base_demand = np.random.poisson(50)

                # Weekend boost
                if date.weekday() >= 5:
                    base_demand *= 1.3

                # Perishables have more variation
                if info['perishable']:
                    base_demand *= np.random.uniform(0.7, 1.4)
                
                # Occasional stockouts (missing data)
                if np.random.random() < 0.05:  # 5% chance of missing data
                    quantity_sold = np.nan
                    revenue = np.nan
                else:
                    quantity_sold = max(0, int(base_demand))
                    price_variation = np.random.uniform(0.9, 1.1)  # ±10% price variation
                    unit_price = info['avg_price'] * price_variation
                    revenue = quantity_sold * unit_price
                
                data.append({
                    'date': date,
                    'store_id': store,
                    'product_name': product,
                    'category': info['category'],
                    'quantity_sold': quantity_sold,
                    'unit_price': info['avg_price'] * np.random.uniform(0.9, 1.1),
                    'revenue': revenue,
                    'is_perishable': info['perishable']
                })
    
    return pd.DataFrame(data)

# Generate the dataset
df = generate_grocery_data()
print(f"Generated dataset with {len(df)} rows")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")


Generated dataset with 2440 rows
Date range: 2024-01-01 00:00:00 to 2024-03-01 00:00:00


In [7]:
# Load and Explore data
print("====== DATASET OVERVIEW =======")
print(f"Shape: " ,{df.shape})
print(f"\nColumn Types: ")
print(df.dtypes)
print(f"\nMemory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

Shape:  {(2440, 8)}

Column Types: 
date             datetime64[ns]
store_id                 object
product_name             object
category                 object
quantity_sold           float64
unit_price              float64
revenue                 float64
is_perishable              bool
dtype: object

Memory usage: 0.53 MB


In [12]:
# Data Exploration methods
print(f"DF head: ", df.head())
print(f"\nDF tail: ", df.tail())
print(f'\nDF describe: ', df.describe())

DF head:          date   store_id  product_name       category  quantity_sold  \
0 2024-01-01  Store_001       Bananas  Fresh Produce           56.0   
1 2024-01-01  Store_001        Apples  Fresh Produce           37.0   
2 2024-01-01  Store_001          Milk          Dairy           39.0   
3 2024-01-01  Store_001         Bread         Bakery           47.0   
4 2024-01-01  Store_001  Canned Beans         Pantry           43.0   

   unit_price     revenue  is_perishable  
0    1.853086  103.773343           True  
1    3.289213  137.715664           True  
2    4.110875  165.032744           True  
3    3.160535  139.295303           True  
4    1.930652   73.898005          False  

DF tail:             date   store_id    product_name category  quantity_sold  \
2435 2024-03-01  Store_005           Bread   Bakery           69.0   
2436 2024-03-01  Store_005    Canned Beans   Pantry           51.0   
2437 2024-03-01  Store_005            Rice   Pantry           54.0   
2438 2024-03-0

In [13]:
print("=========DATASET INFO===========")
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2440 entries, 0 to 2439
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           2440 non-null   datetime64[ns]
 1   store_id       2440 non-null   object        
 2   product_name   2440 non-null   object        
 3   category       2440 non-null   object        
 4   quantity_sold  2321 non-null   float64       
 5   unit_price     2440 non-null   float64       
 6   revenue        2321 non-null   float64       
 7   is_perishable  2440 non-null   bool          
dtypes: bool(1), datetime64[ns](1), float64(3), object(3)
memory usage: 135.9+ KB
None
