In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import warnings

# Random seed
np.random.seed(42)


In [3]:
# Generate sample grocery store data

def generate_grocery_data():
    # Date range: 3 months of daily data
    start_date = datetime(2024, 1, 1)
    end_date = datetime(2024, 3, 1)
    dates = pd.date_range(start_date, end_date, freq='D')

    # Store and product info
    stores = ['Store_001', 'Store_002', 'Store_003', 'Store_004', 'Store_005']
    products = {
        'Bananas': {'category': 'Fresh Produce', 'avg_price': 1.99, 'perishable': True},
        'Apples': {'category': 'Fresh Produce', 'avg_price': 3.49, 'perishable': True},
        'Milk': {'category': 'Dairy', 'avg_price': 4.29, 'perishable': True},
        'Bread': {'category': 'Bakery', 'avg_price': 2.99, 'perishable': True},
        'Canned Beans': {'category': 'Pantry', 'avg_price': 1.89, 'perishable': False},
        'Rice': {'category': 'Pantry', 'avg_price': 5.99, 'perishable': False},
        'Chicken Breast': {'category': 'Meat', 'avg_price': 8.99, 'perishable': True},
        'Yogurt': {'category': 'Dairy', 'avg_price': 5.49, 'perishable': True}
    }

    # Generate data
    data = []
    for date in dates:
        for store in stores:
            for product, info in products.items():
                # Add some seasonality and randomness
                base_demand = np.random.poisson(50)

                # Weekend boost
                if date.weekday() >= 5:
                    base_demand *= 1.3

                # Perishables have more variation
                if info['perishable']:
                    base_demand *= np.random.uniform(0.7, 1.4)
                
                # Occasional stockouts (missing data)
                if np.random.random() < 0.05:  # 5% chance of missing data
                    quantity_sold = np.nan
                    revenue = np.nan
                else:
                    quantity_sold = max(0, int(base_demand))
                    price_variation = np.random.uniform(0.9, 1.1)  # ±10% price variation
                    unit_price = info['avg_price'] * price_variation
                    revenue = quantity_sold * unit_price
                
                data.append({
                    'date': date,
                    'store_id': store,
                    'product_name': product,
                    'category': info['category'],
                    'quantity_sold': quantity_sold,
                    'unit_price': info['avg_price'] * np.random.uniform(0.9, 1.1),
                    'revenue': revenue,
                    'is_perishable': info['perishable']
                })
    
    return pd.DataFrame(data)

# Generate the dataset
df = generate_grocery_data()
print(f"Generated dataset with {len(df)} rows")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")


Generated dataset with 2440 rows
Date range: 2024-01-01 00:00:00 to 2024-03-01 00:00:00


In [7]:
# Load and Explore data
print("====== DATASET OVERVIEW =======")
print(f"Shape: " ,{df.shape})
print(f"\nColumn Types: ")
print(df.dtypes)
print(f"\nMemory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

Shape:  {(2440, 8)}

Column Types: 
date             datetime64[ns]
store_id                 object
product_name             object
category                 object
quantity_sold           float64
unit_price              float64
revenue                 float64
is_perishable              bool
dtype: object

Memory usage: 0.53 MB


In [12]:
# Data Exploration methods
print(f"DF head: ", df.head())
print(f"\nDF tail: ", df.tail())
print(f'\nDF describe: ', df.describe())

DF head:          date   store_id  product_name       category  quantity_sold  \
0 2024-01-01  Store_001       Bananas  Fresh Produce           56.0   
1 2024-01-01  Store_001        Apples  Fresh Produce           37.0   
2 2024-01-01  Store_001          Milk          Dairy           39.0   
3 2024-01-01  Store_001         Bread         Bakery           47.0   
4 2024-01-01  Store_001  Canned Beans         Pantry           43.0   

   unit_price     revenue  is_perishable  
0    1.853086  103.773343           True  
1    3.289213  137.715664           True  
2    4.110875  165.032744           True  
3    3.160535  139.295303           True  
4    1.930652   73.898005          False  

DF tail:             date   store_id    product_name category  quantity_sold  \
2435 2024-03-01  Store_005           Bread   Bakery           69.0   
2436 2024-03-01  Store_005    Canned Beans   Pantry           51.0   
2437 2024-03-01  Store_005            Rice   Pantry           54.0   
2438 2024-03-0

In [14]:
print("=========DATASET INFO===========")
print(df.info())

print("\n=== UNIQUE VALUES PER COLUMN ===")
for col in df.columns:
    if df[col].dtype == 'object' or df[col].dtype == 'bool':
        print(f"{col}: {df[col].nunique()} unique values")
        if df[col].nunique() < 10:
            print(f"  Values: {df[col].unique()}")

print("\n=== MISSING VALUES ===")
missing_data = df.isnull().sum()
print(missing_data[missing_data > 0])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2440 entries, 0 to 2439
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           2440 non-null   datetime64[ns]
 1   store_id       2440 non-null   object        
 2   product_name   2440 non-null   object        
 3   category       2440 non-null   object        
 4   quantity_sold  2321 non-null   float64       
 5   unit_price     2440 non-null   float64       
 6   revenue        2321 non-null   float64       
 7   is_perishable  2440 non-null   bool          
dtypes: bool(1), datetime64[ns](1), float64(3), object(3)
memory usage: 135.9+ KB
None

=== UNIQUE VALUES PER COLUMN ===
store_id: 5 unique values
  Values: ['Store_001' 'Store_002' 'Store_003' 'Store_004' 'Store_005']
product_name: 8 unique values
  Values: ['Bananas' 'Apples' 'Milk' 'Bread' 'Canned Beans' 'Rice' 'Chicken Breast'
 'Yogurt']
category: 5 unique values
  Values: ['Fr

In [18]:
# Column selection
print("=== SELECTING COLUMNS ===")
# Single column (returns Series)
bananas_sales = df['product_name'] == 'Bananas'
print(f"Bananas rows: {bananas_sales.sum()}")

# Multiple columns
key_columns = df[['date', 'store_id', 'product_name', 'quantity_sold', 'revenue']]
print(f"Key columns shape: {key_columns.shape}")

# Boolean filtering
print("\n=== FILTERING DATA ===")
# High revenue transactions
high_revenue = df[df['revenue'] > 200]
print(f"High revenue transactions (>$200): {len(high_revenue)}")

# Multiple conditions
fresh_produce_weekends = df[
    (df['category'] == 'Fresh Produce') & 
    (df['date'].dt.weekday >= 5)
]
print(f"Fresh produce sales on weekends: {len(fresh_produce_weekends)}")

# Using query method (alternative syntax)
dairy_high_sales = df.query('category == "Dairy" and quantity_sold > 60')
print(f"High-selling dairy products: {len(dairy_high_sales)}")

=== SELECTING COLUMNS ===
Bananas rows: 305
Key columns shape: (2440, 5)

=== FILTERING DATA ===
High revenue transactions (>$200): 1232
Fresh produce sales on weekends: 160
High-selling dairy products: 214


In [19]:
# loc and iloc examples
print("=== LABEL-BASED SELECTION (loc) ===")
# Select specific rows and columns
sample_data = df.loc[0:5, ['date', 'product_name', 'quantity_sold']]
print(sample_data)

print("\n=== POSITION-BASED SELECTION (iloc) ===")
# Select by position
first_10_rows_3_cols = df.iloc[:10, :3]
print(first_10_rows_3_cols)

# Select specific positions
random_sample = df.iloc[[0, 100, 200, 300], [1, 3, 4]]
print("\nRandom sample:")
print(random_sample)

=== LABEL-BASED SELECTION (loc) ===
        date  product_name  quantity_sold
0 2024-01-01       Bananas           56.0
1 2024-01-01        Apples           37.0
2 2024-01-01          Milk           39.0
3 2024-01-01         Bread           47.0
4 2024-01-01  Canned Beans           43.0
5 2024-01-01          Rice           42.0

=== POSITION-BASED SELECTION (iloc) ===
        date   store_id    product_name
0 2024-01-01  Store_001         Bananas
1 2024-01-01  Store_001          Apples
2 2024-01-01  Store_001            Milk
3 2024-01-01  Store_001           Bread
4 2024-01-01  Store_001    Canned Beans
5 2024-01-01  Store_001            Rice
6 2024-01-01  Store_001  Chicken Breast
7 2024-01-01  Store_001          Yogurt
8 2024-01-01  Store_002         Bananas
9 2024-01-01  Store_002          Apples

Random sample:
      store_id       category  quantity_sold
0    Store_001  Fresh Produce           56.0
100  Store_003         Pantry           50.0
200  Store_001  Fresh Produce         

In [20]:
# GroupBy operations - core of data analysis
print("=== BASIC GROUPBY OPERATIONS ===")

# Group by single column
store_totals = df.groupby('store_id')['revenue'].sum()
print("Total revenue by store:")
print(store_totals)

# Group by multiple columns
category_store_stats = df.groupby(['category', 'store_id']).agg({
    'quantity_sold': ['sum', 'mean'],
    'revenue': ['sum', 'mean', 'count']
})
print("\nCategory-Store statistics:")
print(category_store_stats.head(10))

# More complex aggregations
daily_summary = df.groupby('date').agg({
    'quantity_sold': 'sum',
    'revenue': 'sum',
    'store_id': 'nunique',  # Number of unique stores
    'product_name': 'nunique'  # Number of unique products
}).round(2)

print("\nDaily summary (first 10 days):")
print(daily_summary.head(10))

=== BASIC GROUPBY OPERATIONS ===
Total revenue by store:
store_id
Store_001    112841.283108
Store_002    110152.488576
Store_003    112545.986465
Store_004    112689.795020
Store_005    114349.029300
Name: revenue, dtype: float64

Category-Store statistics:
                   quantity_sold                  revenue                  
                             sum       mean           sum        mean count
category store_id                                                          
Bakery   Store_001        3251.0  57.035088   9615.212817  168.687944    57
         Store_002        3291.0  55.779661   9835.992280  166.711734    59
         Store_003        3442.0  58.338983  10226.573082  173.331747    59
         Store_004        3401.0  57.644068  10215.483583  173.143790    59
         Store_005        3259.0  56.189655   9803.825642  169.031477    58
Dairy    Store_001        6651.0  57.834783  32750.755384  284.789177   115
         Store_002        6486.0  56.400000  31525.320971

In [21]:
# Custom aggregation functions
def revenue_per_unit(group):
    """Calculate average revenue per unit sold"""
    total_revenue = group['revenue'].sum()
    total_quantity = group['quantity_sold'].sum()
    return total_revenue / total_quantity if total_quantity > 0 else 0

# Apply custom function
product_efficiency = df.groupby('product_name').apply(revenue_per_unit)
print("=== REVENUE PER UNIT BY PRODUCT ===")
print(product_efficiency.sort_values(ascending=False))

# Transform operations (keeps original DataFrame size)
print("\n=== TRANSFORM OPERATIONS ===")
# Add store rank within each category
df['store_rank_in_category'] = df.groupby('category')['revenue'].rank(ascending=False)

# Add percentage of total category revenue
df['pct_of_category_revenue'] = df.groupby('category')['revenue'].transform(
    lambda x: x / x.sum() * 100
)

print("Sample with new columns:")
print(df[['product_name', 'category', 'revenue', 'store_rank_in_category', 'pct_of_category_revenue']].head())

=== REVENUE PER UNIT BY PRODUCT ===
product_name
Chicken Breast    8.952420
Rice              5.998613
Yogurt            5.527598
Milk              4.299103
Apples            3.491078
Bread             2.985886
Bananas           1.988791
Canned Beans      1.888744
dtype: float64

=== TRANSFORM OPERATIONS ===
Sample with new columns:
   product_name       category     revenue  store_rank_in_category  \
0       Bananas  Fresh Produce  103.773343                   449.0   
1        Apples  Fresh Produce  137.715664                   304.0   
2          Milk          Dairy  165.032744                   538.0   
3         Bread         Bakery  139.295303                   212.0   
4  Canned Beans         Pantry   73.898005                   564.0   

   pct_of_category_revenue  
0                 0.116292  
1                 0.154328  
2                 0.103424  
3                 0.280289  
4                 0.059682  


  product_efficiency = df.groupby('product_name').apply(revenue_per_unit)
