In [6]:
import pandas as pd
import numpy as np
import datetime
import random

# --- Configuration ---
NUM_STORES = 8
START_DATE = datetime.datetime(2022, 1, 1)
END_DATE = datetime.datetime(2022, 6, 30)
AVG_TRANSACTIONS_PER_DAY_PER_STORE = 10

# Product Catalog (simplified)
products_df = pd.DataFrame({
    'ProductID': [f'P{100+i}' for i in range(50)],
    'ProductCategory': random.choices(['Electronics', 'Grocery', 'Apparel', 'Home Goods', 'Books'], k=50),
    'UnitPrice': np.random.uniform(5, 200, 50).round(2),
    'PopularityWeight': np.random.rand(50)
})
products_df['PopularityWeight'] = products_df['PopularityWeight'] / products_df['PopularityWeight'].sum()

# Pre-compute arrays for faster access
product_ids = products_df['ProductID'].values
product_categories = products_df['ProductCategory'].values
unit_prices = products_df['UnitPrice'].values
popularity_weights = products_df['PopularityWeight'].values

# Customer IDs (generate a pool)
customer_ids = [f'Cust_{i:04d}' for i in range(1000)]
customer_ids_array = np.array(customer_ids)

# Store characteristics
store_multipliers = {store_id: random.uniform(0.7, 1.3) for store_id in range(1, NUM_STORES + 1)}

# Pre-calculate all date multipliers
date_range = pd.date_range(START_DATE, END_DATE)
day_multipliers = []
for date in date_range:
    day_of_week = date.weekday()
    multiplier = 1.0
    if day_of_week >= 5:  # Weekend
        multiplier = random.uniform(1.2, 1.6)
    if date.month in [11, 12]:  # Holiday season
        multiplier *= random.uniform(1.1, 1.5)
    day_multipliers.append(multiplier)

# --- Optimized Generation ---
all_transactions = []
current_transaction_id = 10000

# Payment methods array for faster selection
payment_methods = ['Credit Card', 'Cash', 'Mobile Pay', 'Debit Card']

for day_idx, current_date in enumerate(date_range):
    day_multiplier = day_multipliers[day_idx]
    
    for store_id in range(1, NUM_STORES + 1):
        store_base_traffic = AVG_TRANSACTIONS_PER_DAY_PER_STORE * store_multipliers[store_id]
        num_transactions_today = int(np.random.poisson(store_base_traffic * day_multiplier))
        
        if num_transactions_today == 0:
            continue
            
        # Generate all transaction times at once
        transaction_hours = np.random.randint(9, 21, num_transactions_today)
        transaction_minutes = np.random.randint(0, 60, num_transactions_today)
        transaction_seconds = np.random.randint(0, 60, num_transactions_today)
        
        # Generate all customer IDs at once
        transaction_customers = np.random.choice(customer_ids_array, num_transactions_today)
        
        # Generate basket sizes for all transactions
        basket_sizes = np.random.geometric(p=0.4, size=num_transactions_today)
        basket_sizes = np.clip(basket_sizes, 1, 10)
        
        for trans_idx in range(num_transactions_today):
            transaction_time = current_date + datetime.timedelta(
                hours=int(transaction_hours[trans_idx]),
                minutes=int(transaction_minutes[trans_idx]),
                seconds=int(transaction_seconds[trans_idx])
            )
            
            num_items = basket_sizes[trans_idx]
            
            # Vectorized product selection
            selected_product_indices = np.random.choice(
                len(products_df), 
                size=num_items, 
                p=popularity_weights
            )
            
            # Vectorized quantity generation
            quantities = np.random.randint(1, 4, num_items)
            
            # Vectorized promotion application
            promotions_applied = np.random.random(num_items) < 0.1
            
            for item_idx in range(num_items):
                prod_idx = selected_product_indices[item_idx]
                quantity = quantities[item_idx]
                total_price = quantity * unit_prices[prod_idx]
                
                promotion_applied = promotions_applied[item_idx]
                discount_amount = 0.0
                if promotion_applied:
                    discount_amount = round(total_price * random.uniform(0.05, 0.20), 2)
                    total_price -= discount_amount
                
                all_transactions.append({
                    'TransactionID': current_transaction_id,
                    'Timestamp': transaction_time,
                    'StoreID': store_id,
                    'CustomerID': transaction_customers[trans_idx],
                    'ProductID': product_ids[prod_idx],
                    'ProductName': f"Product {product_ids[prod_idx]}",
                    'ProductCategory': product_categories[prod_idx],
                    'Quantity': quantity,
                    'UnitPrice': unit_prices[prod_idx],
                    'TotalPrice': round(total_price, 2),
                    'PaymentMethod': random.choice(payment_methods),
                    'PromotionApplied': promotion_applied,
                    'DiscountAmount': discount_amount
                })
            
            current_transaction_id += 1

df_transactions = pd.DataFrame(all_transactions)
display(df_transactions.head())
print(f"\nGenerated {len(df_transactions)} transaction line items.")

Unnamed: 0,TransactionID,Timestamp,StoreID,CustomerID,ProductID,ProductName,ProductCategory,Quantity,UnitPrice,TotalPrice,PaymentMethod,PromotionApplied,DiscountAmount
0,10000,2022-01-01 18:48:45,1,Cust_0103,P105,Product P105,Electronics,2,126.37,252.74,Cash,False,0.0
1,10001,2022-01-01 09:19:34,1,Cust_0810,P147,Product P147,Grocery,2,41.7,83.4,Mobile Pay,False,0.0
2,10001,2022-01-01 09:19:34,1,Cust_0810,P132,Product P132,Electronics,2,36.18,59.43,Credit Card,True,12.93
3,10001,2022-01-01 09:19:34,1,Cust_0810,P110,Product P110,Books,1,94.88,94.88,Cash,False,0.0
4,10002,2022-01-01 13:00:55,1,Cust_0918,P130,Product P130,Electronics,2,193.46,386.92,Cash,False,0.0



Generated 37934 transaction line items.


In [7]:
# save to CSV
df_transactions.to_csv('../data/sample-2/random-simulation.csv', index=False)