In [32]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
from scipy import stats

# Set seed for repro
np.random.seed(42)
random.seed(42)

In [7]:
# Get random weights to easily change the distribution of randomness in the data

# Generate 10 random numbers
random_numbers = np.random.random(10)

# Normalize so they sum to 1
weights = random_numbers / np.sum(random_numbers)

# Print weights
for i, weight in enumerate(weights):
    print(f"{weight:.4f}")

0.0720
0.1828
0.1407
0.1151
0.0300
0.0300
0.0112
0.1665
0.1156
0.1361


In [27]:
# I want 6000 records
num_records = 6000

# 1. Category with uneven distribution
categories = ['Boot', 'Athletic', 'Sandal', 'Casual', 'Dress']
category_weights = [0.3, 0.35, 0.15, 0.15, 0.05]  # Uneven distribution
categories_data = random.choices(categories, weights=category_weights, k=num_records)

# 2. Order Number - 8-digit random numbers
order_numbers = [f"{random.randint(10000000, 99999999)}" for _ in range(num_records)]

# 3. Order Date - Random dates between Jan 1, 2025 and one month into future
today = datetime(2025, 6, 20)  
start_date = datetime(2025, 1, 1)
days_between = (today - start_date).days
if days_between < 0:  # Handle future date
    days_between = 365  
    start_date = today - timedelta(days=days_between)
    
random_days = [random.randint(0, days_between) for _ in range(num_records)]
order_dates = [(start_date + timedelta(days=day)).strftime('%Y-%m-%d') for day in random_days]

# 4. Total Sales - Skewed toward higher values
# Using a beta distribution scaled to our range
# Alpha < Beta creates right skew (more lower values)
# Alpha > Beta creates left skew (more higher values)
alpha, beta = 2.0, 1.0  # Parameters for left skew (more higher values)
raw_sales = stats.beta.rvs(alpha, beta, size=num_records)
# Scale to desired range of $20 to $150
total_sales = [round(20 + raw_sales[i] * 130, 2) for i in range(num_records)]

# 5. Region with uneven distribution
regions = ['Northeast', 'Midwest', 'South', 'West']
region_weights = [0.2, 0.15, 0.4, 0.25]  
regions_data = random.choices(regions, weights=region_weights, k=num_records)

# 6. Fictional Brands with uneven distribution using the weights from above that equal 1
brands = ['StrideFlex', 'AeroStep', 'TerraTread', 'UrbanSole',
    'AlpineGrip', 'CoastalWalk', 'EcoTrek',
    'LuxeStep', 'HorizonPath', 'WildTrail']
brand_weights = [0.0720, 0.1828, 0.1407, 0.1151, 0.0300, 0.0300, 0.0112, 0.1665, 0.1156, 0.1361]
brands_data = random.choices(brands, weights=brand_weights, k=num_records)

# Create the DataFrame
footwear_sales = pd.DataFrame({
    'Category': categories_data,
    'Order Number': order_numbers,
    'Order Date': order_dates,
    'Order Total': total_sales,
    'Region': regions_data,
    'Brand': brands_data
})

footwear_sales

Unnamed: 0,Category,Order Number,Order Date,Order Total,Region,Brand
0,Athletic,66306216,2025-05-06,114.11,West,LuxeStep
1,Boot,95395644,2025-06-03,136.75,West,AlpineGrip
2,Boot,57779407,2025-04-16,94.14,South,LuxeStep
3,Boot,16538403,2025-03-02,140.01,West,HorizonPath
4,Sandal,48903840,2025-01-08,98.59,West,TerraTread
...,...,...,...,...,...,...
5995,Athletic,25238583,2025-03-26,55.54,South,AeroStep
5996,Casual,56651206,2025-05-25,95.94,Northeast,HorizonPath
5997,Boot,63430642,2025-01-27,115.98,South,WildTrail
5998,Boot,27187635,2025-03-17,104.24,South,StrideFlex


In [28]:
# Add a Return column where 11% of the rows are randomly selected as returns with the Order Total amount.

# Create a Returns column as 0
footwear_sales['Returns'] = 0.0

# Randomly select 11% of rows to be returns
num_returns = int(0.11 * len(footwear_sales))
return_indices = np.random.choice(footwear_sales.index, size=num_returns, replace=False)

# For the selected rows, set the Returns value equal to the Order Total value
footwear_sales.loc[return_indices, 'Returns'] = footwear_sales.loc[return_indices, 'Order Total']

# Validate
returned = (footwear_sales['Returns'] != 0.0).sum()
print(returned)

# Verify the percentage of returns
return_percentage = (footwear_sales['Returns'] > 0).mean() * 100
print(f"\nPercentage of orders with returns: {return_percentage:.2f}%")

660

Percentage of orders with returns: 11.00%


In [29]:
footwear_sales

Unnamed: 0,Category,Order Number,Order Date,Order Total,Region,Brand,Returns
0,Athletic,66306216,2025-05-06,114.11,West,LuxeStep,0.00
1,Boot,95395644,2025-06-03,136.75,West,AlpineGrip,0.00
2,Boot,57779407,2025-04-16,94.14,South,LuxeStep,0.00
3,Boot,16538403,2025-03-02,140.01,West,HorizonPath,0.00
4,Sandal,48903840,2025-01-08,98.59,West,TerraTread,0.00
...,...,...,...,...,...,...,...
5995,Athletic,25238583,2025-03-26,55.54,South,AeroStep,0.00
5996,Casual,56651206,2025-05-25,95.94,Northeast,HorizonPath,0.00
5997,Boot,63430642,2025-01-27,115.98,South,WildTrail,115.98
5998,Boot,27187635,2025-03-17,104.24,South,StrideFlex,0.00


In [30]:
# Add a promotion column to identify orders that fall within a desginated promo period

def is_promotion_week(date_str):
    """
    Determine if a date falls within a promotion week.
    Promotions occur every 3 weeks starting from Jan 1, 2025.
    Each promotion lasts for 7 days.
    """
    # Convert string date to datetime
    order_date = datetime.strptime(date_str, '%Y-%m-%d')
    
    # Define the start date of the promo
    promotion_start = datetime(2025, 1, 1)
    
    # Calculate days since the promo start
    days_since_start = (order_date - promotion_start).days
    
    # If days_since_start is negative, the order is before promotions began
    if days_since_start < 0:
        return False
    
    # Calculate 3-week cycle
    cycle = days_since_start // 21
    
    # Calculate the day within the current 3-week cycle (0-20)
    day_in_cycle = days_since_start % 21
    
    # If the day falls within the first 7 days of the cycle, it's a promotion week
    return day_in_cycle < 7

# Apply the function to create the promotion column
footwear_sales['Promotion'] = footwear_sales['Order Date'].apply(is_promotion_week)

# Convert boolean to discount percentage
footwear_sales['Promotion'] = footwear_sales['Promotion'].apply(lambda x: 0.20 if x else 0.0)

# Count how many orders fall within promotion periods
promotion_count = (footwear_sales['Promotion'] > 0).sum()
total_orders = len(footwear_sales)
promotion_percentage = (promotion_count / total_orders) * 100

print(f"\nOrders with 20% promotion: {promotion_count} ({promotion_percentage:.2f}% of total)")


Orders with 20% promotion: 2026 (33.77% of total)


In [31]:
footwear_sales

Unnamed: 0,Category,Order Number,Order Date,Order Total,Region,Brand,Returns,Promotion
0,Athletic,66306216,2025-05-06,114.11,West,LuxeStep,0.00,0.0
1,Boot,95395644,2025-06-03,136.75,West,AlpineGrip,0.00,0.2
2,Boot,57779407,2025-04-16,94.14,South,LuxeStep,0.00,0.2
3,Boot,16538403,2025-03-02,140.01,West,HorizonPath,0.00,0.0
4,Sandal,48903840,2025-01-08,98.59,West,TerraTread,0.00,0.0
...,...,...,...,...,...,...,...,...
5995,Athletic,25238583,2025-03-26,55.54,South,AeroStep,0.00,0.2
5996,Casual,56651206,2025-05-25,95.94,Northeast,HorizonPath,0.00,0.0
5997,Boot,63430642,2025-01-27,115.98,South,WildTrail,115.98,0.2
5998,Boot,27187635,2025-03-17,104.24,South,StrideFlex,0.00,0.0


In [None]:
# Create a profit column that randomly uses 40-70% of the order total



In [None]:
# Export the DataFrame to a CSV file
csv_filename = 'footwear_sales.csv'
footwear_sales.to_csv(csv_filename, index=False)

print(f"\nDataset successfully exported to {csv_filename}")
print(f"Total records: {len(footwear_sales)}")