#### Data Generation Process

- **ITEM_ID**: Unique identifier for the product.
- **TIME_STEP**: Date of the transaction.
- **ASIN**: Amazon Standard Identification Number.
- **UNITS_SOLD**: Number of units sold.
- **OrderedAmount**: Total amount of orders, calculated as UNITS_SOLD * keepa_price.
- **GlanceViews**: Number of views the product page received.
- **Business_Unit**: Category of the product.
- **DEAL_TYPE**: Type of deal or promotion.
- **SOA**: Sell Out Allowance (extra discount provided for product).
- **EVENT**: Special event (e.g., holiday, promotion).
- **PEAK_EVENT_FLAG**: Indicator for peak events.
- **NET_INVOICE**: Net invoice amount.
- **CountryCd**: Country code.
- **keepa_price**: Price from Keepa (price tracking service).
- **CTN**: Similar to SKU in e-commerce websites.
- **EVENT_TYPE**: Type of event such as lead-up week, brand week, or spring sales.

In [2]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Seed for reproducibility
np.random.seed(42)

# Constants
NUM_RECORDS = 1000
# ITEM_IDS = [f"item_{i}" for i in range(1, 101)]
ASINS = [f"B00{i:03d}" for i in range(100, 200)]
BUSINESS_UNITS = ['Electronics', 'Clothing', 'Home', 'Books', 'Toys']
DEAL_TYPES = ['None', 'Discount', 'Promotion', 'BOGO']
EVENTS = ['Holiday', 'Black Friday', 'Cyber Monday', 'Regular']
COUNTRIES = ['US', 'CA', 'UK', 'DE', 'FR']
EVENT_TYPES = ['Leadup Week', 'Brand Week', 'Spring Sales', 'Regular Week']

# Helper functions
def random_date(start, end):
    """Generate a random datetime between `start` and `end`"""
    return start + timedelta(
        seconds=random.randint(0, int((end - start).total_seconds())),
    )

# Generate the data
data = {
    # 'ITEM_ID': np.random.choice(ITEM_IDS, NUM_RECORDS),
    'TIME_STEP': [random_date(datetime(2020, 1, 1), datetime(2023, 12, 31)).strftime('%Y-%m-%d') for _ in range(NUM_RECORDS)],
    'UNITS_SOLD': np.random.poisson(20, NUM_RECORDS),  # This will be adjusted later for causal structure
    'GlanceViews': np.random.poisson(100, NUM_RECORDS),  # Adjusted later
    'DEAL_TYPE': np.random.choice(DEAL_TYPES, NUM_RECORDS),
    'SOA': np.random.uniform(0, 50, NUM_RECORDS),
    'EVENT': np.random.choice(EVENTS, NUM_RECORDS),
    'PEAK_EVENT_FLAG': np.random.choice([0, 1], NUM_RECORDS),
    'NET_INVOICE': np.random.uniform(10, 200, NUM_RECORDS),
    'keepa_price': np.random.lognormal(mean=3.5,sigma=0.75, size=NUM_RECORDS),
    'EVENT_TYPE': np.random.choice(EVENT_TYPES, NUM_RECORDS)
}

# Create DataFrame
df = pd.DataFrame(data)

# constant values
df['ITEM_ID'] = df['CountryCd'] + '_' + df['CTN']
df['CountryCd'] = "DE" # germany
df['Business_Unit'] = "BrushHeads" # brush heads
df['CTN'] = "sku_1234" # sku_1234
df['ASIN'] = "B001234567" # B001234567



# Introduce causal structure: DEAL_TYPE affects UNITS_SOLD and GlanceViews
deal_effect = {'None': 1, 'Discount': 1.5, 'Promotion': 2, 'BOGO': 2.5}
df['UNITS_SOLD'] = df.apply(lambda x: int(x['UNITS_SOLD'] * deal_effect[x['DEAL_TYPE']]), axis=1)
df['GlanceViews'] = df.apply(lambda x: int(x['GlanceViews'] * deal_effect[x['DEAL_TYPE']]), axis=1)

# Simulate confounding variable: Business_Unit affects both DEAL_TYPE and UNITS_SOLD
# This is a simplistic way to introduce a confounder. In real scenarios, this would be more complex.
business_unit_effect = {'Electronics': 1.2, 'Clothing': 1, 'Home': 0.8, 'Books': 1.1, 'Toys': 1.3}
df['UNITS_SOLD'] = df.apply(lambda x: int(x['UNITS_SOLD'] * business_unit_effect[x['Business_Unit']]), axis=1)

# Calculate OrderedAmount as UNITS_SOLD * keepa_price to make the data more realistic
df['OrderedAmount'] = df['UNITS_SOLD'] * df['keepa_price']

# Simulate missing data for a confounding variable
df.loc[df['Business_Unit'] == 'Electronics', 'SOA'] = np.nan

# Save to CSV
df.to_csv('../data/synthetic_sales_data.csv', index=False)

# Display the first few rows
print(df.head())

KeyError: 'CountryCd'