In [14]:
import numpy as np
import pandas as pd

from pathlib import Path

np.random.seed(42)

DATA_RAW = Path("../data/raw")
DATA_RAW.mkdir(parents=True, exist_ok=True)


In [15]:
N_STORES = 10
N_PRODUCTS = 50
N_WEEKS = 52

stores = [f"S{str(i).zfill(2)}" for i in range(1, N_STORES + 1)]
products = [f"P{str(i).zfill(3)}" for i in range(1, N_PRODUCTS + 1)]
weeks = list(range(1, N_WEEKS + 1))

stores[:3], products[:3], weeks[:5]


(['S01', 'S02', 'S03'], ['P001', 'P002', 'P003'], [1, 2, 3, 4, 5])

In [16]:
categories = ["Fresh", "Dairy", "Snacks", "Beverages", "Household"]

product_df = pd.DataFrame({
    "product_id": products,
    "category": np.random.choice(categories, size=N_PRODUCTS, p=[0.22, 0.18, 0.24, 0.18, 0.18])
})

# cost price: lognormal makes “most normal, some expensive”
product_df["cost_price"] = np.round(np.random.lognormal(mean=3.2, sigma=0.35, size=N_PRODUCTS), 2)

product_df.head()


Unnamed: 0,product_id,category,cost_price
0,P001,Dairy,31.77
1,P002,Household,26.05
2,P003,Beverages,23.56
3,P004,Snacks,22.08
4,P005,Fresh,14.62


### base price per product (selling price before discount)

In [17]:
# margin factor: base selling price is cost * margin_factor
margin_factor = np.random.uniform(1.15, 1.65, size=N_PRODUCTS)
product_df["base_price"] = np.round(product_df["cost_price"] * margin_factor, 2)

product_df.describe()

# Thoda margin tho hona chahiye business ke liye


Unnamed: 0,cost_price,base_price
count,50.0,50.0
mean,25.714,35.5044
std,7.798726,10.261722
min,9.81,11.99
25%,20.49,27.735
50%,24.76,36.555
75%,31.425,42.4575
max,42.42,54.89


### Store strength (some stores just sell more)

In [18]:
store_df = pd.DataFrame({
    "store_id": stores,
    "store_strength": np.random.uniform(0.75, 1.35, size=N_STORES)  # multiplier
})

store_df.sort_values("store_strength", ascending=False).head()

# This says ki few stores generally sell more due to location, higher demand wagera wagera factors


Unnamed: 0,store_id,store_strength
8,S09,1.164563
2,S03,1.156539
6,S07,1.137104
1,S02,1.104536
4,S05,1.057256


### Seasonality curve

In [19]:
seasonality = []
for w in weeks:
    # base = 1.0, festival peak around week 44
    bump = 0.25 * np.exp(-0.5 * ((w - 44) / 4.5) ** 2)  # gaussian bump
    seasonality.append(1.0 + bump)

seasonality = np.array(seasonality)

pd.DataFrame({"week": weeks, "seasonality": seasonality}).tail()


Unnamed: 0,week,seasonality
47,48,1.16841
48,49,1.134852
49,50,1.102778
50,51,1.074559
51,52,1.051481


In [20]:
grid = pd.MultiIndex.from_product(
    [weeks, stores, products],
    names=["week", "store_id", "product_id"]
).to_frame(index=False)

grid.shape, grid.head()


((26000, 3),
    week store_id product_id
 0     1      S01       P001
 1     1      S01       P002
 2     1      S01       P003
 3     1      S01       P004
 4     1      S01       P005)

In [21]:
df = grid.merge(product_df[["product_id", "category", "cost_price", "base_price"]], on="product_id", how="left")
df = df.merge(store_df, on="store_id", how="left")

df.head()


Unnamed: 0,week,store_id,product_id,category,cost_price,base_price,store_strength
0,1,S01,P001,Dairy,31.77,50.71,0.774465
1,1,S01,P002,Household,26.05,36.98,0.774465
2,1,S01,P003,Beverages,23.56,36.61,0.774465
3,1,S01,P004,Snacks,22.08,35.28,0.774465
4,1,S01,P005,Fresh,14.62,19.14,0.774465


## Discount Table
Hamesha discounts are not given

In [22]:
# promo probability by category
promo_prob = {
    "Fresh": 0.18,
    "Dairy": 0.22,
    "Snacks": 0.30,
    "Beverages": 0.26,
    "Household": 0.20
}

# Discount percent when promo is active
def sample_discount(cat, size):
    if cat in ["Snacks", "Beverages"]:
        return np.random.choice([5,10,15,20,25], size=size, p=[0.18,0.30,0.25,0.18,0.09])
    else:
        return np.random.choice([0,5,10,15,20], size=size, p=[0.45,0.25,0.18,0.09,0.03])

disc_rows = []
for cat in categories:
    cat_df = df[df["category"] == cat][["week","store_id","product_id","category"]].copy()
    is_promo = np.random.rand(len(cat_df)) < promo_prob[cat]
    cat_df = cat_df[is_promo].copy()
    cat_df["discount_percent"] = sample_discount(cat, len(cat_df))
    cat_df = cat_df[cat_df["discount_percent"] > 0]
    disc_rows.append(cat_df[["week","store_id","product_id","discount_percent"]])

discount_df = pd.concat(disc_rows, ignore_index=True)

discount_df.head(), discount_df.shape


(   week store_id product_id  discount_percent
 0     1      S02       P033                20
 1     1      S02       P043                10
 2     1      S03       P038                 5
 3     1      S04       P033                15
 4     1      S05       P038                 5,
 (4510, 4))

Generating marketing table (raw)

Marketing spend varies by store and season (more spend in peak weeks).

In [23]:
marketing_rows = []
for s in stores:
    base = np.random.uniform(5000, 25000)  # weekly baseline spend
    strength = store_df.loc[store_df["store_id"] == s, "store_strength"].iloc[0]
    for i, w in enumerate(weeks):
        seasonal_boost = 1 + (seasonality[i] - 1) * 1.2
        noise = np.random.normal(1.0, 0.15)
        spend = base * strength * seasonal_boost * noise
        marketing_rows.append([w, s, max(0, round(spend, 2))])

marketing_df = pd.DataFrame(marketing_rows, columns=["week","store_id","marketing_spend"])
marketing_df.head()


Unnamed: 0,week,store_id,marketing_spend
0,1,S01,8047.96
1,2,S01,6862.98
2,3,S01,9418.4
3,4,S01,6656.55
4,5,S01,7178.85


In [24]:
# weekly price noise: small drift around base price
price_noise = np.random.normal(1.0, 0.03, size=len(df))
df["selling_price"] = np.round(df["base_price"] * price_noise, 2)

df[["base_price","selling_price"]].head()


Unnamed: 0,base_price,selling_price
0,50.71,51.95
1,36.98,38.57
2,36.61,34.52
3,35.28,35.43
4,19.14,19.55


## Computing Units Sold

These depend on:

- product base demand
- store strength
- seasonality
- discount increases units
- higher price decreases units
- marketing increases units
- plus random noise

In [25]:
# base demand per product (some products are naturally popular)
base_demand = pd.Series(
    np.random.lognormal(mean=2.6, sigma=0.55, size=N_PRODUCTS),
    index=products
)

df["base_demand"] = df["product_id"].map(base_demand)

# attach discount (missing = 0 for now, because raw discount table might not cover all)
df = df.merge(discount_df, on=["week","store_id","product_id"], how="left")
df["discount_percent"] = df["discount_percent"].fillna(0)

# attach marketing
df = df.merge(marketing_df, on=["week","store_id"], how="left")

# discounted price
df["discounted_price"] = df["selling_price"] * (1 - df["discount_percent"]/100)

# price sensitivity (elasticity-ish): higher magnitude = more sensitive
# keep it realistic by category
price_sensitivity = {
    "Fresh": 0.8,
    "Dairy": 1.0,
    "Snacks": 1.4,
    "Beverages": 1.2,
    "Household": 0.9
}
df["price_sens"] = df["category"].map(price_sensitivity)

# seasonality multiplier
df["seasonality"] = df["week"].apply(lambda w: seasonality[w-1])

# marketing effect: diminishing returns using log
df["mkt_effect"] = np.log1p(df["marketing_spend"]) / np.log1p(marketing_df["marketing_spend"].max())

# demand formula (multiplicative style, then add noise)
noise = np.random.normal(1.0, 0.25, size=len(df))

# demand increases with discount, decreases with price, increases with marketing + seasonality + store strength
df["units_sold"] = (
    df["base_demand"]
    * df["store_strength"]
    * df["seasonality"]
    * (1 + 0.03 * df["discount_percent"])                 # each 1% discount adds ~3% * 1% = 0.03 boost factor
    * (df["discounted_price"] / df["base_price"]) ** (-df["price_sens"])  # price up -> units down
    * (1 + 0.30 * df["mkt_effect"])                       # marketing boosts up to ~30%
    * noise
)

# convert to integer units and keep non-negative
df["units_sold"] = np.clip(np.round(df["units_sold"]), 0, None).astype(int)

df[["week","store_id","product_id","category","selling_price","discount_percent","marketing_spend","units_sold"]].head()


Unnamed: 0,week,store_id,product_id,category,selling_price,discount_percent,marketing_spend,units_sold
0,1,S01,P001,Dairy,51.95,0.0,8047.96,17
1,1,S01,P002,Household,38.57,0.0,8047.96,7
2,1,S01,P003,Beverages,34.52,15.0,8047.96,25
3,1,S01,P004,Snacks,35.43,20.0,8047.96,29
4,1,S01,P005,Fresh,19.55,0.0,8047.96,19


In [26]:
sales_df = df[["week","store_id","product_id","units_sold","selling_price"]].copy()
sales_df.head(), sales_df.shape


(   week store_id product_id  units_sold  selling_price
 0     1      S01       P001          17          51.95
 1     1      S01       P002           7          38.57
 2     1      S01       P003          25          34.52
 3     1      S01       P004          29          35.43
 4     1      S01       P005          19          19.55,
 (26000, 5))

In [27]:
# 1) Duplicate some sales rows (like accidental double ingestion)
dup_idx = np.random.choice(sales_df.index, size=int(0.005 * len(sales_df)), replace=False)
sales_df_dirty = pd.concat([sales_df, sales_df.loc[dup_idx]], ignore_index=True)

# 2) Remove some discount rows to simulate missing promo data
discount_df_dirty = discount_df.copy()
drop_idx = np.random.choice(discount_df_dirty.index, size=int(0.08 * len(discount_df_dirty)), replace=False)
discount_df_dirty = discount_df_dirty.drop(drop_idx).reset_index(drop=True)

# 3) Remove some marketing rows to simulate missing spend entries
marketing_df_dirty = marketing_df.copy()
drop_idx = np.random.choice(marketing_df_dirty.index, size=int(0.05 * len(marketing_df_dirty)), replace=False)
marketing_df_dirty = marketing_df_dirty.drop(drop_idx).reset_index(drop=True)

# 4) Add a few outliers in units (extreme spikes)
outlier_idx = np.random.choice(sales_df_dirty.index, size=30, replace=False)
sales_df_dirty.loc[outlier_idx, "units_sold"] *= np.random.choice([5, 8, 12], size=30)

# 5) Add a few wrong product IDs in sales
bad_idx = np.random.choice(sales_df_dirty.index, size=25, replace=False)
sales_df_dirty.loc[bad_idx, "product_id"] = "P999"  # invalid

sales_df_dirty.shape, discount_df_dirty.shape, marketing_df_dirty.shape


((26130, 5), (4150, 4), (494, 3))

In [28]:
product_out = product_df[["product_id","category","cost_price"]].copy()

product_out.to_csv(DATA_RAW / "product.csv", index=False)
sales_df_dirty.to_csv(DATA_RAW / "sales.csv", index=False)
discount_df_dirty.to_csv(DATA_RAW / "discount.csv", index=False)
marketing_df_dirty.to_csv(DATA_RAW / "marketing.csv", index=False)

print("Saved:")
print(DATA_RAW / "product.csv")
print(DATA_RAW / "sales.csv")
print(DATA_RAW / "discount.csv")
print(DATA_RAW / "marketing.csv")


Saved:
../data/raw/product.csv
../data/raw/sales.csv
../data/raw/discount.csv
../data/raw/marketing.csv
