#Stage 1: Data Ingestion (Mock Data Generation)

**Description:** This notebook simulates the extraction of raw product data from a legacy ERP system. 
Since real internal sales and margin data are sensitive/unavailable, I generate a synthetic dataset that mimics the schema of a real pharmaceutical inventory system.

**Key Objectives:**
1. Generate realistic product attributes (Vitamins, Pricing, Stock).
2. Introduce controlled "data quality issues" (Nulls, Outliers) to demonstrate ETL capabilities later.
3. Save data in **Parquet format** to simulate a modern Data Lakehouse architecture.

In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta

# --- Configuration ---
# Set seed for reproducibility
np.random.seed(42)
N_PRODUCTS = 2000
OUTPUT_DIR = '../data/raw'
OUTPUT_FILE = 'supplements_raw.parquet'

print(f"Starting Data Ingestion for {N_PRODUCTS} products...")

# --- 1. Data Generation (Simulation) ---
# Simulating categorical data found in typical ERP systems
categories = ['Vitamin C', 'Magnesium', 'Multivitamin', 'Vitamin D3', 'Omega-3', 'Zinc']
forms = ['Capsules', 'Tablets', 'Liquid', 'Powder']

# Generating the dataset
data = {
    'product_id': range(1001, 1001 + N_PRODUCTS),
    'product_name': [f"NutriLife {np.random.choice(categories)} {np.random.choice(['Plus', 'Pro', 'Basic', 'Ultra'])}" for _ in range(N_PRODUCTS)],
    'category': [np.random.choice(categories) for _ in range(N_PRODUCTS)],
    'form': [np.random.choice(forms) for _ in range(N_PRODUCTS)],

    # Lab results (Chemical composition)
    # Using normal distribution to simulate realistic variations
    'vitamin_c_mg': np.random.normal(loc=250, scale=100, size=N_PRODUCTS),
    'magnesium_mg': np.random.normal(loc=150, scale=50, size=N_PRODUCTS),

    # Business Metrics
    'price_eur': np.random.uniform(15, 60, N_PRODUCTS).round(2),
    'stock_level': np.random.randint(0, 500, N_PRODUCTS),
    'last_updated': [datetime.now() - timedelta(days=np.random.randint(0, 30)) for _ in range(N_PRODUCTS)]
}

df = pd.DataFrame(data)

# --- 2. Injecting Data Quality Issues (For ETL Demo) ---
# We intentionally corrupt some data to fix it in the next stage (ETL)

# Case A: Missing Values (Simulating incomplete lab reports)
# Set 5% of magnesium values to NaN
mask_null = np.random.choice([True, False], size=N_PRODUCTS, p=[0.05, 0.95])
df.loc[mask_null, 'magnesium_mg'] = np.nan

# Case B: Data Errors (Simulating manual entry errors)
# Set a few prices to negative values
df.loc[0:2, 'price_eur'] = -10.0

# --- 3. Data Storage (Lakehouse Pattern) ---
# Ensure directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Save as Parquet (Columnar storage, best for Big Data/Analytics)
full_path = os.path.join(OUTPUT_DIR, OUTPUT_FILE)
df.to_parquet(full_path, engine='pyarrow')

print(f"Data Ingestion completed successfully.")
print(f"File saved to: {full_path}")
print(f"Dataset Shape: {df.shape}")

# Preview
df.head()

 Generating 2000 synthetic products with strict validation rules...
Success! Generated 2000 rows.
Data saved to: ../data/raw/supplements_raw.parquet

--- Data Quality Check (Sample) ---


Unnamed: 0,category,product_name,price_eur
1236,Multivitamin,NutriLife Men's Multi Power Premium,45.75
1081,Vitamin C,NutriLife Liposomal Vitamin C Basic,19.34
1825,Omega-3,NutriLife Vegan Algae Omega Basic,21.81
575,Vitamin D3,NutriLife Vitamin D3 2000IU Basic,33.19
450,Vitamin C,NutriLife Liposomal Vitamin C Pro,40.45
