# Fake Data Generation with Gemini API

This notebook orchestrates data generation and insertion into the database.

Notes:
- As development happened: some LLM data generators were depreceated over random methods due to exhaustive computation and unnecessary, since it would not change much for our purpose
- Order and order items logic: order is a high level abstraction of each transaction detailed in order items. Given that we are using random methods, it is hard to predict how order items might come across so we are instead generating these first (with the order id) and then overwriting the total. In a real case data flow scenario, this would not make sense but for our use case, is the best possible emplyment.

# 0. Imports and Setup

In [None]:
import os
import json
import time
import re
import logging
import random
from pathlib import Path
from typing import List, Dict, Any, Optional
from google import genai
from google.genai import types
from dotenv import load_dotenv
from collections import defaultdict


# Load environment variables FIRST
load_dotenv()

# Silence OpenTelemetry (Langfuse) errors
logging.getLogger("opentelemetry.sdk._shared_internal").setLevel(logging.CRITICAL)

from langfuse import observe, get_client  # traceability

# Import configurations and templates
from config import *
from db_connector import DatabaseConnector

# Prompts directory
PROMPTS_DIR = Path("prompts")

In [2]:
# Testing environment
print("üîç Checking configuration...\n")

# Check each variable
required = {
    "GEMINI_API_KEY": os.getenv("GEMINI_API_KEY"),
    "LANGFUSE_PUBLIC_KEY": os.getenv("LANGFUSE_PUBLIC_KEY"),
    "LANGFUSE_SECRET_KEY": os.getenv("LANGFUSE_SECRET_KEY"),
    "LANGFUSE_HOST": os.getenv("LANGFUSE_HOST")
}

all_set = True
for name, value in required.items():
    if value:
        display = value[:15] + "..." if len(value) > 15 else value
        print(f"‚úÖ {name}: {display}")
    else:
        print(f"‚ùå {name}: NOT SET")
        all_set = False

print("\n" + "="*50)
if all_set:
    print("üéâ Perfect! Ready to start tracing!")
else:
    print("‚ö†Ô∏è  Please add missing keys to your .env file")

üîç Checking configuration...

‚úÖ GEMINI_API_KEY: AIzaSyBmeV92JOQ...
‚úÖ LANGFUSE_PUBLIC_KEY: pk-lf-f2596628-...
‚úÖ LANGFUSE_SECRET_KEY: sk-lf-9793468d-...
‚úÖ LANGFUSE_HOST: https://cloud.l...

üéâ Perfect! Ready to start tracing!


## Tool 1: SmartJSON Extractor

In [3]:
class SmartJSONExtractor:
    """Robust JSON extraction from LLM responses"""

    def extract(self, text: str) -> Dict[str, Any]:
        """
        Extract JSON from text with multiple fallback strategies

        Args:
            text: Raw text that may contain JSON

        Returns:
            Dict with 'success' (bool), 'data' (parsed JSON), 'error' (str)
        """
        try:
            # Strategy 1: Try direct parsing
            data = json.loads(text.strip())
            return {"success": True, "data": data, "error": None}
        except json.JSONDecodeError:
            pass

        try:
            # Strategy 2: Remove markdown code blocks
            cleaned = self._remove_code_blocks(text)
            data = json.loads(cleaned)
            return {"success": True, "data": data, "error": None}
        except json.JSONDecodeError:
            pass

        try:
            # Strategy 3: Extract first JSON array or object found
            json_match = re.search(r'(\[[\s\S]*\]|\{[\s\S]*\})', text)
            if json_match:
                data = json.loads(json_match.group(1))
                return {"success": True, "data": data, "error": None}
        except (json.JSONDecodeError, AttributeError):
            pass

        return {
            "success": False,
            "data": None,
            "error": "Failed to extract valid JSON from response"
        }

    def _remove_code_blocks(self, text: str) -> str:
        """Remove markdown code block formatting"""
        text = text.strip()
        if text.startswith('```'):
            lines = text.split('\n')
            text = '\n'.join(lines[1:-1]) if len(lines) > 2 else text
            if text.startswith('json'):
                text = text[4:].strip()
        return text

print("‚úì SmartJSONExtractor class loaded")

‚úì SmartJSONExtractor class loaded


## Tool 2: Gemini Data Generator

In [4]:
class GeminiDataGenerator:
    """Generate realistic fake data using Gemini API with structured output"""

    def __init__(self):
        self.client = genai.Client(api_key=os.getenv('GEMINI_API_KEY'))
        self.extractor = SmartJSONExtractor()
        self.generation_config = types.GenerateContentConfig(
            system_instruction=[
            "You're a synthetic data generator for an enterprise jazz vinyl record application, whose clients have very exquisite taste.",
            "Your clients value authenticity, hence try to choose real-world examples when available first.",
        ],
            temperature=GEMINI_TEMPERATURE,
            top_p=0.95,
            top_k=40,
        )
        self.TEMPLATES = TEMPLATES

    def _load_prompt(self, prompt_file: str) -> str:
        """Load prompt from file"""
        prompt_path = PROMPTS_DIR / prompt_file
        with open(prompt_path, 'r') as f:
            return f.read().strip()

    @observe()
    def _build_structured_prompt(
        self,
        instructions: str,
        schema: Dict[str, Any],
        count: int,
        reference_ids: Optional[Dict[str, List[str]]] = None
    ) -> str:
        """
        Build a structured prompt using the CRITICAL format with json.dumps schema

        Args:
            instructions: Natural language instructions for data generation
            schema: Schema template defining the expected structure
            count: Number of records to generate
            reference_ids: Optional dict of reference IDs for foreign keys

        Returns:
            Formatted prompt string
        """
        # Create example schema for a single record
        single_record_schema = schema
        # Full schema is an array of records
        full_schema = {
            "type": "array",
            "items": single_record_schema,
            "minItems": count,
            "maxItems": count
        }

        prompt_parts = [
            "CRITICAL: Output ONLY valid JSON matching this exact schema.",
            "No other text, no markdown, no explanations.\n",
            f"Schema:\n{json.dumps(full_schema, indent=2)}\n",
            f"Instructions:\n{instructions}\n"
        ]

        if reference_ids:
            prompt_parts.append("Reference IDs (use these for foreign key fields):")
            for key, ids in reference_ids.items():
                sample_ids = ids[:10] if len(ids) > 10 else ids
                prompt_parts.append(f"- {key}: {sample_ids}")
            prompt_parts.append("")

        prompt_parts.append(f"Generate exactly {count} records.\n")
        prompt_parts.append("JSON:")

        return "\n".join(prompt_parts)

    @observe()
    def extract_structured_form(
        self,
        instructions: str,
        form_template: Dict[str, Any],
        count: int,
        reference_ids: Optional[Dict[str, List[str]]] = None,
        model_class: Optional[BaseModel] = None
    ) -> List[Dict[str, Any]]:
        """
        Extract data matching a form template with validation

        Args:
            instructions: Natural language instructions for data generation
            form_template: Template defining the expected structure
            count: Number of records to generate
            reference_ids: Optional dict of reference IDs for foreign keys
            model_class: Optional Pydantic model for validation

        Returns:
            List of validated dictionaries
        """
        # Build structured prompt using new format
        full_prompt = self._build_structured_prompt(
            instructions,
            form_template,
            count,
            reference_ids
        )

        # Generate with retry
        return self._generate_with_validation(full_prompt, count, model_class)

    @observe()
    def _generate_with_validation(
        self,
        prompt: str,
        expected_count: int,
        model_class: Optional[BaseModel] = None,
        retry: int = 0
    ) -> List[Dict[str, Any]]:
        """
        Generate content with retry and optional Pydantic validation

        Args:
            prompt: Full prompt to send
            expected_count: Expected number of records
            model_class: Optional Pydantic model for validation
            retry: Current retry attempt

        Returns:
            List of validated dictionaries
        """
        try:
            response = self.client.models.generate_content(
                model=GEMINI_MODEL,
                contents=prompt,
                config=self.generation_config
            )

            # Extract JSON
            result = self.extractor.extract(response.text)

            if not result["success"]:
                raise ValueError(result["error"])

            data = result["data"]

            # Validate with Pydantic 
            if model_class:
                validated_data = []
                for i, item in enumerate(data):
                    try:
                        validated_item = model_class(**item)
                        validated_data.append(validated_item.model_dump())
                    except Exception as e:
                        print(f"Validation warning for record {i+1}: {e}")
                        validated_data.append(item)  
                data = validated_data

            actual_count = len(data)
            print(f"‚úì Generated {actual_count} validated records")
            return data

        except Exception as e:
            if retry < GEMINI_MAX_RETRIES:
                print(f"Error (attempt {retry + 1}/{GEMINI_MAX_RETRIES}): {e}")
                time.sleep(2 ** retry)  # Exponential backoff
                return self._generate_with_validation(prompt, expected_count, model_class, retry + 1)
            else:
                print(f" Failed after {GEMINI_MAX_RETRIES} attempts: {e}")
                return []

# ENTITY TYPE SPECIFIC COMPILING ----------------------------
    @observe()
    def generate_genres(self, count: int) -> List[Dict]:
        """Generate music genres"""
        instructions = self._load_prompt('genre_prompt.txt')
        return self.extract_structured_form(
            instructions,
            self.TEMPLATES['genre'],
            count,
            model_class=Genre
        )

    @observe()
    def generate_labels(self, count: int) -> List[Dict]:
        """Generate record labels"""
        instructions = self._load_prompt('label_prompt.txt')
        return self.extract_structured_form(
            instructions,
            self.TEMPLATES['label'],
            count,
            model_class=Label
        )

    @observe()
    def generate_customers(self, count: int) -> List[Dict]:
        """Generate customers"""
        instructions = self._load_prompt('customer_prompt.txt')
        return self.extract_structured_form(
            instructions,
            self.TEMPLATES['customer'],
            count,
            model_class=Customer
        )

    @observe()
    def generate_albums(self, count: int, genre_ids: List[str], label_ids: List[str]) -> List[Dict]:
        """Generate albums with references to genres and labels"""
        instructions = self._load_prompt('album_prompt.txt')
        return self.extract_structured_form(
            instructions,
            self.TEMPLATES['album'],
            count,
            reference_ids={'genre_ids': genre_ids, 'label_ids': label_ids},
            model_class=Album
        )
    # UNUSED - depreceated over manual input
    @observe()
    def generate_orders(self, count: int, customer_ids: List[str]) -> List[Dict]:
        """Generate orders"""
        instructions = self._load_prompt('order_prompt.txt')
        return self.extract_structured_form(
            instructions,
            self.TEMPLATES['order'],
            count,
            reference_ids={'customer_ids': customer_ids},
            model_class=Order
        )
    # UNUSED- not needed
    @observe()
    def generate_workflows(self, count: int) -> List[Dict]:
        """Generate workflow definitions"""
        instructions = self._load_prompt('workflow_prompt.txt')
        return self.extract_structured_form(
            instructions,
            self.TEMPLATES['workflow'],
            count,
            model_class=Workflow
        )
    # UNUSED over manual random generation
    @observe()
    def generate_order_items(self, order_ids: List[str], album_ids: List[str]) -> List[Dict]:
        """Generate order items for all orders"""
        instructions = self._load_prompt('order_item_prompt.txt')
        
        # Generate 1-5 items per order
        total_items = sum(random.randint(1, 5) for _ in order_ids)
        
        return self.extract_structured_form(
            instructions,
            self.TEMPLATES['order_item'],
            total_items,
            reference_ids={'order_ids': order_ids, 'album_ids': album_ids},
            model_class=OrderItem
        )
    # UNUSED over manual random generation
    @observe()
    def generate_payments(self, count: int, order_ids: List[str]) -> List[Dict]:
        """Generate payment records"""
        instructions = self._load_prompt('payment_prompt.txt')
        return self.extract_structured_form(
            instructions,
            self.TEMPLATES['payment'],
            count,
            reference_ids={'order_ids': order_ids},
            model_class=Payment
        )

    @observe()
    def generate_reviews(self, count: int, customer_ids: List[str], album_ids: List[str]) -> List[Dict]:
        """Generate customer reviews"""
        instructions = self._load_prompt('review_prompt.txt')
        return self.extract_structured_form(
            instructions,
            self.TEMPLATES['review'],
            count,
            reference_ids={'customer_ids': customer_ids, 'album_ids': album_ids},
            model_class=Review
        )

print("‚úì GeminiDataGenerator class loaded")

‚úì GeminiDataGenerator class loaded


## Debugging

In [5]:
def list_prompts():
    """List all available prompt files"""
    prompts_dir = Path("prompts")
    if prompts_dir.exists():
        print("üìÑ Available prompt files:")
        for prompt_file in sorted(prompts_dir.glob("*.txt")):
            print(f"  - {prompt_file.name}")
    else:
        print("‚ö†Ô∏è  Prompts directory not found")

def show_prompt(prompt_name: str):
    """Display content of a specific prompt file"""
    prompt_path = Path("prompts") / prompt_name
    if prompt_path.exists():
        print(f"\n{'='*60}")
        print(f"PROMPT: {prompt_name}")
        print('='*60)
        with open(prompt_path, 'r') as f:
            print(f.read())
        print('='*60 + '\n')
    else:
        print(f" Prompt file not found: {prompt_name}")

def show_all_templates():
    """Display all JSON templates"""
    print("\n" + "="*60)
    print("JSON TEMPLATES (Schemas)")
    print("="*60)
    for name, template in TEMPLATES.items():
        print(f"\n{name.upper()}:")
        print(json.dumps(template, indent=2))
    print("\n" + "="*60)

# Uncomment to view:
list_prompts()
show_all_templates()

print("‚úì Prompt/template inspection utilities loaded")

üìÑ Available prompt files:
  - album_prompt.txt
  - customer_prompt.txt
  - genre_prompt.txt
  - label_prompt.txt
  - order_item_prompt.txt
  - order_prompt.txt
  - payment_prompt.txt
  - review_prompt.txt
  - workflow_prompt.txt

JSON TEMPLATES (Schemas)

GENRE:
{
  "name": null
}

LABEL:
{
  "name": null
}

CUSTOMER:
{
  "email": null,
  "first_name": null,
  "last_name": null,
  "phone": null
}

ALBUM:
{
  "title": null,
  "artist": null,
  "genre_id": null,
  "label_id": null,
  "price": null
}

ORDER:
{
  "order_number": null,
  "customer_id": null,
  "shipping_address": null,
  "order_date": null
}

WORKFLOW:
{
  "name": null,
  "description": null,
  "trigger_type": null,
  "trigger_config": {},
  "workflow_definition": {},
  "enabled": null
}

ORDER_ITEM:
{
  "order_id": null,
  "album_id": null,
  "quantity": null
}

PAYMENT:
{
  "order_id": null,
  "amount": null,
  "payment_method": null,
  "status": null,
  "transaction_id": null
}

REVIEW:
{
  "customer_id": null,
  "alb

# 1. Initialize Generator and Database Connection + Traceability

In [6]:
generator = GeminiDataGenerator()
db = DatabaseConnector()

# Initialize LangFuse client
langfuse_client = get_client()


print("‚úì Generator, DB connector and langfuse initialized")

‚úì Generator, DB connector and langfuse initialized


## 1.1. Inspect Prompt and Schema (Debugging)

In [7]:
# Optional: Inspect how prompts are structured
# This cell shows you the exact prompt and schema sent to Gemini API

def inspect_prompt_for_entity(entity_name: str, template_key: str, count: int = 5):
    """Show the structured prompt for any entity"""
    generator_temp = GeminiDataGenerator()
    
    # Load the prompt
    prompt_text = generator_temp._load_prompt(f'{entity_name}_prompt.txt')
    
    # Get the schema
    schema = TEMPLATES[template_key]
    
    # Build the prompt using the same method
    full_prompt = generator_temp._build_structured_prompt(
        prompt_text,
        schema,
        count
    )
    
    print(f"=== PROMPT FOR {entity_name.upper()} ===\n")
    print(full_prompt)
    print("\n" + "="*60)

# Example: Inspect genre prompt (comment/uncomment to test different entities)
# inspect_prompt_for_entity('genre', 'genre', 5)
# inspect_prompt_for_entity('album', 'album', 3)

print("‚úì Debugging utilities loaded. Uncomment lines above to inspect prompts.")

‚úì Debugging utilities loaded. Uncomment lines above to inspect prompts.


## 1.2 Error Tracking Setup

In [8]:
# Track errors and warnings throughout the process
error_log = []
warning_log = []

def log_error(step: str, error: Exception):
    """Log an error for later review"""
    error_log.append({"step": step, "error": str(error), "type": type(error).__name__})
    print(f" ERROR in {step}: {error}")

def log_warning(step: str, message: str):
    """Log a warning for later review"""
    warning_log.append({"step": step, "message": message})
    print(f"‚ö†Ô∏è  WARNING in {step}: {message}")

def show_logs():
    """Display all errors and warnings"""
    print("\n" + "="*60)
    print("ERROR AND WARNING SUMMARY")
    print("="*60)
    
    if error_log:
        print(f"\n ERRORS ({len(error_log)}):")
        for i, err in enumerate(error_log, 1):
            print(f"\n{i}. {err['step']} ({err['type']})")
            print(f"   {err['error']}")
    else:
        print("\n‚úì No errors!")
    
    if warning_log:
        print(f"\n‚ö†Ô∏è  WARNINGS ({len(warning_log)}):")
        for i, warn in enumerate(warning_log, 1):
            print(f"\n{i}. {warn['step']}")
            print(f"   {warn['message']}")
    else:
        print("\n‚úì No warnings!")
    
    print("\n" + "="*60)

print("‚úì Error tracking initialized")

‚úì Error tracking initialized


# 2. Connect to Database

In [9]:
db.connect()
print("‚úì Connected to database")

Connected to Supabase successfully
‚úì Connected to database


## 2.1. Generate and Insert Genres

In [10]:
print("Generating genres...")
genres_data = generator.generate_genres(DATA_COUNTS['genres'])
print(f"Generated {len(genres_data)} genres")

genre_ids = db.insert_genres(genres_data)
print(f"‚úì Inserted {len(genre_ids)} genres")
print(f"Sample genre IDs: {genre_ids[:5]}")

Generating genres...
‚úì Generated 15 validated records
Generated 15 genres
‚úì Inserted 15 genres
Sample genre IDs: ['1f657a18-645d-4206-a431-c45e967bcb1d', '8396d96c-9644-4d83-8cb8-301f9942adae', 'b0718b04-1517-4669-97d3-13b27e79d549', '70a76012-6212-49ff-a125-603de72b02ed', 'ce7b1985-50d6-4f25-9d77-44b8b78b0eaa']


## 2.2. Generate and Insert Labels

In [11]:
print("Generating labels...")
labels_data = generator.generate_labels(DATA_COUNTS['labels'])
print(f"Generated {len(labels_data)} labels")

label_ids = db.insert_labels(labels_data)
print(f"‚úì Inserted {len(label_ids)} labels")
print(f"Sample label IDs: {label_ids[:5]}")

Generating labels...
‚úì Generated 20 validated records
Generated 20 labels
‚úì Inserted 20 labels
Sample label IDs: ['a6debdad-89fb-428d-ba80-e4d1f83b2478', '682957ea-82cf-494b-bca1-3ec2e2938d22', '469b8cb4-5be0-4c27-993f-02f9dc8e1142', '26e329d9-0a0b-4e06-a53d-539cdc6ab27e', '437d32dc-cc1c-4680-81b9-8e556f48b25d']


## 2.3. Generate and Insert Customers

In [12]:
print("Generating customers...")
customers_data = generator.generate_customers(DATA_COUNTS['customers'])
print(f"Generated {len(customers_data)} customers")

customer_ids = db.insert_customers(customers_data)
print(f"‚úì Inserted {len(customer_ids)} customers")
print(f"Sample customer IDs: {customer_ids[:5]}")

Generating customers...
‚úì Generated 100 validated records
Generated 100 customers
‚úì Inserted 100 customers
Sample customer IDs: ['00c4c355-ed1a-43fb-8ae3-2052e5f95418', 'e9368dcf-6a85-4c63-9967-2af56d151dee', '4babe54a-8b62-49ee-b619-00c093744ad4', '7d75c15f-4bd1-42b3-8bfd-9f0a83479323', 'aeb8127f-d03a-46d7-af63-62c1fef91893']


## 2.4. Generate and Insert Albums

In [13]:
print("Generating albums...")
albums_data = generator.generate_albums(DATA_COUNTS['albums'], genre_ids, label_ids)
print(f"Generated {len(albums_data)} albums")

album_ids = db.insert_albums(albums_data)
print(f"‚úì Inserted {len(album_ids)} albums")
print(f"Sample album IDs: {album_ids[:5]}")

Generating albums...
‚úì Generated 298 validated records
Generated 298 albums
‚úì Inserted 298 albums
Sample album IDs: ['f91028df-0494-4092-9248-5194651be8bd', '6a5c3554-9eac-43af-8123-ab9e00c76b2a', 'cfc1875b-4247-4680-9c2d-dfa51b4e787a', '1305a889-18be-4ca1-a445-ecf38cd0bc71', '66c16aa0-6f0e-429b-8232-6f05814b982d']


## 2.5. Generate and Insert Inventory

In [14]:
print("Generating inventory...")
import random

# Create inventory records for each album with random quantity 1-200
inventory_data = []
for album_id in album_ids:
    inventory_data.append({
        'album_id': album_id,
        'quantity': random.randint(1, 200)
    })

print(f"Generated {len(inventory_data)} inventory records")

inventory_ids = db.insert_inventory(inventory_data)
print(f"‚úì Inserted {len(inventory_ids)} inventory records")
print(f"Sample inventory IDs: {inventory_ids[:5]}")

Generating inventory...
Generated 298 inventory records
‚úì Inserted 298 inventory records
Sample inventory IDs: ['efa93197-ccb7-48ae-8ea3-5b4165fc5eb2', '7d632c03-c47e-499e-a303-758d25c01bb7', '286c49d0-ea8b-4ef2-95ba-dc4177f805d2', '68f96341-a352-4b1e-8088-cede74206c13', 'a09d4f6f-4ca6-4686-9b13-b3dfdab4a78b']


## 2.6. Generate and Insert Orders

In [15]:
print("Generating orders...")
orders_data = generator.generate_orders(DATA_COUNTS['orders'], customer_ids)
print(f"Generated {len(orders_data)} orders")

# Insert orders without totals (will be calculated after order items are created)
# We need to add total=0.0 temporarily for database constraint (since random generations are not the most predictable)
orders_with_temp_total = [dict(order, total=0.0) for order in orders_data]
order_ids = db.insert_orders(orders_with_temp_total)
print(f"‚úì Inserted {len(order_ids)} orders (totals will be calculated after order items)")
print(f"Sample order IDs: {order_ids[:5]}")

Generating orders...
‚úì Generated 150 validated records
Generated 150 orders
‚úì Inserted 150 orders (totals will be calculated after order items)
Sample order IDs: ['e5185ee8-bec2-443e-9201-b728a5da902c', '2ad35ad7-a9f9-4910-9cd5-2d81c5648649', '461f15d8-1a99-45fb-b55b-616c5b6e5cff', '6c58fd7b-d0a8-485b-ba4f-b8bc787dbcaa', 'd6f9d813-0cf3-406a-85f3-15feaae90e21']


## 2.7. Generate and Insert Order Items

In [17]:
# Manual random generation

print("Generating order items...")

order_items_data = []

for order_id in order_ids:
    # Pick random number of albums for this order (1-20)
    num_albums = random.randint(1, 20)
    
    # Randomly select albums for this order (without replacement within the same order)
    selected_albums = random.sample(album_ids, min(num_albums, len(album_ids)))
    
    # Create order items
    for album_id in selected_albums:
        order_items_data.append({
            'order_id': order_id,
            'album_id': album_id,
            'quantity': random.randint(1, 3)  # 1-3 quantity per album
        })

print(f"Generated {len(order_items_data)} order items")

order_item_ids = db.insert_order_items(order_items_data)
print(f"‚úì Inserted {len(order_item_ids)} order items")
print(f"Sample order item IDs: {order_item_ids[:5]}")


Generating order items...
Generated 1601 order items
‚úì Inserted 1601 order items
Sample order item IDs: ['0e62d431-8886-45e5-8d03-5a62caf396b2', 'a24c8656-ebbd-4cf0-868b-563df09d19e7', '5c4c0927-af32-4187-8fe0-702f0c6d65e2', '7fdc3f24-6022-47bb-866c-ff652b35bd37', '163580eb-2952-4bdb-a33f-360b483fcc7a']


## 2.7.1. Calculate and Update Order Totals

In [20]:
# Manual random generation

print("Calculating order totals based on order items...")

# Fetch album prices from database
albums_with_prices = db.get_albums_data()
album_price_map = {album['album_id']: float(album['price']) for album in albums_with_prices}

# Group order items by order_id and calculate totals
from collections import defaultdict
order_totals = defaultdict(float)

for item in order_items_data:
    order_id = item['order_id']
    album_id = item['album_id']
    quantity = item['quantity']
    
    # Get price from album
    unit_price = album_price_map.get(album_id, 0.0)
    item_total = unit_price * quantity
    order_totals[order_id] += item_total

# Prepare batch update data
print(f"Updating {len(order_totals)} orders with calculated totals...")
update_data = [
    {'order_id': order_id, 'total': total}
    for order_id, total in order_totals.items()
]

# Perform batch upsert
for update in update_data:
    db.client.table('orders').update({'total': update['total']}).eq('order_id', update['order_id']).execute()

print(f"‚úì Updated {len(order_totals)} order totals")
print(f"Sample totals: {dict(list(order_totals.items())[:3])}")


Calculating order totals based on order items...
Updating 150 orders with calculated totals...
‚úì Updated 150 order totals
Sample totals: {'e5185ee8-bec2-443e-9201-b728a5da902c': 5137.150000000001, '2ad35ad7-a9f9-4910-9cd5-2d81c5648649': 1340.8200000000002, '461f15d8-1a99-45fb-b55b-616c5b6e5cff': 4259.05}


## 2.8. Generate and Insert Payments

In [22]:
# Manual random generation
print("Generating payments...")

payments_data = []

# Create one payment per order
for order_id in order_ids:
    # Get the order total (we calculated this in section 2.7.1)
    # Fetch it from order_totals dict if available, otherwise query DB
    order_total = 0.0
    
    # Query the order to get its total
    order_result = db.client.table('orders').select('total').eq('order_id', order_id).execute()
    if order_result.data:
        order_total = float(order_result.data[0]['total'])
    
    # Generate payment record
    payment_method = random.choice(['card', 'cash', 'bank_transfer', 'paypal'])
    status = random.choice(['completed'] * 8 + ['pending'] * 1 + ['failed'] * 1)  # 80% completed
    
    payments_data.append({
        'order_id': order_id,
        'amount': order_total,
        'payment_method': payment_method,
        'status': status,
        'transaction_id': f"TXN-{random.randint(100000, 999999)}-{order_id[:8]}"
    })

print(f"Generated {len(payments_data)} payments")

payment_ids = db.insert_payments(payments_data)
print(f"‚úì Inserted {len(payment_ids)} payments")
print(f"Sample payment IDs: {payment_ids[:5]}")


Generating payments...
Generated 150 payments
‚úì Inserted 150 payments
Sample payment IDs: ['3adfaa34-c4e7-4092-b0a6-162ccffc1c94', 'd1413c39-b0d2-43bc-9a8e-8af0b7944a00', 'c754d325-c540-46f8-a10a-ba57c97f60db', '051b85a1-9102-41a8-a376-2cc5dfdb60d4', 'dc8b94e4-e8e7-42fb-a104-3f0035f10ded']


## 2.9. Generate and Insert Reviews

In [23]:
print("Generating reviews...")
reviews_data = generator.generate_reviews(DATA_COUNTS['reviews'], customer_ids, album_ids)
print(f"Generated {len(reviews_data)} reviews")

review_ids = db.insert_reviews(reviews_data)
print(f"‚úì Inserted {len(review_ids)} reviews")
print(f"Sample review IDs: {review_ids[:5]}")

Generating reviews...
‚úì Generated 167 validated records
Generated 167 reviews
‚úì Inserted 167 reviews
Sample review IDs: ['d189fbf5-f99c-4d04-9952-751e22056aff', '294b1d1d-231f-48e0-bedf-2d22bd870f50', '20f931ef-7c3a-4d95-b785-08e9031d21dc', '97631e8b-6d86-4304-8686-e37a7aceb11d', '7b318fd3-9b72-4b12-bcbe-e2183535396b']


## 2.10. Generate and Insert Sales Transactions

In [24]:
print("Generating sales transactions...")

# Sales transactions are audit records based on order_items
# We'll create sale transactions for each order item
sales_data = []

# Fetch order items to create corresponding sales transactions
for order_item in order_items_data:
    # Find the inventory_id for this album
    album_id = order_item['album_id']
    inventory_id = next((inv_id for inv_id, alb_id in zip(inventory_ids, album_ids) if alb_id == album_id), None)
    
    if inventory_id:
        # Fetch album price (we'll need to query this or store it)
        # For now, we'll create the transaction without unit_price
        sales_data.append({
            'inventory_id': inventory_id,
            'order_id': order_item['order_id'],
            'transaction_type': 'sale',
            'quantity_change': -order_item['quantity']  # Negative for sales
        })

print(f"Generated {len(sales_data)} sales transactions")

db.insert_sales(sales_data)
print(f"‚úì Inserted {len(sales_data)} sales transactions")

Generating sales transactions...
Generated 1601 sales transactions
‚úì Inserted 1601 sales transactions


## 2.11. Generate and Insert Workflows (UNUSED)

In [None]:
print("Generating workflows...")
workflows_data = generator.generate_workflows(DATA_COUNTS['workflows'])
print(f"Generated {len(workflows_data)} workflows")

# Debug: Inspect first workflow with complex JSON fields
print("\nüìã Sample generated workflow:")
print(json.dumps(workflows_data[0] if workflows_data else {}, indent=2))

workflow_ids = db.insert_workflows(workflows_data)
print(f"\n‚úì Inserted {len(workflow_ids)} workflows")
print(f"Sample workflow IDs: {workflow_ids[:5]}")

## 2.12. Completion and Cleanup

In [25]:
# Flush all traces to LangFuse
langfuse_client.flush()
print("‚úì LangFuse traces flushed to dashboard")


‚úì LangFuse traces flushed to dashboard


In [26]:
db.close()
print("‚úì Database connection closed")

Database connection closed
‚úì Database connection closed
