In [0]:
!pip install reportlab

In [0]:
%run ./00-init-requirements

# Sales CRM Data Generator - Code Summary
This script generates synthetic sales and CRM data. Key features:
- Creates comprehensive sales datasets with lead, opportunity, and customer data
- Generates realistic US-based sales profiles including:
  1. Lead generation and qualification data
  2. Sales opportunities with stages and probabilities
  3. Customer accounts with demographics and firmographics
  4. Sales performance metrics and KPIs
  5. Territory and quota management
  6. Revenue and pipeline tracking
  7. Sales rep performance data

The data includes US-specific elements like states, zip codes, industries, and company sizes. Key KPIs tracked include conversion rates, average deal size, sales cycle length, quota attainment, and revenue metrics.

## Sales Representatives

In [0]:
def generate_sales_reps(num_reps=50, seed=42):
    """
    Generate synthetic sales representative data for Workday
    
    Parameters:
    -----------
    num_reps : int
        Number of sales rep records to generate
    seed : int
        Random seed for reproducibility
        
    Returns:
    --------
    pandas.DataFrame
        DataFrame containing synthetic sales rep data
    """
    # Set random seed for reproducibility
    np.random.seed(seed)
    random.seed(seed)
    
    # Generate Rep IDs
    rep_ids = [f'REP{str(i).zfill(4)}' for i in range(1, num_reps + 1)]
    
    # First names by gender
    male_names = ['James', 'Michael', 'Robert', 'David', 'William', 'Richard', 'Thomas', 'Mark', 'Daniel', 'Paul',
                  'Steven', 'Andrew', 'Joshua', 'Kenneth', 'Kevin', 'Brian', 'George', 'Timothy', 'Ronald', 'Jason']
    female_names = ['Mary', 'Patricia', 'Jennifer', 'Linda', 'Elizabeth', 'Barbara', 'Susan', 'Jessica', 'Sarah', 'Karen',
                    'Lisa', 'Nancy', 'Betty', 'Helen', 'Sandra', 'Donna', 'Carol', 'Ruth', 'Sharon', 'Michelle']
    
    last_names = ['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia', 'Miller', 'Davis', 'Rodriguez', 'Martinez',
                  'Hernandez', 'Lopez', 'Gonzales', 'Wilson', 'Anderson', 'Thomas', 'Taylor', 'Moore', 'Jackson', 'Martin',
                  'Lee', 'Perez', 'Thompson', 'White', 'Harris', 'Sanchez', 'Clark', 'Ramirez', 'Lewis', 'Robinson']
    
    # Generate names and demographics
    genders = np.random.choice(['Male', 'Female'], num_reps, p=[0.52, 0.48])
    first_names = []
    for gender in genders:
        if gender == 'Male':
            first_names.append(random.choice(male_names))
        else:
            first_names.append(random.choice(female_names))
    
    last_names_list = [random.choice(last_names) for _ in range(num_reps)]
    
    # Generate ages and experience
    ages = np.random.normal(35, 8, num_reps).astype(int)
    ages = np.clip(ages, 23, 65)
    
    # Years of experience (correlated with age)
    experience_years = [max(1, age - random.randint(22, 26)) for age in ages]
    
    # Sales territories (US regions and states)
    territories = [
        'Northeast - NY/NJ/CT', 'Northeast - MA/RI/VT/NH/ME', 'Southeast - FL/GA/SC/NC',
        'Southeast - AL/MS/TN/KY', 'Midwest - IL/IN/OH/MI', 'Midwest - WI/MN/IA/ND/SD',
        'Southwest - TX/OK/AR/LA', 'West - CA/NV/AZ', 'West - WA/OR/ID/MT',
        'Mountain - CO/UT/WY/NM', 'Mid-Atlantic - PA/MD/DE/DC/WV/VA'
    ]
    
    assigned_territories = [random.choice(territories) for _ in range(num_reps)]
    
    # Sales roles and levels
    roles = ['Sales Development Rep', 'Account Executive', 'Senior Account Executive', 
             'Enterprise Account Executive', 'Sales Manager', 'Senior Sales Manager']
    
    role_weights = [0.15, 0.35, 0.25, 0.15, 0.08, 0.02]
    assigned_roles = np.random.choice(roles, num_reps, p=role_weights)
    
    # Annual quotas based on role
    quota_ranges = {
        'Sales Development Rep': (150000, 300000),
        'Account Executive': (500000, 1000000),
        'Senior Account Executive': (800000, 1500000),
        'Enterprise Account Executive': (1200000, 2500000),
        'Sales Manager': (2000000, 4000000),
        'Senior Sales Manager': (3000000, 6000000)
    }
    
    annual_quotas = []
    for role in assigned_roles:
        min_quota, max_quota = quota_ranges[role]
        quota = random.randint(min_quota, max_quota)
        annual_quotas.append(quota)
    
    # Hire dates (within last 10 years, weighted towards more recent)
    current_date = datetime.now()
    hire_dates = []
    for _ in range(num_reps):
        days_ago = int(np.random.exponential(500))  # Exponential distribution favoring recent hires
        days_ago = min(days_ago, 3650)  # Cap at 10 years
        hire_date = current_date - timedelta(days=days_ago)
        hire_dates.append(hire_date.strftime('%Y-%m-%d'))
    
    # Manager hierarchy
    managers = ['Sarah Johnson', 'Mike Chen', 'Lisa Rodriguez', 'David Kim', 'Jennifer Walsh',
                'Robert Martinez', 'Amanda Thompson', 'Kevin O\'Connor', 'Maria Gonzalez', 'Tom Wilson']
    
    assigned_managers = [random.choice(managers) for _ in range(num_reps)]
    
    # Performance metrics
    # Quota attainment (normal distribution around 85%)
    quota_attainment = np.random.normal(85, 20, num_reps)
    quota_attainment = np.clip(quota_attainment, 0, 200)
    
    # Create the DataFrame
    data = {
        'RepID': rep_ids,
        'FirstName': first_names,
        'LastName': last_names_list,
        'Gender': genders,
        'Age': ages,
        'ExperienceYears': experience_years,
        'Role': assigned_roles,
        'Territory': assigned_territories,
        'Manager': assigned_managers,
        'HireDate': hire_dates,
        'AnnualQuota': annual_quotas,
        'QuotaAttainmentPercent': np.round(quota_attainment, 1),
        'IsActive': [1] * num_reps  # All reps are currently active
    }
    
    return pd.DataFrame(data)

# Generate sales rep data
sales_reps = spark.createDataFrame(generate_sales_reps(num_reps=75))
sales_reps.write.format('delta').mode('overwrite').saveAsTable(f'{catalog_name}.{schema_name}.sales_reps')
print(f"Created f'{catalog_name}.{schema_name}.sales_reps table")

## Customer Accounts

In [0]:
def generate_customer_accounts(num_accounts=500, seed=42):
    """
    Generate synthetic customer account data for Workday sales
    
    Parameters:
    -----------
    num_accounts : int
        Number of customer account records to generate
    seed : int
        Random seed for reproducibility
        
    Returns:
    --------
    pandas.DataFrame
        DataFrame containing synthetic customer account data
    """
    # Set random seed for reproducibility
    np.random.seed(seed)
    random.seed(seed)
    
    # Generate Account IDs
    account_ids = [f'ACC{str(i).zfill(5)}' for i in range(1, num_accounts + 1)]
    
    # Company name components
    company_prefixes = ['Global', 'Advanced', 'Premier', 'Elite', 'Strategic', 'Dynamic', 'Innovative', 
                        'Integrated', 'Digital', 'Smart', 'NextGen', 'Future', 'Pro', 'Alpha', 'Beta']
    
    company_roots = ['Tech', 'Systems', 'Solutions', 'Dynamics', 'Corp', 'Industries', 'Enterprises',
                     'Services', 'Group', 'Holdings', 'Partners', 'Consulting', 'Analytics', 'Data']
    
    company_suffixes = ['Inc.', 'LLC', 'Corp.', 'Ltd.', 'Co.', 'Group', 'Solutions', 'Systems']
    
    # Generate company names
    company_names = []
    for _ in range(num_accounts):
        if random.random() < 0.3:  # 30% chance of having prefix
            name = f"{random.choice(company_prefixes)} {random.choice(company_roots)} {random.choice(company_suffixes)}"
        else:
            name = f"{random.choice(company_roots)} {random.choice(company_suffixes)}"
        company_names.append(name)
    
    # Industries
    industries = [
        'Technology', 'Financial Services', 'Healthcare', 'Manufacturing', 'Retail', 'Education',
        'Government', 'Non-profit', 'Energy', 'Transportation', 'Real Estate', 'Media & Entertainment',
        'Telecommunications', 'Professional Services', 'Construction', 'Hospitality'
    ]
    
    industry_weights = [0.18, 0.12, 0.10, 0.08, 0.08, 0.06, 0.06, 0.04, 0.05, 0.04, 0.04, 0.03, 0.03, 0.05, 0.03, 0.01]
    assigned_industries = np.random.choice(industries, num_accounts, p=industry_weights)
    
    # Company sizes
    company_sizes = ['Small (1-100)', 'Medium (101-1000)', 'Large (1001-5000)', 'Enterprise (5000+)']
    size_weights = [0.3, 0.35, 0.25, 0.1]
    assigned_sizes = np.random.choice(company_sizes, num_accounts, p=size_weights)
    
    # Employee counts based on company size
    employee_counts = []
    for size in assigned_sizes:
        if size == 'Small (1-100)':
            employee_counts.append(random.randint(1, 100))
        elif size == 'Medium (101-1000)':
            employee_counts.append(random.randint(101, 1000))
        elif size == 'Large (1001-5000)':
            employee_counts.append(random.randint(1001, 5000))
        else:  # Enterprise
            employee_counts.append(random.randint(5000, 50000))
    
    # US states and cities
    us_locations = [
        ('New York', 'NY'), ('Los Angeles', 'CA'), ('Chicago', 'IL'), ('Houston', 'TX'),
        ('Phoenix', 'AZ'), ('Philadelphia', 'PA'), ('San Antonio', 'TX'), ('San Diego', 'CA'),
        ('Dallas', 'TX'), ('San Jose', 'CA'), ('Austin', 'TX'), ('Jacksonville', 'FL'),
        ('San Francisco', 'CA'), ('Columbus', 'OH'), ('Charlotte', 'NC'), ('Fort Worth', 'TX'),
        ('Indianapolis', 'IN'), ('Seattle', 'WA'), ('Denver', 'CO'), ('Boston', 'MA'),
        ('El Paso', 'TX'), ('Nashville', 'TN'), ('Detroit', 'MI'), ('Oklahoma City', 'OK'),
        ('Portland', 'OR'), ('Las Vegas', 'NV'), ('Memphis', 'TN'), ('Louisville', 'KY'),
        ('Baltimore', 'MD'), ('Milwaukee', 'WI'), ('Albuquerque', 'NM'), ('Tucson', 'AZ'),
        ('Fresno', 'CA'), ('Sacramento', 'CA'), ('Kansas City', 'MO'), ('Mesa', 'AZ'),
        ('Atlanta', 'GA'), ('Colorado Springs', 'CO'), ('Raleigh', 'NC'), ('Omaha', 'NE')
    ]
    
    selected_locations = [random.choice(us_locations) for _ in range(num_accounts)]
    cities = [loc[0] for loc in selected_locations]
    states = [loc[1] for loc in selected_locations]
    
    # Generate ZIP codes (simplified)
    zip_codes = [f'{random.randint(10000, 99999)}' for _ in range(num_accounts)]
    
    # Annual revenue based on company size
    annual_revenues = []
    for size in assigned_sizes:
        if size == 'Small (1-100)':
            revenue = random.randint(100000, 5000000)
        elif size == 'Medium (101-1000)':
            revenue = random.randint(5000000, 50000000)
        elif size == 'Large (1001-5000)':
            revenue = random.randint(50000000, 500000000)
        else:  # Enterprise
            revenue = random.randint(500000000, 10000000000)
        annual_revenues.append(revenue)
    
    # Account status
    account_statuses = ['Prospect', 'Customer', 'Former Customer', 'Partner']
    status_weights = [0.4, 0.45, 0.1, 0.05]
    assigned_statuses = np.random.choice(account_statuses, num_accounts, p=status_weights)
    
    # Create dates
    current_date = datetime.now()
    
    # Account creation dates
    create_dates = []
    for _ in range(num_accounts):
        days_ago = random.randint(30, 1825)  # Between 1 month and 5 years ago
        create_date = current_date - timedelta(days=days_ago)
        create_dates.append(create_date.strftime('%Y-%m-%d'))
    
    # Last activity dates (more recent)
    last_activity_dates = []
    for _ in range(num_accounts):
        days_ago = int(np.random.exponential(30))  # Exponential distribution favoring recent activity
        days_ago = min(days_ago, 365)  # Cap at 1 year
        activity_date = current_date - timedelta(days=days_ago)
        last_activity_dates.append(activity_date.strftime('%Y-%m-%d'))
    
    # Primary contact information
    contact_titles = [
        'CEO', 'CTO', 'CFO', 'CHRO', 'VP of Engineering', 'VP of Sales', 'VP of Marketing',
        'Director of IT', 'Director of Operations', 'Head of HR', 'Chief Data Officer',
        'VP of Product', 'General Manager', 'President', 'COO'
    ]
    
    assigned_contact_titles = [random.choice(contact_titles) for _ in range(num_accounts)]
    
    # Create the DataFrame
    data = {
        'AccountID': account_ids,
        'CompanyName': company_names,
        'Industry': assigned_industries,
        'CompanySize': assigned_sizes,
        'EmployeeCount': employee_counts,
        'AnnualRevenue': annual_revenues,
        'City': cities,
        'State': states,
        'ZipCode': zip_codes,
        'AccountStatus': assigned_statuses,
        'CreatedDate': create_dates,
        'LastActivityDate': last_activity_dates,
        'PrimaryContactTitle': assigned_contact_titles
    }
    
    return pd.DataFrame(data)

# Generate customer account data
customer_accounts = spark.createDataFrame(generate_customer_accounts(num_accounts=750))
display(customer_accounts)
customer_accounts.write.format('delta').mode('overwrite').saveAsTable(f'{catalog_name}.{schema_name}.customer_accounts')
print(f"Created f'{catalog_name}.{schema_name}.customer_accounts table")

## Sales Opportunities

In [0]:
def generate_sales_opportunities(sales_reps_df, accounts_df, num_opportunities=1500, seed=42):
    """
    Generate synthetic sales opportunities data
    
    Parameters:
    -----------
    sales_reps_df : pandas.DataFrame
        DataFrame containing sales rep data
    accounts_df : pandas.DataFrame
        DataFrame containing account data
    num_opportunities : int
        Number of opportunities to generate
    seed : int
        Random seed for reproducibility
        
    Returns:
    --------
    pandas.DataFrame
        DataFrame containing synthetic sales opportunities data
    """
    # Set random seed for reproducibility
    np.random.seed(seed)
    random.seed(seed)
    
    # Convert to pandas for easier manipulation
    reps_pd = sales_reps_df.toPandas()
    accounts_pd = accounts_df.toPandas()
    
    # Generate Opportunity IDs
    opp_ids = [f'OPP{str(i).zfill(6)}' for i in range(1, num_opportunities + 1)]
    
    # Workday product categories
    product_categories = [
        'Human Capital Management (HCM)',
        'Financial Management',
        'Planning',
        'Analytics',
        'Student',
        'Adaptive Planning',
        'Peakon Employee Voice'
    ]
    
    # Opportunity names based on products
    opp_name_templates = [
        '{company} - HCM Implementation',
        '{company} - Financial Management Upgrade',
        '{company} - Planning Solution',
        '{company} - Analytics Platform',
        '{company} - Student Information System',
        '{company} - Adaptive Planning Deployment',
        '{company} - Employee Voice Initiative',
        '{company} - Complete Workday Suite',
        '{company} - HCM + Financials Bundle'
    ]
    
    # Sales stages
    sales_stages = [
        'Prospecting',
        'Discovery',
        'Proposal',
        'Negotiation',
        'Closed Won',
        'Closed Lost'
    ]
    
    # Stage probabilities
    stage_probabilities = {
        'Prospecting': 10,
        'Discovery': 25,
        'Proposal': 50,
        'Negotiation': 75,
        'Closed Won': 100,
        'Closed Lost': 0
    }
    
    # Generate opportunities
    opportunities = []
    
    for i in range(num_opportunities):
        # Select random account and rep
        account = accounts_pd.sample(n=1).iloc[0]
        rep = reps_pd.sample(n=1).iloc[0]
        
        # Generate opportunity name
        opp_name_template = random.choice(opp_name_templates)
        opp_name = opp_name_template.format(company=account['CompanyName'])
        
        # Select product category
        product_category = random.choice(product_categories)
        
        # Select sales stage (weighted towards earlier stages for active opps)
        stage_weights = [0.25, 0.20, 0.15, 0.10, 0.20, 0.10]
        sales_stage = np.random.choice(sales_stages, p=stage_weights)
        
        # Generate opportunity value based on account size
        company_size = account['CompanySize']
        if company_size == 'Small (1-100)':
            base_value = random.randint(50000, 300000)
        elif company_size == 'Medium (101-1000)':
            base_value = random.randint(200000, 800000)
        elif company_size == 'Large (1001-5000)':
            base_value = random.randint(500000, 2000000)
        else:  # Enterprise
            base_value = random.randint(1000000, 5000000)
        
        # Adjust value based on product category
        if product_category == 'Complete Workday Suite' or 'Bundle' in opp_name:
            base_value = int(base_value * random.uniform(1.5, 2.5))
        elif product_category in ['Human Capital Management (HCM)', 'Financial Management']:
            base_value = int(base_value * random.uniform(1.2, 1.8))
        
        opportunity_value = base_value
        
        # Set probability based on stage
        probability = stage_probabilities[sales_stage]
        if sales_stage not in ['Closed Won', 'Closed Lost']:
            # Add some randomness to probability
            probability += random.randint(-10, 10)
            probability = max(0, min(100, probability))
        
        # Generate dates
        current_date = datetime.now()
        
        # Create date (opportunity creation)
        days_ago_created = random.randint(30, 365)
        create_date = current_date - timedelta(days=days_ago_created)
        
        # Expected close date
        if sales_stage in ['Closed Won', 'Closed Lost']:
            # Closed deals have close date in the past
            close_date = create_date + timedelta(days=random.randint(30, 180))
        else:
            # Open deals have future close dates
            days_to_close = random.randint(30, 270)
            close_date = current_date + timedelta(days=days_to_close)
        
        # Last activity date
        if sales_stage in ['Closed Won', 'Closed Lost']:
            last_activity = close_date
        else:
            days_since_activity = int(np.random.exponential(7))  # Most activity is recent
            days_since_activity = min(days_since_activity, 60)
            last_activity = current_date - timedelta(days=days_since_activity)
        
        # Deal source
        deal_sources = ['Inbound Marketing', 'Outbound Prospecting', 'Partner Referral', 'Event/Conference', 
                        'Customer Referral', 'Cold Call', 'LinkedIn', 'Website']
        source_weights = [0.25, 0.20, 0.15, 0.10, 0.10, 0.08, 0.07, 0.05]
        deal_source = np.random.choice(deal_sources, p=source_weights)
        
        # Competition (for competitive deals)
        competitors = ['SAP SuccessFactors', 'Oracle HCM', 'ADP', 'BambooHR', 'Cornerstone OnDemand', 
                      'Ultimate Software', 'Paycom', 'None']
        competitor_weights = [0.20, 0.15, 0.12, 0.08, 0.08, 0.07, 0.05, 0.25]
        primary_competitor = np.random.choice(competitors, p=competitor_weights)
        
        opportunity = {
            'OpportunityID': opp_ids[i],
            'OpportunityName': opp_name,
            'AccountID': account['AccountID'],
            'RepID': rep['RepID'],
            'ProductCategory': product_category,
            'OpportunityValue': opportunity_value,
            'SalesStage': sales_stage,
            'Probability': probability,
            'ExpectedCloseDate': close_date.strftime('%Y-%m-%d'),
            'CreateDate': create_date.strftime('%Y-%m-%d'),
            'LastActivityDate': last_activity.strftime('%Y-%m-%d'),
            'DealSource': deal_source,
            'PrimaryCompetitor': primary_competitor,
            'IsActive': 1 if sales_stage not in ['Closed Won', 'Closed Lost'] else 0
        }
        
        opportunities.append(opportunity)
    
    return pd.DataFrame(opportunities)

# Generate sales opportunities
sales_opportunities = generate_sales_opportunities(sales_reps, customer_accounts, num_opportunities=2000)
sales_opportunities_spark = spark.createDataFrame(sales_opportunities)
display(sales_opportunities_spark)
sales_opportunities_spark.write.format('delta').mode('overwrite').saveAsTable(f'{catalog_name}.{schema_name}.sales_opportunities')
print(f"Created f'{catalog_name}.{schema_name}.sales_opportunities table")

## Sales Activities & Interactions

In [0]:
def generate_sales_activities(opportunities_df, num_activities=5000, seed=42):
    """
    Generate synthetic sales activities data
    
    Parameters:
    -----------
    opportunities_df : pandas.DataFrame
        DataFrame containing opportunities data
    num_activities : int
        Number of activities to generate
    seed : int
        Random seed for reproducibility
        
    Returns:
    --------
    pandas.DataFrame
        DataFrame containing synthetic sales activities data
    """
    # Set random seed for reproducibility
    np.random.seed(seed)
    random.seed(seed)
    
    # Convert to pandas
    opps_pd = opportunities_df.toPandas()
    
    # Activity types
    activity_types = [
        'Phone Call', 'Email', 'Meeting', 'Demo', 'Proposal Sent', 
        'Contract Review', 'Discovery Call', 'Follow-up Call', 'Event'
    ]
    
    # Activity outcomes
    activity_outcomes = [
        'Positive', 'Neutral', 'Negative', 'No Response', 'Scheduled Follow-up'
    ]
    
    # Generate activities
    activities = []
    current_date = datetime.now()
    
    for i in range(num_activities):
        # Select random opportunity
        opp = opps_pd.sample(n=1).iloc[0]
        
        # Activity ID
        activity_id = f'ACT{str(i).zfill(6)}'
        
        # Activity type (weighted based on typical sales activities)
        type_weights = [0.25, 0.20, 0.15, 0.10, 0.08, 0.05, 0.07, 0.08, 0.02]
        activity_type = np.random.choice(activity_types, p=type_weights)
        
        # Activity date (should be between opportunity creation and last activity)
        opp_create = datetime.strptime(opp['CreateDate'], '%Y-%m-%d')
        opp_last_activity = datetime.strptime(opp['LastActivityDate'], '%Y-%m-%d')
        
        # Random date between creation and last activity
        date_range = (opp_last_activity - opp_create).days
        if date_range > 0:
            random_days = random.randint(0, date_range)
            activity_date = opp_create + timedelta(days=random_days)
        else:
            activity_date = opp_create
        
        # Activity outcome
        outcome_weights = [0.4, 0.25, 0.15, 0.1, 0.1]
        outcome = np.random.choice(activity_outcomes, p=outcome_weights)
        
        # Duration (in minutes)
        if activity_type == 'Demo':
            duration = random.randint(45, 120)
        elif activity_type == 'Meeting':
            duration = random.randint(30, 90)
        elif activity_type in ['Phone Call', 'Discovery Call', 'Follow-up Call']:
            duration = random.randint(15, 60)
        elif activity_type == 'Event':
            duration = random.randint(120, 480)
        else:  # Email, Proposal, etc.
            duration = 0
        
        # Notes/Description
        activity_descriptions = {
            'Phone Call': ['Initial contact call', 'Follow-up discussion', 'Budget confirmation call', 'Technical requirements call'],
            'Email': ['Sent proposal', 'Follow-up email', 'Meeting recap', 'Resource sharing'],
            'Meeting': ['Discovery meeting', 'Stakeholder meeting', 'Technical review', 'Executive briefing'],
            'Demo': ['Product demonstration', 'Technical demo', 'Pilot demonstration', 'POC presentation'],
            'Proposal Sent': ['Initial proposal', 'Revised proposal', 'Final proposal', 'Contract proposal'],
            'Contract Review': ['Legal review', 'Terms negotiation', 'Contract discussion', 'Final review'],
            'Discovery Call': ['Needs assessment', 'Requirements gathering', 'Pain point discussion', 'Current state review'],
            'Follow-up Call': ['Status check', 'Decision timeline', 'Next steps discussion', 'Objection handling'],
            'Event': ['Trade show meeting', 'Conference presentation', 'User group event', 'Industry event']
        }
        
        description = random.choice(activity_descriptions.get(activity_type, ['General activity']))
        
        activity = {
            'ActivityID': activity_id,
            'OpportunityID': opp['OpportunityID'],
            'ActivityType': activity_type,
            'ActivityDate': activity_date.strftime('%Y-%m-%d'),
            'Duration': duration,
            'Outcome': outcome,
            'Description': description
        }
        
        activities.append(activity)
    
    return pd.DataFrame(activities)

# Generate sales activities
sales_activities = generate_sales_activities(sales_opportunities_spark, num_activities=7500)
sales_activities_spark = spark.createDataFrame(sales_activities)
display(sales_activities_spark)
sales_activities_spark.write.format('delta').mode('overwrite').saveAsTable(f'{catalog_name}.{schema_name}.sales_activities')
print(f"Created '{catalog_name}.{schema_name}.sales_activities table")

## Unstructured Data: Proposal Documents & Contract Content

In [0]:
import io
import random
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import LETTER
from reportlab.lib.units import inch
from databricks.sdk import WorkspaceClient

# ---------- FEEDBACK GENERATOR ----------
def generate_customer_feedback(accounts_df, opportunities_df, num_feedback=10, seed=42):
    np.random.seed(seed)
    random.seed(seed)
    accounts_pd = accounts_df.toPandas()
    opps_pd = opportunities_df.toPandas()

    feedback_templates = {
        "positive": [
            """Excellent demo! Highlights included real-time reporting, a clean interface, and easy integrations.""",
            """Outstanding experience with the Workday sales team. Clear communication and excellent support.""",
        ],
        "neutral": [
            """Mixed impressions. The system appears capable, but implementation may be complex for our team.""",
            """Professional process, though timelines and pricing need further clarification.""",
        ],
        "negative": [
            """Disappointed with the demo. Performance concerns and unclear customization options.""",
            """Sales process felt rushed and didn’t align with our needs.""",
        ],
    }

    records, now = [], datetime.now()

    for i in range(num_feedback):
        account = accounts_pd.sample(n=1).iloc[0]
        related = opps_pd[opps_pd["AccountID"] == account["AccountID"]]
        opp = related.sample(n=1).iloc[0] if len(related) > 0 else opps_pd.sample(n=1).iloc[0]
        stage = opp["SalesStage"]

        if stage == "Closed Won":
            sentiment = random.choice(["positive", "positive", "neutral"])
        elif stage == "Closed Lost":
            sentiment = random.choice(["negative", "neutral"])
        else:
            sentiment = random.choice(["positive", "neutral", "negative"])

        content = random.choice(feedback_templates[sentiment])

        days_ago = min(int(np.random.exponential(45)), 200)
        dt = now - timedelta(days=days_ago)
        score = (
            random.uniform(0.6, 0.9)
            if sentiment == "positive"
            else random.uniform(0.3, 0.6)
            if sentiment == "neutral"
            else random.uniform(0.1, 0.4)
        )

        records.append(
            {
                "FeedbackID": f"FB{str(i + 1).zfill(5)}",
                "AccountID": account["AccountID"],
                "OpportunityID": opp["OpportunityID"],
                "FeedbackDate": dt.strftime("%Y-%m-%d"),
                "Sentiment": sentiment,
                "Score": round(score, 3),
                "Content": content,
                "CustomerRole": random.choice(
                    ["IT Director", "CFO", "CTO", "CHRO", "Operations Manager"]
                ),
                "Source": random.choice(["Email", "Survey", "Phone Interview"]),
            }
        )

    return pd.DataFrame(records)


# ---------- FETCH DATA FROM EXISTING SPARK TABLES ----------
customer_feedback = generate_customer_feedback(
    customer_accounts, sales_opportunities_spark, num_feedback=25
)

# ---------- GENERATE & UPLOAD PDFs ----------
for _, row in customer_feedback.iterrows():
    buffer = io.BytesIO()
    c = canvas.Canvas(buffer, pagesize=LETTER)
    c.setFont("Helvetica", 10)
    width, height = LETTER

    # Header
    c.setFont("Helvetica-Bold", 14)
    c.drawCentredString(width / 2, height - 0.8 * inch, "Customer Feedback Report")

    # Metadata
    c.setFont("Helvetica", 10)
    y = height - 1.3 * inch
    meta_lines = [
        f"Feedback ID: {row['FeedbackID']}",
        f"Account ID: {row['AccountID']}",
        f"Opportunity ID: {row['OpportunityID']}",
        f"Feedback Date: {row['FeedbackDate']}",
        f"Sentiment: {row['Sentiment'].capitalize()}",
        f"Sentiment Score: {row['Score']}",
        f"Customer Role: {row['CustomerRole']}",
        f"Source: {row['Source']}",
        "",
        "Feedback Content:",
    ]

    for line in meta_lines:
        c.drawString(72, y, line)
        y -= 15

    # Content (word-wrapped)
    text_lines = row["Content"].split("\n")
    for paragraph in text_lines:
        for subline in paragraph.strip().split(". "):
            wrapped = []
            words = subline.split()
            while words:
                chunk, words = words[:12], words[12:]
                wrapped.append(" ".join(chunk))
            for line in wrapped:
                if y < 72:
                    c.showPage()
                    c.setFont("Helvetica", 10)
                    y = height - 72
                c.drawString(72, y, line)
                y -= 14
        y -= 10

    c.save()
    buffer.seek(0)

    # Upload directly to UC Volume
    filename = f"{row['FeedbackID']}.pdf"
    target_path = f"{volume_path}/customer_feedback/{filename}"
    w.files.upload(target_path, buffer, overwrite=True)

    # print(f"Uploaded: {target_path}")

print(f"✅ Successfully uploaded {len(customer_feedback)} PDFs to {volume_path}")

## Unstructured Data: Meeting Notes & Call Summaries


In [0]:
import io
import random
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import LETTER
from reportlab.lib.units import inch
from databricks.sdk import WorkspaceClient

# ---------- GENERATOR ----------
def generate_meeting_notes(
    activities_df,
    opportunities_df,
    accounts_df,
    sales_reps_df,
    seed=42
):
    np.random.seed(seed)
    random.seed(seed)

    activities_pd = activities_df.toPandas()
    opps_pd = opportunities_df.toPandas()
    accounts_pd = accounts_df.toPandas()
    reps_pd = sales_reps_df.toPandas()

    meeting_activities = activities_pd[
        activities_pd['ActivityType'].isin(
            ['Meeting', 'Demo', 'Phone Call', 'Discovery Call']
        )
    ]

    # One comprehensive template that supports every placeholder used below
    base_template = """\
Company: {company_name}
Meeting Type: {meeting_type}
Date: {meeting_date}  |  Duration: {duration} min  |  Location: {meeting_location}
Databricks Rep: {rep_name} (SE: {se_name})
Attendees: {attendees}
Client Attendees: {client_attendees}

Business Area: {business_area} ({department})
Focus Area: {focus_area}
Current System: {current_system}  |  Team Size: {team_size}  |  Locations: {locations}

Pain Points: {pain_points}
Main Challenge: {main_challenge}
Delay Area: {delay_area}  |  Error Area: {error_area}
Compliance Area: {compliance_area}  |  Resource Area: {resource_area}
Data Sources: {data_sources}  |  Reporting Frequency: {reporting_freq}  |  Users: {user_count}

Requirements:
  1) {requirement_1}
  2) {requirement_2}
  3) {requirement_3}
Integration Focus: {integration_focus}  |  Integration System: {integration_system}
Product Category: {product_category}

Fit Assessment:
  - {fit_point_1}
  - {fit_point_2}
  - {fit_point_3}

Concerns:
  - {concern_1}
  - {concern_2}

Feature Reactions:
  * {feature_1}: {feature_1_reaction}
  * {feature_2}: {feature_2_reaction}
  * {feature_3}: {feature_3_reaction}
Workflow Reaction: {workflow_reaction}

Q&A:
  Q: {question_1}
  A: {answer_1}

  Q: {question_2}
  A: {answer_2}

  Q: {question_3}
  A: {answer_3}

Implementation:
  Timeline: {impl_timeline}
  Phases: {impl_phases}
  Resource Estimate: {resource_estimate}
  Training Needs: {training_needs}

Commercials:
  Budget Range: {budget_range}
  Timeline Target: {timeline}
  Decision Makers: {decision_makers}
  Evaluation Criteria: {evaluation_criteria}
  Proposal Date: {proposal_date}

Discussion Highlights:
  - {discussion_topic_1}: {discussion_detail_1}
  - {discussion_topic_2}: {discussion_detail_2}
  - {discussion_topic_3}: {discussion_detail_3}

Decisions:
  1) {decision_1}
  2) {decision_2}
  3) {decision_3}

Issues & Owners:
  - {issue_1} (Owner: {issue_owner_1})
  - {issue_2} (Owner: {issue_owner_2})
Risk Assessment: {risk_assessment}

Next Steps:
  Actions:
    - {action_1}
    - {action_2}
    - {action_3}
  Action Items:
{action_items}
  Next Meeting: {next_meeting_date}  |  Demo Date: {demo_date}

Attendees List:
{attendees_list}

Additional Notes: {additional_notes}

Call Summary:
  Purpose: {call_purpose}
  Sentiment: {sentiment} (Score: {sentiment_score}/10)
  Opportunity Score: {opp_score}/10
  Outcome: {call_outcome}

Customer Update: {customer_update}
Our Response: {our_response}
Key Points:
  - {point_1}
  - {point_2}
  - {point_3}
  - {point_4}
"""

    meeting_templates = {
        'Meeting': [base_template],
        'Demo': [base_template],
        'Phone Call': [base_template],
        'Discovery Call': [base_template],
    }

    meeting_notes = []

    for _, activity in meeting_activities.head(25).iterrows():
        opp = opps_pd[opps_pd['OpportunityID'] == activity['OpportunityID']].iloc[0]
        account = accounts_pd[accounts_pd['AccountID'] == opp['AccountID']].iloc[0]
        rep = reps_pd[reps_pd['RepID'] == opp['RepID']].iloc[0]

        activity_type = activity['ActivityType']
        template = random.choice(
            meeting_templates.get(activity_type, meeting_templates['Phone Call'])
        )

        rep_name = f"{rep['FirstName']} {rep['LastName']}"
        company_name = account['CompanyName']
        industry = account.get('Industry', 'Unknown')

        attendees = random.choice([
            'Sarah Johnson (IT Director), Mike Chen (HR Director)',
            'David Kim (CFO), Lisa Rodriguez (Operations Manager)',
            'Jennifer Walsh (CTO), Robert Martinez (VP Finance)',
            "Amanda Thompson (CHRO), Kevin O'Connor (IT Manager)"
        ])

        pain_points = random.choice([
            'manual data entry, inconsistent reporting, compliance gaps',
            'siloed systems, delayed reporting, audit trail issues',
            'spreadsheet-based processes, version control problems',
            'lack of real-time visibility, integration challenges'
        ])

        note_content = template.format(
            company_name=company_name,
            meeting_type=activity_type,
            meeting_date=activity['ActivityDate'],
            meeting_location=random.choice(['Client HQ', 'Virtual (Zoom)', 'Databricks Office']),
            rep_name=rep_name,
            se_name=random.choice(['Alex Rodriguez', 'Priya Shah', 'Tom Nguyen']),
            attendees=attendees,
            client_attendees='C-suite and department heads',
            business_area=random.choice(['HR processes', 'financial planning', 'reporting', 'compliance']),
            department=random.choice(['HR', 'Finance', 'Operations', 'IT']),
            focus_area=industry.lower(),
            current_system=random.choice(['Excel', 'Legacy HRIS', 'Multiple systems', 'Paper-based process']),
            team_size=random.randint(5, 50),
            locations=random.randint(1, 5),
            pain_points=pain_points,
            main_challenge=random.choice(['scaling operations', 'regulatory compliance', 'process efficiency']),
            delay_area='month-end close',
            error_area='data consolidation',
            compliance_area=f'{industry} regulations',
            resource_area='finance team',
            data_sources='3 different systems',
            reporting_freq='Monthly',
            user_count=random.randint(10, 100),
            requirement_1=random.choice(['Real-time reporting', 'Mobile access', 'API integration']),
            requirement_2=random.choice(['Automated workflows', 'Compliance tracking', 'Self-service']),
            requirement_3=random.choice(['Custom dashboards', 'Role-based access', 'Audit trails']),
            integration_focus=random.choice(['ERP', 'CRM', 'HRIS']),
            integration_system=random.choice(['ERP', 'CRM', 'HRIS']),
            product_category=opp['ProductCategory'],
            fit_point_1='Strong alignment with current needs',
            fit_point_2='Proven track record in ' + industry.lower(),
            fit_point_3='Scalable architecture',
            concern_1='Implementation timeline',
            concern_2='Change management',
            feature_1='Automated reporting', feature_1_reaction='Impressed with real-time capabilities',
            feature_2='Mobile interface',   feature_2_reaction='Liked the user experience',
            feature_3='Analytics dashboard',feature_3_reaction='Requested custom metrics',
            workflow_reaction='Perfect fit for their processes',
            question_1='How long is implementation?', answer_1='Typically 6-9 months for your size',
            question_2='What about data migration?',  answer_2='We handle that with a dedicated team',
            question_3='Training requirements?',       answer_3='2-week program with ongoing support',
            impl_timeline='6-8 months',
            impl_phases='3 phases: Foundation, Rollout, Optimization',
            resource_estimate='2-3 dedicated resources',
            training_needs='Train-the-trainer approach',
            budget_range=f"{random.randint(100, 500)}K - {random.randint(500, 1000)}K",
            timeline=random.choice(['Q2 2025', 'end of year', 'next fiscal year']),
            decision_makers='IT Director, CFO',
            evaluation_criteria='ROI, implementation time, user adoption',
            proposal_date='End of month',
            discussion_topic_1='Technical Integration',
            discussion_detail_1='Reviewed API capabilities and data flow requirements',
            discussion_topic_2='Change Management',
            discussion_detail_2='Discussed training approach and user adoption strategy',
            discussion_topic_3='Timeline & Budget',
            discussion_detail_3='Aligned on project phases and investment levels',
            decision_1='Proceed with technical evaluation',
            decision_2='Include IT security team in next review',
            decision_3='Request detailed implementation plan',
            issue_1='Security review pending', issue_owner_1='Client IT team',
            issue_2='Budget approval needed', issue_owner_2='Client Finance',
            risk_assessment='Low risk - strong stakeholder buy-in',
            next_meeting_date='Next Friday',
            action_1='Send detailed proposal',
            action_2='Schedule technical review',
            action_3='Provide reference contacts',
            action_items='    • Workday: Security documentation\n    • Client: Finalize budget',
            demo_date='Next Tuesday',
            attendees_list='• Sarah J. (IT Director)\n• Mike C. (Finance Manager)',
            additional_notes='Client seems engaged and ready to move forward.',
            call_purpose=random.choice(['Status update', 'Address concerns', 'Next steps discussion']),
            sentiment=random.choice(['Positive', 'Neutral', 'Cautious', 'Enthusiastic']),
            sentiment_score=random.randint(6, 9),
            opp_score=random.randint(5, 9),
            call_outcome='Positive - moving to next stage',
            customer_update='Evaluation committee formed',
            our_response='Provided requested documentation',
            point_1='Discussed current pain points',
            point_2='Reviewed Workday capabilities',
            point_3='Addressed pricing questions',
            point_4='Next steps alignment',
            contact_name=random.choice(['John', 'Sarah', 'Mike', 'Jennifer']),
            urgency_level=random.choice(['High', 'Medium', 'Low']),
            duration=random.randint(30, 120) if activity['Duration'] == 0 else activity['Duration'],
        )

        note = {
            'NoteID': f'NOTE{activity["ActivityID"][3:]}',
            'ActivityID': activity['ActivityID'],
            'OpportunityID': activity['OpportunityID'],
            'RepID': opp['RepID'],
            'AccountID': opp['AccountID'],
            'NoteType': activity_type,
            'CreatedDate': activity['ActivityDate'],
            'NoteContent': note_content,
            'WordCount': len(note_content.split()),
            'SentimentScore': random.uniform(0.3, 0.9),
            'KeyTopics': random.choice([
                'budget,timeline,requirements',
                'integration,security,compliance',
                'roi,implementation,training',
                'competitive,pricing,features'
            ])
        }

        meeting_notes.append(note)

    return pd.DataFrame(meeting_notes)


# ---------- Generate meeting notes from your Spark DFs ----------
meeting_notes = generate_meeting_notes(
    sales_activities_spark,
    sales_opportunities_spark,
    customer_accounts,
    sales_reps
)

# ---------- PDF Uploads (WorkspaceClient -> UC Volume) ----------
def wrap_and_draw(c, text, x, y, width, font_name="Helvetica", font_size=10, line_height=14):
    """Simple word-wrap printer. Returns new y."""
    c.setFont(font_name, font_size)
    # Split into paragraphs on newlines
    paragraphs = [p.rstrip() for p in (text or "").split("\n")]
    for para in paragraphs:
        if not para:
            y -= line_height
            continue
        words, line = para.split(), []
        for w in words:
            trial = (" ".join(line + [w])).strip()
            if c.stringWidth(trial, font_name, font_size) <= width:
                line.append(w)
            else:
                c.drawString(x, y, " ".join(line))
                y -= line_height
                line = [w]
                if y < 0.9 * inch:
                    c.showPage()
                    c.setFont(font_name, font_size)
                    y = LETTER[1] - 1.0 * inch
        if line:
            c.drawString(x, y, " ".join(line))
            y -= line_height
        y -= 0.4 * line_height
        if y < 0.9 * inch:
            c.showPage()
            c.setFont(font_name, font_size)
            y = LETTER[1] - 1.0 * inch
    return y

for _, row in meeting_notes.iterrows():
    buffer = io.BytesIO()
    c = canvas.Canvas(buffer, pagesize=LETTER)

    width, height = LETTER
    lm, rm = 1.0 * inch, 1.0 * inch
    usable_w = width - lm - rm
    y = height - 0.9 * inch

    # Header
    c.setFont("Helvetica-Bold", 14)
    c.drawCentredString(width / 2, y, "Meeting Notes")
    y -= 0.4 * inch

    # Metadata block
    c.setFont("Helvetica", 10)
    meta = [
        f"NoteID: {row['NoteID']}",
        f"Type: {row['NoteType']}",
        f"Created: {row['CreatedDate']}",
        f"AccountID: {row['AccountID']}   OpportunityID: {row['OpportunityID']}   RepID: {row['RepID']}",
        f"SentimentScore: {round(row['SentimentScore'],3)}   KeyTopics: {row['KeyTopics']}",
        ""
    ]
    for line in meta:
        c.drawString(lm, y, line)
        y -= 14
        if y < 0.9 * inch:
            c.showPage(); c.setFont("Helvetica", 10)
            y = height - 1.0 * inch

    # Body
    c.setFont("Helvetica-Bold", 11)
    c.drawString(lm, y, "Notes:")
    y -= 16

    y = wrap_and_draw(
        c=c,
        text=row['NoteContent'],
        x=lm,
        y=y,
        width=usable_w,
        font_name="Helvetica",
        font_size=10,
        line_height=14
    )

    c.save()
    buffer.seek(0)

    # Upload to Volume
    filename = f"{row['NoteID']}.pdf"
    target_path = f"{volume_path}/meeting_notes/{filename}"
    w.files.upload(target_path, buffer, overwrite=True)
    # print(f"Uploaded: {target_path}")

print(f"✅ Successfully uploaded {len(meeting_notes)} meeting note PDFs to {volume_path}")

## Unstructured Data: Email communications

In [0]:
import io
import random
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import LETTER
from reportlab.lib.units import inch
from databricks.sdk import WorkspaceClient

# ---------- YOUR GENERATOR (unchanged) ----------
def generate_email_communications(opportunities_df, sales_reps_df, accounts_df, num_emails=800, seed=42):
    """
    Generate synthetic email communications between sales reps and customers
    """
    # Set random seed for reproducibility
    np.random.seed(seed)
    random.seed(seed)
    
    # Convert to pandas
    opps_pd = opportunities_df.toPandas()
    reps_pd = sales_reps_df.toPandas()
    accounts_pd = accounts_df.toPandas()
    
    # Email templates by type
    email_templates = {
        'initial_outreach': [
            """Subject: Workday Solutions for {company_name} - Let's Connect

Hi {contact_name},

I hope this email finds you well. I'm {rep_name}, {rep_role} at Workday, and I've been following {company_name}'s growth in the {industry} space.

I'd love to share how companies like yours are leveraging Workday's {product_category} solutions to streamline operations and drive efficiency. Would you be open to a brief 15-minute conversation next week?

Best regards,
{rep_name}
{rep_email}
Workday""",
            
            """Subject: {company_name} + Workday: Driving Digital Transformation

Hello {contact_name},

I noticed {company_name} has been expanding rapidly - congratulations! As you scale, managing {business_challenge} becomes increasingly complex.

At Workday, we've helped similar {industry} companies like {example_company} achieve:
- 40% reduction in administrative overhead
- 25% faster reporting cycles
- Improved employee satisfaction scores

Would you be interested in a quick call to discuss how we could help {company_name}?

Looking forward to connecting,
{rep_name}"""
        ],
        
        'follow_up': [
            """Subject: Re: Following up on our conversation

Hi {contact_name},

Thank you for taking the time to speak with me yesterday about {company_name}'s {business_challenge} initiatives. I really enjoyed learning about your current processes and future goals.

As promised, I'm attaching:
- ROI calculator specific to your use case
- Case study from {example_company} (similar size/industry)
- Implementation timeline overview

Based on our discussion, I believe our {product_category} solution could deliver significant value. Would you like to schedule a demo for your team next week?

Best,
{rep_name}""",
            
            """Subject: Quick follow-up from {rep_name}

{contact_name},

Hope you're having a great week! I wanted to circle back on the {product_category} discussion we had last month.

I know timing wasn't quite right then, but I thought you'd be interested to know that we just helped {example_company} achieve a 30% improvement in {business_metric} within 6 months of implementation.

Given {company_name}'s similar challenges, this could be relevant. Would you be open to a brief catch-up call?

Thanks,
{rep_name}"""
        ],
        
        'demo_invite': [
            """Subject: Workday Demo - {company_name} Custom Walkthrough

Dear {contact_name},

Thank you for expressing interest in Workday's {product_category} capabilities. I'm excited to show you how our platform can address {company_name}'s specific needs.

Demo Details:
- Date: [Next Tuesday]
- Duration: 45 minutes
- Focus: {business_challenge} solutions
- Attendees: Please invite key stakeholders from IT, HR, and Finance

I'll tailor the demo to show:
1. Integration with your existing systems
2. Automated reporting for your {industry} requirements
3. ROI projections based on your current processes

Looking forward to it!

{rep_name}
Workday Solutions"""
        ],
        
        'proposal_follow_up': [
            """Subject: Workday Proposal for {company_name} - Next Steps

{contact_name},

I hope you and the team had a chance to review the proposal I sent last week. The solution we've designed specifically addresses the key challenges you mentioned:

✓ Streamlined {business_process}
✓ Real-time reporting and analytics  
✓ Compliance with {industry} regulations
✓ Seamless integration with your current tech stack

I'm confident this solution will deliver the ROI we discussed. Do you have any questions about the proposal? I'd be happy to walk through any sections in detail.

What would be the best way to move forward?

Best regards,
{rep_name}"""
        ],
        
        'objection_handling': [
            """Subject: Re: Budget concerns for Workday implementation

{contact_name},

I completely understand your budget considerations - this is an investment decision that needs to make financial sense.

Let me share a few points that might help:

1. ROI Timeline: Most clients see payback within 18 months
2. Flexible Payment: We offer phased implementation to spread costs
3. Hidden Costs: Consider current manual process costs (estimated $200K annually for companies your size)

Would it help to have our finance team create a detailed cost-benefit analysis specific to {company_name}? We can also explore a pilot program to prove value before full commitment.

Happy to discuss further,
{rep_name}"""
        ]
    }
    
    # Sample company names for examples
    example_companies = [
        'TechFlow Industries', 'DataCorp Solutions', 'InnovateCorp', 'GlobalTech Systems',
        'NextGen Enterprises', 'SmartOps Inc.', 'VelocityPro', 'OptimalSystems'
    ]
    
    # Business challenges by industry
    business_challenges = {
        'Technology': ['talent management', 'rapid scaling', 'project tracking', 'performance management'],
        'Healthcare': ['compliance reporting', 'staff scheduling', 'cost management', 'regulatory compliance'],
        'Financial Services': ['risk management', 'regulatory reporting', 'audit trails', 'compliance monitoring'],
        'Manufacturing': ['workforce planning', 'safety compliance', 'operational efficiency', 'cost control'],
        'Retail': ['seasonal staffing', 'inventory planning', 'customer analytics', 'supply chain management']
    }
    
    emails = []
    current_date = datetime.now()
    
    for i in range(num_emails):
        # Select random opportunity and related data
        opp = opps_pd.sample(n=1).iloc[0]
        rep = reps_pd[reps_pd['RepID'] == opp['RepID']].iloc[0]
        account = accounts_pd[accounts_pd['AccountID'] == opp['AccountID']].iloc[0]
        
        # Email type based on opportunity stage
        stage = opp['SalesStage']
        if stage == 'Prospecting':
            email_type = random.choice(['initial_outreach', 'follow_up'])
        elif stage == 'Discovery':
            email_type = random.choice(['demo_invite', 'follow_up'])
        elif stage == 'Proposal':
            email_type = 'proposal_follow_up'
        elif stage == 'Negotiation':
            email_type = 'objection_handling'
        else:
            email_type = random.choice(['initial_outreach', 'follow_up'])
        
        # Select template
        template = random.choice(email_templates[email_type])
        
        # Generate email content
        rep_name = f"{rep['FirstName']} {rep['LastName']}"
        rep_email = f"{rep['FirstName'].lower()}.{rep['LastName'].lower()}@workday.com"
        contact_name = random.choice(['John', 'Sarah', 'Mike', 'Jennifer', 'David', 'Lisa', 'Robert', 'Michelle'])
        company_name = account['CompanyName']
        industry = account['Industry']
        product_category = opp['ProductCategory']
        
        # Business challenge based on industry
        challenges = business_challenges.get(industry, ['operational efficiency', 'cost management'])
        business_challenge = random.choice(challenges)
        
        # Fill in template
        email_content = template.format(
            company_name=company_name,
            contact_name=contact_name,
            rep_name=rep_name,
            rep_role=rep['Role'],
            rep_email=rep_email,
            industry=industry.lower(),
            product_category=product_category,
            business_challenge=business_challenge,
            example_company=random.choice(example_companies),
            business_metric=random.choice(['efficiency', 'accuracy', 'compliance', 'productivity']),
            business_process=random.choice(['payroll processing', 'reporting', 'onboarding', 'planning'])
        )
        
        # Email metadata
        days_ago = int(np.random.exponential(30))  # Most emails recent
        days_ago = min(days_ago, 365)
        sent_date = current_date - timedelta(days=days_ago)
        
        email_direction = random.choice(['Outbound', 'Inbound']) if random.random() < 0.2 else 'Outbound'
        
        email = {
            'EmailID': f'EMAIL{str(i+1).zfill(6)}',
            'OpportunityID': opp['OpportunityID'],
            'RepID': opp['RepID'],
            'AccountID': opp['AccountID'],
            'EmailType': email_type,
            'EmailDirection': email_direction,
            'SentDate': sent_date.strftime('%Y-%m-%d %H:%M:%S'),
            'EmailContent': email_content,
            'WordCount': len(email_content.split()),
            'HasAttachment': 1 if 'attach' in email_content.lower() else 0,
            'ResponseReceived': random.choice([1, 0]) if email_direction == 'Outbound' else 0
        }
        
        emails.append(email)
    
    return pd.DataFrame(emails)

# ---------- Build DataFrame from your Spark DFs ----------
email_communications = generate_email_communications(
    sales_opportunities_spark, sales_reps, customer_accounts, num_emails=1200
)

# ---------- PDF helper ----------
def wrap_and_draw(c, text, x, y, width, font_name="Helvetica", font_size=10, line_height=14):
    """Simple word-wrap printer. Returns new y."""
    c.setFont(font_name, font_size)
    paragraphs = [p.rstrip() for p in (text or "").split("\n")]
    for para in paragraphs:
        if not para:
            y -= line_height
            continue
        words, line = para.split(), []
        for w in words:
            trial = (" ".join(line + [w])).strip()
            if c.stringWidth(trial, font_name, font_size) <= width:
                line.append(w)
            else:
                c.drawString(x, y, " ".join(line))
                y -= line_height
                line = [w]
                if y < 0.9 * inch:
                    c.showPage()
                    c.setFont(font_name, font_size)
                    y = LETTER[1] - 1.0 * inch
        if line:
            c.drawString(x, y, " ".join(line))
            y -= line_height
        y -= 0.4 * line_height
        if y < 0.9 * inch:
            c.showPage()
            c.setFont(font_name, font_size)
            y = LETTER[1] - 1.0 * inch
    return y

# ---------- Generate & Upload PDFs ----------
upload_count = 0
max_pdfs = 25
for _, row in email_communications.iterrows():
    if max_pdfs is not None and upload_count >= max_pdfs:
        break

    buffer = io.BytesIO()
    c = canvas.Canvas(buffer, pagesize=LETTER)
    width, height = LETTER

    lm, rm = 1.0 * inch, 1.0 * inch
    usable_w = width - lm - rm
    y = height - 0.9 * inch

    # Header
    c.setFont("Helvetica-Bold", 14)
    c.drawCentredString(width / 2, y, "Email Communication")
    y -= 0.35 * inch

    # Subject line (first line up to newline)
    subject_line = row["EmailContent"].split("\n", 1)[0].strip()
    c.setFont("Helvetica-Bold", 11)
    wrap_and_draw(c, subject_line, lm, y, usable_w, font_name="Helvetica-Bold", font_size=11, line_height=14)
    y -= 4

    # Meta
    c.setFont("Helvetica", 10)
    meta = [
        f"EmailID: {row['EmailID']}   Type: {row['EmailType']}   Direction: {row['EmailDirection']}",
        f"Sent: {row['SentDate']}   AccountID: {row['AccountID']}   OpportunityID: {row['OpportunityID']}   RepID: {row['RepID']}",
        ""
    ]
    for line in meta:
        c.drawString(lm, y, line)
        y -= 14
        if y < 0.9 * inch:
            c.showPage(); c.setFont("Helvetica", 10)
            y = height - 1.0 * inch

    # Body (skip the first line which is the subject)
    body = row["EmailContent"].split("\n", 1)[1] if "\n" in row["EmailContent"] else ""
    c.setFont("Helvetica-Bold", 11)
    c.drawString(lm, y, "Body:")
    y -= 16
    y = wrap_and_draw(
        c=c,
        text=body,
        x=lm,
        y=y,
        width=usable_w,
        font_name="Helvetica",
        font_size=10,
        line_height=14
    )

    c.save()
    buffer.seek(0)

    # Upload
    filename = f"{row['EmailID']}.pdf"
    target_path = f"{volume_path}/email_communications/{filename}"
    w.files.upload(target_path, buffer, overwrite=True)
    upload_count += 1
    # print(f"Uploaded: {target_path}")

print(f"✅ Successfully uploaded {upload_count} email PDFs to {volume_path}")