# Data Generation

In [24]:
%pip install faker=="36.1.1"
dbutils.library.restartPython()

Note: you may need to restart the kernel to use updated packages.


InternalError: ContextNotFound: 

In [None]:
import pandas as pd
import pyspark.sql.functions as F
import numpy as np
import random
import uuid

from datetime import datetime
from faker import Faker
from pyspark.sql.types import StringType

import numpy as np
import pandas as pd
import uuid
import random
from faker import Faker
from datetime import datetime

In [None]:
%run ./_resources/00_setup

## Step 1: Generate structured data for clustering

## Introduction to Demographic Clustering
Demographic clustering involves categorizing individuals based on characteristics like age, income, and education level. This process helps in understanding and targeting specific groups more effectively.

## Defining Clusters
We define several demographic clusters based on their characteristics:

- **Young Urban Professional**
- **Suburban Family-Oriented**
- **Retired Rural Dweller**
- **College Student**
- **High-Income Empty Nester**

## LA Tribes
Additionally, we focus on specific tribes in Los Angeles:

- **Young Professional Women - Urban Explorers**
- **Tech-Savvy Professionals - Silicon Beach Innovators**
- **Creative Entrepreneurs - Hollywood Creatives**
- **Eco-Conscious Millennials - Sustainable Lifestyle**
- **Luxury-Oriented Professionals - Upscale Lifestyles**
- **College Students - Campus Life**

## Code Implementation

Below is the Python code to generate these demographic clusters and tribes:



In [25]:
# Set seed
np.random.seed(42)

We need to use conditional probabilities in our data gen code in order to "force" the clusters for later.

In [26]:
# Initialize Faker for generating fake dates
fake = Faker()

# Define the tribe sizes and their descriptions
tribe_sizes = {
    "Young Professional Women - Urban Explorers (Downtown LA / Arts District)": 250,
    "Tech-Savvy Professionals - Silicon Beach Innovators (Venice Beach / Santa Monica)": 200,
    "Creative Entrepreneurs - Hollywood Creatives (West Hollywood / Beverly Hills)": 150,
    "Eco-Conscious Millennials - Sustainable Lifestyle (Culver City / Echo Park)": 180,
    "Luxury-Oriented Professionals - Upscale Lifestyles (Beverly Hills / Bel Air)": 200,
    "College Students - Campus Life (UCLA / USC)": 300,
}

# Function to generate data for LA-based tribes
def generate_la_tribe_data(tribe_name, size):
    if "Urban Explorers" in tribe_name:
        locations = random.choices(["Downtown LA", "Arts District", "Silver Lake", "Echo Park"], k=size)
        incomes = np.random.normal(45000, 12000, size).clip(30000, 80000)
        occupations = random.choices(["Freelancer", "Graphic Designer", "Social Media Influencer"], k=size)
        ages = np.random.randint(25, 35, size)
        education_levels = np.random.choice(["Bachelor's", "Post Graduate"], size, p=[0.7, 0.3])
        relationship_statuses = random.choices(["Single", "Cohabiting"], k=size)
        number_dependants = [0] * size
        short_description = "Downtown LA professionals"

    elif "Silicon Beach Innovators" in tribe_name:
        locations = random.choices(["Venice Beach", "Santa Monica", "Playa Vista"], k=size)
        incomes = np.random.normal(100000, 25000, size).clip(70000, 200000)
        occupations = random.choices(["Software Engineer", "Product Manager", "UX/UI Designer"], k=size)
        ages = np.random.randint(25, 40, size)
        education_levels = np.random.choice(["Bachelor's", "Post Graduate"], size, p=[0.8, 0.2])
        relationship_statuses = random.choices(["Single", "Cohabiting"], k=size)
        number_dependants = [0] * size
        short_description = "Venice Beach tech industry leaders"

    elif "Hollywood Creatives" in tribe_name:
        locations = random.choices(["West Hollywood", "Beverly Hills", "Sunset Strip"], k=size)
        incomes = np.random.normal(120000, 30000, size).clip(80000, 250000)
        occupations = random.choices(["Filmmaker", "Designer", "Fashion Entrepreneur"], k=size)
        ages = np.random.randint(25, 45, size)
        education_levels = np.random.choice(["Bachelor's", "Post Graduate"], size, p=[0.6, 0.4])
        relationship_statuses = random.choices(["Single", "Cohabiting"], k=size)
        number_dependants = [0] * size
        short_description = "West Hollywood creatives"

    elif "Sustainable Lifestyle" in tribe_name:
        locations = random.choices(["Culver City", "Echo Park", "Silver Lake"], k=size)
        incomes = np.random.normal(55000, 15000, size).clip(40000, 100000)
        occupations = random.choices(["Sustainability Consultant", "Eco-Entrepreneur", "Health Coach"], k=size)
        ages = np.random.randint(25, 35, size)
        education_levels = np.random.choice(["Bachelor's", "Post Graduate"], size, p=[0.7, 0.3])
        relationship_statuses = random.choices(["Single", "Cohabiting"], k=size)
        number_dependants = [0] * size
        short_description = "Culver City eco-advocates"

    elif "Luxury-Oriented Professionals" in tribe_name:
        locations = random.choices(["Beverly Hills", "Bel Air", "Westwood"], k=size)
        incomes = np.random.normal(200000, 50000, size).clip(150000, 300000)
        occupations = random.choices(["Real Estate Executive", "Investment Banker", "Entertainment Lawyer"], k=size)
        ages = np.random.randint(35, 55, size)
        education_levels = np.random.choice(["Bachelor's", "Post Graduate"], size, p=[0.6, 0.4])
        relationship_statuses = random.choices(["Single", "Cohabiting"], k=size)
        number_dependants = [0] * size
        short_description = "Beverly Hills luxury professionals"

    elif "College Students" in tribe_name:
        locations = random.choices(["Westwood (UCLA)", "South LA (USC)", "Downtown LA"], k=size)
        incomes = np.random.normal(15000, 3000, size).clip(10000, 25000)
        occupations = random.choices(["Student", "Intern", "Part-time Worker"], k=size)
        ages = np.random.randint(18, 24, size)
        education_levels = ["Some College"] * size
        relationship_statuses = ["Single"] * size
        number_dependants = [0] * size
        short_description = "UCLA/USC students"

    return pd.DataFrame({
        "age": ages,
        "income": incomes.round(-3),
        "location": locations,
        "education": education_levels,
        "relationship_status": relationship_statuses,
        "number_dependants": number_dependants,
        "occupation": occupations,
        "tribe": tribe_name,
        "short_description": short_description
    })

# Generate data for all LA tribes
tribe_dfs = [generate_la_tribe_data(tribe, size) for tribe, size in tribe_sizes.items()]
demographic_df = pd.concat(tribe_dfs, ignore_index=True)

# Shuffle the dataset
demographic_df = demographic_df.sample(frac=1).reset_index(drop=True)

# Add UUIDs for each individual
demographic_df.insert(0, 'uuid', [str(uuid.uuid4()) for _ in range(len(demographic_df))])

# Add fake creation dates for each record (simulating when they joined the platform)
demographic_df['created_at'] = [
  fake.date_time_between(datetime(2023, 1, 1), datetime(2024, 12, 31)).strftime('%Y-%m-%d %H:%M:%S') for _ in range(len(demographic_df))]


In [27]:
# Display the first few records of the generated dataset

demographic_df.head()

Unnamed: 0,uuid,age,income,location,education,relationship_status,number_dependants,occupation,tribe,short_description,created_at
0,15af5fab-8049-4d0b-8fbc-68e8f07082ea,36,80000.0,West Hollywood,Post Graduate,Single,0,Filmmaker,Creative Entrepreneurs - Hollywood Creatives (...,West Hollywood creatives,2024-01-26 05:44:08
1,81632240-77dc-40b2-b117-b0891d6e5dad,34,96000.0,Playa Vista,Bachelor's,Cohabiting,0,Software Engineer,Tech-Savvy Professionals - Silicon Beach Innov...,Venice Beach tech industry leaders,2023-10-03 04:58:11
2,dd75e92c-c1c7-445e-8755-68040618e7aa,23,16000.0,Downtown LA,Some College,Single,0,Student,College Students - Campus Life (UCLA / USC),UCLA/USC students,2024-07-04 10:41:27
3,e0c2a91e-d60b-4edb-8681-b9974f078674,36,149000.0,Beverly Hills,Post Graduate,Single,0,Designer,Creative Entrepreneurs - Hollywood Creatives (...,West Hollywood creatives,2024-09-16 03:21:25
4,6affe2c6-e705-4798-b117-e38d82fa8491,29,42000.0,Culver City,Post Graduate,Single,0,Eco-Entrepreneur,Eco-Conscious Millennials - Sustainable Lifest...,Culver City eco-advocates,2024-09-22 00:21:04


## Step 2: Generate social media posts

In [0]:
# Get random sample from demographic data
sampled_df = demographic_df.sample(n=100).reset_index(drop=True)

In [0]:
# Define segment-specific products and possible emotions
segment_products = {
    "Young Urban Professional": ["smartphone", "laptop", "smartwatch", "wireless earbuds", "fitness tracker"],
    "Suburban Family-Oriented": ["family SUV", "grill", "home security system", "washing machine", "family board game"],
    "Retired Rural Dweller": ["gardening tools", "golf clubs", "heating blanket", "armchair", "bird feeder"],
    "College Student": ["backpack", "coffee maker", "gaming console", "textbooks", "bicycle"],
    "High-Income Empty Nester": ["luxury watch", "high-end camera", "luxury car", "wine fridge", "holiday package"]
}

emotions = ["excited", "angry", "satisfied", "frustrated", "disappointed", "scared", "relaxed", "confused", "amazed", "curious"]

# Generate 100 unique combinations
combinations = []
unique_combinations = set()

while len(unique_combinations) < 100:
    segment = random.choice(list(segment_products.keys()))
    author_id = demographic_df[demographic_df["segment"] == segment]["uuid"].sample(1).values[0]
    product = random.choice(segment_products[segment])
    emotion = random.choice(emotions)

    # Create a tuple to check for uniqueness
    combination_tuple = (segment, product, emotion)

    # Add only if the combination is unique
    if combination_tuple not in unique_combinations:
        unique_combinations.add(combination_tuple)
        combinations.append({
            "author_id": author_id,
            "segment": segment,
            "product": product,
            "emotion": emotion
        })

# Convert to DataFrame
combinations_df = pd.DataFrame(combinations)
combinations_sdf = spark.createDataFrame(combinations_df)

In [0]:
display(combinations_sdf.groupBy("segment").count())

In [0]:
# Creat temp view for AI_QUERY
combinations_sdf.createOrReplaceTempView("sampled_audience")

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW sampled_audience_posts AS
SELECT
  author_id,
  AI_QUERY(
    "databricks-meta-llama-3-3-70b-instruct", 
    "Generate a realistic social media post from a consumer who recently purchased a " || product||  "from the perspective of a " || segment || "who is " || emotion || "about the product. The post should reflect their genuine experience, including specific details about the product's features, performance, and how it fits into their lifestyle. Maintain a conversational and engaging tone, similar to how people naturally write on social media. Optionally, include a hashtag or emoji for authenticity. Don't explicitly mention the segment or that you are an AI assistant. Remove quotation marks.",
    modelParameters => named_struct('max_tokens', 100)
  ) AS post
FROM sampled_audience

In [0]:
posts_df = spark.sql("select * from sampled_audience_posts").toPandas()

In [0]:
display(posts_df)

## Step 3: Generate Ad Campaigns

In [0]:
# We can re-use the segment_products dict from earlier and add tone, ctas to create variation in ad copy
ad_tones = ["Exciting", "Informative", "Persuasive", "Trustworthy"]
ctas = ["Shop Now", "Hurry - Limited Time Offer", "Discover More", "Upgrade Today", "Claim Your Deal"]

# Generate campaigns
campaigns = []
campaign_counter = 1

for segment, products in segment_products.items():
    for product in products:
        for tone in ad_tones:
            campaign_id = f"campaign-{campaign_counter:04d}"  # Format as campaign-0001
            cta = random.choice(ctas) # Random CTA
            ctr = round(random.uniform(5.0, 15.0), 2)
            impressions = random.randint(50000, 500000)

            campaigns.append((campaign_id, segment, product, tone, cta, ctr, impressions))
            campaign_counter += 1

# Convert to DataFrame
campaigns_df = pd.DataFrame(campaigns, columns=["campaign_id", "segment", "product", "tone", "cta", "ctr", "impressions"])

campaigns_sdf = spark.createDataFrame(campaigns_df)

In [0]:
# Creat temp view for AI_QUERY
campaigns_sdf.createOrReplaceTempView("campaigns")

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW campaigns_performance AS
SELECT
  campaign_id,
  segment,
  AI_QUERY(
    "databricks-meta-llama-3-3-70b-instruct", 
    "Write a unique and persuasive online advertisement for a " || product || ". The ad should be targeted at " || segment || ", highlighting key benefits. The tone should be " || tone || ". and the ad should include a compelling call-to-action that encourages the user to " || cta || ". Ensure creativity, keep it concise, clear, and optimised for digital platforms like Facebook, Instagram or Google Ads. Don't state the segment name. Use an emoji if appropriate. Remove quotation marks. Don't include the CTA button in the response."
  ) AS ad_copy,
  impressions,
  ctr
FROM campaigns

In [0]:
campaigns_performance_df = spark.sql("select * from campaigns_performance").toPandas()

In [0]:
display(campaigns_performance_df)

### Write social media posts to volume JSON and save demographic + campaign tables

In [0]:
fake = Faker()

# Generate post id and creation date
posts_df.insert(0, 'id', [str(uuid.uuid4()) for _ in range(len(posts_df))])
posts_df['created_at'] = [
  fake.date_time_between(datetime(2024, 1, 1), datetime(2024, 12, 31)).strftime('%Y-%m-%d %H:%M:%S') for _ in range(len(posts_df))]

In [0]:
# Write social media posts to volume
posts_df.to_json(config['vol_social_media_feed'], orient='records')

In [0]:
# Write demographic data to UC table dropping segment
demographic_sdf = spark.createDataFrame(demographic_df)
demographic_sdf = demographic_sdf.drop("segment")
demographic_sdf.write.format("delta").mode("overwrite").saveAsTable("audience_demographic")

In [0]:
# Write campaigns data to UC table
campaigns_performance_sdf = spark.createDataFrame(campaigns_performance_df)
campaigns_performance_sdf.write.format("delta").mode("overwrite").saveAsTable(f"campaigns_performance")