# Data Generation

In [24]:
%pip install faker=="36.1.1"
dbutils.library.restartPython()

Note: you may need to restart the kernel to use updated packages.


In [13]:
import pandas as pd
import pyspark.sql.functions as F
import numpy as np
import random
import uuid

from datetime import datetime
from faker import Faker
from pyspark.sql.types import StringType

import numpy as np
import pandas as pd
import uuid
import random
from faker import Faker
from datetime import datetime

In [49]:
%run ./_resources/00_setup

## Step 1: Generate structured data for clustering

## Introduction to Demographic Clustering
Demographic clustering involves categorizing individuals based on characteristics like age, income, and education level. This process helps in understanding and targeting specific groups more effectively.

## Defining Clusters
We define several demographic clusters based on their characteristics:

- **Young Urban Professional**
- **Suburban Family-Oriented**
- **Retired Rural Dweller**
- **College Student**
- **High-Income Empty Nester**

## LA Tribes
Additionally, we focus on specific tribes in Los Angeles:

- **Young Professional Women - Urban Explorers**
- **Tech-Savvy Professionals - Silicon Beach Innovators**
- **Creative Entrepreneurs - Hollywood Creatives**
- **Eco-Conscious Millennials - Sustainable Lifestyle**
- **Luxury-Oriented Professionals - Upscale Lifestyles**
- **College Students - Campus Life**

## Code Implementation

Below is the Python code to generate these demographic clusters and tribes:



In [14]:
# Set seed
np.random.seed(42)

We need to use conditional probabilities in our data gen code in order to "force" the clusters for later.

In [15]:
# Initialize Faker for generating fake dates
fake = Faker()

# Define the tribe sizes and their descriptions
tribe_sizes = {
    "Young Professional Women - Urban Explorers (Downtown LA / Arts District)": 250,
    "Tech-Savvy Professionals - Silicon Beach Innovators (Venice Beach / Santa Monica)": 200,
    "Creative Entrepreneurs - Hollywood Creatives (West Hollywood / Beverly Hills)": 150,
    "Eco-Conscious Millennials - Sustainable Lifestyle (Culver City / Echo Park)": 180,
    "Luxury-Oriented Professionals - Upscale Lifestyles (Beverly Hills / Bel Air)": 200,
    "College Students - Campus Life (UCLA / USC)": 300,
}

# Function to generate data for LA-based tribes
def generate_la_tribe_data(tribe_name, size):
    if "Urban Explorers" in tribe_name:
        locations = random.choices(["Downtown LA", "Arts District", "Silver Lake", "Echo Park"], k=size)
        incomes = np.random.normal(45000, 12000, size).clip(30000, 80000)
        occupations = random.choices(["Freelancer", "Graphic Designer", "Social Media Influencer"], k=size)
        ages = np.random.randint(25, 35, size)
        education_levels = np.random.choice(["Bachelor's", "Post Graduate"], size, p=[0.7, 0.3])
        relationship_statuses = random.choices(["Single", "Cohabiting"], k=size)
        number_dependants = [0] * size
        short_description = "Downtown LA professionals"

    elif "Silicon Beach Innovators" in tribe_name:
        locations = random.choices(["Venice Beach", "Santa Monica", "Playa Vista"], k=size)
        incomes = np.random.normal(100000, 25000, size).clip(70000, 200000)
        occupations = random.choices(["Software Engineer", "Product Manager", "UX/UI Designer"], k=size)
        ages = np.random.randint(25, 40, size)
        education_levels = np.random.choice(["Bachelor's", "Post Graduate"], size, p=[0.8, 0.2])
        relationship_statuses = random.choices(["Single", "Cohabiting"], k=size)
        number_dependants = [0] * size
        short_description = "Venice Beach tech industry leaders"

    elif "Hollywood Creatives" in tribe_name:
        locations = random.choices(["West Hollywood", "Beverly Hills", "Sunset Strip"], k=size)
        incomes = np.random.normal(120000, 30000, size).clip(80000, 250000)
        occupations = random.choices(["Filmmaker", "Designer", "Fashion Entrepreneur"], k=size)
        ages = np.random.randint(25, 45, size)
        education_levels = np.random.choice(["Bachelor's", "Post Graduate"], size, p=[0.6, 0.4])
        relationship_statuses = random.choices(["Single", "Cohabiting"], k=size)
        number_dependants = [0] * size
        short_description = "West Hollywood creatives"

    elif "Sustainable Lifestyle" in tribe_name:
        locations = random.choices(["Culver City", "Echo Park", "Silver Lake"], k=size)
        incomes = np.random.normal(55000, 15000, size).clip(40000, 100000)
        occupations = random.choices(["Sustainability Consultant", "Eco-Entrepreneur", "Health Coach"], k=size)
        ages = np.random.randint(25, 35, size)
        education_levels = np.random.choice(["Bachelor's", "Post Graduate"], size, p=[0.7, 0.3])
        relationship_statuses = random.choices(["Single", "Cohabiting"], k=size)
        number_dependants = [0] * size
        short_description = "Culver City eco-advocates"

    elif "Luxury-Oriented Professionals" in tribe_name:
        locations = random.choices(["Beverly Hills", "Bel Air", "Westwood"], k=size)
        incomes = np.random.normal(200000, 50000, size).clip(150000, 300000)
        occupations = random.choices(["Real Estate Executive", "Investment Banker", "Entertainment Lawyer"], k=size)
        ages = np.random.randint(35, 55, size)
        education_levels = np.random.choice(["Bachelor's", "Post Graduate"], size, p=[0.6, 0.4])
        relationship_statuses = random.choices(["Single", "Cohabiting"], k=size)
        number_dependants = [0] * size
        short_description = "Beverly Hills luxury professionals"

    elif "College Students" in tribe_name:
        locations = random.choices(["Westwood (UCLA)", "South LA (USC)", "Downtown LA"], k=size)
        incomes = np.random.normal(15000, 3000, size).clip(10000, 25000)
        occupations = random.choices(["Student", "Intern", "Part-time Worker"], k=size)
        ages = np.random.randint(18, 24, size)
        education_levels = ["Some College"] * size
        relationship_statuses = ["Single"] * size
        number_dependants = [0] * size
        short_description = "UCLA/USC students"

    return pd.DataFrame({
        "age": ages,
        "income": incomes.round(-3),
        "location": locations,
        "education": education_levels,
        "relationship_status": relationship_statuses,
        "number_dependants": number_dependants,
        "occupation": occupations,
        "tribe": tribe_name,
        "short_description": short_description
    })

# Generate data for all LA tribes
tribe_dfs = [generate_la_tribe_data(tribe, size) for tribe, size in tribe_sizes.items()]
demographic_df = pd.concat(tribe_dfs, ignore_index=True)

# Shuffle the dataset
demographic_df = demographic_df.sample(frac=1).reset_index(drop=True)

# Add UUIDs for each individual
demographic_df.insert(0, 'uuid', [str(uuid.uuid4()) for _ in range(len(demographic_df))])

# Add fake creation dates for each record (simulating when they joined the platform)
demographic_df['created_at'] = [
  fake.date_time_between(datetime(2023, 1, 1), datetime(2024, 12, 31)).strftime('%Y-%m-%d %H:%M:%S') for _ in range(len(demographic_df))]


In [16]:
# Display the first few records of the generated dataset

demographic_df.head()

Unnamed: 0,uuid,age,income,location,education,relationship_status,number_dependants,occupation,tribe,short_description,created_at
0,8382753e-2d4a-4af7-a6aa-29e833a4232e,36,80000.0,West Hollywood,Post Graduate,Single,0,Designer,Creative Entrepreneurs - Hollywood Creatives (...,West Hollywood creatives,2024-12-18 08:09:44
1,be15b253-57ba-4f79-a5ac-e861455b74b0,34,96000.0,Santa Monica,Bachelor's,Single,0,Product Manager,Tech-Savvy Professionals - Silicon Beach Innov...,Venice Beach tech industry leaders,2023-08-13 06:06:37
2,81d8484a-e48d-43ff-b3a2-89c24e5b87d4,23,16000.0,Downtown LA,Some College,Single,0,Student,College Students - Campus Life (UCLA / USC),UCLA/USC students,2024-11-26 00:38:09
3,4eaa67e4-f266-4cca-a5a6-6a61fa3de7a5,36,149000.0,Sunset Strip,Post Graduate,Cohabiting,0,Filmmaker,Creative Entrepreneurs - Hollywood Creatives (...,West Hollywood creatives,2024-07-26 10:21:23
4,b251c932-aa69-4595-ab10-feabb9f0d1ef,29,42000.0,Silver Lake,Post Graduate,Single,0,Health Coach,Eco-Conscious Millennials - Sustainable Lifest...,Culver City eco-advocates,2024-09-16 22:34:13


In [None]:
import pandas as pd
import plotly.express as px

# Set your Mapbox access token
px.set_mapbox_access_token("pk.eyJ1IjoibWFyY2Fwb2xhayIsImEiOiJjbThvajN1bHAwMHo1MmxzYXhjb3p3Zjh0In0.emuUuaDMhcM6R9-XtsvZWA")

# Define location coordinates
location_coords = {
    "Downtown LA": (34.0522, -118.2437),
    "Arts District": (34.0403, -118.2352),
    "Silver Lake": (34.0872, -118.2707),
    "Echo Park": (34.0782, -118.2606),
    "Venice Beach": (33.9850, -118.4695),
    "Santa Monica": (34.0195, -118.4912),
    "Playa Vista": (33.9754, -118.4208),
    "West Hollywood": (34.0900, -118.3617),
    "Beverly Hills": (34.0736, -118.4004),
    "Sunset Strip": (34.0928, -118.3854),
    "Culver City": (34.0211, -118.3965),
    "Bel Air": (34.1000, -118.4614),
    "Westwood": (34.0561, -118.4290),
    "Westwood (UCLA)": (34.0689, -118.4452),
    "South LA (USC)": (34.0224, -118.2851)
}

# Add latitude and longitude to the dataframe
demographic_df['latitude'] = demographic_df['location'].map(lambda x: location_coords.get(x.split('(')[0].strip(), (None, None))[0])
demographic_df['longitude'] = demographic_df['location'].map(lambda x: location_coords.get(x.split('(')[0].strip(), (None, None))[1])

# Check if latitude and longitude columns were added correctly
if demographic_df[['latitude', 'longitude']].isnull().any().any():
    print("Warning: Some locations do not have coordinates in the dictionary.")
    missing_locations = set(demographic_df['location'].map(lambda x: x.split('(')[0].strip())) - set(location_coords.keys())
    print("Missing locations:", missing_locations)

demographic_df = demographic_df.dropna(subset=['latitude', 'longitude'])

fig = px.scatter_mapbox(
    demographic_df,
    lat="latitude",
    lon="longitude",
    color="tribe",
    size="income",
    hover_name="short_description",
    hover_data=["age", "occupation", "education"],
    zoom=10,
    height=600
)

fig.update_layout(
    mapbox_style="mapbox://styles/mapbox/streets-v12",
    margin={"r": 0, "t": 0, "l": 0, "b": 0},
    legend=dict(
        orientation="h",  # Horizontal orientation
        yanchor="bottom",  # Position at the bottom
        y=1.02,  # Slightly above the bottom
        xanchor="right",  # Align to the right
        x=1,  # Position at the right edge
        font=dict(size=8)  # Reduce font size
    )
)

fig.show()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


*scatter_mapbox* is deprecated! Use *scatter_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/



## Step 2: Generate social media posts

In [29]:
fake = Faker()

# Updated Tribes based on your focus
tribes = {
    "Young Professional Women - Urban Explorers (Downtown LA / Arts District)": {
        "products": ["travel bag", "stylish sunglasses", "compact camera", "smartphone", "fitness tracker"],
        "emotions": ["excited", "curious", "adventurous", "relaxed"]
    },
    "Tech-Savvy Professionals - Silicon Beach Innovators (Venice Beach / Santa Monica)": {
        "products": ["smartphone", "laptop", "smartwatch", "tablet", "wireless earbuds"],
        "emotions": ["excited", "satisfied", "curious", "innovative"]
    },
    "Creative Entrepreneurs - Hollywood Creatives (West Hollywood / Beverly Hills)": {
        "products": ["designer laptop", "high-end camera", "smartphone", "creative tools", "luxury watch"],
        "emotions": ["inspired", "excited", "content", "creative"]
    },
    "Eco-Conscious Millennials - Sustainable Lifestyle (Culver City / Echo Park)": {
        "products": ["reusable water bottle", "organic skincare", "bamboo toothbrush", "electric bike", "solar-powered charger"],
        "emotions": ["satisfied", "relaxed", "eco-friendly", "content"]
    },
    "Luxury-Oriented Professionals - Upscale Lifestyles (Beverly Hills / Bel Air)": {
        "products": ["luxury watch", "high-end camera", "luxury car", "premium wine", "gourmet food subscription"],
        "emotions": ["satisfied", "relaxed", "luxurious", "content"]
    },
    "College Students - Campus Life (UCLA / USC)": {
        "products": ["backpack", "laptop", "coffee maker", "textbooks", "headphones"],
        "emotions": ["excited", "curious", "satisfied", "motivated"]
    }
}


# Get random sample from demographic data (assuming `demographic_df` is already created)
sampled_df = demographic_df.sample(n=100).reset_index(drop=True)

# Generate combinations for 100 unique posts
combinations = []
unique_combinations = set()

while len(unique_combinations) < 100:
    # Select a tribe randomly from the dictionary
    tribe_name = random.choice(list(tribes.keys()))
    
    # Select a random author_id from the demographic data based on the tribe (segment)
    author_id = demographic_df[demographic_df["tribe"] == tribe_name]["uuid"].sample(1).values[0]
    
    # Choose a product and emotion from the tribe
    product = random.choice(tribes[tribe_name]["products"])
    emotion = random.choice(tribes[tribe_name]["emotions"])
    
    # Create a tuple to check for uniqueness
    combination_tuple = (tribe_name, product, emotion)
    
    # Add only if the combination is unique
    if combination_tuple not in unique_combinations:
        unique_combinations.add(combination_tuple)
        combinations.append({
            "author_id": author_id,
            "tribe": tribe_name,
            "product": product,
            "emotion": emotion
        })

combinations_df = pd.DataFrame(combinations)

In [32]:
combinations_sdf = spark.createDataFrame(combinations_df)

In [39]:
combinations_sdf.groupBy("tribe").count().show()

+--------------------+-----+
|               tribe|count|
+--------------------+-----+
|Luxury-Oriented P...|   14|
|Eco-Conscious Mil...|   19|
|Creative Entrepre...|   16|
|College Students ...|   20|
|Young Professiona...|   18|
|Tech-Savvy Profes...|   13|
+--------------------+-----+



In [35]:
# Creat temp view for AI_QUERY
combinations_sdf.createOrReplaceTempView("sampled_audience")

In [36]:
# Create the SQL query for generating social media posts
sql_query = """
CREATE OR REPLACE TEMP VIEW sampled_audience_posts AS
SELECT
    author_id,
    tribe,
    product,
    emotion,
    AI_QUERY(
        "databricks-meta-llama-3-3-70b-instruct", 
        CONCAT(
            'Generate a realistic social media post from a consumer who recently purchased a ',
            product, 
            ' from the perspective of a ', tribe, 
            ' who is ', emotion, 
            ' about the product. The post should reflect their genuine experience, including specific details about the product\'s features, performance, and how it fits into their lifestyle. Maintain a conversational and engaging tone, similar to how people naturally write on social media. Optionally, include a hashtag or emoji for authenticity. Don\'t explicitly mention the tribe or that you are an AI assistant. Remove quotation marks.'
        ) AS post
FROM sampled_audience
"""

In [41]:
posts_df = spark.sql("SELECT * FROM sampled_audience").toPandas()
display(posts_df)

Unnamed: 0,author_id,tribe,product,emotion
0,80db4db2-d9a3-4b7e-892d-c5dd0277e172,Eco-Conscious Millennials - Sustainable Lifest...,bamboo toothbrush,eco-friendly
1,b8f84fba-26b7-4dcd-8e7c-a321e09370e0,Luxury-Oriented Professionals - Upscale Lifest...,premium wine,relaxed
2,cb1624b8-89f6-4988-a435-cfd40e6384bc,Creative Entrepreneurs - Hollywood Creatives (...,high-end camera,content
3,c1214686-4bd6-4b48-a956-55518d7c9e19,Creative Entrepreneurs - Hollywood Creatives (...,high-end camera,excited
4,e601e234-05a0-4fc5-8d60-08bbce316be5,Eco-Conscious Millennials - Sustainable Lifest...,organic skincare,eco-friendly
...,...,...,...,...
95,fe592152-d9fd-40e3-9388-d5bc47ab357f,Luxury-Oriented Professionals - Upscale Lifest...,premium wine,luxurious
96,4df88077-49df-482b-bdab-22318763d3fa,Tech-Savvy Professionals - Silicon Beach Innov...,wireless earbuds,curious
97,73f9c0f3-8a93-4630-b6c9-9b861aff9586,Young Professional Women - Urban Explorers (Do...,compact camera,curious
98,3c92fd43-bc03-4b6c-9242-716ab6df4a0b,Eco-Conscious Millennials - Sustainable Lifest...,electric bike,satisfied


## Step 3: Generate Ad Campaigns

In [43]:
# Add fake creation dates for the posts
posts_df['created_at'] = [fake.date_time_between(datetime(2023, 1, 1), datetime(2024, 12, 31)).strftime('%Y-%m-%d %H:%M:%S') for _ in range(len(posts_df))]

# Optionally: Save posts to a JSON or database
posts_df.to_json('social_media_posts.json', orient='records')

In [44]:
# Generate Ad Campaigns
campaigns = []
for tribe, details in tribes.items():
    for product in details["products"]:
        campaign = {
            "campaign_id": f"AD-{random.randint(1000,9999)}",
            "segment": tribe,
            "product": product,
            "tone": random.choice(details["emotions"]),
            "cta": random.choice(["Shop now", "Learn more", "Get yours", "Discover"]),
            "impressions": random.randint(10000, 500000),
            "ctr": round(random.uniform(0.5, 5.0), 2)
        }
        campaigns.append(campaign)

campaigns_df = pd.DataFrame(campaigns)

# Add campaign dates
campaigns_df['start_date'] = [fake.date_between(datetime(2023,1,1), datetime(2024,1,1)) for _ in range(len(campaigns_df))]
campaigns_df['end_date'] = [fake.date_between(datetime(2024,1,1), datetime(2024,12,31)) for _ in range(len(campaigns_df))]

# Create Spark DataFrame for campaigns
campaigns_sdf = spark.createDataFrame(campaigns_df)
campaigns_sdf.createOrReplaceTempView("campaigns")

# Generate AI-optimized ad copies
performance_query = """
CREATE OR REPLACE TEMP VIEW campaigns_performance AS
SELECT
    campaign_id,
    segment,
    product,
    AI_QUERY(
        "databricks-meta-llama-3-3-70b-instruct", 
        CONCAT(
            'Create a digital ad for ', product, 
            ' targeting ', segment, 
            ' with ', tone, ' tone. Include: ',
            '- Key product benefits\n',
            '- Lifestyle connection\n',
            '- Emoji if appropriate\n',
            '- CTA: ', cta, '\n',
            '- Max 200 characters\n',
            '- No hashtags\n',
            '- Natural conversational style'
        )
    ) AS optimized_ad_copy,
    impressions,
    ctr
FROM campaigns
"""

spark.sql(performance_query)
campaigns_performance_df = spark.sql("SELECT * FROM campaigns_performance").toPandas()

# Add performance metrics
campaigns_performance_df['estimated_clicks'] = (campaigns_performance_df['impressions'] * 
                                               campaigns_performance_df['ctr']/100).astype(int)

# Display results
print("Generated Social Media Posts:")
display(posts_df)

print("\nAI-Optimized Campaign Performance:")
display(campaigns_performance_df[['campaign_id', 'segment', 'optimized_ad_copy', 
                                'impressions', 'ctr', 'estimated_clicks']])

# Save to JSON
posts_df.to_json('social_media_posts.json', orient='records', indent=2)
campaigns_performance_df.to_json('ai_optimized_campaigns.json', orient='records', indent=2)

print("Data saved to 'social_media_posts.json' and 'ai_optimized_campaigns.json'")

Generated Social Media Posts:


Unnamed: 0,author_id,tribe,product,emotion,created_at
0,80db4db2-d9a3-4b7e-892d-c5dd0277e172,Eco-Conscious Millennials - Sustainable Lifest...,bamboo toothbrush,eco-friendly,2023-07-04 08:19:19
1,b8f84fba-26b7-4dcd-8e7c-a321e09370e0,Luxury-Oriented Professionals - Upscale Lifest...,premium wine,relaxed,2024-11-27 19:22:05
2,cb1624b8-89f6-4988-a435-cfd40e6384bc,Creative Entrepreneurs - Hollywood Creatives (...,high-end camera,content,2024-03-23 12:26:53
3,c1214686-4bd6-4b48-a956-55518d7c9e19,Creative Entrepreneurs - Hollywood Creatives (...,high-end camera,excited,2024-08-04 15:23:46
4,e601e234-05a0-4fc5-8d60-08bbce316be5,Eco-Conscious Millennials - Sustainable Lifest...,organic skincare,eco-friendly,2024-09-18 11:15:29
...,...,...,...,...,...
95,fe592152-d9fd-40e3-9388-d5bc47ab357f,Luxury-Oriented Professionals - Upscale Lifest...,premium wine,luxurious,2023-08-17 23:42:28
96,4df88077-49df-482b-bdab-22318763d3fa,Tech-Savvy Professionals - Silicon Beach Innov...,wireless earbuds,curious,2024-04-28 20:57:36
97,73f9c0f3-8a93-4630-b6c9-9b861aff9586,Young Professional Women - Urban Explorers (Do...,compact camera,curious,2023-07-05 18:17:35
98,3c92fd43-bc03-4b6c-9242-716ab6df4a0b,Eco-Conscious Millennials - Sustainable Lifest...,electric bike,satisfied,2023-04-29 20:27:33



AI-Optimized Campaign Performance:


Unnamed: 0,campaign_id,segment,optimized_ad_copy,impressions,ctr,estimated_clicks
0,AD-1278,Young Professional Women - Urban Explorers (Do...,"""Explore the city in style! 💃 Durable, lightwe...",130989,0.81,1061
1,AD-9023,Young Professional Women - Urban Explorers (Do...,"""Explore the city in style 🕶️! Our sunglasses ...",456835,2.22,10141
2,AD-1928,Young Professional Women - Urban Explorers (Do...,"""Capture city vibes with our compact camera! 📸...",338971,2.74,9287
3,AD-4147,Young Professional Women - Urban Explorers (Do...,"""Explore the city with ease 🗺️. Our smartphone...",256674,1.37,3516
4,AD-1704,Young Professional Women - Urban Explorers (Do...,"""Track your downtown adventures 🏙️ and crush y...",352175,2.66,9367
5,AD-8158,Tech-Savvy Professionals - Silicon Beach Innov...,"""Upgrade your mobile game 📈 with our latest sm...",189330,0.7,1325
6,AD-1416,Tech-Savvy Professionals - Silicon Beach Innov...,"""Upgrade your workflow with our ultra-fast lap...",436097,2.68,11687
7,AD-4578,Tech-Savvy Professionals - Silicon Beach Innov...,"""Upgrade your hustle with our smartwatch 🕒️! T...",136413,2.47,3369
8,AD-7041,Tech-Savvy Professionals - Silicon Beach Innov...,"""Upgrade your workflow 🚀 with our cutting-edge...",150448,4.35,6544
9,AD-2033,Tech-Savvy Professionals - Silicon Beach Innov...,"""Upgrade your workflow with our wireless earbu...",406666,4.68,19031


Data saved to 'social_media_posts.json' and 'ai_optimized_campaigns.json'


### Write social media posts to volume JSON and save demographic + campaign tables