# Data Generation

In [1]:
%pip install faker=="36.1.1"
dbutils.library.restartPython()

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import pyspark.sql.functions as F
import numpy as np
import random
import uuid

from datetime import datetime
from faker import Faker
from pyspark.sql.types import StringType

import numpy as np
import pandas as pd
import uuid
import random
from faker import Faker
from datetime import datetime
from pyspark.sql import SparkSession


In [None]:
%run ./_resources/00_setup

## Step 1: Generate structured data for clustering

## Introduction to Demographic Clustering
Demographic clustering involves categorizing individuals based on characteristics like age, income, and education level. This process helps in understanding and targeting specific groups more effectively.

## Defining Clusters
We define several demographic clusters based on their characteristics:

- **Young Urban Professional**
- **Suburban Family-Oriented**
- **Retired Rural Dweller**
- **College Student**
- **High-Income Empty Nester**

## LA Tribes
Additionally, we focus on specific tribes in Los Angeles:

- **Young Professional Women - Urban Explorers**
- **Tech-Savvy Professionals - Silicon Beach Innovators**
- **Creative Entrepreneurs - Hollywood Creatives**
- **Eco-Conscious Millennials - Sustainable Lifestyle**
- **Luxury-Oriented Professionals - Upscale Lifestyles**
- **College Students - Campus Life**

## Code Implementation

Below is the Python code to generate these demographic clusters and tribes:



In [3]:
# Set seed
np.random.seed(42)

We need to use conditional probabilities in our data gen code in order to "force" the clusters for later.

In [4]:
# Initialize Faker for generating fake dates
fake = Faker()

# Define the tribe sizes and their descriptions
tribe_sizes = {
    "Young Professional Women - Urban Explorers": 250,
    "Tech-Savvy Professionals - Silicon Beach Innovators": 200,
    "Creative Entrepreneurs - Hollywood Creatives": 150,
    "Eco-Conscious Millennials - Sustainable Lifestyle": 180,
    "Luxury-Oriented Professionals - Upscale Lifestyles": 200,
    "College Students - Campus Life (UCLA / USC)": 300,
}

# Define the tribe sizes and their descriptions
tribe_sizes_location = {
    "Young Professional Women - Urban Explorers (Downtown LA / Arts District)": 250,
    "Tech-Savvy Professionals - Silicon Beach Innovators (Venice Beach / Santa Monica)": 200,
    "Creative Entrepreneurs - Hollywood Creatives (West Hollywood / Beverly Hills)": 150,
    "Eco-Conscious Millennials - Sustainable Lifestyle (Culver City / Echo Park)": 180,
    "Luxury-Oriented Professionals - Upscale Lifestyles (Beverly Hills / Bel Air)": 200,
    "College Students - Campus Life (UCLA / USC)": 300,
}
# Function to generate data for LA-based tribes
def generate_la_tribe_data(tribe_name, size):
    if "Urban Explorers" in tribe_name:
        locations = random.choices(["Downtown LA", "Arts District", "Silver Lake", "Echo Park"], k=size)
        incomes = np.random.normal(45000, 12000, size).clip(30000, 80000)
        occupations = random.choices(["Freelancer", "Graphic Designer", "Social Media Influencer"], k=size)
        ages = np.random.randint(25, 35, size)
        education_levels = np.random.choice(["Bachelor's", "Post Graduate"], size, p=[0.7, 0.3])
        relationship_statuses = random.choices(["Single", "Cohabiting"], k=size)
        number_dependants = [0] * size
        short_description = "Downtown LA professionals"

    elif "Silicon Beach Innovators" in tribe_name:
        locations = random.choices(["Venice Beach", "Santa Monica", "Playa Vista"], k=size)
        incomes = np.random.normal(100000, 25000, size).clip(70000, 200000)
        occupations = random.choices(["Software Engineer", "Product Manager", "UX/UI Designer"], k=size)
        ages = np.random.randint(25, 40, size)
        education_levels = np.random.choice(["Bachelor's", "Post Graduate"], size, p=[0.8, 0.2])
        relationship_statuses = random.choices(["Single", "Cohabiting"], k=size)
        number_dependants = [0] * size
        short_description = "Venice Beach tech industry leaders"

    elif "Hollywood Creatives" in tribe_name:
        locations = random.choices(["West Hollywood", "Beverly Hills", "Sunset Strip"], k=size)
        incomes = np.random.normal(120000, 30000, size).clip(80000, 250000)
        occupations = random.choices(["Filmmaker", "Designer", "Fashion Entrepreneur"], k=size)
        ages = np.random.randint(25, 45, size)
        education_levels = np.random.choice(["Bachelor's", "Post Graduate"], size, p=[0.6, 0.4])
        relationship_statuses = random.choices(["Single", "Cohabiting"], k=size)
        number_dependants = [0] * size
        short_description = "West Hollywood creatives"

    elif "Sustainable Lifestyle" in tribe_name:
        locations = random.choices(["Culver City", "Echo Park", "Silver Lake"], k=size)
        incomes = np.random.normal(55000, 15000, size).clip(40000, 100000)
        occupations = random.choices(["Sustainability Consultant", "Eco-Entrepreneur", "Health Coach"], k=size)
        ages = np.random.randint(25, 35, size)
        education_levels = np.random.choice(["Bachelor's", "Post Graduate"], size, p=[0.7, 0.3])
        relationship_statuses = random.choices(["Single", "Cohabiting"], k=size)
        number_dependants = [0] * size
        short_description = "Culver City eco-advocates"

    elif "Luxury-Oriented Professionals" in tribe_name:
        locations = random.choices(["Beverly Hills", "Bel Air", "Westwood"], k=size)
        incomes = np.random.normal(200000, 50000, size).clip(150000, 300000)
        occupations = random.choices(["Real Estate Executive", "Investment Banker", "Entertainment Lawyer"], k=size)
        ages = np.random.randint(35, 55, size)
        education_levels = np.random.choice(["Bachelor's", "Post Graduate"], size, p=[0.6, 0.4])
        relationship_statuses = random.choices(["Single", "Cohabiting"], k=size)
        number_dependants = [0] * size
        short_description = "Beverly Hills luxury professionals"

    elif "College Students" in tribe_name:
        locations = random.choices(["Westwood (UCLA)", "South LA (USC)", "Downtown LA"], k=size)
        incomes = np.random.normal(15000, 3000, size).clip(10000, 25000)
        occupations = random.choices(["Student", "Intern", "Part-time Worker"], k=size)
        ages = np.random.randint(18, 24, size)
        education_levels = ["Some College"] * size
        relationship_statuses = ["Single"] * size
        number_dependants = [0] * size
        short_description = "UCLA/USC students"

    return pd.DataFrame({
        "age": ages,
        "income": incomes.round(-3),
        "location": locations,
        "education": education_levels,
        "relationship_status": relationship_statuses,
        "number_dependants": number_dependants,
        "occupation": occupations,
        "tribe": tribe_name,
        "short_description": short_description
    })

# Generate data for all LA tribes
tribe_dfs = [generate_la_tribe_data(tribe, size) for tribe, size in tribe_sizes.items()]
demographic_df = pd.concat(tribe_dfs, ignore_index=True)

# Shuffle the dataset
demographic_df = demographic_df.sample(frac=1).reset_index(drop=True)

# Add UUIDs for each individual
demographic_df.insert(0, 'uuid', [str(uuid.uuid4()) for _ in range(len(demographic_df))])

# Add fake creation dates for each record (simulating when they joined the platform)
demographic_df['created_at'] = [
  fake.date_time_between(datetime(2023, 1, 1), datetime(2024, 12, 31)).strftime('%Y-%m-%d %H:%M:%S') for _ in range(len(demographic_df))]


In [5]:
# Display the first few records of the generated dataset

display(demographic_df)

Unnamed: 0,uuid,age,income,location,education,relationship_status,number_dependants,occupation,tribe,short_description,created_at
0,df96c50d-dd8e-4839-91d2-27122716ef32,36,80000.0,Sunset Strip,Post Graduate,Single,0,Filmmaker,Creative Entrepreneurs - Hollywood Creatives,West Hollywood creatives,2023-07-06 08:44:02
1,93f838e4-5846-44f9-a594-0da21dcb677c,34,96000.0,Playa Vista,Bachelor's,Single,0,Software Engineer,Tech-Savvy Professionals - Silicon Beach Innov...,Venice Beach tech industry leaders,2024-01-16 06:56:40
2,3e610e5e-edb3-4133-87e6-de1fca5cbbb1,23,16000.0,Downtown LA,Some College,Single,0,Intern,College Students - Campus Life (UCLA / USC),UCLA/USC students,2024-09-14 06:15:13
3,2fdfc1d2-d1b4-422e-90ad-52bfa91106bc,36,149000.0,Sunset Strip,Post Graduate,Single,0,Fashion Entrepreneur,Creative Entrepreneurs - Hollywood Creatives,West Hollywood creatives,2024-04-16 08:48:10
4,d37c808a-5fad-4a3c-9605-bb638c936de6,29,42000.0,Silver Lake,Post Graduate,Cohabiting,0,Health Coach,Eco-Conscious Millennials - Sustainable Lifestyle,Culver City eco-advocates,2024-07-26 21:00:47
...,...,...,...,...,...,...,...,...,...,...,...
1275,b6adaa01-d48b-417f-8e53-ca1c4ffc9137,23,15000.0,Westwood (UCLA),Some College,Single,0,Student,College Students - Campus Life (UCLA / USC),UCLA/USC students,2024-05-22 19:09:53
1276,934cc35a-5675-4d23-adf6-d14782c92936,29,43000.0,Silver Lake,Bachelor's,Cohabiting,0,Health Coach,Eco-Conscious Millennials - Sustainable Lifestyle,Culver City eco-advocates,2024-03-26 21:48:01
1277,163db4c9-7b41-4708-b5d2-c19256d4af2b,19,21000.0,South LA (USC),Some College,Single,0,Part-time Worker,College Students - Campus Life (UCLA / USC),UCLA/USC students,2024-11-17 11:40:01
1278,35da780d-836e-47ec-badf-4ba8200c4c1c,29,30000.0,Arts District,Post Graduate,Single,0,Social Media Influencer,Young Professional Women - Urban Explorers,Downtown LA professionals,2024-06-26 10:17:16


In [6]:
import pandas as pd
import plotly.express as px

# Set your Mapbox access token
px.set_mapbox_access_token("pk.eyJ1IjoibWFyY2Fwb2xhayIsImEiOiJjbThvajN1bHAwMHo1MmxzYXhjb3p3Zjh0In0.emuUuaDMhcM6R9-XtsvZWA")

# Define location coordinates
location_coords = {
    "Downtown LA": (34.0522, -118.2437),
    "Arts District": (34.0403, -118.2352),
    "Silver Lake": (34.0872, -118.2707),
    "Echo Park": (34.0782, -118.2606),
    "Venice Beach": (33.9850, -118.4695),
    "Santa Monica": (34.0195, -118.4912),
    "Playa Vista": (33.9754, -118.4208),
    "West Hollywood": (34.0900, -118.3617),
    "Beverly Hills": (34.0736, -118.4004),
    "Sunset Strip": (34.0928, -118.3854),
    "Culver City": (34.0211, -118.3965),
    "Bel Air": (34.1000, -118.4614),
    "Westwood": (34.0561, -118.4290),
    "Westwood (UCLA)": (34.0689, -118.4452),
    "South LA (USC)": (34.0224, -118.2851)
}

# Add latitude and longitude to the dataframe
demographic_df['latitude'] = demographic_df['location'].map(lambda x: location_coords.get(x.split('(')[0].strip(), (None, None))[0])
demographic_df['longitude'] = demographic_df['location'].map(lambda x: location_coords.get(x.split('(')[0].strip(), (None, None))[1])

# Check if latitude and longitude columns were added correctly
if demographic_df[['latitude', 'longitude']].isnull().any().any():
    print("Warning: Some locations do not have coordinates in the dictionary.")
    missing_locations = set(demographic_df['location'].map(lambda x: x.split('(')[0].strip())) - set(location_coords.keys())
    print("Missing locations:", missing_locations)

demographic_df = demographic_df.dropna(subset=['latitude', 'longitude'])

fig = px.scatter_mapbox(
    demographic_df,
    lat="latitude",
    lon="longitude",
    color="tribe",
    size="income",
    hover_name="short_description",
    hover_data=["age", "occupation", "education"],
    zoom=10,
    height=600
)

fig.update_layout(
    mapbox_style="mapbox://styles/mapbox/streets-v12",
    margin={"r": 0, "t": 0, "l": 0, "b": 0},
    legend=dict(
        orientation="h",  # Horizontal orientation
        yanchor="bottom",  # Position at the bottom
        y=1.02,  # Slightly above the bottom
        xanchor="right",  # Align to the right
        x=1,  # Position at the right edge
        font=dict(size=8)  # Reduce font size
    )
)

fig.show()


Missing locations: {'South LA'}


  fig = px.scatter_mapbox(


## Step 2: Generate social media posts

In [7]:
fake = Faker()

# Updated Tribes based on your focus
tribes = {
    "Young Professional Women - Urban Explorers": {
        "products": ["travel bag", "stylish sunglasses", "compact camera", "smartphone", "fitness tracker"],
        "emotions": ["excited", "curious", "adventurous", "relaxed"]
    },
    "Tech-Savvy Professionals - Silicon Beach Innovators": {
        "products": ["smartphone", "laptop", "smartwatch", "tablet", "wireless earbuds"],
        "emotions": ["excited", "satisfied", "curious", "innovative"]
    },
    "Creative Entrepreneurs - Hollywood Creatives": {
        "products": ["designer laptop", "high-end camera", "smartphone", "creative tools", "luxury watch"],
        "emotions": ["inspired", "excited", "content", "creative"]
    },
    "Eco-Conscious Millennials - Sustainable Lifestyle": {
        "products": ["reusable water bottle", "organic skincare", "bamboo toothbrush", "electric bike", "solar-powered charger"],
        "emotions": ["satisfied", "relaxed", "eco-friendly", "content"]
    },
    "Luxury-Oriented Professionals - Upscale Lifestyles": {
        "products": ["luxury watch", "high-end camera", "luxury car", "premium wine", "gourmet food subscription"],
        "emotions": ["satisfied", "relaxed", "luxurious", "content"]
    },
    "College Students - Campus Life (UCLA / USC)": {
        "products": ["backpack", "laptop", "coffee maker", "textbooks", "headphones"],
        "emotions": ["excited", "curious", "satisfied", "motivated"]
    }
}


# Get random sample from demographic data (assuming `demographic_df` is already created)
sampled_df = demographic_df.sample(n=100).reset_index(drop=True)

# Generate combinations for 100 unique posts
combinations = []
unique_combinations = set()

while len(unique_combinations) < 100:
    # Select a tribe randomly from the dictionary
    tribe_name = random.choice(list(tribes.keys()))
    
    # Select a random author_id from the demographic data based on the tribe (segment)
    author_id = demographic_df[demographic_df["tribe"] == tribe_name]["uuid"].sample(1).values[0]
    
    # Choose a product and emotion from the tribe
    product = random.choice(tribes[tribe_name]["products"])
    emotion = random.choice(tribes[tribe_name]["emotions"])
    
    # Create a tuple to check for uniqueness
    combination_tuple = (tribe_name, product, emotion)
    
    # Add only if the combination is unique
    if combination_tuple not in unique_combinations:
        unique_combinations.add(combination_tuple)
        combinations.append({
            "author_id": author_id,
            "tribe": tribe_name,
            "product": product,
            "emotion": emotion
        })

combinations_df = pd.DataFrame(combinations)

In [8]:
combinations_sdf = spark.createDataFrame(combinations_df)

In [9]:
display(combinations_sdf.groupBy("tribe").count())

Unnamed: 0,tribe,count
0,Eco-Conscious Millennials - Sustainable Lifestyle,15
1,Tech-Savvy Professionals - Silicon Beach Innovators,17
2,College Students - Campus Life (UCLA / USC),20
3,Creative Entrepreneurs - Hollywood Creatives,17
4,Luxury-Oriented Professionals - Upscale Lifestyles,17
5,Young Professional Women - Urban Explorers,14


In [10]:
# Creat temp view for AI_QUERY
combinations_sdf.createOrReplaceTempView("sampled_audience")

In [11]:
# Create the SQL query for generating social media posts
sql_query = """
CREATE OR REPLACE TEMP VIEW sampled_audience_posts AS
SELECT
    author_id,
    tribe,
    product,
    emotion,
    AI_QUERY(
        "databricks-meta-llama-3-3-70b-instruct", 
        CONCAT(
            'Generate a realistic social media post from a consumer who recently purchased a ',
            product, 
            ' from the perspective of a ', tribe, 
            ' who is ', emotion, 
            ' about the product. The post should reflect their genuine experience, including specific details about the product\'s features, performance, and how it fits into their lifestyle. Maintain a conversational and engaging tone, similar to how people naturally write on social media. Optionally, include a hashtag or emoji for authenticity. Don\'t explicitly mention the tribe or that you are an AI assistant. Remove quotation marks.'
        ) AS post
FROM sampled_audience
"""

In [12]:
posts_df = spark.sql("SELECT * FROM sampled_audience").toPandas()
display(posts_df)

Unnamed: 0,author_id,tribe,product,emotion
0,8c60ff26-5f37-44e7-99fb-4d2b270ceae4,Eco-Conscious Millennials - Sustainable Lifestyle,bamboo toothbrush,satisfied
1,483089aa-cda3-4609-8704-71bc91faa6e5,Eco-Conscious Millennials - Sustainable Lifestyle,organic skincare,satisfied
2,eecb7210-aaca-4915-a3b4-8d3e4abfa5f9,Tech-Savvy Professionals - Silicon Beach Innov...,smartphone,excited
3,567eca8b-78d7-4b3a-a704-9839511269e1,College Students - Campus Life (UCLA / USC),textbooks,curious
4,37296c8e-cef3-41ad-8472-8d6f27ee94c2,College Students - Campus Life (UCLA / USC),textbooks,satisfied
...,...,...,...,...
95,de4e6ae7-c64a-44f3-b636-3ae099c66939,Young Professional Women - Urban Explorers,travel bag,excited
96,d06e1276-24f1-4b67-bbcc-71a1873eef6b,Eco-Conscious Millennials - Sustainable Lifestyle,electric bike,satisfied
97,3b61da9c-96d5-45da-b3c4-4ba788698526,Luxury-Oriented Professionals - Upscale Lifest...,gourmet food subscription,luxurious
98,779f4e35-fca3-4da1-9d9b-bec4c89c4ecc,Creative Entrepreneurs - Hollywood Creatives,luxury watch,excited


## Step 3: Generate Ad Campaigns

In [13]:
# Add fake creation dates for the posts
posts_df['created_at'] = [fake.date_time_between(datetime(2023, 1, 1), datetime(2024, 12, 31)).strftime('%Y-%m-%d %H:%M:%S') for _ in range(len(posts_df))]

# Optionally: Save posts to a JSON or database
posts_df.to_json('social_media_posts.json', orient='records')

In [14]:
# Generate Ad Campaigns
campaigns = []
for tribe, details in tribes.items():
    for product in details["products"]:
        campaign = {
            "campaign_id": f"AD-{random.randint(1000,9999)}",
            "segment": tribe,
            "product": product,
            "tone": random.choice(details["emotions"]),
            "cta": random.choice(["Shop now", "Learn more", "Get yours", "Discover"]),
            "impressions": random.randint(10000, 500000),
            "ctr": round(random.uniform(0.5, 5.0), 2)
        }
        campaigns.append(campaign)

campaigns_df = pd.DataFrame(campaigns)

# Add campaign dates
campaigns_df['start_date'] = [fake.date_between(datetime(2023,1,1), datetime(2024,1,1)) for _ in range(len(campaigns_df))]
campaigns_df['end_date'] = [fake.date_between(datetime(2024,1,1), datetime(2024,12,31)) for _ in range(len(campaigns_df))]

# Create Spark DataFrame for campaigns
campaigns_sdf = spark.createDataFrame(campaigns_df)
campaigns_sdf.createOrReplaceTempView("campaigns")

# Generate AI-optimized ad copies
performance_query = """
CREATE OR REPLACE TEMP VIEW campaigns_performance AS
SELECT
    campaign_id,
    segment,
    product,
    AI_QUERY(
        "databricks-meta-llama-3-3-70b-instruct", 
        CONCAT(
            'Create a digital ad for ', product, 
            ' targeting ', segment, 
            ' with ', tone, ' tone. Include: ',
            '- Key product benefits\n',
            '- Lifestyle connection\n',
            '- Emoji if appropriate\n',
            '- CTA: ', cta, '\n',
            '- Max 200 characters\n',
            '- No hashtags\n',
            '- Natural conversational style'
        )
    ) AS optimized_ad_copy,
    impressions,
    ctr
FROM campaigns
"""

spark.sql(performance_query)
campaigns_performance_df = spark.sql("SELECT * FROM campaigns_performance").toPandas()

# Add performance metrics
campaigns_performance_df['estimated_clicks'] = (campaigns_performance_df['impressions'] * 
                                               campaigns_performance_df['ctr']/100).astype(int)

# Display results
print("Generated Social Media Posts:")
display(posts_df)

print("\nAI-Optimized Campaign Performance:")
display(campaigns_performance_df[['campaign_id', 'segment', 'optimized_ad_copy', 
                                'impressions', 'ctr', 'estimated_clicks']])

# Save to JSON
posts_df.to_json('social_media_posts.json', orient='records', indent=2)
campaigns_performance_df.to_json('ai_optimized_campaigns.json', orient='records', indent=2)

print("Data saved to 'social_media_posts.json' and 'ai_optimized_campaigns.json'")

Generated Social Media Posts:


Unnamed: 0,author_id,tribe,product,emotion,created_at
0,8c60ff26-5f37-44e7-99fb-4d2b270ceae4,Eco-Conscious Millennials - Sustainable Lifestyle,bamboo toothbrush,satisfied,2024-03-12 09:54:47
1,483089aa-cda3-4609-8704-71bc91faa6e5,Eco-Conscious Millennials - Sustainable Lifestyle,organic skincare,satisfied,2023-02-13 00:15:58
2,eecb7210-aaca-4915-a3b4-8d3e4abfa5f9,Tech-Savvy Professionals - Silicon Beach Innov...,smartphone,excited,2024-10-21 22:53:47
3,567eca8b-78d7-4b3a-a704-9839511269e1,College Students - Campus Life (UCLA / USC),textbooks,curious,2023-08-30 10:22:49
4,37296c8e-cef3-41ad-8472-8d6f27ee94c2,College Students - Campus Life (UCLA / USC),textbooks,satisfied,2023-06-17 03:43:49
...,...,...,...,...,...
95,de4e6ae7-c64a-44f3-b636-3ae099c66939,Young Professional Women - Urban Explorers,travel bag,excited,2024-04-17 10:08:10
96,d06e1276-24f1-4b67-bbcc-71a1873eef6b,Eco-Conscious Millennials - Sustainable Lifestyle,electric bike,satisfied,2023-05-13 08:26:34
97,3b61da9c-96d5-45da-b3c4-4ba788698526,Luxury-Oriented Professionals - Upscale Lifest...,gourmet food subscription,luxurious,2023-06-04 16:55:49
98,779f4e35-fca3-4da1-9d9b-bec4c89c4ecc,Creative Entrepreneurs - Hollywood Creatives,luxury watch,excited,2024-02-26 22:10:03



AI-Optimized Campaign Performance:


Unnamed: 0,campaign_id,segment,optimized_ad_copy,impressions,ctr,estimated_clicks
0,AD-5124,Young Professional Women - Urban Explorers,"""Ready for your next urban escape? 💼🗺️ Our tra...",38066,2.26,860
1,AD-6213,Young Professional Women - Urban Explorers,"""Explore the city in style! 💃 Our sunglasses o...",171145,0.74,1266
2,AD-2972,Young Professional Women - Urban Explorers,"""Capture city vibes 📸 with our compact camera!...",197423,1.33,2625
3,AD-6868,Young Professional Women - Urban Explorers,"""Unleash your city adventures with our smartph...",167547,0.64,1072
4,AD-2068,Young Professional Women - Urban Explorers,"""Track your urban adventures 🏃‍♀️! Monitor ste...",465213,2.61,12142
5,AD-8596,Tech-Savvy Professionals - Silicon Beach Innov...,"""Upgrade your mobile experience 📈! Our smartph...",345380,1.72,5940
6,AD-9611,Tech-Savvy Professionals - Silicon Beach Innov...,"""Upgrade your workflow 🚀 with our cutting-edge...",420701,1.65,6941
7,AD-4428,Tech-Savvy Professionals - Silicon Beach Innov...,"""Upgrade your hustle with our smartwatch! 🚀 Tr...",150041,2.13,3195
8,AD-1530,Tech-Savvy Professionals - Silicon Beach Innov...,"""Upgrade your workflow 📊 with our latest table...",320915,4.28,13735
9,AD-8982,Tech-Savvy Professionals - Silicon Beach Innov...,"""Upgrade your commute! 💻 Wireless earbuds with...",126590,4.41,5582


Data saved to 'social_media_posts.json' and 'ai_optimized_campaigns.json'


### Write social media posts to volume JSON and save demographic + campaign tables

In [16]:
# Define the path to save social media feed
catalog = "marca_tribes"
schema = "ai_audience_segments"
socials_volume = "social_media_feed"
vol_social_media_feed = f"/mnt/{catalog}/{schema}/{socials_volume}/posts.json"

# Check if the directory exists, and create it if not
dbutils.fs.mkdirs(f"/mnt/{catalog}/{schema}/{socials_volume}")

# Convert the Pandas DataFrame to a Spark DataFrame if necessary
if isinstance(posts_df, pd.DataFrame):
    posts_df = spark.createDataFrame(posts_df)

# Save the social media posts DataFrame to the volume as a JSON file
posts_df.write.json(vol_social_media_feed, mode='overwrite')

In [21]:
# Write demographic data to UC table dropping segment
demographic_sdf = spark.createDataFrame(demographic_df)
demographic_sdf = demographic_sdf.drop("segment")

# Define the catalog and schema
catalog = "marca_tribes"
schema = "ai_audience_segments"
table_name = "audience_demographic"

# Save the DataFrame as a Delta table in Unity Catalog
demographic_sdf.write.format("delta").mode("overwrite").saveAsTable(f"{catalog}.{schema}.{table_name}")

In [None]:
# Write campaigns data to UC table
campaigns_df = spark.createDataFrame(campaigns_df)

# Define the catalog and schema
catalog = "marca_tribes"
schema = "ai_audience_segments"
table_name = "campaigns_performance"

# Save the DataFrame as a Delta table in Unity Catalog
campaigns_df.write.format("delta").mode("overwrite").saveAsTable(f"{catalog}.{schema}.{table_name}")