# Audience Segmentation

In [0]:
# !pip install threadpoolctl=="3.1.0"

In [0]:
dbutils.library.restartPython()

In [0]:
import matplotlib.pyplot as plt
import mlflow
import pandas as pd
import pyspark.sql.functions as F

from openai import OpenAI
import os
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [0]:
# Set catalog and schema
catalog = "jack_sandom"
schema = "ai_audience_segments"

## Step 1: Build clustering model (KMeans)

In [0]:
# Read demographic table
demographic_df = spark.read.table(f"{catalog}.{schema}.audience_demographic").toPandas()

display(demographic_df)

In [0]:
# Define numerical and categorical features
numerical_features = ["age", "income", "number_dependants"]
categorical_features = ["location", "education", "relationship_status", "occupation"]

# Create preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ])

X = preprocessor.fit_transform(demographic_df)

# Calculate inertia for different values of k
k_range = range(2, 8)
inertias = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init="auto")
    kmeans.fit(X)
    inertias.append(kmeans.inertia_)

# Plot the elbow curve
plt.figure(figsize=(10, 6))
plt.plot(k_range, inertias, marker="o")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Inertia")
plt.title("Elbow Method for Optimal k")

# Save and log the plot without tempfile
plot_filename = "elbow_plot.png"
plt.savefig(plot_filename)

plt.show()

mlflow.end_run()

### Log optimal K based on elbow method

In [0]:
optimal_k = 5
kmeans_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("kmeans", KMeans(n_clusters=optimal_k, random_state=42, n_init="auto"))
])

In [0]:
# Do final run with optimal k

with mlflow.start_run(run_name="KMeans_clustering_optimal"):
    # Fit the pipeline
    kmeans_pipeline.fit(demographic_df)
    
    # Make predictions
    labels = kmeans_pipeline.predict(demographic_df)
    
    # Evaluate the model
    silhouette = silhouette_score(kmeans_pipeline.named_steps["preprocessor"].transform(demographic_df), labels)
    
    # Log optimal k and elbow plot
    mlflow.log_param("optimal_n_clusters", optimal_k)
    mlflow.log_artifact(plot_filename)

    # Log parameters and metrics
    mlflow.log_param("numerical_features", numerical_features)
    mlflow.log_param("categorical_features", categorical_features)
    mlflow.log_metric("silhouette_score", silhouette)
    
    # Log the model
    mlflow.sklearn.log_model(kmeans_pipeline, "kmeans_model")
    
    print(f"Silhouette score: {silhouette}")

### Generate segments

In [0]:
demographic_df["cluster"] = labels
demographic_sdf = spark.createDataFrame(demographic_df)

display(demographic_sdf)

In [0]:
# Find aggregates to understand the demographics of our clusters
segment_summary_sdf = (
  demographic_sdf.groupby("cluster").agg(
    F.expr("percentile_approx(age, 0.5)").alias("median_age"),
    F.expr("percentile_approx(income, 0.5)").alias("median_income"),
    F.expr("mode() within group (order by location)").alias("mode_location"),
    F.expr("mode() within group (order by education)").alias("mode_education"),
    F.expr("mode() within group (order by relationship_status)").alias("mode_relationship_status"),
    F.expr("percentile_approx(number_dependants, 0.5)").alias("median_number_dependants"),
    F.expr("mode() within group (order by occupation)").alias("mode_occupation"),
  ).orderBy("cluster")
)

display(segment_summary_sdf)

In [0]:
# Assign names to clusters
segment_summary_sdf = (
  segment_summary_sdf
    .withColumn("segment", 
      F.when(demographic_sdf.cluster == 0, "Retired Rural Dweller")
       .when(demographic_sdf.cluster == 1, "College Student")
       .when(demographic_sdf.cluster == 2, "High-Income Empty Nester")
       .when(demographic_sdf.cluster == 3, "Suburban Family-Oriented")
       .when(demographic_sdf.cluster == 4, "Young Urban Professional")
    )
)

demographic_sdf = demographic_sdf.join(segment_summary_sdf.select("cluster", "segment"), "cluster"). drop("cluster")

In [0]:
display(demographic_sdf)

## Step 2: Use LLM to generate customer profiles

In [0]:
# Get social media posts data
uc_volume_path = "/Volumes/jack_sandom/ai_audience_segments/social_media_feed/posts.json"
posts_sdf = spark.read.json(uc_volume_path)

display(posts_sdf)

In [0]:
# Join demographic and posts data
audience_sdf = demographic_sdf.join(posts_sdf, [demographic_sdf.uuid == posts_sdf.author_id], how="left")

# Aggregate the posts data by segment
segment_posts_sdf = audience_sdf.groupBy("segment").agg(
  F.concat_ws("\n\n", F.collect_list("post")).alias("posts")
)

# Join to the segment summaries from earlier
segment_summary_sdf = segment_summary_sdf.join(segment_posts_sdf, "segment")

In [0]:
display(segment_summary_sdf)

### Create profiles

In [0]:

def create_prompt(segment, age, income, location, education, dependants, occupation, social_posts):

    prompt = f"""
        You are an expert marketing analyst tasked with creating a detailed customer persona. Use the provided demographic information and social media posts to describe the customer segment’s characteristics. Use the segment as the persona name.

        ### Demographic Information:
        - Segment: {segment}
        - Average Age: {age}
        - Average Income: {income}
        - Location: {location}
        - Education: {education}
        - Dependants: {dependants}
        - Occupation: {occupation}

        ### Aggregated Social Media Posts:
        "{social_posts}"

        ### Instructions:
        Based on the demographic data, social media content, and product reviews:
        1. Describe the segment’s **lifestyle**, **values**, and **daily habits**.
        2. Highlight their **interests**, **opinions**, and **brand sentiments**.
        3. Identify their **media consumption habits** and **preferred communication channels**.
        4. Outline their **purchase motivations** and **buying behaviors**.
        5. Summarize any **pain points**, **common complaints**, or **product preferences** mentioned.

        ### Output Format:
        - **Persona Name:**  
        - **Overview:**  
        - **Lifestyle & Values:**  
        - **Interests & Social Media Behavior:**  
        - **Buying Motivations & Product Preferences:**  
        - **Challenges & Pain Points:**  
        - **Preferred Communication Channels:**  
        """

    return prompt

In [0]:
DATABRICKS_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()

client = OpenAI(
  api_key=DATABRICKS_TOKEN,
  base_url="https://e2-demo-field-eng.cloud.databricks.com/serving-endpoints"
)

def generate_profile(prompt):
  chat_completion = client.chat.completions.create(
    messages=[
    {
      "role": "system",
      "content": "You are an AI assistant."
    },
    {
      "role": "user",
      "content": prompt
    }
    ],
    model="databricks-meta-llama-3-3-70b-instruct",
  )

  return chat_completion.choices[0].message.content

In [0]:
profiles = {}

# Iterate through segments to get profile
for row in segment_summary_sdf.collect():
  prompt = create_prompt(
    row.segment, 
    row.median_age, 
    row.median_income, 
    row.mode_location, 
    row.mode_education, 
    row.median_number_dependants, 
    row.mode_occupation, row.posts
  )

  profiles[row.segment] = generate_profile(prompt)

### Save profiles

In [0]:
# Save profiles to UC volume
for segment, profile in profiles.items():
  with open(f"/Volumes/jack_sandom/ai_audience_segments/profiles/{segment}.txt", "w") as f:
    f.write(profile)