In [8]:
import pandas as pd
import json
import os
import numpy as np
from pathlib import Path
from dotenv import load_dotenv
from openai import OpenAI
import anthropic
import google.generativeai as genai
import time
import re

# Load API keys from .env file
load_dotenv()

# Initialize clients using environment variables
openai_client = OpenAI()  # Will automatically use OPENAI_API_KEY env var
anthropic_client = anthropic.Anthropic()  # Will automatically use ANTHROPIC_API_KEY env var
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

# Define the models to compare
models = {
    "ChatGPT": {
        "name": "gpt-4o-mini",
        "provider": "openai"
    },
    "Claude": {
        "name": "claude-3-5-sonnet-20241022",
        "provider": "anthropic"
    },
    "Gemini": {
        "name": "gemini-1.5-pro",
        "provider": "google"
    }
}

# Create output directories
output_dir = Path("persona_experiments")
output_dir.mkdir(exist_ok=True)

####################################
# Data Loading Functions
####################################

def load_persona_data(file_path="personalization_data.csv", num_personas=5):
    """Load user personas from CSV file"""
    try:
        data_df = pd.read_csv(file_path)
        personas = []
        
        for idx, row in data_df.iloc[:num_personas].iterrows():
            # Parse liked tweets
            liked_tweets = []
            try:
                tweets_str = row.get("liked_tweets", "")
                # Try to handle the format as a string representation of a list
                if isinstance(tweets_str, str):
                    # Clean up the string and split by commas
                    cleaned = tweets_str.replace("['", "").replace("']", "").replace('"', "")
                    liked_tweets = [t.strip() for t in cleaned.split("','")]
            except Exception as e:
                print(f"Error parsing liked tweets: {e}")
            
            # Create a clean persona object
            persona = {
                "user_id": row.get("user_id", idx + 1),
                "linkedin_headline": row.get("linkedin_headline", ""),
                "liked_tweets": liked_tweets,
                "events": [row.get("event_1", ""), row.get("event_2", ""), row.get("event_3", "")],
                "dwell_times": parse_array_string(row.get("dwell_times", "")),
                "sku_views": parse_array_string(row.get("sku_views", "")),
                "categories": [row.get("cat_gt_1", ""), row.get("cat_gt_2", "")]
            }
            personas.append(persona)
        
        return personas
    except FileNotFoundError:
        print(f"File {file_path} not found")
        return []

def load_product_data(file_path="content_data.csv"):
    """Load product data from CSV file"""
    try:
        data_df = pd.read_csv(file_path)
        products = []
        
        for idx, row in data_df.iterrows():
            # Parse spec bullets
            spec_bullets = []
            if isinstance(row.get("spec_bullets"), str):
                spec_bullets = [bullet.strip() for bullet in row.get("spec_bullets").split(",")]
            
            # Parse SEO keywords
            seo_keywords = []
            if isinstance(row.get("seo_keywords"), str):
                seo_keywords = [keyword.strip() for keyword in row.get("seo_keywords").split(",")]
            
            product = {
                "campaign_id": row.get("campaign_id", idx + 1),
                "product": row.get("product", ""),
                "theme": row.get("theme", ""),
                "value_prop": row.get("value_prop", ""),
                "target_audience": row.get("target_audience", ""),
                "spec_bullets": spec_bullets,
                "seo_keywords": seo_keywords
            }
            products.append(product)
        
        return products
    except FileNotFoundError:
        print(f"File {file_path} not found")
        return []

def parse_array_string(array_str):
    """Parse a string representation of an array into a list of values"""
    if not isinstance(array_str, str):
        return []
    
    try:
        # Clean up the string and convert to list
        clean_str = array_str.strip().strip("[]")
        if not clean_str:
            return []
        
        # Try to split by comma
        values = [val.strip() for val in clean_str.split(",")]
        
        # Try to convert to appropriate types
        result = []
        for val in values:
            try:
                # Try as number first
                num_val = float(val)
                if num_val.is_integer():
                    result.append(int(num_val))
                else:
                    result.append(num_val)
            except ValueError:
                # Otherwise keep as string
                result.append(val)
        
        return result
    except Exception as e:
        print(f"Error parsing array string: {e}")
        return []

####################################
# Prompt Generation
####################################

def create_persona_prompt(persona, product):
    """Create a prompt for generating a personalized product blurb"""
    
    # Extract professional info from LinkedIn headline
    job_title = persona["linkedin_headline"].split("|")[0].strip() if "|" in persona["linkedin_headline"] else persona["linkedin_headline"]
    industry = persona["linkedin_headline"].split("|")[1].strip() if "|" in persona["linkedin_headline"] else ""
    
    # Format liked tweets for better context
    tweet_examples = "\n".join([f"- {tweet}" for tweet in persona["liked_tweets"][:3]])
    
    # Format product bullets
    product_bullets = "\n".join([f"- {bullet}" for bullet in product["spec_bullets"][:5]])
    
    prompt = (
        f"You are a personalization specialist for an e-commerce platform. Write a concise, "
        f"personalized product blurb (50-75 words) for the following user persona and product.\n\n"
        
        f"USER PERSONA:\n"
        f"Professional: {job_title}\n"
        f"Industry: {industry}\n"
        f"Interests based on social media activity:\n{tweet_examples}\n"
        f"Recently viewed categories: {', '.join(persona['categories'])}\n\n"
        
        f"PRODUCT INFORMATION:\n"
        f"Product Name: {product['product']}\n"
        f"Theme: {product['theme']}\n"
        f"Value Proposition: {product['value_prop']}\n"
        f"Target Audience: {product['target_audience']}\n"
        f"Key Features:\n{product_bullets}\n\n"
        
        f"INSTRUCTIONS:\n"
        f"1. Create a personalized product blurb that speaks directly to this specific persona\n"
        f"2. Use language and framing that will resonate with their professional background and interests\n"
        f"3. Highlight aspects of the product that would be most relevant to them\n"
        f"4. Keep the tone professional but engaging\n"
        f"5. Be concise (50-75 words)\n"
        f"6. Return ONLY the blurb text, nothing else\n"
    )
    
    return prompt

####################################
# API Calls
####################################

def call_api(prompt, model_info, max_retries=3):
    """Call the appropriate API based on model provider with retries"""
    provider = model_info["provider"]
    model_name = model_info["name"]
    
    for attempt in range(max_retries):
        try:
            if provider == "openai":
                response = openai_client.chat.completions.create(
                    model=model_name,
                    messages=[{"role": "user", "content": prompt}],
                    temperature=0.7
                )
                return response.choices[0].message.content
                
            elif provider == "anthropic":
                response = anthropic_client.messages.create(
                    model=model_name,
                    max_tokens=1024,
                    temperature=0.7,
                    messages=[{"role": "user", "content": prompt}]
                )
                return response.content[0].text
                
            elif provider == "google":
                model = genai.GenerativeModel(model_name)
                response = model.generate_content(prompt)
                return response.text
                
            else:
                raise ValueError(f"Unsupported provider: {provider}")
                
        except Exception as e:
            print(f"Error calling {provider} API (attempt {attempt + 1}/{max_retries}): {str(e)}")
            if attempt < max_retries - 1:
                # Exponential backoff
                time.sleep(2 ** attempt)
            else:
                return "API Error: Could not generate blurb."

####################################
# Evaluation Metrics
####################################

def compute_blurb_metrics(blurb, persona, product):
    """Compute metrics for evaluating the personalized blurb"""
    try:
        # Basic metrics
        word_count = len(blurb.split())
        
        # Check for persona-specific terms
        persona_elements = 0
        
        # Check for job title
        if "|" in persona["linkedin_headline"]:
            job_title = persona["linkedin_headline"].split("|")[0].strip().lower()
            industry = persona["linkedin_headline"].split("|")[1].strip().lower()
            
            if job_title in blurb.lower():
                persona_elements += 1
            
            if industry in blurb.lower():
                persona_elements += 1
        
        # Check for interests from tweets
        interest_keywords = extract_keywords_from_tweets(persona["liked_tweets"])
        interest_matches = sum(1 for keyword in interest_keywords if keyword.lower() in blurb.lower())
        
        # Check for product-specific terms
        product_elements = 0
        
        # Check for product name
        if product["product"].lower() in blurb.lower():
            product_elements += 1
        
        # Check for theme and value prop
        if product["theme"].lower() in blurb.lower():
            product_elements += 1
            
        if product["value_prop"].lower() in blurb.lower():
            product_elements += 1
        
        # Check for product features
        feature_count = sum(1 for feature in product["spec_bullets"] if feature.lower() in blurb.lower())
        
        return {
            "word_count": word_count,
            "persona_elements": persona_elements,
            "interest_matches": interest_matches,
            "product_elements": product_elements,
            "feature_mentions": feature_count,
            "personalization_score": (persona_elements + interest_matches) / (2 + len(interest_keywords)) if interest_keywords else persona_elements / 2,
            "product_relevance_score": (product_elements + feature_count) / (3 + len(product["spec_bullets"])) if product["spec_bullets"] else product_elements / 3
        }
    except Exception as e:
        print(f"Error computing blurb metrics: {str(e)}")
        return {
            "error": str(e),
            "word_count": 0,
            "personalization_score": 0,
            "product_relevance_score": 0
        }

def extract_keywords_from_tweets(tweets):
    """Extract potential interest keywords from tweets"""
    if not tweets:
        return []
    
    all_text = " ".join(tweets)
    
    # Remove common stop words
    stop_words = ["a", "an", "the", "and", "in", "on", "at", "of", "for", "to", "is", "are", "this", "that", "with", "so", "just"]
    
    # Split into words, remove punctuation, and filter stop words
    words = re.findall(r'\b\w+\b', all_text.lower())
    filtered_words = [word for word in words if word not in stop_words and len(word) > 3]
    
    # Get frequency
    word_freq = {}
    for word in filtered_words:
        word_freq[word] = word_freq.get(word, 0) + 1
    
    # Return top keywords by frequency
    sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
    return [word for word, freq in sorted_words[:10]]  # Return top 10 keywords

####################################
# Main Experiment Functions
####################################

def run_personalized_blurbs_matrix_with_metrics():
    """Generate a matrix of personalized blurbs with metrics for all products and personas"""
    # Load data
    personas = load_persona_data(num_personas=5)
    products = load_product_data()
    
    if not personas or not products:
        print("Error: Could not load data files.")
        return None
    
    # Initialize results structure
    matrix_results = []
    detailed_metrics = []
    
    # For each product and model, generate blurbs for all 5 personas
    for product in products:
        for model_name, model_info in models.items():
            print(f"Generating blurbs for product: {product['product']} with model: {model_name}")
            
            # Initialize row with product and model info
            row = {
                "product": product["product"],
                "model": model_name,
            }
            
            # Track metrics for this product-model combination
            model_metrics = {
                "product": product["product"],
                "model": model_name,
                "avg_personalization_score": 0,
                "avg_product_relevance_score": 0,
                "avg_word_count": 0,
                "total_persona_elements": 0,
                "total_interest_matches": 0,
                "total_product_elements": 0,
                "total_feature_mentions": 0
            }
            
            persona_metrics = []
            
            # Generate a blurb for each persona
            for i, persona in enumerate(personas):
                persona_key = f"persona_{persona['user_id']}"
                
                # Create a personalized prompt
                prompt = create_persona_prompt(persona, product)
                
                # Generate the blurb
                blurb = call_api(prompt, model_info)
                
                # Clean up the blurb
                if blurb:
                    blurb = blurb.strip().strip('"').strip("'")
                    blurb = re.sub(r'\n+', ' ', blurb)  # Replace newlines with spaces
                else:
                    blurb = "Failed to generate blurb."
                
                # Add to the row
                row[persona_key] = blurb
                
                # Add persona headline as a separate column for reference
                row[f"persona_{persona['user_id']}_headline"] = persona["linkedin_headline"]
                
                # Compute metrics for this blurb
                metrics = compute_blurb_metrics(blurb, persona, product)
                
                # Add metrics to the row
                row[f"persona_{persona['user_id']}_personalization_score"] = metrics["personalization_score"]
                row[f"persona_{persona['user_id']}_product_relevance_score"] = metrics["product_relevance_score"]
                
                # Update overall metrics
                model_metrics["avg_personalization_score"] += metrics["personalization_score"]
                model_metrics["avg_product_relevance_score"] += metrics["product_relevance_score"]
                model_metrics["avg_word_count"] += metrics["word_count"]
                model_metrics["total_persona_elements"] += metrics["persona_elements"]
                model_metrics["total_interest_matches"] += metrics["interest_matches"]
                model_metrics["total_product_elements"] += metrics["product_elements"]
                model_metrics["total_feature_mentions"] += metrics["feature_mentions"]
                
                # Store detailed metrics for this persona-product-model combination
                persona_metrics.append({
                    "product": product["product"],
                    "model": model_name,
                    "persona_id": persona["user_id"],
                    "linkedin_headline": persona["linkedin_headline"],
                    "blurb": blurb,
                    "word_count": metrics["word_count"],
                    "persona_elements": metrics["persona_elements"],
                    "interest_matches": metrics["interest_matches"],
                    "product_elements": metrics["product_elements"],
                    "feature_mentions": metrics["feature_mentions"],
                    "personalization_score": metrics["personalization_score"],
                    "product_relevance_score": metrics["product_relevance_score"],
                    "combined_score": (metrics["personalization_score"] + metrics["product_relevance_score"]) / 2
                })
            
            # Compute averages
            num_personas = len(personas)
            model_metrics["avg_personalization_score"] /= num_personas
            model_metrics["avg_product_relevance_score"] /= num_personas
            model_metrics["avg_word_count"] /= num_personas
            model_metrics["combined_score"] = (model_metrics["avg_personalization_score"] + model_metrics["avg_product_relevance_score"]) / 2
            
            # Add summary metrics to the row
            row["avg_personalization_score"] = model_metrics["avg_personalization_score"]
            row["avg_product_relevance_score"] = model_metrics["avg_product_relevance_score"]
            row["combined_score"] = model_metrics["combined_score"]
            
            # Add completed row to results
            matrix_results.append(row)
            detailed_metrics.extend(persona_metrics)
    
    # Create output directory
    task_dir = output_dir / "personalized_blurbs_matrix"
    task_dir.mkdir(exist_ok=True)
    
    # Convert to DataFrame
    results_df = pd.DataFrame(matrix_results)
    detailed_metrics_df = pd.DataFrame(detailed_metrics)
    
    # Save as CSV
    results_df.to_csv(task_dir / "personalized_blurbs_matrix.csv", index=False)
    detailed_metrics_df.to_csv(task_dir / "detailed_metrics.csv", index=False)
    
    # Compute overall metrics by model
    model_summary = []
    for model_name in models.keys():
        model_results = [r for r in detailed_metrics if r["model"] == model_name]
        
        if model_results:
            avg_personalization = np.mean([r["personalization_score"] for r in model_results])
            avg_product_relevance = np.mean([r["product_relevance_score"] for r in model_results])
            avg_combined = np.mean([r["combined_score"] for r in model_results])
            avg_word_count = np.mean([r["word_count"] for r in model_results])
            
            model_summary.append({
                "model": model_name,
                "avg_personalization_score": float(avg_personalization),
                "avg_product_relevance_score": float(avg_product_relevance),
                "avg_combined_score": float(avg_combined),
                "avg_word_count": float(avg_word_count),
                "sample_count": len(model_results)
            })
    
    # Save model summary
    model_summary_df = pd.DataFrame(model_summary)
    model_summary_df.to_csv(task_dir / "model_summary.csv", index=False)
    
    # Also save as JSON for easier import in other systems
    with open(task_dir / "model_summary.json", "w") as f:
        json.dump(model_summary, f, indent=2)
    
    print(f"Generated {len(matrix_results)} rows with blurbs for {len(personas)} personas.")
    print(f"Results saved to {task_dir}")
    
    # Print model summary
    print("\nModel Performance Summary:")
    for model in model_summary:
        print(f"{model['model']}:")
        print(f"  Personalization Score: {model['avg_personalization_score']:.3f}")
        print(f"  Product Relevance Score: {model['avg_product_relevance_score']:.3f}")
        print(f"  Combined Score: {model['avg_combined_score']:.3f}")
        print(f"  Average Word Count: {model['avg_word_count']:.1f}")
    
    return matrix_results, detailed_metrics, model_summary

####################################
# Main Function
####################################

def main():
    """Run the personalized product blurb matrix experiment with metrics"""
    print("Starting Personalized Product Blurb Matrix Experiment with Metrics...")
    
    # Check if data files exist
    persona_file = Path("personalization_data.csv")
    product_file = Path("content_data.csv")
    
    if not persona_file.exists():
        print("Error: personalization_data.csv not found.")
        return
    
    if not product_file.exists():
        print("Error: content_data.csv not found.")
        return
    
    # Run the experiment
    results, detailed_metrics, model_summary = run_personalized_blurbs_matrix_with_metrics()
    
    if results:
        print("\nExperiment complete! Matrix of personalized blurbs with metrics generated successfully.")
    else:
        print("\nError: Experiment failed to generate results.")

if __name__ == "__main__":
    main()

Starting Personalized Product Blurb Matrix Experiment with Metrics...
Generating blurbs for product: AetherGuard AI-Driven Smart Security Hub with model: ChatGPT
Generating blurbs for product: AetherGuard AI-Driven Smart Security Hub with model: Claude
Generating blurbs for product: AetherGuard AI-Driven Smart Security Hub with model: Gemini
Generating blurbs for product: FlexMaster Pro Adjustable Dumbbell Set with model: ChatGPT
Generating blurbs for product: FlexMaster Pro Adjustable Dumbbell Set with model: Claude
Generating blurbs for product: FlexMaster Pro Adjustable Dumbbell Set with model: Gemini
Generating blurbs for product: LuxeGlimmer Solar-Powered LED Statement Necklace with model: ChatGPT
Generating blurbs for product: LuxeGlimmer Solar-Powered LED Statement Necklace with model: Claude
Generating blurbs for product: LuxeGlimmer Solar-Powered LED Statement Necklace with model: Gemini
Generating blurbs for product: TruffleTwist Artisan Black Truffle Infused Olive Oil with m