In [4]:
import pandas as pd
import json
import os
from openai import OpenAI
import anthropic
import requests
import google.generativeai as genai
from pathlib import Path
from dotenv import load_dotenv

# Load API keys from .env file
load_dotenv()

# Initialize clients using environment variables
openai_client = OpenAI()  # Will automatically use OPENAI_API_KEY env var
anthropic_client = anthropic.Anthropic()  # Will automatically use ANTHROPIC_API_KEY env var
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
LLAMA_API_KEY = os.getenv("LLAMA_API_KEY")

# DeepSeek uses OpenAI-compatible API
deepseek_client = OpenAI(
    api_key=os.getenv("DEEPSEEK_API_KEY"),
    base_url="https://api.deepseek.com/v1"
)

# Load data
data_df = pd.read_csv("personalization_data.csv")

# Define the catalog (you'll need to replace this with your actual catalog)
catalog = ["Electronics", "Fashion", "Home & Garden", "Sports", "Books", 
           "Beauty", "Toys", "Food & Grocery", "Health", "Automotive"]

def create_prompt(row):
    """Create prompt from user data"""
    # Parse the liked_tweets column - it might be a string representation of a list or JSON
    try:
        # Try to parse as JSON
        liked_tweets = json.loads(row["liked_tweets"])
    except:
        # If it's not valid JSON, split by some delimiter (comma, semicolon, etc.)
        # This is a guess - you'll need to check the actual format
        liked_tweets = row["liked_tweets"].split(';')
    
    # Take the first 3 tweets or fewer if there aren't 3
    liked_tweets = liked_tweets[:3] if len(liked_tweets) >= 3 else liked_tweets + [""] * (3 - len(liked_tweets))
    
    user_data = {
        "linkedin_headline": row["linkedin_headline"],
        "liked_tweets": liked_tweets
    }
    
    prompt = (
        f'json {json.dumps(user_data)}\n'
        f'"From the catalog below, pick the top two categories this user will most likely engage with '
        f'and justify each in ≤25 words. Return JSON: {{ "picks": ["...", "..."], "why": ["...", "..."] }}"\n'
        f'Catalog: {", ".join(catalog)}'
    )
    return prompt

def call_chatgpt(prompt):
    """Call ChatGPT API"""
    try:
        response = openai_client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7,
            response_format={"type": "json_object"}
        )
        return json.loads(response.choices[0].message.content)
    except Exception as e:
        print(f"ChatGPT error: {e}")
        return None

def call_claude(prompt):
    """Call Claude API"""
    try:
        response = anthropic_client.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=1024,
            temperature=0.7,
            messages=[{"role": "user", "content": prompt}]
        )
        # Extract JSON from response
        content = response.content[0].text
        json_start = content.find('{')
        json_end = content.rfind('}') + 1
        if json_start != -1 and json_end != 0:
            return json.loads(content[json_start:json_end])
        return None
    except Exception as e:
        print(f"Claude error: {e}")
        return None

def call_gemini(prompt):
    """Call Gemini API"""
    try:
        model = genai.GenerativeModel('gemini-1.5-pro') 
        response = model.generate_content(prompt)
        # Extract JSON from response
        content = response.text
        json_start = content.find('{')
        json_end = content.rfind('}') + 1
        if json_start != -1 and json_end != 0:
            return json.loads(content[json_start:json_end])
        return None
    except Exception as e:
        print(f"Gemini error: {e}")
        return None

def call_llama(prompt):
    """Call Llama API"""
    try:
        headers = {
            "Authorization": f"Bearer {LLAMA_API_KEY}",
            "Content-Type": "application/json"
        }
        
        # You may need to adjust the API endpoint based on Meta's documentation
        api_url = "https://api.meta.ai/v1/chat/completions"  
        
        payload = {
            "model": "llama-3-70b-instruct",  # or whatever model is available
            "messages": [{"role": "user", "content": prompt}],
            "temperature": 0.7,
            "max_tokens": 1024
        }
        
        response = requests.post(api_url, headers=headers, json=payload)
        response.raise_for_status()  # Raise exception for HTTP errors
        
        result = response.json()
        content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
        
        # Extract JSON from response
        json_start = content.find('{')
        json_end = content.rfind('}') + 1
        if json_start != -1 and json_end != 0:
            return json.loads(content[json_start:json_end])
        return None
    except Exception as e:
        print(f"Llama error: {e}")
        return None

# Run experiment
results = []
models = {
    "ChatGPT": call_chatgpt,
    "Claude": call_claude,
    "Gemini": call_gemini,
    "Llama": call_llama  
}

# Process first 25 users
for idx, row in data_df.head(25).iterrows():
    prompt = create_prompt(row)
    user_id = row["user_id"]
    
    for model_name, model_func in models.items():
        print(f"Processing user {user_id} with {model_name}...")
        response = model_func(prompt)
        
        if response:
            result = {
                "user_id": user_id,
                "model": model_name,
                "response": response,
                "ground_truth_1": row["cat_gt_1"],
                "ground_truth_2": row["cat_gt_2"]
            }
            results.append(result)

# Save results
output_dir = Path("runs")
output_dir.mkdir(exist_ok=True)

for model_name in models.keys():
    model_dir = output_dir / model_name
    model_dir.mkdir(exist_ok=True)
    
    # Save JSON responses
    model_results = [r for r in results if r["model"] == model_name]
    with open(model_dir / "p4.json", "w") as f:
        json.dump(model_results, f, indent=2)

# Compute hit rates
def compute_metrics(results):
    """Compute top-2 hit rate for each model"""
    metrics = {}
    
    for model_name in models.keys():
        model_results = [r for r in results if r["model"] == model_name]
        hits = 0
        total = 0
        
        for result in model_results:
            if result["response"] and "picks" in result["response"]:
                picks = result["response"]["picks"]
                gt_1 = result["ground_truth_1"]
                gt_2 = result["ground_truth_2"]
                
                if gt_1 in picks or gt_2 in picks:
                    hits += 1
                total += 1
        
        hit_rate = hits / total if total > 0 else 0
        metrics[model_name] = {
            "hit_rate": hit_rate,
            "total": total,
            "hits": hits
        }
    
    return metrics

metrics = compute_metrics(results)
print("\nMetrics Summary:")
for model, metric in metrics.items():
    print(f"{model}: Hit Rate = {metric['hit_rate']:.2%} ({metric['hits']}/{metric['total']})")

# Create evaluation spreadsheet template
eval_data = []
for result in results:
    if result["response"] and "picks" in result["response"] and "why" in result["response"]:
        eval_data.append({
            "user_id": result["user_id"],
            "model": result["model"],
            "pick_1": result["response"]["picks"][0] if len(result["response"]["picks"]) > 0 else "",
            "pick_2": result["response"]["picks"][1] if len(result["response"]["picks"]) > 1 else "",
            "rationale_1": result["response"]["why"][0] if len(result["response"]["why"]) > 0 else "",
            "rationale_2": result["response"]["why"][1] if len(result["response"]["why"]) > 1 else "",
            "ground_truth_1": result["ground_truth_1"],
            "ground_truth_2": result["ground_truth_2"],
            "plausibility_1": "",  # To be manually filled
            "plausibility_2": ""   # To be manually filled
        })

eval_df = pd.DataFrame(eval_data)
eval_df.to_csv("experiment_results_for_evaluation.csv", index=False)
print("\nEvaluation template saved to 'experiment_results_for_evaluation.csv'")

Processing user 1 with ChatGPT...
Processing user 1 with Claude...
Processing user 1 with Gemini...
Processing user 1 with Llama...
Llama error: HTTPSConnectionPool(host='api.meta.ai', port=443): Max retries exceeded with url: /v1/chat/completions (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7efeb914e0c0>: Failed to resolve 'api.meta.ai' ([Errno -2] Name or service not known)"))
Processing user 2 with ChatGPT...
Processing user 2 with Claude...
Processing user 2 with Gemini...
Processing user 2 with Llama...
Llama error: HTTPSConnectionPool(host='api.meta.ai', port=443): Max retries exceeded with url: /v1/chat/completions (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7efeb923c5c0>: Failed to resolve 'api.meta.ai' ([Errno -2] Name or service not known)"))
Processing user 3 with ChatGPT...
Processing user 3 with Claude...
Processing user 3 with Gemini...
Processing user 3 with Llama...
Llama error: HTTPSConnectionPool(h