In [23]:
%pip install -U -q pandas dspy python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [24]:
import pandas as pd  
import dspy  
import numpy as np  
import random  
from typing import List, Optional  
  
# Function to create a sample dataset if you don't have one  
def create_sample_dataset(filename="sample_reviews.csv", num_samples=20):  
    """Create a sample dataset of product reviews."""  
      
    # Sample reviews with sentiments and features  
    sample_reviews = [  
        {"review_text": "This phone has amazing battery life and the camera quality is outstanding.",   
         "sentiment": "positive",   
         "features_mentioned": ["battery life", "camera quality"]},  
        {"review_text": "The laptop overheats quickly and the keyboard is uncomfortable to type on.",   
         "sentiment": "negative",   
         "features_mentioned": ["thermal management", "keyboard comfort"]},  
        {"review_text": "Average performance for the price, but the design is sleek.",   
         "sentiment": "neutral",   
         "features_mentioned": ["performance", "design"]},  
        {"review_text": "The headphones have great sound quality but poor noise cancellation.",   
         "sentiment": "mixed",   
         "features_mentioned": ["sound quality", "noise cancellation"]},  
        {"review_text": "The smartwatch's fitness tracking is accurate, but the battery drains too quickly.",   
         "sentiment": "mixed",   
         "features_mentioned": ["fitness tracking", "battery life"]},  
        {"review_text": "This blender makes the smoothest smoothies I've ever had!",   
         "sentiment": "positive",   
         "features_mentioned": ["blending performance"]},  
        {"review_text": "The TV has excellent picture quality and smart features.",   
         "sentiment": "positive",   
         "features_mentioned": ["picture quality", "smart features"]},  
        {"review_text": "This coffee maker is a complete waste of money. It broke after two weeks.",   
         "sentiment": "negative",   
         "features_mentioned": ["durability"]},  
        {"review_text": "The wireless charger works as expected, nothing special.",   
         "sentiment": "neutral",   
         "features_mentioned": ["charging functionality"]},  
        {"review_text": "The gaming mouse has responsive buttons but the software is buggy.",   
         "sentiment": "mixed",   
         "features_mentioned": ["button responsiveness", "software quality"]}  
    ]  
      
    # Generate additional reviews to reach the desired number  
    sentiments = ["positive", "negative", "neutral", "mixed"]  
    features = ["battery life", "performance", "design", "durability", "user interface",   
                "price", "quality", "functionality", "comfort", "reliability"]  
      
    while len(sample_reviews) < num_samples:  
        sentiment = random.choice(sentiments)  
        num_features = random.randint(1, 3)  
        mentioned_features = random.sample(features, num_features)  
          
        if sentiment == "positive":  
            review = f"I love this product! The {' and '.join(mentioned_features)} are excellent."  
        elif sentiment == "negative":  
            review = f"Disappointed with this purchase. The {' and '.join(mentioned_features)} are terrible."  
        elif sentiment == "neutral":  
            review = f"This product is okay. The {' and '.join(mentioned_features)} are average."  
        else:  # mixed  
            good_feature = mentioned_features[0]  
            bad_feature = mentioned_features[1] if len(mentioned_features) > 1 else random.choice(features)  
            while bad_feature == good_feature:  
                bad_feature = random.choice(features)  
            review = f"The {good_feature} is great, but the {bad_feature} could be improved."  
          
        sample_reviews.append({  
            "review_text": review,  
            "sentiment": sentiment,  
            "features_mentioned": mentioned_features  
        })  
      
    # Convert to DataFrame and save to CSV  
    df = pd.DataFrame(sample_reviews)  
    df.to_csv(filename, index=False)  
    print(f"Sample dataset created and saved to {filename}")  
    return df  
  
# Function to load reviews from a CSV file  
def load_reviews(filename="sample_reviews.csv", has_labels=True):  
    """  
    Load reviews from a CSV file and convert to DSPy examples.  
      
    Args:  
        filename: Path to the CSV file  
        has_labels: Whether the CSV contains sentiment and features_mentioned columns  
          
    Returns:  
        List of DSPy examples  
    """  
    try:  
        df = pd.read_csv(filename)  
    except FileNotFoundError:  
        print(f"File {filename} not found. Creating a sample dataset...")  
        df = create_sample_dataset(filename)  
      
    examples = []  
    for _, row in df.iterrows():  
        if has_labels:  
            # Convert string representation of list to actual list if needed  
            features = row['features_mentioned']  
            if isinstance(features, str):  
                # Handle the case where features might be stored as a string representation of a list  
                if features.startswith('[') and features.endswith(']'):  
                    features = eval(features)  # Be careful with eval in production!  
                else:  
                    features = [feat.strip() for feat in features.split(',')]  
              
            example = dspy.Example(  
                review=row['review_text'],  
                sentiment=row['sentiment'],  
                features_mentioned=features  
            ).with_inputs('review')  
        else:  
            example = dspy.Example(  
                review=row['review_text']  
            ).with_inputs('review')  
          
        examples.append(example)  
      
    return examples  
  
# Load the dataset  
examples = load_reviews()  
  
# Split into train/dev sets  
random.seed(42)  # For reproducibility  
random.shuffle(examples)  
split_idx = int(len(examples) * 0.8)  
trainset = examples[:split_idx]  
devset = examples[split_idx:]  
  
print(f"Loaded {len(examples)} examples")  
print(f"Training set: {len(trainset)} examples")  
print(f"Development set: {len(devset)} examples")  
  
# Display a sample example  
print("\nSample example:")  
print(f"Review: {trainset[0].review}")  
print(f"Sentiment: {trainset[0].sentiment}")  
print(f"Features mentioned: {trainset[0].features_mentioned}")

Loaded 20 examples
Training set: 16 examples
Development set: 4 examples

Sample example:
Review: I love this product! The comfort are excellent.
Sentiment: positive
Features mentioned: ['comfort']


In [25]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")
# print(f"API Key: {api_key}")


gpt_4o_mini = "openai/gpt-4o-mini"
gpt_41_nano = "openai/gpt-4.1-nano"

In [26]:
import dspy  
from typing import List, Set  
  
# Configure DSPy with a language model  
lm = dspy.LM(gpt_41_nano)  # You can change this to your preferred model  
dspy.settings.configure(lm=lm)  
  
# Define our ReviewAnalyzer module  
class ReviewAnalyzer(dspy.Module):  
    """A module that analyzes product reviews to extract sentiment and key features."""  
      
    def __init__(self):  
        super().__init__()  
        # Define a predictor that takes a review and outputs sentiment and key features  
        self.analyzer = dspy.ChainOfThought("review -> sentiment, key_features")  
      
    def forward(self, review):  
        # Process the review and return the prediction  
        result = self.analyzer(review=review)  
        return dspy.Prediction(  
            sentiment=result.sentiment,  
            key_features=result.key_features  
        )  
  
# Define evaluation metrics  
def sentiment_accuracy(example, prediction):  
    """  
    Evaluate sentiment analysis accuracy.  
      
    Args:  
        example: The ground truth example  
        prediction: The model's prediction  
          
    Returns:  
        1.0 if the sentiment matches, 0.0 otherwise  
    """  
    # Convert both to lowercase for case-insensitive comparison  
    true_sentiment = example.sentiment.lower()  
    pred_sentiment = prediction.sentiment.lower()  
      
    # Return 1.0 if they match, 0.0 otherwise  
    return 1.0 if true_sentiment == pred_sentiment else 0.0  
  
def feature_extraction_quality(example, prediction):  
    """  
    Evaluate feature extraction quality using F1 score.  
      
    Args:  
        example: The ground truth example  
        prediction: The model's prediction  
          
    Returns:  
        F1 score between 0.0 and 1.0  
    """  
    # Convert ground truth features to a set of lowercase strings  
    true_features = set(f.lower() for f in example.features_mentioned)  
      
    # Parse predicted features (assuming they're comma-separated)  
    if isinstance(prediction.key_features, list):  
        pred_features = set(f.lower() for f in prediction.key_features)  
    else:  
        # Split by comma and strip whitespace  
        pred_features = set(f.lower().strip() for f in prediction.key_features.split(','))  
      
    # Calculate precision and recall  
    if not pred_features:  
        return 0.0  # No features predicted  
      
    if not true_features:  
        return 1.0 if not pred_features else 0.0  # No true features  
      
    # Find common features  
    common_features = true_features.intersection(pred_features)  
      
    # Calculate precision, recall, and F1  
    precision = len(common_features) / len(pred_features)  
    recall = len(common_features) / len(true_features)  
      
    # Calculate F1 score  
    if precision + recall == 0:  
        return 0.0  
    f1 = 2 * (precision * recall) / (precision + recall)  
      
    return f1  
  
# Create a combined metric that averages sentiment and feature metrics  
def combined_metric(example, prediction):  
    """  
    Combined metric that weights sentiment accuracy and feature extraction.  
      
    Args:  
        example: The ground truth example  
        prediction: The model's prediction  
          
    Returns:  
        Weighted average of sentiment accuracy and feature extraction quality  
    """  
    sent_score = sentiment_accuracy(example, prediction)  
    feat_score = feature_extraction_quality(example, prediction)  
      
    # Weight sentiment and features equally (you can adjust these weights)  
    return 0.5 * sent_score + 0.5 * feat_score  
  
# Create an instance of our ReviewAnalyzer  
analyzer = ReviewAnalyzer()  
  
# Set up evaluation  
evaluator = dspy.Evaluate(  
    devset=devset,  
    metric=combined_metric,  
    num_threads=4,  # Adjust based on your system  
    display_progress=True,  
    display_table=True  
)  
  
# Run evaluation and print results  
print("Evaluating baseline model...")  
score = evaluator(analyzer)  
print(f"Overall Score: {score:.2f}%")  
  
# Get detailed results  
detailed_evaluator = dspy.Evaluate(  
    devset=devset,  
    metric=combined_metric,  
    num_threads=4,  
    display_progress=True,  
    display_table=True,  
    return_all_scores=True,  
    return_outputs=True  
)  
  
overall_score, results, individual_scores = detailed_evaluator(analyzer)  
  
# Display some detailed results  
print("\nDetailed Results for First 3 Examples:")  
for i, (example, prediction, score) in enumerate(results[:3]):  
    print(f"\nExample {i+1}:")  
    print(f"Review: {example.review[:100]}...")  
    print(f"True Sentiment: {example.sentiment}")  
    print(f"Predicted Sentiment: {prediction.sentiment}")  
    print(f"True Features: {example.features_mentioned}")  
    print(f"Predicted Features: {prediction.key_features}")  
    print(f"Score: {score:.2f}")

Evaluating baseline model...
Average Metric: 2.50 / 4 (62.5%): 100%|██████████| 4/4 [00:00<00:00, 2430.07it/s]

2025/05/05 19:12:23 INFO dspy.evaluate.evaluate: Average Metric: 2.5 / 4 (62.5%)





Unnamed: 0,review,example_sentiment,features_mentioned,pred_sentiment,key_features,combined_metric
0,This coffee maker is a complete waste of money. It broke after two...,negative,[durability],Negative,,✔️ [0.500]
1,"The wireless charger works as expected, nothing special.",neutral,[charging functionality],Neutral,"Proper functionality, basic performance",✔️ [0.500]
2,This phone has amazing battery life and the camera quality is outs...,positive,"[battery life, camera quality]",Positive,"Battery life, Camera quality",✔️ [1.000]
3,The headphones have great sound quality but poor noise cancellation.,mixed,"[sound quality, noise cancellation]",Mixed/Neutral (leaning positive),"Sound quality, Noise cancellation",✔️ [0.500]


Overall Score: 62.50%
Average Metric: 2.50 / 4 (62.5%): 100%|██████████| 4/4 [00:00<00:00, 1444.57it/s]

2025/05/05 19:12:23 INFO dspy.evaluate.evaluate: Average Metric: 2.5 / 4 (62.5%)





Unnamed: 0,review,example_sentiment,features_mentioned,pred_sentiment,key_features,combined_metric
0,This coffee maker is a complete waste of money. It broke after two...,negative,[durability],Negative,,✔️ [0.500]
1,"The wireless charger works as expected, nothing special.",neutral,[charging functionality],Neutral,"Proper functionality, basic performance",✔️ [0.500]
2,This phone has amazing battery life and the camera quality is outs...,positive,"[battery life, camera quality]",Positive,"Battery life, Camera quality",✔️ [1.000]
3,The headphones have great sound quality but poor noise cancellation.,mixed,"[sound quality, noise cancellation]",Mixed/Neutral (leaning positive),"Sound quality, Noise cancellation",✔️ [0.500]



Detailed Results for First 3 Examples:

Example 1:
Review: This coffee maker is a complete waste of money. It broke after two weeks....
True Sentiment: negative
Predicted Sentiment: Negative
True Features: ['durability']
Predicted Features: None
Score: 0.50

Example 2:
Review: The wireless charger works as expected, nothing special....
True Sentiment: neutral
Predicted Sentiment: Neutral
True Features: ['charging functionality']
Predicted Features: Proper functionality, basic performance
Score: 0.50

Example 3:
Review: This phone has amazing battery life and the camera quality is outstanding....
True Sentiment: positive
Predicted Sentiment: Positive
True Features: ['battery life', 'camera quality']
Predicted Features: Battery life, Camera quality
Score: 1.00


In [27]:
from dspy.teleprompt import MIPROv2  
import time  
  
# Initialize the optimizer with our metric  
print("Initializing MIPROv2 optimizer...")  
optimizer = MIPROv2(  
    metric=combined_metric,  
    auto="light",  # Use "light" for faster optimization, "medium" or "heavy" for better results  
    num_threads=4  # Adjust based on your system  
)  
  
# Start the optimization process  
print("Starting optimization process (this may take a few minutes)...")  
start_time = time.time()  
  
# Compile the program to get an optimized version  
optimized_analyzer = optimizer.compile(  
    analyzer,  
    trainset=trainset,  
    max_bootstrapped_demos=2,  # Number of bootstrapped examples to include  
    max_labeled_demos=2,       # Number of labeled examples to include  
    requires_permission_to_run=False  # Set to True if you want to confirm before running  
)  
  
end_time = time.time()  
print(f"Optimization completed in {end_time - start_time:.2f} seconds")  
  
# Save the optimized program  
optimized_analyzer.save("optimized_review_analyzer.json")  
print("Optimized program saved to 'optimized_review_analyzer.json'")  
  
# Evaluate the optimized program  
print("\nEvaluating optimized model...")  
optimized_score = evaluator(optimized_analyzer)  
print(f"Optimized Score: {optimized_score:.2f}%")  
print(f"Improvement: {optimized_score - score:.2f}%")  
  
# Examine the optimized prompt  
print("\nExamining the optimized prompt:")  
  
# For a ChainOfThought module, we need to access the predict attribute first  
# which contains the actual predictor with the signature  
print("Optimized Instructions:")  
if hasattr(optimized_analyzer.analyzer, "predict") and hasattr(optimized_analyzer.analyzer.predict, "signature"):  
    print(optimized_analyzer.analyzer.predict.signature.instructions)  
else:  
    print("Instructions not found in the expected structure.")  
    # Alternative approach to find instructions  
    for predictor in optimized_analyzer.predictors():  
        if hasattr(predictor, "signature"):  
            print(f"Found instructions in predictor {predictor.__class__.__name__}:")  
            print(predictor.signature.instructions)  
            break  
  
# Check if the program has demos (few-shot examples)  
if hasattr(optimized_analyzer.analyzer, "demos"):  
    print("\nFew-shot Examples:")  
    for i, demo in enumerate(optimized_analyzer.analyzer.demos):  
        print(f"\nExample {i+1}:")  
        print(f"Input: {demo.review}")  
          
        # Safely access attributes with fallbacks  
        sentiment = getattr(demo, 'sentiment', 'N/A')  
          
        # Try different possible attribute names for key features  
        if hasattr(demo, 'key_features'):  
            features = demo.key_features  
        elif hasattr(demo, 'features'):  
            features = demo.features  
        elif hasattr(demo, 'features_mentioned'):  
            features = demo.features_mentioned  
        else:  
            # Print all available attributes to help debug  
            print(f"Available attributes: {dir(demo)}")  
            features = 'N/A'  
              
        print(f"Output: Sentiment: {sentiment}, Key Features: {features}")  
elif hasattr(optimized_analyzer.analyzer, "predict") and hasattr(optimized_analyzer.analyzer.predict, "demos"):  
    print("\nFew-shot Examples:")  
    for i, demo in enumerate(optimized_analyzer.analyzer.predict.demos):  
        print(f"\nExample {i+1}:")  
        print(f"Input: {demo.review}")  
          
        # Same safe attribute access as above  
        sentiment = getattr(demo, 'sentiment', 'N/A')  
          
        if hasattr(demo, 'key_features'):  
            features = demo.key_features  
        elif hasattr(demo, 'features'):  
            features = demo.features  
        elif hasattr(demo, 'features_mentioned'):  
            features = demo.features_mentioned  
        else:  
            print(f"Available attributes: {dir(demo)}")  
            features = 'N/A'  
              
        print(f"Output: Sentiment: {sentiment}, Key Features: {features}")  
else:  
    print("\nNo few-shot examples found.")

2025/05/05 19:12:23 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 7
minibatch: False
num_candidates: 5
valset size: 12

2025/05/05 19:12:23 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/05/05 19:12:23 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/05/05 19:12:23 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=5 sets of demonstrations...


Initializing MIPROv2 optimizer...
Starting optimization process (this may take a few minutes)...
Bootstrapping set 1/5
Bootstrapping set 2/5
Bootstrapping set 3/5


  0%|          | 0/4 [00:00<?, ?it/s]2025/05/05 19:12:23 ERROR dspy.teleprompt.bootstrap: Failed to run or to evaluate example Example({'review': 'I love this product! The comfort are excellent.', 'sentiment': 'positive', 'features_mentioned': ['comfort']}) (input_keys={'review'}) with <function combined_metric at 0x156049300> due to combined_metric() takes 2 positional arguments but 3 were given.
2025/05/05 19:12:23 ERROR dspy.teleprompt.bootstrap: Failed to run or to evaluate example Example({'review': "This blender makes the smoothest smoothies I've ever had!", 'sentiment': 'positive', 'features_mentioned': ['blending performance']}) (input_keys={'review'}) with <function combined_metric at 0x156049300> due to combined_metric() takes 2 positional arguments but 3 were given.
2025/05/05 19:12:23 ERROR dspy.teleprompt.bootstrap: Failed to run or to evaluate example Example({'review': 'The durability is great, but the performance could be improved.', 'sentiment': 'mixed', 'features_ment

Bootstrapped 0 full traces after 3 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 4/5


  0%|          | 0/4 [00:00<?, ?it/s]2025/05/05 19:12:23 ERROR dspy.teleprompt.bootstrap: Failed to run or to evaluate example Example({'review': "This blender makes the smoothest smoothies I've ever had!", 'sentiment': 'positive', 'features_mentioned': ['blending performance']}) (input_keys={'review'}) with <function combined_metric at 0x156049300> due to combined_metric() takes 2 positional arguments but 3 were given.
2025/05/05 19:12:23 ERROR dspy.teleprompt.bootstrap: Failed to run or to evaluate example Example({'review': 'I love this product! The comfort are excellent.', 'sentiment': 'positive', 'features_mentioned': ['comfort']}) (input_keys={'review'}) with <function combined_metric at 0x156049300> due to combined_metric() takes 2 positional arguments but 3 were given.
2025/05/05 19:12:23 ERROR dspy.teleprompt.bootstrap: Failed to run or to evaluate example Example({'review': "The smartwatch's fitness tracking is accurate, but the battery drains too quickly.", 'sentiment': 'mix

Bootstrapped 0 full traces after 3 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 5/5


  0%|          | 0/4 [00:00<?, ?it/s]2025/05/05 19:12:23 ERROR dspy.teleprompt.bootstrap: Failed to run or to evaluate example Example({'review': 'I love this product! The comfort are excellent.', 'sentiment': 'positive', 'features_mentioned': ['comfort']}) (input_keys={'review'}) with <function combined_metric at 0x156049300> due to combined_metric() takes 2 positional arguments but 3 were given.
2025/05/05 19:12:23 ERROR dspy.teleprompt.bootstrap: Failed to run or to evaluate example Example({'review': "The smartwatch's fitness tracking is accurate, but the battery drains too quickly.", 'sentiment': 'mixed', 'features_mentioned': ['fitness tracking', 'battery life']}) (input_keys={'review'}) with <function combined_metric at 0x156049300> due to combined_metric() takes 2 positional arguments but 3 were given.
2025/05/05 19:12:23 ERROR dspy.teleprompt.bootstrap: Failed to run or to evaluate example Example({'review': 'The durability is great, but the performance could be improved.', 's

Bootstrapped 0 full traces after 3 examples for up to 1 rounds, amounting to 4 attempts.


2025/05/05 19:12:23 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.
2025/05/05 19:12:23 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing instructions...

2025/05/05 19:12:23 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/05/05 19:12:23 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Given the fields `review`, produce the fields `sentiment`, `key_features`.

2025/05/05 19:12:23 INFO dspy.teleprompt.mipro_optimizer_v2: 1: Analyze the provided product review to determine the overall sentiment (positive, negative, or neutral) and identify the key features discussed in the review. Use a chain of thought approach to reason through the review content before producing the structured output. Respond with the fields `sentiment` and `key_features`, including your reasoning process, and fo

Average Metric: 9.98 / 12 (83.2%): 100%|██████████| 12/12 [00:00<00:00, 1336.33it/s]

2025/05/05 19:12:24 INFO dspy.evaluate.evaluate: Average Metric: 9.983333333333333 / 12 (83.2%)
2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 83.19

2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 2 / 7 =====



Average Metric: 10.98 / 12 (91.5%): 100%|██████████| 12/12 [00:00<00:00, 2452.93it/s]

2025/05/05 19:12:24 INFO dspy.evaluate.evaluate: Average Metric: 10.983333333333333 / 12 (91.5%)
2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: [92mBest full score so far![0m Score: 91.53
2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 91.53 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 1'].
2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [83.19, 91.53]
2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 91.53


2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 3 / 7 =====



Average Metric: 10.48 / 12 (87.4%): 100%|██████████| 12/12 [00:00<00:00, 2413.99it/s]

2025/05/05 19:12:24 INFO dspy.evaluate.evaluate: Average Metric: 10.483333333333333 / 12 (87.4%)





2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 87.36 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1'].
2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [83.19, 91.53, 87.36]
2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 91.53


2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 4 / 7 =====


Average Metric: 10.48 / 12 (87.4%): 100%|██████████| 12/12 [00:00<00:00, 2268.83it/s]

2025/05/05 19:12:24 INFO dspy.evaluate.evaluate: Average Metric: 10.483333333333333 / 12 (87.4%)





2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 87.36 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 1'].
2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [83.19, 91.53, 87.36, 87.36]
2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 91.53


2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 5 / 7 =====


Average Metric: 10.48 / 12 (87.4%): 100%|██████████| 12/12 [00:00<00:00, 495.36it/s]

2025/05/05 19:12:24 INFO dspy.evaluate.evaluate: Average Metric: 10.483333333333333 / 12 (87.4%)
2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 87.36 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1'].





2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [83.19, 91.53, 87.36, 87.36, 87.36]
2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 91.53


2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 6 / 7 =====


Average Metric: 10.98 / 12 (91.5%): 100%|██████████| 12/12 [00:00<00:00, 1185.89it/s]

2025/05/05 19:12:24 INFO dspy.evaluate.evaluate: Average Metric: 10.983333333333333 / 12 (91.5%)
2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 91.53 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 3'].
2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [83.19, 91.53, 87.36, 87.36, 87.36, 91.53]
2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 91.53


2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 7 =====



Average Metric: 10.48 / 12 (87.4%): 100%|██████████| 12/12 [00:00<00:00, 2225.49it/s]

2025/05/05 19:12:24 INFO dspy.evaluate.evaluate: Average Metric: 10.483333333333333 / 12 (87.4%)
2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 87.36 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 1'].
2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [83.19, 91.53, 87.36, 87.36, 87.36, 91.53, 87.36]
2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 91.53


2025/05/05 19:12:24 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 91.53!



Optimization completed in 0.28 seconds
Optimized program saved to 'optimized_review_analyzer.json'

Evaluating optimized model...
Average Metric: 3.33 / 4 (83.3%): 100%|██████████| 4/4 [00:00<00:00, 2349.09it/s]

2025/05/05 19:12:24 INFO dspy.evaluate.evaluate: Average Metric: 3.333333333333333 / 4 (83.3%)





Unnamed: 0,review,example_sentiment,features_mentioned,pred_sentiment,key_features,combined_metric
0,This coffee maker is a complete waste of money. It broke after two...,negative,[durability],negative,"durability, quality",✔️ [0.833]
1,"The wireless charger works as expected, nothing special.",neutral,[charging functionality],neutral,wireless charger,✔️ [0.500]
2,This phone has amazing battery life and the camera quality is outs...,positive,"[battery life, camera quality]",positive,"battery life, camera quality",✔️ [1.000]
3,The headphones have great sound quality but poor noise cancellation.,mixed,"[sound quality, noise cancellation]",mixed,"sound quality, noise cancellation",✔️ [1.000]


Optimized Score: 83.33%
Improvement: 82.33%

Examining the optimized prompt:
Optimized Instructions:
Analyze the provided product review to determine the overall sentiment (positive, negative, or neutral) and identify the key features discussed in the review. Use a chain of thought approach to reason through the review content before producing the structured output. Respond with the fields `sentiment` and `key_features`, including your reasoning process, and format the output clearly.

Few-shot Examples:

Example 1:
Input: The smartwatch's fitness tracking is accurate, but the battery drains too quickly.
Output: Sentiment: mixed, Key Features: ['fitness tracking', 'battery life']

Example 2:
Input: This blender makes the smoothest smoothies I've ever had!
Output: Sentiment: positive, Key Features: ['blending performance']


In [31]:
import openai  
import json  
import os  
  
# Set your OpenAI API key  
openai.api_key = os.environ.get("OPENAI_API_KEY", api_key)  
  
def extract_optimized_prompt(optimized_program):  
    """  
    Extract instructions and few-shot examples from an optimized DSPy program.  
      
    Args:  
        optimized_program: The optimized DSPy program  
          
    Returns:  
        tuple: (instructions, demos)  
    """  
    # For ChainOfThought modules, we need to access the predict attribute  
    if hasattr(optimized_program.analyzer, "predict") and hasattr(optimized_program.analyzer.predict, "signature"):  
        instructions = optimized_program.analyzer.predict.signature.instructions  
    else:  
        # Fallback: try to find instructions in any predictor  
        instructions = "No instructions found"  
        for predictor in optimized_program.predictors():  
            if hasattr(predictor, "signature"):  
                instructions = predictor.signature.instructions  
                break  
      
    # Extract few-shot examples if they exist  
    demos = []  
    if hasattr(optimized_program.analyzer, "demos"):  
        demos = optimized_program.analyzer.demos  
    elif hasattr(optimized_program.analyzer, "predict") and hasattr(optimized_program.analyzer.predict, "demos"):  
        demos = optimized_program.analyzer.predict.demos  
      
    return instructions, demos
  
  
# Function to format the prompt for OpenAI API  
def format_prompt_for_openai(instructions, demos, new_review):  
    """  
    Format the instructions and demos into a prompt for OpenAI API.  
      
    Args:  
        instructions: The optimized instructions  
        demos: List of few-shot examples  
        new_review: The new review to analyze  
          
    Returns:  
        str: Formatted prompt  
    """  
    prompt = f"{instructions}\n\n"  
      
    if demos:  
        prompt += "Examples:\n\n"  
        for demo in demos:  
            prompt += f"Input: {demo.review}\n"  
              
            # Safely access sentiment  
            sentiment = getattr(demo, 'sentiment', 'N/A')  
              
            # Safely access key_features with fallbacks  
            if hasattr(demo, 'key_features'):  
                features = demo.key_features  
            elif hasattr(demo, 'features'):  
                features = demo.features  
            elif hasattr(demo, 'features_mentioned'):  
                features = demo.features_mentioned  
            else:  
                # Use a default value if no attribute is found  
                features = 'N/A'  
                  
            prompt += f"Output: Sentiment: {sentiment}, Key Features: {features}\n\n"  
      
    prompt += f"Input: {new_review}\n"  
    prompt += "Output:"  
      
    return prompt  
  
# Function to call OpenAI API with the optimized prompt  
def analyze_review_with_openai(instructions, demos, new_review, model="gpt-4o-mini"):  
    try:  
        # Format the prompt  
        system_prompt = format_prompt_for_openai(instructions, demos, new_review)  
          
        # Initialize the OpenAI client  
        from openai import OpenAI  
        client = OpenAI()  
          
        # Call OpenAI API with the new format  
        response = client.chat.completions.create(  
            model=model,  
            messages=[  
                {"role": "system", "content": system_prompt}  
            ],  
            temperature=0.0  
        )  
          
        # Extract the response  
        result = response.choices[0].message.content  
        print(f"Raw API response: {result}")  
          
        # Parse the result - improved to handle multiple formats  
        try:  
            # Try to find JSON in the response  
            import re  
            import json  
              
            # Look for JSON pattern  
            json_match = re.search(r'```json\s*(.*?)\s*```', result, re.DOTALL)  
            if json_match:  
                # Extract and parse JSON  
                json_str = json_match.group(1)  
                data = json.loads(json_str)  
                return {  
                    "sentiment": data.get("sentiment", "N/A"),  
                    "key_features": data.get("key_features", "N/A")  
                }  
              
            # Try the original format as fallback  
            parts = result.split(', Key Features: ')  
            if len(parts) >= 2:  
                sentiment = parts[0].replace('Sentiment: ', '')  
                key_features = parts[1]  
                return {  
                    "sentiment": sentiment,  
                    "key_features": key_features  
                }  
                  
            # If we get here, try to extract sentiment and key_features from anywhere in the text  
            sentiment_match = re.search(r'sentiment["\s:]+([^",\n]+)', result.lower())  
            features_match = re.search(r'key_features["\s:]+(\[[^\]]+\])', result.lower())  
              
            sentiment = sentiment_match.group(1).strip() if sentiment_match else "N/A"  
            features = features_match.group(1).strip() if features_match else "N/A"  
              
            return {  
                "sentiment": sentiment,  
                "key_features": features  
            }  
              
        except Exception as e:  
            print(f"Error parsing response: {e}")  
            return {"raw_result": result}  
              
    except Exception as e:  
        print(f"Error calling OpenAI API: {e}")  
        return {"error": str(e)}  
  
# Extract the optimized prompt  
instructions, demos = extract_optimized_prompt(optimized_analyzer)  
  
# Print the extracted prompt components  
print("Extracted Instructions:")  
print(instructions)  
print("\nExtracted Few-Shot Examples:")  
for i, demo in enumerate(demos):  
    print(f"\nExample {i+1}:")  
    print(f"Review: {demo.review}")  
    print(f"Sentiment: {demo.sentiment}")  
      
    # Safely access key_features with fallbacks  
    if hasattr(demo, 'key_features'):  
        features = demo.key_features  
    elif hasattr(demo, 'features'):  
        features = demo.features  
    elif hasattr(demo, 'features_mentioned'):  
        features = demo.features_mentioned  
    else:  
        # Print all available attributes to help debug  
        print(f"Available attributes: {', '.join(demo._store.keys())}")  
        features = 'N/A'  
          
    print(f"Key Features: {features}") 
  
# Test with a new review  
# new_review = "I bought this camera last week and I'm impressed with the image quality, but the battery life is disappointing."  
# new_review = "I recently purchased this organic maple syrup and it's absolutely divine! The rich, complex flavor is so much better than the artificial syrups I used to buy. It has a perfect consistency - not too thick or runny. The glass bottle packaging is elegant and eco-friendly, though a bit pricey compared to regular syrup. Worth every penny for weekend pancakes with the family!"  
# new_review = "These instant ramen noodles were a huge disappointment. The flavor packet contained way too much salt, making the broth nearly inedible. The noodles themselves had a strange texture - they remained hard in some places while becoming mushy in others. The packaging claims 'authentic taste' but this doesn't resemble any real ramen I've had. The only positive is how quickly it cooks, but that doesn't make up for the poor taste and quality."  
new_review = "This dark chocolate sea salt bar is decent, but not exceptional. The chocolate has a good cocoa percentage and melts smoothly on the tongue. The sea salt adds a nice contrast to the bitterness, though the distribution is uneven - some bites have too much salt while others have none. The packaging is stylish and the ingredients are high-quality, but for the premium price point, I expected something more memorable. It satisfies my chocolate cravings, but I'll probably try a different brand next time."  
  
# Analyze the review using OpenAI API with the optimized prompt  
print("\nAnalyzing new review with OpenAI API...")  
result = analyze_review_with_openai(instructions, demos, new_review)  
  
print("\nAnalysis Result:")  
print(f"Sentiment: {result.get('sentiment', 'N/A')}")  
print(f"Key Features: {result.get('key_features', 'N/A')}")  
  
# Compare with DSPy result  
print("\nComparing with DSPy result:")  
dspy_result = optimized_analyzer(review=new_review)  
print(f"DSPy Sentiment: {dspy_result.sentiment}")  
print(f"DSPy Key Features: {dspy_result.key_features}")  
  
# Save the optimized prompt to a JSON file for future use  
prompt_data = {  
    "instructions": instructions,  
    "demos": []  
}

for demo in demos:  
    # Safely access sentiment  
    sentiment = getattr(demo, 'sentiment', 'N/A')  
      
    # Safely access key_features with fallbacks  
    if hasattr(demo, 'key_features'):  
        features = demo.key_features  
    elif hasattr(demo, 'features'):  
        features = demo.features  
    elif hasattr(demo, 'features_mentioned'):  
        features = demo.features_mentioned  
    else:  
        features = 'N/A'  
      
    prompt_data["demos"].append({  
        "review": demo.review,  
        "sentiment": sentiment,  
        "key_features": features  
    })
  
with open("optimized_prompt.json", "w") as f:  
    json.dump(prompt_data, f, indent=2)  
  
print("\nOptimized prompt saved to 'optimized_prompt.json'")

Extracted Instructions:
Analyze the provided product review to determine the overall sentiment (positive, negative, or neutral) and identify the key features discussed in the review. Use a chain of thought approach to reason through the review content before producing the structured output. Respond with the fields `sentiment` and `key_features`, including your reasoning process, and format the output clearly.

Extracted Few-Shot Examples:

Example 1:
Review: The smartwatch's fitness tracking is accurate, but the battery drains too quickly.
Sentiment: mixed
Key Features: ['fitness tracking', 'battery life']

Example 2:
Review: This blender makes the smoothest smoothies I've ever had!
Sentiment: positive
Key Features: ['blending performance']

Analyzing new review with OpenAI API...
Raw API response: To analyze the provided product review, I will break down the content to determine the overall sentiment and identify the key features discussed.

1. **Sentiment Analysis**:
   - The reviewe