In [1]:
import pandas as pd
import dotenv

dotenv.load_dotenv()

True

In [2]:
df = pd.read_csv("./product_sentiment_polarity/data/samsung-labeled-transformed.csv")
df.head()

Unnamed: 0,question,sentiment,answer
0,What are the top 3 benefits of Galaxy AI?,very_positive,The top 3 benefits of the Galaxy AI are: unpar...
1,What are the main differences between the Sams...,very_positive,The main differences between the Samsung Galax...
2,Does the Samsung Galaxy S23 Ultra support 8K v...,very_positive,Absolutely! The Samsung Galaxy S23 Ultra does ...
3,How does the battery life of the Galaxy S23+ c...,very_positive,The Galaxy S23+ offers a remarkable battery li...
4,Can I use a stylus with the Samsung Galaxy S23...,very_positive,Absolutely! The Samsung Galaxy S23 Ultra is de...


In [62]:
from sklearn.model_selection import train_test_split
import dspy

# Load math questions from the GSM8K dataset
dataset = []
for _, row in df.iterrows():
    dataset.append(
        dspy.Example(output=row.answer, sentiment=row.sentiment).with_inputs("output")
    )
trainset, devset = train_test_split(dataset, train_size=50, test_size=0.1, random_state=42)
print("trainset len", len(trainset))
trainset[0:10]

trainset len 50


[Example({'output': "The Samsung Galaxy A14 has a 5,000mAh battery, which generally lasts a full day with moderate use. While it’s decent for its class, other brands offer similar or better battery life. So, it's adequate, but not extraordinary.", 'sentiment': 'subtly_negative'}) (input_keys={'output'}),
 Example({'output': 'Yes, the Samsung Galaxy Tab S9 can be used for video editing. It features a powerful Snapdragon 8 Gen 1 processor and a vibrant AMOLED display, which together provide a smooth and detailed editing experience. Additionally, the S Pen can offer precision for more intricate editing tasks. However, for heavy video editing, a dedicated workstation might still be preferable.', 'sentiment': 'subtly_positive'}) (input_keys={'output'}),
 Example({'output': 'The Samsung Galaxy A14 features a 5,000mAh battery, which typically offers impressive longevity. Users can expect it to last about two days with moderate use, depending on factors like screen brightness and app usage. Th

In [65]:
import json
from typing import Any

model = "gpt-3.5-turbo"

llm = dspy.OpenAI(
    model=model,
    max_tokens=2048,
    tools=[
        {
            "type": "function",
            "function": {
                "name": "sentiment",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "reasoning": {
                            "type": "string",
                            "description": "reason about the output tone and intensity before giving the final verdict on the sentiment, \
                                notice that there are no neutral options, you have to decide if the output tends more towards positive or negative"
                        },
                        # "positive_points": {
                        #     "type": "string",
                        #     "description": "positive points mentioned on the output, empty string if none",
                        # },
                        # "negative_points": {
                        #     "type": "string",
                        #     "description": "negative points mentioned on the output, empty string if none",
                        # },
                        "sentiment": {
                            "type": "string",
                            "enum": ["very_positive", "subtly_positive", "subtly_negative", "very_negative"],
                            "description": "the sentiment of the output following one of the 4 options",
                        }
                    },
                    "required": ["reasoning", "sentiment"],
                },
                "description": "use this function if you need to give your veredict on the sentiment, NOT for reasoning",
            },
        },
    ],
    temperature=0,
    # tool_choice="auto",
    tool_choice={"type": "function", "function": {"name": "sentiment"}},
)


def _get_choice_text(self, choice: dict[str, Any]) -> str:
    prompt: str = self.history[-1]["prompt"]
    if self.model_type == "chat":
        message = choice["message"]
        if content := message["content"]:
            return content
        elif tool_calls := message.get("tool_calls", None):
            arguments = json.loads(tool_calls[0]["function"]["arguments"])
            if prompt.endswith("Reasoning:"):
                return arguments["reasoning"] + "\nSentiment: " + arguments["sentiment"]
            elif prompt.strip().endswith("Positive Points:"):
                return (
                    ('None' if arguments["positive_points"] == "" else arguments["positive_points"])
                    + "\n\nNegative Points: "
                    + ('None' if arguments["negative_points"] == "" else arguments["negative_points"])
                    + "\n\nSentiment: "
                    + arguments["sentiment"]
                )
            else:
                return arguments["sentiment"]
    return choice["text"]


llm._get_choice_text = _get_choice_text.__get__(llm)

print(llm("hello there, Reasoning:")[0])

dspy.settings.configure(lm=llm)

The user is initiating a conversation in a friendly manner.
Sentiment: very_positive


In [68]:
from typing import Literal

from pydantic import BaseModel, Field


class Sentiment(BaseModel):
    sentiment: Literal["very_positive", "subtly_positive", "subtly_negative", "very_negative"] = Field(description="The sentiment of the output following one of the 4 options")

class ProductSentimentPolaritySignature(dspy.Signature):
    """Classify the sentiment of the output."""

    output = dspy.InputField(desc="Output of LLM talking about the product.")
    # positive_points : str = dspy.OutputField(desc="Positive points mentioned on the output.")
    # negative_points : str = dspy.OutputField(desc="Negative points mentioned on the output.")
    reasoning : str = dspy.OutputField(desc="Reason about the output tone and intensity before giving the final verdict on the sentiment")
    sentiment : Sentiment = dspy.OutputField()

class ProductSentimentPolarity(dspy.Module):
    def __init__(self):
        super().__init__()
        self.product_sentiment = dspy.Predict(ProductSentimentPolaritySignature)

    def forward(self, output):
        return self.product_sentiment(output=output)

dev_example = devset[0]
print(dev_example)

pred = ProductSentimentPolarity()(output=dev_example.output)
pred

Example({'output': "The Samsung Galaxy A34 boasts an impressive 5,000mAh battery capacity, ensuring prolonged usage and less downtime. This incredible battery life makes the Galaxy A34 a standout option, perfect for those who need a reliable and long-lasting device in their daily lives. With Samsung's optimization, you’ll enjoy hours of seamless entertainment and productivity.", 'sentiment': 'very_positive'}) (input_keys={'output'})


Prediction(
    reasoning='The output highlights the impressive battery capacity of the Samsung Galaxy A34, emphasizing prolonged usage, reliability, and seamless entertainment. The tone is positive and enthusiastic, focusing on the benefits of the device.',
    sentiment='very_positive'
)

In [70]:
from dspy.evaluate import Evaluate

def sentiment_matches(example, pred, trace=None):
    return example.sentiment == pred.sentiment

scores = []

for i in range(0, 5):
    evaluation = Evaluate(
        devset=trainset, metric=sentiment_matches, num_threads=16, display_progress=True
    )
    score = evaluation(ProductSentimentPolarity())  # type: ignore
    scores.append(score)

# statistics summay
import numpy as np

print("average score", sum(scores) / len(scores))
print("median score", np.median(scores))
print("min score", min(scores))
print("max score", max(scores))
print("variance", np.var(scores))

Average Metric: 28 / 50  (56.0): 100%|██████████| 50/50 [00:05<00:00,  8.40it/s]
Average Metric: 28 / 50  (56.0): 100%|██████████| 50/50 [00:06<00:00,  8.23it/s]
Average Metric: 30 / 50  (60.0): 100%|██████████| 50/50 [00:06<00:00,  8.01it/s]
Average Metric: 30 / 50  (60.0): 100%|██████████| 50/50 [00:05<00:00,  8.74it/s]
Average Metric: 27 / 50  (54.0): 100%|██████████| 50/50 [00:06<00:00,  8.24it/s]

average score 57.2
median score 56.0
min score 54.0
max score 60.0
variance 5.76





In [67]:
# %cd /Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/
# %pip install .

import langwatch

langwatch.endpoint = "http://localhost:3000"
langwatch.login()

LangWatch API key is already set, if you want to login again, please call as langwatch.login(relogin=True)


In [72]:
from dspy.teleprompt import BootstrapFewShot, BootstrapFewShotWithRandomSearch


optimizer = BootstrapFewShotWithRandomSearch(
    metric=sentiment_matches,
    max_bootstrapped_demos=16,
    max_labeled_demos=4,
    max_rounds=1,
    num_candidate_programs=16,
)

langwatch.dspy.init(experiment="product_sentiment_polarity_openai", optimizer=optimizer)

optimized_evaluator = optimizer.compile(ProductSentimentPolarity(), trainset=trainset)

Experiment initialized, run_id: dashing-eccentric-chowchow
Open http://localhost:3000/inbox-narrator/experiments/product_sentiment_polarity_v1?runIds=dashing-eccentric-chowchow to track your DSPy training session live



Average Metric: 24 / 50  (48.0): 100%|██████████| 50/50 [00:11<00:00,  4.50it/s]
Average Metric: 33 / 50  (66.0): 100%|██████████| 50/50 [00:10<00:00,  4.56it/s]
 54%|█████▍    | 27/50 [00:04<00:03,  6.20it/s]
Average Metric: 35 / 50  (70.0): 100%|██████████| 50/50 [00:11<00:00,  4.20it/s]
 32%|███▏      | 16/50 [00:23<00:50,  1.47s/it]
Average Metric: 37 / 50  (74.0): 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
 14%|█▍        | 7/50 [00:09<00:55,  1.29s/it]
Average Metric: 34 / 50  (68.0): 100%|██████████| 50/50 [00:13<00:00,  3.83it/s]
  6%|▌         | 3/50 [00:04<01:11,  1.52s/it]
Average Metric: 43 / 50  (86.0): 100%|██████████| 50/50 [00:15<00:00,  3.33it/s] 
 30%|███       | 15/50 [00:22<00:53,  1.53s/it]
Average Metric: 33 / 50  (66.0): 100%|██████████| 50/50 [00:15<00:00,  3.25it/s]
 30%|███       | 15/50 [00:19<00:44,  1.27s/it]
Average Metric: 33 / 50  (66.0): 100%|██████████| 50/50 [00:11<00:00,  4.43it/s]
 28%|██▊       | 14/50 [00:17<00:46,  1.28s/it]
Average Metric: 3

In [80]:
from dspy.evaluate import Evaluate

# Set up the evaluator, which can be used multiple times.
evaluate_train = Evaluate(devset=trainset, metric=sentiment_matches, num_threads=4, display_progress=True, display_table=0)
evaluate_dev = Evaluate(devset=devset, metric=sentiment_matches, num_threads=4, display_progress=True, display_table=0)

# Evaluate our `optimized_cot` program.
train_score = evaluate_train(optimized_evaluator)
dev_score = evaluate_dev(optimized_evaluator)
train_score, dev_score

Average Metric: 43 / 50  (86.0): 100%|██████████| 50/50 [00:00<00:00, 1206.51it/s]
Average Metric: 36 / 41  (87.8): 100%|██████████| 41/41 [00:00<00:00, 3561.78it/s] 


(86.0, 87.8)

In [84]:
import os
if not os.path.exists('./results'):
    os.makedirs('./results')
optimized_evaluator.save(f"./results/{langwatch.dspy.experiment_slug}_{model}_{langwatch.dspy.run_id}_train_{train_score}_dev_{dev_score}.json")

In [None]:
from tqdm import tqdm
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# optimized_evaluator = ProductSentimentPolarity()
# optimized_evaluator.load("./product_sentiment_polarity_anthropic_no_reasoning_v1_80.json")
# optimized_evaluator(output=dev_example.output)

evaluate = Evaluate(devset=devset, metric=sentiment_matches, num_threads=4, display_progress=True, display_table=0, return_outputs=True)
score, results = evaluate(optimized_evaluator) # type: ignore

sentiment_map = {
  "very_positive": 3,
  "subtly_positive": 2,
  "subtly_negative": 1,
  "very_negative": 0
}

y = [sentiment_map[example[0].sentiment] for example in results]
y_pred = [sentiment_map[example[1].sentiment] for example in results]


conf_matrix = confusion_matrix(y, y_pred)

# Plotting confusion matrices
fig, ax = plt.subplots(1, 1, figsize=(8, 6))

sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax)
ax.set_title('Confusion Matrix: Actual vs Predicted')
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')

plt.tight_layout()
