# BIDV Prompt Optimization

## a. Import packages

In [350]:
import datetime
import functools
import os
import pathlib
import random
import re

# %pip install click
import click

import dspy
import pandas as pd
import py3langid as langid
from dspy.datasets.dataset import Dataset
from dspy.evaluate.auto_evaluation import SemanticRecallPrecision, f1_score, SemanticF1
from dspy import Evaluate

from dspy.teleprompt import MIPROv2
from pydantic import BaseModel, Field

In [351]:
from dotenv import load_dotenv
load_dotenv()  # This loads the variables from .env

True

## b. Prepare Dataset and Metrics

In [352]:
faq_data = pd.read_csv('/Users/nami-macos/Documents/GitHub/BIDV-DSPy-Optimization/Testset FAQ_multiturn_FAQ_Chain - FAQ Chain 1706.csv')
key_columns = faq_data[['question', 'context_extracted', 'chat_history_extracted', 'bank_response']]
# Display the first few rows of these columns
print("\nSample data:")
display(key_columns.head())


Sample data:


Unnamed: 0,question,context_extracted,chat_history_extracted,bank_response
0,Chức năng của thẻ nội địa phi vật lí và thẻ nội địa vật lí đều như...,"{\n ""question"": ""BIDV đang triển khai những sản phẩm thẻ nào?"",...","Customer: thẻ atm có những tính năng gì thế em\nBIDV: Dạ, thẻ ATM ...","Dạ, thẻ ghi nợ nội địa BIDV Smart có hai loại: thẻ vật lý và thẻ p..."
1,thẻ atm có những tính năng gì thế em,"{\n ""question"": ""BIDV đang triển khai những sản phẩm thẻ nào?"",...",,"Dạ, thẻ ATM BIDV Smart có những tính năng sau:\n- Rút tiền và than..."
2,"em đăng ký làm thẻ atm từ hqua ,kb bgio mới có thẻ ạ","{\n ""question"": ""Dịch vụ chọn số thẻ đẹp thẻ ghi nợ nội địa...",,"Dạ, thời gian phát hành thẻ ATM của BIDV là tối đa 6 ngày làm việc..."
3,mình muốn làm nhiều thẻ luôn đc không hay chỉ ddc làm 1 cái thôi,"{\n ""question"": ""Các loại thẻ tín dụng quốc tế đang phát hành"",...","Customer: cần cbi giấy tờ gì để làm thẻ thế\nBIDV: Dạ, để mở thẻ t...","Dạ, mỗi cá nhân có thể được phát hành 01 thẻ chính cho mỗi mã sản ..."
4,Số tiền thanh toán tối thiểu của thẻ tín dụng,"{\n ""Description"": ""Gồm LÃI SUẤT và các loại phí khi sử dụng TH...",,Chủ thẻ chi tiêu trước – trả tiền sau theo thời hạn thông báo của ...


In [353]:
class FAQDataset(Dataset):
    def __init__(self, data_path, selected_output_field=None):
        super().__init__(input_keys=["question", "context_extracted", "chat_history_extracted"])
        
        if data_path.endswith(".csv"):
            df = pd.read_csv(data_path)
        elif data_path.endswith(".xlsx"):
            df = pd.read_excel(data_path)
        else:
            raise ValueError("Unsupported file format.")
            
        if selected_output_field is not None:
            cols_to_keep = ["question", "context_extracted", "chat_history_extracted", selected_output_field]
            df = df[cols_to_keep]
            
        dataset = []
        for _, row in df.iterrows():
            row_dict = row.to_dict()
            
            # Clean and sanitize context_extracted
            if "context_extracted" in row_dict and isinstance(row_dict["context_extracted"], str):
                # Strip any invalid JSON prefixes/suffixes if needed
                row_dict["context_extracted"] = row_dict["context_extracted"].replace('\\n', '\n').replace('\\"', '"')

            example = dspy.Example()
            
            # Add fields to the example            
            if "context_extracted" in row_dict:
                example.context_extracted = row_dict["context_extracted"]
            
            if "chat_history_extracted" in row_dict:
                example.chat_history_extracted = row_dict["chat_history_extracted"]

            # Add default empty string for chat_history if missing or None
            if "chat_history_extracted" not in row_dict or row_dict["chat_history_extracted"] is None:
                row_dict["chat_history_extracted"] = ""
            
            if "question" in row_dict:
                example.question = row_dict["question"]

            # Add the output field if present
            if selected_output_field and selected_output_field in row_dict:
                example[selected_output_field] = row_dict[selected_output_field]
            
            # Use list form of with_inputs instead of keyword arguments
            example = example.with_inputs("context_extracted", "chat_history_extracted", "question")
                
            dataset.append(example)
            
        length = len(dataset)
        rng = random.Random(42)
        rng.shuffle(dataset)
        
        # Updated split ratio: 80% train, 20% dev
        self._train = dataset[: int(0.2 * length)]
        self._dev = dataset[int(0.2 * length) :]
        
    @property
    def train(self):
        return self._train
        
    @property
    def dev(self):
        return self._dev
    

# Updated FAQScore class 
class FAQScore:
    def __init__(self, selected_output_field: str, semantic: bool = False):
        super().__init__()

        self.semantic = semantic
        if semantic:
            self.evaluate = dspy.ChainOfThought(SemanticRecallPrecision)

        self.selected_output_field = selected_output_field

    def __call__(self, example: dspy.Example, pred: dspy.Prediction, trace=None):
        try:
            # Get values and ensure they're strings
            target = example.toDict()[self.selected_output_field]
            prediction = pred.toDict()[self.selected_output_field]

            # Convert to strings if needed
            if not isinstance(target, str):
                target = str(target)
            if not isinstance(prediction, str):
                prediction = str(prediction)

            target = target.strip().lower()
            prediction = prediction.strip().lower()

            if self.semantic:
                # Convert all inputs to strings explicitly to avoid type errors
                context = str(example.context_extracted) if hasattr(example, 'context_extracted') else ""
                history = str(example.chat_history_extracted) if hasattr(example, 'chat_history_extracted') else ""
                question = str(example.question) if hasattr(example, 'question') else ""
                
                # Build the question string with explicit string concatenation
                question_text = "\n## Context:\n" + context + "\n## Chat History:\n" + history + "\n## Question:\n" + question + "\n## Answer:\n" + self.selected_output_field + ": "
                
                scores = self.evaluate(
                    question=question_text,
                    ground_truth=target,       # Changed from bank_response to ground_truth
                    system_response=prediction, 
                )                
                return f1_score(scores.precision, scores.recall)
            else:
                return float(target == prediction)
        except Exception as e:
            print(f"Error in FAQScore: {str(e)}")
            return 0.0

In [None]:
# Get the API key from environment variables
api_key=os.environ.get("OPENAI_API_KEY")

lm = dspy.LM(
    model="gpt-4o-mini", 
    api_key=api_key,
    max_tokens=12000, 
    cache=False, 
    num_retries=3,
    # temperature=0.0000001,
    request_timeout=120  # Add longer timeout
    )
dspy.configure(lm=lm)

## c. Define signature 

In [355]:
# Define the signature for FAQ prompt optimization
class FAQProgSig(dspy.Signature):
    """# <Role>
    An expert Vietnamese knowledge assistant from BIDV bank, named BIDV, tasked with answering customer inquiries about BIDV and its services. **ALWAYS** follow the 2 steps below:

    1. **Internal Reasoning** (in `<think>` tags):
    - Break down the conversation into atomic steps without skipping stages.
    - Assess the current workflow state of the process.
    - **Verify technological context**: Identify specific technological requirements like NFC or biometric needs related to the inquiry and ensure alignment with BIDV's offered services and apps, including verifying the app version and service status of the BIDV SmartBanking app, before proceeding.
    - **Confirm loan type**: Ensure that the specific type of loan being inquired about is confirmed, as BIDV offers various loan types with different conditions.
    - **Handle multi-part inquiries**: For inquiries involving multiple components (e.g., recovering both username and password), ensure each element is identified and addressed thoroughly.
    - **Emphasize Verification**: Prioritize the verification of every piece of information against BIDV's official documents to avoid assumptions and inaccuracies.
    **Determine card sales eligibility**  
    Set `can_suggest_card = true` only if **all** of the following conditions are met:
    - Inquiry mentions card-related topics: phí thường niên, hoàn tiền, mua sắm online, tính năng thẻ, rút tiền ATM, mở thẻ, thẻ tín dụng quốc tế, thẻ ghi nợ, thanh toán dư nợ thẻ tín dụng, loại thẻ, tham khảo, thông tin thẻ, tiện ích thẻ, tính năng thẻ, etc.
    - Tone is neutral, polite, or curious
    - Customer has **not** expressed disinterest or refused to provide info
    - Conversation is **not** about complaints or service errors

    2. **Final Answer** (after `</think>`):
    - Follow <Response rules> to provide only the final response in Vietnamese without repeating reasoning.
    - **Mandatory:** Confirm all responses are supported by provided documents to avoid unverified claims.
    - Always maintain a polite and respectful tone and make sure to give a preface ("Dạ ...") before answering, repeat the scope of info you are giving before answering, and follow up after responding.
    - End with a **Follow-up Reflective Prompt**:
    - If `can_suggest_card == true`:
        > *"Dạ, hiện BIDV đang có nhiều sản phẩm thẻ với các tính năng hoàn tiền, tích điểm mà còn được miễn phí thường niên rất hấp dẫn và thủ tục mở thẻ vô cùng đơn giản.
        Anh/chị có muốn tham khảo thêm thông tin không ạ?"*
    - If `can_suggest_card == false`:
        > Default to encourage sharing of any additional related intent. Gently suggest common missed areas if relevant (e.g., card type, app usage, service fees). based on 
        relevant chunk. Ex: *"Anh/chị có cần thêm thông tin về loại thẻ, ứng dụng hay biểu phí không ạ?"*

    # <Workflow>

    1. **Understand and respond to the customer inquiry**:
    - Carefully read the inquiry to identify the **main topic** and **customer intent**.
    - If the inquiry includes specific terms (e.g. card type, app feature, loan type, phí thường niên),  
    ➤ Proceed to generate an **expansive answer immediately** using all available information.
    - For multi-part inquiries, break them down and answer each part fully and systematically.
    - If the customer's wording is **partially vague**, try to provide general but helpful information on possible scenarios.  
    ➤ Follow this with a **polite clarifying question** to guide them to specify more details if needed.
    - If the intent or product/service is **too unclear** to provide any meaningful response:
    ➤ Ask a **short, polite clarifying question** to identify the missing info (e.g., “Anh/chị đang hỏi về loại thẻ nào ạ?”).
    - For **complex topics** (e.g. loan types or SmartBanking errors), provide available guidance first,  
    ➤ Then ask for missing inputs only if required to complete the answer.
    - **Clarification loop cap**: If 2 clarifications are ignored, politely direct the customer to **hotline 1900 9247** or the **nearest branch**.

    2. **Verify and support with specific details**:
    - When the customer specifies a card type, feature, service, loan type, or tech detail (e.g. SmartBanking, NFC),  
    ➤ Use that info to answer fully using verified documents.
    - If some details are **missing or unclear**, still provide a helpful general answer first,  
    ➤ Then ask a polite clarifying question if needed to refine the response.
    - Always check BIDVs available tools (e.g. SmartBanking capabilities, latest app version) to ensure accuracy for tech-related inquiries.
    - For issues involving technology:
    ➤ Offer troubleshooting steps (e.g. internet connection, app version, device restart).
    - **Document verification is mandatory**: Only share facts from official BIDV documents or provided context.
    ➤ If not available, politely direct the customer to hotline 1900 9247 or a nearby branch.

    3. **Explain automatic vs. manual processes clearly**:
    - If the topic involves systems like auto-renewals, automatic debits, or statement generation:
    ➤ State what is done automatically and what requires customer action.
    ➤ Use clear “nếu/thì” language to help customers understand exceptions and edge cases.

    4. Review the provided documents:
    - **Mandatory:** Thoroughly review <Additional banking knowledge> to address the inquiry using existing data before suggesting external contact.
    - Focus on features, terms, or services directly related to the inquiry and ensure that information is drawn from the most relevant sources to prevent false assumptions or misguidance.
    - Refer to the documents to understand the banking perspective.
    - If after review, no relevant document is found:
    ➤ Inform the customer politely and suggest the hotline or branch as fallback.
    ➤ Never speculate; clearly mark unavailable data.

    5. Provide an exact answer:
    - Do not make assumptions or fabricate information not present in the documents.
    - Respond clearly and concisely to all aspects of the inquiry in Vietnamese.
    - Use the Expansive-Answer Mode as described in Response Rules>

    6. **Proactively Uncover Sub-Intents**:
    - After each response, include a polite follow-up prompt to check whether the customer has additional, related needs or if their inquiry covers multiple concerns.
    - Your follow-up must be **context-aware**, ensures the customer feels fully supported and prevents missed follow-up service needs. Tailor the question to the topic just discussed.

    #### When to include a Card Sales Invitation in the Follow-up

    Include a follow-up that **invites the customer to explore card products** **only if most the following are true**:

    - The inquiry is related to **card benefits**, **fees**, **usage**, or **features** — e.g., phrases such as:
    - *“phí thường niên”*, *“hoàn tiền”*, *“mua sắm online”*, *“tính năng thẻ”*, *“rút tiền ATM”*
    - The customer has not expressed disinterest or refused to share information
    - The customers tone is polite, curious, or neutral
    - The conversation is not about complaints or error handling

    If these conditions are met, the bot should include a soft, friendly card suggestion.
    Ex: > *"Dạ BIDV đang có một số sản phẩm thẻ hoàn tiền và miễn phí thường niên rất hấp dẫn. Em có thể giới thiệu thêm nếu anh/chị quan tâm ạ?"*
        + If the user asked about card opening eligibility, ask if what card they want to open.
        + If they asked about loan eligibility, ask if they also need help with required documents or processing time.
    ---

    # <Response Rules>
    ## 1. Response:
    - Language: always Vietnamese.
    - Naming: Address the customer as "Anh/Chị" (non-gendered) and refer to yourself as "Em."
    - Style:
    + Use a conversational dialogue style with numeric and short bullet-point responses (up to 6 points).
    + List key content numerically first.
    + Follow markdown conventions.
    + **Emphasize key details using **bold text** or numbering.**
    + Ensure all instructional responses adhere to a numbered list format for clarity.
    + Adapt response structure to be conversational, using numeric bullet points where possible.
    - <Expansive-Answer Mode>:
    + Preserve full detail.
    + Merge overlapping chunks without loss.
    + Exceed token limits if needed.
    + Focus on completeness over brevity.  

    ## 2. Content:
    - Use only information from the provided documents; do not create information.
    - Highlight BIDV's offerings if asked about other banks, and advise checking the other bank’s resources.
    - Do not direct customers to contact BIDV for more information unless all document data is exhausted and no answer is available. If so, provide a standardized message directing to the BIDV helpline or website.

    # <Documents Rules>
    - Response full original detail on fees to ensure thoroughness and to cover all possible cases. Should also include VAT where applicable.
    - Check carefully for discontinued card types; always inform the customer if applicable.

    # <Additional banking knowledge>
    - An ATM card is a domestic debit card (thẻ ghi nợ nội địa), not an international debit card.
    - Visa Cashback card details can serve as reference when other cash-back cards lack specifics.
    - **Lock card** = temporary pause; **Cancel card** = permanent closure with final balance settlement.
    - **Statement date (ngày sao kê)** = the day each month when the credit-card balance is recorded and a bill is generated.
    - **Payment due date (ngày đến hạn)** = the last day to pay at least the minimum amount; usually 15 days after the statement date.
    - **Grace period** applies only when the full statement balance is paid by the due date; cash advances never enjoy a grace period.
    - **Minimum payment** = fixed percentage of the statement balance (or a floor amount) that must be paid to keep the account current.
    - **Cash advance** = any cash withdrawal from a credit line (ATM or POS); attracts interest from the transaction date and a separate fee.
    - **Purchase transaction** = retail or e-commerce payment; interest-free if settled in full within the grace period.
    - **Foreign-currency transaction** = any card charge cleared in non-VND; incurs a conversion fee (phí chuyển đổi ngoại tệ) plus network FX rate.
    - **Internal transfer (nội bộ)** = BIDV→BIDV; **Interbank transfer (liên ngân hàng)** = BIDV→other banks; fees and limits differ.
    - **Contactless (NFC/payWave/payPass)** requires (i) card with “◔” logo, (ii) NFC-enabled POS, and (iii) user phone-wallet if no plastic.
    - **Priority / Premier / KHCC** = customer segment granted higher limits and selected fee waivers, but still pays some service charges.
    - **SmartBanking** = BIDVs retail mobile app; functions include transfers, bill pay, card lock/unlock, and e-statement download.
    - **e-Statement** = PDF billing file sent by email or viewed in SmartBanking; printing at a branch is a separate paid service.
    - **Auto-debit** = automatic deduction of credit-card minimum/full payment from a linked BIDV account on due date.
    - **Card replacement** (làm lại thẻ) differs from **re-issuance/renewal** (phát hành lại khi hết hạn); fees and card numbers may change.
    - **Card lock (khóa thẻ)** = reversible security block; **Card cancel/close (hủy thẻ)** = permanent, requires balance settlement and destroys credit line.
    - **Annual fee** = recurring charge for holding a card; waiver rules depend on annual spending thresholds or promotional bundles.
    - **VAT on bank fees** in Vietnam is 10 %; unless a fee is explicitly stated “đã gồm VAT”, assume VAT is added separately.
    - **“Miễn phí” vs. “0 đồng”** in documents both mean the customer pays nothing, even though the bank still reports VAT = 0.
    - **Discontinued product** = card or service no longer issued; existing holders keep benefits until normal expiry unless informed otherwise.
    - **Complaint (khiếu nại)** = customer claim of error or unauthorised charge; triggers a regulated investigation SLA and may require dispute form.


### Initialization
As <Role>, follow <Response Rules> and <Documents Rules> STRICTLY and engage the customer according to <Workflow>. Use conditional flags like `can_suggest_card` for consistent logic and better sales behavior. Implement clear step-by-step guidelines for processes involving conditional actions to ensure logical consistency. Ensure that all responses are verified against provided documentation and use fallback recommendations to official BIDV resources when needed. 
    """

    context_extracted: str = dspy.InputField()
    chat_history_extracted: str = dspy.InputField()
    question: str = dspy.InputField()

    
    bank_response: str = dspy.OutputField()


## d. Teacher and Student Model

In [356]:
class OptimizationConfig:
    def __init__(self, 
                 student_model="gpt-4o-mini", 
                 teacher_model="gpt-4o", 
                 cache=True,
                 cache_in_memory=False,
                 num_retries=3,
                 max_tokens=12000,
                 max_bootstrapped_demos=0,
                 max_labeled_demos=0,
                 output_path="faq_optimized_prompt.json"):
        self.student_model = student_model
        self.teacher_model = teacher_model
        self.cache = cache
        self.cache_in_memory = cache_in_memory
        self.num_retries = num_retries
        self.max_tokens = max_tokens
        self.max_bootstrapped_demos = max_bootstrapped_demos
        self.max_labeled_demos = max_labeled_demos
        self.output_path = output_path

In [357]:
class FAQPromptOptimizer:
    def __init__(self, config=None):
        """Initialize the optimizer with configuration."""
        self._config = config or OptimizationConfig()
        self._module = dspy.ChainOfThought(FAQProgSig)
        self._optimized_save_path = self._config.output_path
        self._setup_models()
        
    def _setup_models(self):
        """Set up student (inference) and teacher (optimization) models."""
        self._student_lm = dspy.LM(
            model=self._config.student_model,
            api_key=os.environ.get("OPENAI_API_KEY"),
            cache=self._config.cache,
            cache_in_memory=self._config.cache_in_memory,
            num_retries=self._config.num_retries,
            max_tokens=self._config.max_tokens,
            temperature=0.0000001,
            request_timeout=120,
        )

        self._teacher_lm = dspy.LM(
            model=self._config.teacher_model,
            api_key=os.environ.get("OPENAI_API_KEY"),
            cache=self._config.cache,
            cache_in_memory=self._config.cache_in_memory,
            num_retries=self._config.num_retries + 2,  # Higher retry count for teacher
            max_tokens=self._config.max_tokens,
            temperature=0.0000001,
            request_timeout=180,  # Longer timeout for teacher
        )
    
    def optimize(self, dataset, metric):
        """Run the MIPROv2 optimization process."""
        # Configure the default LM to be the student model
        dspy.configure(lm=self._student_lm)
        
        # Set up optimizer with both models
        optimizer = MIPROv2(
            metric=metric,
            max_bootstrapped_demos=self._config.max_bootstrapped_demos,
            max_labeled_demos=self._config.max_labeled_demos,
            auto='medium',
            prompt_model=self._teacher_lm,  # Uses teacher model for prompt optimization
            task_model=self._student_lm,    # Uses student model for inference
            teacher_settings=dict(lm=self._teacher_lm),
        )
        
        print("Starting optimization...")
        optimized_program = optimizer.compile(
            self._module,
            trainset=dataset.train,
            valset=dataset.dev,
            requires_permission_to_run=False,
        )
        
        # Save the optimized prompt
        optimized_program.save(self._optimized_save_path)
        print(f"Optimization complete! Saved to {self._optimized_save_path}")
        
        return optimized_program

## Evaluate the baseline model performance


In [387]:
from dspy import Evaluate

def evaluate_model(module, dataset, metric, sample_size=None, display_progress=True, display_table=3):
    """
    Evaluate a model using dspy.Evaluate for better performance and reporting.
    
    Args:
        module: The DSPy program to evaluate
        dataset: The dataset containing examples
        metric: The scoring metric to use
        sample_size: Optional number of examples to evaluate (None for all)
        display_progress: Whether to show a progress bar
        display_table: How many example results to display in detail (False for none)
    
    Returns:
        float: Average evaluation score
    """
    # Take a subset of examples if needed
    eval_examples = dataset.dev if sample_size is None else dataset.dev[:sample_size]
    
    print(f"Evaluating model on {len(eval_examples)} examples...")
    
    # Create the evaluator
    evaluator = Evaluate(
        devset=eval_examples,
        metric=metric,
        display_progress=display_progress,
        display_table=display_table,
        return_all_scores=True,
        return_outputs=True,     
        num_threads=4
    )
    
    # Run evaluation
    evaluation_result = evaluator(module)
    
    # Handle tuple return types from Evaluate
    if hasattr(evaluation_result, 'score'):
        avg_score = evaluation_result.score
    elif isinstance(evaluation_result, tuple) and len(evaluation_result) >= 1:
        # First element of tuple is the score
        avg_score = evaluation_result[0]
    else:
        # Fall back to calculating average manually if needed
        scores = [metric(ex, pred) for ex, pred in zip(eval_examples, evaluation_result.outputs)] if hasattr(evaluation_result, 'outputs') else []
        avg_score = sum(scores) / len(scores) if scores else 0.0
    print(f"\nModel average score: {avg_score:.4f} on {len(eval_examples)} samples")
    
    return avg_score

# Use the updated evaluation function for the baseline model
base_module = dspy.ChainOfThought(FAQProgSig)

prompt_of_field_to_optimize = "bank_response"
semantic = True
dataset = FAQDataset(
    data_path="/Users/nami-macos/Documents/GitHub/BIDV-DSPy-Optimization/Testset FAQ_multiturn_FAQ_Chain - FAQ Chain 1706.csv",
    selected_output_field=prompt_of_field_to_optimize,
)
metric = FAQScore(
    selected_output_field=prompt_of_field_to_optimize, 
    semantic=semantic
)

# Evaluate the baseline model
baseline_score = evaluate_model(
    base_module, 
    dataset, 
    metric, 
    sample_size=None,  # Use all examples
    display_progress=True,
    display_table=3     # Show 3 examples in detail
)

Evaluating model on 152 examples...
Average Metric: 110.68 / 152 (72.8%): 100%|██████████| 152/152 [00:00<00:00, 196.48it/s]

2025/06/18 00:31:05 INFO dspy.evaluate.evaluate: Average Metric: 110.68070468585765 / 152 (72.8%)





Unnamed: 0,context_extracted,chat_history_extracted,question,example_bank_response,reasoning,pred_bank_response,FAQScore
0,"{ ""title"": ""Giảm 20% mua sắm tại Aeon Mall cùng thẻ BIDV JCB"", ""co...","Customer: còn ctkm nào cho thẻ JCB không BIDV: Dạ, hiện tại BIDV đ...",AEON Hải Phòng không đc áp dụng à,Chương trình ưu đãi giảm 20% khi mua sắm tại Aeon Mall áp dụng cho...,<think> - The customer is inquiring about the applicability of the...,"Dạ, chương trình khuyến mãi giảm 20% khi mua sắm tại Aeon Mall áp ...",✔️ [1.000]
1,"{ ""Description"": ""Gồm các loại phí khi sử dụng thẻ ghi nợ quốc tế ...",Customer: Sao tài khoản mình bị trừ phí thường niên thẻ tháng 12. ...,Phí rút tiền thẻ Mastercard tại nước ngoài,- Phí rút tiền mặt tại ATM/POS nước ngoài cho thẻ ghi nợ quốc tế M...,<think> - The inquiry is about the withdrawal fee for Mastercard a...,"Dạ, về phí rút tiền mặt tại ATM nước ngoài cho thẻ Mastercard của ...",✔️ [0.667]
2,"{ ""Description"": ""Gồm LÃI SUẤT và các loại phí khi sử dụng THẺ TI...","Customer: tôi mới mở thẻ JCB, cần chi tiêu bao nhiêu thì được miễn...","tôi mới mở thẻ JCB, cần chi tiêu bao nhiêu thì được miễn phí thườn...",Doanh số chi tiêu để miễn PTN năm đầu Thẻ JCB Ultimate Từ 10.000.0...,<think> - The customer is inquiring about the annual fee waiver co...,"Dạ, hiện tại em không có thông tin cụ thể về điều kiện miễn phí th...",



Model average score: 72.8200 on 152 samples


In [388]:
eval_examples = dataset.dev

# Create the evaluator
evaluator = Evaluate(
    devset=eval_examples,
    metric=metric,
    display_progress=True,
    display_table=True,
    return_all_scores=True,
    return_outputs=True,     
    num_threads=4
)
# Run evaluation of the baseline model
evaluation_result = evaluator(base_module)


Average Metric: 110.68 / 152 (72.8%): 100%|██████████| 152/152 [00:00<00:00, 2813.09it/s]

2025/06/18 00:31:09 INFO dspy.evaluate.evaluate: Average Metric: 110.68070468585765 / 152 (72.8%)





Unnamed: 0,context_extracted,chat_history_extracted,question,example_bank_response,reasoning,pred_bank_response,FAQScore
0,"{ ""title"": ""Giảm 20% mua sắm tại Aeon Mall cùng thẻ BIDV JCB"", ""co...","Customer: còn ctkm nào cho thẻ JCB không BIDV: Dạ, hiện tại BIDV đ...",AEON Hải Phòng không đc áp dụng à,Chương trình ưu đãi giảm 20% khi mua sắm tại Aeon Mall áp dụng cho...,<think> - The customer is inquiring about the applicability of the...,"Dạ, chương trình khuyến mãi giảm 20% khi mua sắm tại Aeon Mall áp ...",✔️ [1.000]
1,"{ ""Description"": ""Gồm các loại phí khi sử dụng thẻ ghi nợ quốc tế ...",Customer: Sao tài khoản mình bị trừ phí thường niên thẻ tháng 12. ...,Phí rút tiền thẻ Mastercard tại nước ngoài,- Phí rút tiền mặt tại ATM/POS nước ngoài cho thẻ ghi nợ quốc tế M...,<think> - The inquiry is about the withdrawal fee for Mastercard a...,"Dạ, về phí rút tiền mặt tại ATM nước ngoài cho thẻ Mastercard của ...",✔️ [0.667]
2,"{ ""Description"": ""Gồm LÃI SUẤT và các loại phí khi sử dụng THẺ TI...","Customer: tôi mới mở thẻ JCB, cần chi tiêu bao nhiêu thì được miễn...","tôi mới mở thẻ JCB, cần chi tiêu bao nhiêu thì được miễn phí thườn...",Doanh số chi tiêu để miễn PTN năm đầu Thẻ JCB Ultimate Từ 10.000.0...,<think> - The customer is inquiring about the annual fee waiver co...,"Dạ, hiện tại em không có thông tin cụ thể về điều kiện miễn phí th...",
3,"{ ""title"": ""Tặng Voucher 100.000 VND khi mua sắm tại Lotte Departm...","Customer: còn ctkm nào cho thẻ JCB không BIDV: Dạ, hiện tại BIDV đ...",vậy là hết hạn rồi à,"Dạ, em xin lỗi vì hiện tại em không có thông tin cập nhật về chươn...",<think> - The customer is inquiring about the expiration of a prom...,"Dạ, chương trình ưu đãi tặng voucher mua sắm trị giá 100.000 VND k...",✔️ [0.500]
4,"{ ""question"": ""Thanh toán thẻ qua Mã phản hồi nhanh (Quick Respons...",Customer: thẻ nội địa với thẻ quốc tế thẻ nào phí thường niên thấp...,BIDV có dịch vụ thanh toán bằng QR code qua thẻ tín dụng không?,"Dạ, BIDV hiện chưa hỗ trợ dịch vụ thanh toán bằng mã QR code qua t...","<think> - Đầu tiên, tôi cần xác định xem BIDV có cung cấp dịch vụ ...","Dạ, hiện tại BIDV không hỗ trợ dịch vụ thanh toán bằng QR code cho...",✔️ [1.000]
...,...,...,...,...,...,...,...
147,"{ ""question"": ""Đơn vị chấp nhận thẻ (viết tắt là ĐVCNT), đơn vị ch...",Customer: Sao tài khoản mình bị trừ phí thường niên thẻ tháng 12. ...,alo,"Xin chào, em có thể giúp gì cho Anh/Chị ạ?","<think> - The customer has simply said ""alo,"" which does not provi...","Dạ, chào Anh/Chị! Em có thể hỗ trợ gì cho Anh/Chị hôm nay ạ? Nếu c...",✔️ [0.750]
148,"{ ""Description"": ""Gồm các loại phí khi sử dụng dịch vụ ngân hàng đ...",Customer: Sao tài khoản mình bị trừ phí thường niên thẻ tháng 12. ...,Cho mình hỏi đăng ký dịch vụ OTT của ngân hàng BIDV hàng tháng có ...,Dịch vụ OTT (nhắn tin qua Smartbanking) của BIDV là miễn phí hàng ...,"<think> - Đầu tiên, tôi cần xác định thông tin về dịch vụ OTT (nhắ...","Dạ, về dịch vụ OTT (nhắn tin qua SmartBanking) của ngân hàng BIDV,...",✔️ [1.000]
149,"{ ""question"": ""Khách hàng mục tiêu, Điều kiện mở Thẻ BIDV JC...",,Công dụng của thẻ jcb well-being,"Dạ, thẻ BIDV JCB Well-being có các công dụng và ưu đãi như sau: Kh...",<think> - The inquiry is about the functions and benefits of the B...,"Dạ, thẻ BIDV JCB Well-being có nhiều công dụng và lợi ích hấp dẫn ...",✔️ [0.900]
150,"{ ""Description"": ""Gồm các loại phí khi sử dụng thẻ ghi nợ quốc tế ...",[{'content': 'Tôi dùng thẻ JCB của MB bank rút tiền trên cây bidv ...,dạ cho mình hỏi mình muốn làm thẻ gnqt BIDV để đăng kí paypal thì ...,Thẻ ghi nợ quốc tế có phí phát hành là 50.000 đồng đối với thẻ Rea...,<think> Current state: + Discussed card name: Thẻ ghi nợ quốc tế (...,"Dạ, để mở thẻ ghi nợ quốc tế của BIDV, anh/chị sẽ không phải trả p...",✔️ [1.000]


In [None]:
# rows = evaluation_result[1]  # List of rows
# df = pd.DataFrame(rows)      # Convert list to DataFrame
# df.to_excel("evaluation_result.xlsx", index=False)


## Auto Optimization

In [384]:
# Set up the dataset and metric
prompt_of_field_to_optimize = "bank_response"
semantic = True

dataset = FAQDataset(
    data_path="/Users/nami-macos/Documents/GitHub/BIDV-DSPy-Optimization/Testset FAQ_multiturn_FAQ_Chain - FAQ Chain 1706.csv",
    selected_output_field=prompt_of_field_to_optimize,
)
metric = FAQScore(
    selected_output_field=prompt_of_field_to_optimize, 
    semantic=semantic
)

# Create and run the optimizer
config = OptimizationConfig(
    student_model="gpt-4o-mini",
    teacher_model="gpt-4o",
    max_bootstrapped_demos=0,
    max_labeled_demos=0,
    output_path="faq_optimized_prompt.json"
)

optimizer = FAQPromptOptimizer(config)
optimized_program = optimizer.optimize(dataset, metric)

2025/06/17 23:50:40 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:
num_trials: 18
minibatch: True
num_fewshot_candidates: 12
num_instruct_candidates: 12
valset size: 152

2025/06/17 23:50:40 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/06/17 23:50:40 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used for informing instruction proposal.

2025/06/17 23:50:40 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=12 sets of demonstrations...


Starting optimization...
Bootstrapping set 1/12
Bootstrapping set 2/12


  3%|▎         | 1/37 [00:00<00:01, 30.22it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 3/12


  8%|▊         | 3/37 [00:52<09:57, 17.58s/it]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 4/12


  8%|▊         | 3/37 [00:41<07:45, 13.69s/it]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 5/12


  8%|▊         | 3/37 [00:42<08:01, 14.16s/it]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 6/12


  5%|▌         | 2/37 [00:15<04:29,  7.69s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 7/12


  5%|▌         | 2/37 [00:11<03:23,  5.82s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 8/12


  3%|▎         | 1/37 [00:18<10:48, 18.02s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 9/12


  8%|▊         | 3/37 [00:31<06:01, 10.62s/it]


Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 10/12


  5%|▌         | 2/37 [00:15<04:23,  7.54s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 11/12


 16%|█▌        | 6/37 [01:03<05:27, 10.57s/it]


Bootstrapped 3 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.
Bootstrapping set 12/12


  3%|▎         | 1/37 [00:00<00:00, 163.43it/s]
2025/06/17 23:55:32 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/06/17 23:55:32 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.


2025/06/17 23:56:21 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=12 instructions...

2025/06/18 00:05:00 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/06/18 00:05:00 INFO dspy.teleprompt.mipro_optimizer_v2: 0: # <Role>
    An expert Vietnamese knowledge assistant from BIDV bank, named BIDV, tasked with answering customer inquiries about BIDV and its services. **ALWAYS** follow the 2 steps below:

    1. **Internal Reasoning** (in `<think>` tags):
    - Break down the conversation into atomic steps without skipping stages.
    - Assess the current workflow state of the process.
    - **Verify technological context**: Identify specific technological requirements like NFC or biometric needs related to the inquiry and ensure alignment with BIDV's offered services and apps, including verifying the app version and service status of the BIDV SmartBanking app, before proceeding.
    - **Confirm loan type**: Ensure that the specific type of loan 

Average Metric: 110.68 / 152 (72.8%): 100%|██████████| 152/152 [00:00<00:00, 810.48it/s]

2025/06/18 00:05:00 INFO dspy.evaluate.evaluate: Average Metric: 110.68070468585765 / 152 (72.8%)
2025/06/18 00:05:00 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 72.82

2025/06/18 00:05:00 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 2 / 23 - Minibatch ==



Average Metric: 28.07 / 35 (80.2%): 100%|██████████| 35/35 [00:46<00:00,  1.32s/it]

2025/06/18 00:05:47 INFO dspy.evaluate.evaluate: Average Metric: 28.074330093334613 / 35 (80.2%)
2025/06/18 00:05:47 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.21 on minibatch of size 35 with parameters ['Predictor 0: Instruction 11'].
2025/06/18 00:05:47 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.21]
2025/06/18 00:05:47 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.82]
2025/06/18 00:05:47 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 72.82


2025/06/18 00:05:47 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 3 / 23 - Minibatch ==



Average Metric: 23.50 / 35 (67.1%): 100%|██████████| 35/35 [00:00<00:00, 2584.61it/s]

2025/06/18 00:05:47 INFO dspy.evaluate.evaluate: Average Metric: 23.501592277454346 / 35 (67.1%)
2025/06/18 00:05:47 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 67.15 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0'].
2025/06/18 00:05:47 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.21, 67.15]
2025/06/18 00:05:47 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.82]
2025/06/18 00:05:47 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 72.82


2025/06/18 00:05:47 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 4 / 23 - Minibatch ==



Average Metric: 23.84 / 35 (68.1%): 100%|██████████| 35/35 [00:49<00:00,  1.41s/it]

2025/06/18 00:06:36 INFO dspy.evaluate.evaluate: Average Metric: 23.840337413264244 / 35 (68.1%)
2025/06/18 00:06:36 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.12 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2'].
2025/06/18 00:06:36 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.21, 67.15, 68.12]
2025/06/18 00:06:36 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.82]
2025/06/18 00:06:36 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 72.82


2025/06/18 00:06:36 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 5 / 23 - Minibatch ==



Average Metric: 26.21 / 35 (74.9%): 100%|██████████| 35/35 [00:00<00:00, 3354.68it/s]

2025/06/18 00:06:36 INFO dspy.evaluate.evaluate: Average Metric: 26.20533633033633 / 35 (74.9%)
2025/06/18 00:06:36 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 74.87 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0'].
2025/06/18 00:06:36 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.21, 67.15, 68.12, 74.87]
2025/06/18 00:06:36 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.82]
2025/06/18 00:06:36 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 72.82


2025/06/18 00:06:36 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 6 / 23 - Minibatch ==



Average Metric: 24.66 / 35 (70.4%): 100%|██████████| 35/35 [00:00<00:00, 3213.04it/s]

2025/06/18 00:06:36 INFO dspy.evaluate.evaluate: Average Metric: 24.655745573870572 / 35 (70.4%)
2025/06/18 00:06:36 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 70.44 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0'].
2025/06/18 00:06:36 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.21, 67.15, 68.12, 74.87, 70.44]
2025/06/18 00:06:36 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.82]
2025/06/18 00:06:36 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 72.82


2025/06/18 00:06:36 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 23 - Full Evaluation =====
2025/06/18 00:06:36 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 80.21) from minibatch trials...



Average Metric: 106.54 / 152 (70.1%): 100%|██████████| 152/152 [02:33<00:00,  1.01s/it]

2025/06/18 00:09:10 INFO dspy.evaluate.evaluate: Average Metric: 106.5367012959372 / 152 (70.1%)
2025/06/18 00:09:10 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.82, 70.09]
2025/06/18 00:09:10 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 72.82
2025/06/18 00:09:10 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/06/18 00:09:10 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 8 / 23 - Minibatch ==



Average Metric: 29.32 / 35 (83.8%): 100%|██████████| 35/35 [00:46<00:00,  1.34s/it]

2025/06/18 00:09:57 INFO dspy.evaluate.evaluate: Average Metric: 29.321685993306176 / 35 (83.8%)
2025/06/18 00:09:57 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 83.78 on minibatch of size 35 with parameters ['Predictor 0: Instruction 4'].
2025/06/18 00:09:57 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.21, 67.15, 68.12, 74.87, 70.44, 83.78]
2025/06/18 00:09:57 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.82, 70.09]
2025/06/18 00:09:57 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 72.82


2025/06/18 00:09:57 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 9 / 23 - Minibatch ==



Average Metric: 23.28 / 35 (66.5%): 100%|██████████| 35/35 [00:00<00:00, 391.43it/s]

2025/06/18 00:09:57 INFO dspy.evaluate.evaluate: Average Metric: 23.277235680230234 / 35 (66.5%)





2025/06/18 00:09:58 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 66.51 on minibatch of size 35 with parameters ['Predictor 0: Instruction 11'].
2025/06/18 00:09:58 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.21, 67.15, 68.12, 74.87, 70.44, 83.78, 66.51]
2025/06/18 00:09:58 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.82, 70.09]
2025/06/18 00:09:58 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 72.82


2025/06/18 00:09:58 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 10 / 23 - Minibatch ==


Average Metric: 24.75 / 35 (70.7%): 100%|██████████| 35/35 [00:52<00:00,  1.49s/it]

2025/06/18 00:10:50 INFO dspy.evaluate.evaluate: Average Metric: 24.750790627706863 / 35 (70.7%)
2025/06/18 00:10:50 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 70.72 on minibatch of size 35 with parameters ['Predictor 0: Instruction 10'].
2025/06/18 00:10:50 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.21, 67.15, 68.12, 74.87, 70.44, 83.78, 66.51, 70.72]
2025/06/18 00:10:50 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.82, 70.09]
2025/06/18 00:10:50 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 72.82


2025/06/18 00:10:50 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 11 / 23 - Minibatch ==



Average Metric: 28.25 / 35 (80.7%): 100%|██████████| 35/35 [00:39<00:00,  1.12s/it]

2025/06/18 00:11:29 INFO dspy.evaluate.evaluate: Average Metric: 28.25054545235466 / 35 (80.7%)
2025/06/18 00:11:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.72 on minibatch of size 35 with parameters ['Predictor 0: Instruction 4'].
2025/06/18 00:11:29 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.21, 67.15, 68.12, 74.87, 70.44, 83.78, 66.51, 70.72, 80.72]
2025/06/18 00:11:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.82, 70.09]
2025/06/18 00:11:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 72.82


2025/06/18 00:11:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 12 / 23 - Minibatch ==



Average Metric: 26.02 / 35 (74.4%): 100%|██████████| 35/35 [00:29<00:00,  1.17it/s]

2025/06/18 00:11:59 INFO dspy.evaluate.evaluate: Average Metric: 26.02465457078133 / 35 (74.4%)
2025/06/18 00:11:59 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 74.36 on minibatch of size 35 with parameters ['Predictor 0: Instruction 4'].
2025/06/18 00:11:59 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.21, 67.15, 68.12, 74.87, 70.44, 83.78, 66.51, 70.72, 80.72, 74.36]
2025/06/18 00:11:59 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.82, 70.09]
2025/06/18 00:11:59 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 72.82


2025/06/18 00:11:59 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 13 / 23 - Full Evaluation =====
2025/06/18 00:11:59 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 79.62) from minibatch trials...



Average Metric: 113.69 / 152 (74.8%): 100%|██████████| 152/152 [01:30<00:00,  1.68it/s]

2025/06/18 00:13:29 INFO dspy.evaluate.evaluate: Average Metric: 113.68850344851178 / 152 (74.8%)
2025/06/18 00:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 74.8
2025/06/18 00:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.82, 70.09, 74.8]
2025/06/18 00:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.8
2025/06/18 00:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/06/18 00:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 14 / 23 - Minibatch ==



Average Metric: 26.97 / 35 (77.1%): 100%|██████████| 35/35 [00:00<00:00, 467.36it/s]

2025/06/18 00:13:30 INFO dspy.evaluate.evaluate: Average Metric: 26.969622044622046 / 35 (77.1%)





2025/06/18 00:13:30 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 77.06 on minibatch of size 35 with parameters ['Predictor 0: Instruction 4'].
2025/06/18 00:13:30 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.21, 67.15, 68.12, 74.87, 70.44, 83.78, 66.51, 70.72, 80.72, 74.36, 77.06]
2025/06/18 00:13:30 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.82, 70.09, 74.8]
2025/06/18 00:13:30 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.8


2025/06/18 00:13:30 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 15 / 23 - Minibatch ==


Average Metric: 28.40 / 35 (81.1%): 100%|██████████| 35/35 [00:49<00:00,  1.40s/it]

2025/06/18 00:14:19 INFO dspy.evaluate.evaluate: Average Metric: 28.396194991774717 / 35 (81.1%)
2025/06/18 00:14:19 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 81.13 on minibatch of size 35 with parameters ['Predictor 0: Instruction 5'].
2025/06/18 00:14:19 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.21, 67.15, 68.12, 74.87, 70.44, 83.78, 66.51, 70.72, 80.72, 74.36, 77.06, 81.13]
2025/06/18 00:14:19 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.82, 70.09, 74.8]
2025/06/18 00:14:19 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.8


2025/06/18 00:14:19 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 16 / 23 - Minibatch ==



Average Metric: 25.80 / 35 (73.7%): 100%|██████████| 35/35 [00:41<00:00,  1.18s/it]

2025/06/18 00:15:00 INFO dspy.evaluate.evaluate: Average Metric: 25.802522796981037 / 35 (73.7%)
2025/06/18 00:15:00 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 73.72 on minibatch of size 35 with parameters ['Predictor 0: Instruction 5'].
2025/06/18 00:15:00 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.21, 67.15, 68.12, 74.87, 70.44, 83.78, 66.51, 70.72, 80.72, 74.36, 77.06, 81.13, 73.72]
2025/06/18 00:15:00 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.82, 70.09, 74.8]
2025/06/18 00:15:00 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.8


2025/06/18 00:15:00 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 17 / 23 - Minibatch ==



Average Metric: 27.88 / 35 (79.7%): 100%|██████████| 35/35 [00:27<00:00,  1.25it/s]

2025/06/18 00:15:28 INFO dspy.evaluate.evaluate: Average Metric: 27.882114965452356 / 35 (79.7%)
2025/06/18 00:15:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 79.66 on minibatch of size 35 with parameters ['Predictor 0: Instruction 5'].
2025/06/18 00:15:28 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.21, 67.15, 68.12, 74.87, 70.44, 83.78, 66.51, 70.72, 80.72, 74.36, 77.06, 81.13, 73.72, 79.66]
2025/06/18 00:15:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.82, 70.09, 74.8]
2025/06/18 00:15:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.8


2025/06/18 00:15:28 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 18 / 23 - Minibatch ==



Average Metric: 29.27 / 35 (83.6%): 100%|██████████| 35/35 [00:47<00:00,  1.36s/it]

2025/06/18 00:16:16 INFO dspy.evaluate.evaluate: Average Metric: 29.269273182957395 / 35 (83.6%)
2025/06/18 00:16:16 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 83.63 on minibatch of size 35 with parameters ['Predictor 0: Instruction 9'].
2025/06/18 00:16:16 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.21, 67.15, 68.12, 74.87, 70.44, 83.78, 66.51, 70.72, 80.72, 74.36, 77.06, 81.13, 73.72, 79.66, 83.63]
2025/06/18 00:16:16 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.82, 70.09, 74.8]
2025/06/18 00:16:16 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.8


2025/06/18 00:16:16 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 19 / 23 - Full Evaluation =====
2025/06/18 00:16:16 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 83.63) from minibatch trials...



Average Metric: 109.45 / 152 (72.0%): 100%|██████████| 152/152 [02:27<00:00,  1.03it/s]

2025/06/18 00:18:44 INFO dspy.evaluate.evaluate: Average Metric: 109.44568629796203 / 152 (72.0%)
2025/06/18 00:18:44 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.82, 70.09, 74.8, 72.0]
2025/06/18 00:18:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.8
2025/06/18 00:18:44 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/06/18 00:18:44 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 20 / 23 - Minibatch ==



Average Metric: 23.66 / 35 (67.6%): 100%|██████████| 35/35 [00:00<00:00, 244.25it/s]

2025/06/18 00:18:44 INFO dspy.evaluate.evaluate: Average Metric: 23.664657908125456 / 35 (67.6%)
2025/06/18 00:18:44 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 67.61 on minibatch of size 35 with parameters ['Predictor 0: Instruction 9'].
2025/06/18 00:18:44 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.21, 67.15, 68.12, 74.87, 70.44, 83.78, 66.51, 70.72, 80.72, 74.36, 77.06, 81.13, 73.72, 79.66, 83.63, 67.61]
2025/06/18 00:18:44 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.82, 70.09, 74.8, 72.0]
2025/06/18 00:18:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.8


2025/06/18 00:18:44 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 21 / 23 - Minibatch ==



Average Metric: 27.72 / 35 (79.2%): 100%|██████████| 35/35 [00:47<00:00,  1.34s/it]

2025/06/18 00:19:31 INFO dspy.evaluate.evaluate: Average Metric: 27.716162678438437 / 35 (79.2%)
2025/06/18 00:19:31 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 79.19 on minibatch of size 35 with parameters ['Predictor 0: Instruction 3'].
2025/06/18 00:19:31 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.21, 67.15, 68.12, 74.87, 70.44, 83.78, 66.51, 70.72, 80.72, 74.36, 77.06, 81.13, 73.72, 79.66, 83.63, 67.61, 79.19]
2025/06/18 00:19:31 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.82, 70.09, 74.8, 72.0]
2025/06/18 00:19:31 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.8


2025/06/18 00:19:31 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 22 / 23 - Minibatch ==



Average Metric: 27.37 / 35 (78.2%): 100%|██████████| 35/35 [00:51<00:00,  1.48s/it]

2025/06/18 00:20:23 INFO dspy.evaluate.evaluate: Average Metric: 27.370709498834497 / 35 (78.2%)
2025/06/18 00:20:23 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 78.2 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1'].
2025/06/18 00:20:23 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.21, 67.15, 68.12, 74.87, 70.44, 83.78, 66.51, 70.72, 80.72, 74.36, 77.06, 81.13, 73.72, 79.66, 83.63, 67.61, 79.19, 78.2]
2025/06/18 00:20:23 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.82, 70.09, 74.8, 72.0]
2025/06/18 00:20:23 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.8


2025/06/18 00:20:23 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 23 / 23 - Full Evaluation =====
2025/06/18 00:20:23 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 79.19) from minibatch trials...



Average Metric: 111.40 / 152 (73.3%): 100%|██████████| 152/152 [02:35<00:00,  1.02s/it]

2025/06/18 00:22:58 INFO dspy.evaluate.evaluate: Average Metric: 111.40263895090989 / 152 (73.3%)
2025/06/18 00:22:58 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.82, 70.09, 74.8, 72.0, 73.29]
2025/06/18 00:22:58 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.8
2025/06/18 00:22:58 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/06/18 00:22:58 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 74.8!



Optimization complete! Saved to faq_optimized_prompt.json


## Compare

In [402]:
def export_evaluation_to_excel(evaluation_result, filepath="evaluation_results.xlsx", model_name="Baseline"):
    """
    Export DSPy evaluation results to an Excel file with detailed formatting.
    
    Args:
        evaluation_result: The result from dspy.Evaluate
        filepath: Path to save the Excel file
        model_name: Name of the model being evaluated (for labeling in the Excel)
    
    Returns:
        str: Path to the saved Excel file
    """
    import pandas as pd
    import numpy as np
    from datetime import datetime
    
    # Extract essential components from the evaluation result
    if isinstance(evaluation_result, tuple) and len(evaluation_result) >= 2:
        # DSPy Evaluate returns tuple (avg_score, result_object)
        avg_score = evaluation_result[0]
        result_object = evaluation_result[1]
        
        if hasattr(result_object, 'examples') and hasattr(result_object, 'outputs') and hasattr(result_object, 'scores'):
            examples = result_object.examples
            outputs = result_object.outputs
            scores = result_object.scores
        elif isinstance(result_object, list):
            # Handle case where result_object is a list of dictionaries 
            examples = [item['example'] for item in result_object if 'example' in item]
            outputs = [item['prediction'] for item in result_object if 'prediction' in item]
            scores = [item['score'] for item in result_object if 'score' in item]
            if not examples or not outputs or not scores:
                raise ValueError("Could not extract examples, outputs, or scores from result list")
        else:
            raise ValueError(f"Unexpected result object format: {type(result_object)}")
    elif hasattr(evaluation_result, 'examples') and hasattr(evaluation_result, 'outputs') and hasattr(evaluation_result, 'scores'):
        # Direct object with attributes (less common with recent DSPy)
        examples = evaluation_result.examples
        outputs = evaluation_result.outputs
        scores = evaluation_result.scores
        avg_score = evaluation_result.score if hasattr(evaluation_result, 'score') else sum(scores) / len(scores)
    else:
        raise ValueError(f"Unexpected evaluation_result format: {type(evaluation_result)}")
    
    # Create a DataFrame to store the detailed results
    results = []
    
    for i, (example, pred, score) in enumerate(zip(examples, outputs, scores)):
        try:
            # Safe extraction of fields with fallbacks
            question = str(example.question) if hasattr(example, 'question') else ""
            context = str(example.context_extracted) if hasattr(example, 'context_extracted') else ""
            chat_history = str(example.chat_history_extracted) if hasattr(example, 'chat_history_extracted') else ""
            ground_truth = str(example.bank_response) if hasattr(example, 'bank_response') else ""
            prediction = str(pred.bank_response) if hasattr(pred, 'bank_response') else ""
            
            result = {
                "Example_ID": i + 1,
                "Question": question[:500] + "..." if len(question) > 500 else question,  # Truncate long text
                "Context": context[:250] + "..." if len(context) > 250 else context,
                "Chat_History": chat_history[:250] + "..." if len(chat_history) > 250 else chat_history, 
                "Ground_Truth": ground_truth[:500] + "..." if len(ground_truth) > 500 else ground_truth,
                "Prediction": prediction[:500] + "..." if len(prediction) > 500 else prediction,
                "Score": score,
                "Model": model_name
            }
            results.append(result)
        except Exception as e:
            print(f"Error processing example {i}: {e}")
    
    # Create DataFrame
    df = pd.DataFrame(results)
    
    # Calculate statistics from scores
    scores_series = pd.Series(scores)
    
    # Add summary statistics
    summary = pd.DataFrame({
        "Metric": ["Model", "Average Score", "Median Score", "Perfect Scores (1.0)", 
                  "High Scores (≥0.8)", "Medium Scores (0.5-0.79)", 
                  "Low Scores (<0.5)", "Zero Scores (0.0)", "Timestamp"],
        "Value": [
            model_name,
            f"{avg_score:.4f}" if not scores_series.empty else "N/A",
            f"{scores_series.median():.4f}" if not scores_series.empty else "N/A",
            f"{sum(scores_series == 1.0)} ({sum(scores_series == 1.0)/len(scores_series)*100:.1f}%)" if not scores_series.empty else "N/A",
            f"{sum(scores_series >= 0.8)} ({sum(scores_series >= 0.8)/len(scores_series)*100:.1f}%)" if not scores_series.empty else "N/A",
            f"{sum((scores_series >= 0.5) & (scores_series < 0.8))} ({sum((scores_series >= 0.5) & (scores_series < 0.8))/len(scores_series)*100:.1f}%)" if not scores_series.empty else "N/A",
            f"{sum(scores_series < 0.5)} ({sum(scores_series < 0.5)/len(scores_series)*100:.1f}%)" if not scores_series.empty else "N/A",
            f"{sum(scores_series == 0.0)} ({sum(scores_series == 0.0)/len(scores_series)*100:.1f}%)" if not scores_series.empty else "N/A",
            datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        ]
    })
    
    # Write to Excel with multiple sheets
    with pd.ExcelWriter(filepath, engine='xlsxwriter') as writer:
        df.to_excel(writer, sheet_name='Detailed Results', index=False)
        summary.to_excel(writer, sheet_name='Summary', index=False)
        
        # Add formatting to the Excel file
        workbook = writer.book
        
        # Format for the detailed results sheet
        worksheet = writer.sheets['Detailed Results']
        header_format = workbook.add_format({'bold': True, 'bg_color': '#D9E1F2', 'border': 1})
        
        # Format the header row
        for col_num, value in enumerate(df.columns.values):
            worksheet.write(0, col_num, value, header_format)
            
        # Format for the summary sheet
        worksheet = writer.sheets['Summary']
        worksheet.set_column('A:A', 25)
        worksheet.set_column('B:B', 40)
    
    print(f"Evaluation results for {model_name} exported to {filepath}")
    return filepath

In [403]:
def compare_models_and_export(baseline_result, optimized_result, filepath="model_comparison.xlsx"):
    """
    Compare baseline and optimized models, and export a comparison to Excel
    
    Args:
        baseline_result: Evaluation result for the baseline model
        optimized_result: Evaluation result for the optimized model
        filepath: Path to save the comparison Excel file
    
    Returns:
        str: Path to the saved Excel file
    """
    import pandas as pd
    from datetime import datetime
    
    # Export individual results first
    baseline_path = export_evaluation_to_excel(baseline_result, "baseline_evaluation.xlsx", "Baseline")
    optimized_path = export_evaluation_to_excel(optimized_result, "optimized_evaluation.xlsx", "Optimized")
    
    # Extract components from the baseline result
    if isinstance(baseline_result, tuple) and len(baseline_result) >= 2:
        baseline_avg = baseline_result[0]
        baseline_result_obj = baseline_result[1]
        
        if hasattr(baseline_result_obj, 'examples') and hasattr(baseline_result_obj, 'outputs') and hasattr(baseline_result_obj, 'scores'):
            baseline_examples = baseline_result_obj.examples
            baseline_outputs = baseline_result_obj.outputs
            baseline_scores = baseline_result_obj.scores
        elif isinstance(baseline_result_obj, list):
            baseline_examples = [item['example'] for item in baseline_result_obj if 'example' in item]
            baseline_outputs = [item['prediction'] for item in baseline_result_obj if 'prediction' in item]
            baseline_scores = [item['score'] for item in baseline_result_obj if 'score' in item]
            if not baseline_examples or not baseline_outputs or not baseline_scores:
                raise ValueError("Could not extract examples, outputs, or scores from baseline result")
        else:
            raise ValueError(f"Unexpected baseline result format: {type(baseline_result_obj)}")
    elif hasattr(baseline_result, 'examples') and hasattr(baseline_result, 'outputs') and hasattr(baseline_result, 'scores'):
        baseline_examples = baseline_result.examples
        baseline_outputs = baseline_result.outputs
        baseline_scores = baseline_result.scores
        baseline_avg = baseline_result.score if hasattr(baseline_result, 'score') else sum(baseline_scores) / len(baseline_scores)
    else:
        raise ValueError(f"Unexpected baseline result format: {type(baseline_result)}")
    
    # Extract components from the optimized result
    if isinstance(optimized_result, tuple) and len(optimized_result) >= 2:
        optimized_avg = optimized_result[0]
        optimized_result_obj = optimized_result[1]
        
        if hasattr(optimized_result_obj, 'examples') and hasattr(optimized_result_obj, 'outputs') and hasattr(optimized_result_obj, 'scores'):
            optimized_examples = optimized_result_obj.examples
            optimized_outputs = optimized_result_obj.outputs
            optimized_scores = optimized_result_obj.scores
        elif isinstance(optimized_result_obj, list):
            optimized_examples = [item['example'] for item in optimized_result_obj if 'example' in item]
            optimized_outputs = [item['prediction'] for item in optimized_result_obj if 'prediction' in item]
            optimized_scores = [item['score'] for item in optimized_result_obj if 'score' in item]
            if not optimized_examples or not optimized_outputs or not optimized_scores:
                raise ValueError("Could not extract examples, outputs, or scores from optimized result")
        else:
            raise ValueError(f"Unexpected optimized result format: {type(optimized_result_obj)}")
    elif hasattr(optimized_result, 'examples') and hasattr(optimized_result, 'outputs') and hasattr(optimized_result, 'scores'):
        optimized_examples = optimized_result.examples
        optimized_outputs = optimized_result.outputs
        optimized_scores = optimized_result.scores
        optimized_avg = optimized_result.score if hasattr(optimized_result, 'score') else sum(optimized_scores) / len(optimized_scores)
    else:
        raise ValueError(f"Unexpected optimized result format: {type(optimized_result)}")
    
    # Create comparison DataFrame
    comparison_data = []
    
    # Ensure both results have the same number of examples
    if len(baseline_examples) != len(optimized_examples):
        print(f"Warning: Different number of examples in results - baseline: {len(baseline_examples)}, optimized: {len(optimized_examples)}")
        min_examples = min(len(baseline_examples), len(optimized_examples))
        baseline_examples = baseline_examples[:min_examples]
        baseline_scores = baseline_scores[:min_examples]
        baseline_outputs = baseline_outputs[:min_examples]
        optimized_examples = optimized_examples[:min_examples]
        optimized_scores = optimized_scores[:min_examples]
        optimized_outputs = optimized_outputs[:min_examples]
    
    for i, (b_ex, o_ex, b_score, o_score, b_out, o_out) in enumerate(zip(
        baseline_examples, optimized_examples, baseline_scores, optimized_scores, 
        baseline_outputs, optimized_outputs)):
        
        try:
            comparison_data.append({
                "Example_ID": i + 1,
                "Question": b_ex.question[:300] if hasattr(b_ex, 'question') else "",
                "Baseline_Score": b_score,
                "Optimized_Score": o_score,
                "Difference": o_score - b_score,
                "Improved": "Yes" if o_score > b_score else ("No" if o_score < b_score else "Same"),
                "Baseline_Response": b_out.bank_response[:300] + "..." if hasattr(b_out, 'bank_response') and len(b_out.bank_response) > 300 else (b_out.bank_response if hasattr(b_out, 'bank_response') else ""),
                "Optimized_Response": o_out.bank_response[:300] + "..." if hasattr(o_out, 'bank_response') and len(o_out.bank_response) > 300 else (o_out.bank_response if hasattr(o_out, 'bank_response') else ""),
            })
        except Exception as e:
            print(f"Error processing comparison for example {i}: {e}")
    
    comparison_df = pd.DataFrame(comparison_data)
    
    # Convert scores to pandas Series for easier calculations
    baseline_scores_series = pd.Series(baseline_scores)
    optimized_scores_series = pd.Series(optimized_scores)
    
    # Create summary comparison
    summary_comparison = pd.DataFrame({
        "Metric": [
            "Average Score", 
            "Examples Improved", 
            "Examples Worsened", 
            "Examples Unchanged",
            "Perfect Scores (Baseline)",
            "Perfect Scores (Optimized)",
            "High Scores ≥0.8 (Baseline)",
            "High Scores ≥0.8 (Optimized)",
            "Comparison Date"
        ],
        "Value": [
            f"Baseline: {baseline_avg:.4f}, Optimized: {optimized_avg:.4f}, Difference: {optimized_avg - baseline_avg:.4f} ({(optimized_avg - baseline_avg) / baseline_avg * 100:.2f}%)",
            f"{sum(comparison_df['Improved'] == 'Yes')} / {len(comparison_df)} ({sum(comparison_df['Improved'] == 'Yes') / len(comparison_df) * 100:.1f}%)",
            f"{sum(comparison_df['Improved'] == 'No')} / {len(comparison_df)} ({sum(comparison_df['Improved'] == 'No') / len(comparison_df) * 100:.1f}%)",
            f"{sum(comparison_df['Improved'] == 'Same')} / {len(comparison_df)} ({sum(comparison_df['Improved'] == 'Same') / len(comparison_df) * 100:.1f}%)",
            f"{sum(baseline_scores_series == 1.0)} / {len(baseline_scores)} ({sum(baseline_scores_series == 1.0) / len(baseline_scores) * 100:.1f}%)",
            f"{sum(optimized_scores_series == 1.0)} / {len(optimized_scores)} ({sum(optimized_scores_series == 1.0) / len(optimized_scores) * 100:.1f}%)",
            f"{sum(baseline_scores_series >= 0.8)} / {len(baseline_scores)} ({sum(baseline_scores_series >= 0.8) / len(baseline_scores) * 100:.1f}%)",
            f"{sum(optimized_scores_series >= 0.8)} / {len(optimized_scores)} ({sum(optimized_scores_series >= 0.8) / len(optimized_scores) * 100:.1f}%)",
            datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        ]
    })
    
    # Write to Excel with multiple sheets
    with pd.ExcelWriter(filepath, engine='xlsxwriter') as writer:
        summary_comparison.to_excel(writer, sheet_name='Summary Comparison', index=False)
        comparison_df.to_excel(writer, sheet_name='Detailed Comparison', index=False)
        
        # Add formatting
        workbook = writer.book
        
        # Format for the summary sheet
        worksheet = writer.sheets['Summary Comparison']
        header_format = workbook.add_format({'bold': True, 'bg_color': '#D9E1F2', 'border': 1})
        
        # Format header row
        for col_num, value in enumerate(summary_comparison.columns.values):
            worksheet.write(0, col_num, value, header_format)
        
        # Adjust column widths
        worksheet.set_column('A:A', 25)
        worksheet.set_column('B:B', 70)
        
        # Format for the detailed comparison sheet
        worksheet = writer.sheets['Detailed Comparison']
        header_format = workbook.add_format({'bold': True, 'bg_color': '#D9E1F2', 'border': 1})
        improved_format = workbook.add_format({'bg_color': '#C6EFCE'})  # Light green
        worsened_format = workbook.add_format({'bg_color': '#FFC7CE'})  # Light red
        
        # Format header row
        for col_num, value in enumerate(comparison_df.columns.values):
            worksheet.write(0, col_num, value, header_format)
        
        # Format improved/worsened cells
        for row_num, improved in enumerate(comparison_df['Improved'], start=1):
            if improved == 'Yes':
                worksheet.write(row_num, 5, 'Yes', improved_format)
            elif improved == 'No':
                worksheet.write(row_num, 5, 'No', worsened_format)
        
        # Adjust column widths
        worksheet.set_column('A:A', 10)  # Example ID
        worksheet.set_column('B:B', 40)  # Question
        worksheet.set_column('C:C', 15)  # Baseline Score
        worksheet.set_column('D:D', 15)  # Optimized Score
        worksheet.set_column('E:E', 15)  # Difference
        worksheet.set_column('F:F', 15)  # Improved
        worksheet.set_column('G:G', 50)  # Baseline Response
        worksheet.set_column('H:H', 50)  # Optimized Response
    
    print(f"Model comparison exported to {filepath}")
    return filepath

In [407]:
def diagnose_evaluation_result(result, label="Evaluation Result"):
    """Analyze an evaluation result to understand its structure"""
    print(f"\n=== {label} Diagnostics ===")
    print(f"Type: {type(result)}")
    
    if isinstance(result, tuple):
        print(f"Tuple length: {len(result)}")
        print(f"First element (score): {result[0]}")
        print(f"Second element type: {type(result[1])}")
        if isinstance(result[1], list):
            print(f"List length: {len(result[1])}")
            if len(result[1]) > 0:
                print(f"First item in list type: {type(result[1][0])}")
                print(f"First item in list keys: {list(result[1][0].keys()) if isinstance(result[1][0], dict) else 'Not a dict'}")
        elif hasattr(result[1], '__dict__'):
            print(f"Attributes: {list(result[1].__dict__.keys())}")
    elif isinstance(result, list):
        print(f"List length: {len(result)}")
        if len(result) > 0:
            print(f"First item in list type: {type(result[0])}")
            if hasattr(result[0], '__dict__'):
                print(f"First item attributes: {list(result[0].__dict__.keys())}")
            elif isinstance(result[0], dict):
                print(f"First item keys: {list(result[0].keys())}")
    else:
        print(f"Attributes: {dir(result)}")

def compare_evaluations():
    """Compare the baseline and optimized models with robust error handling"""
    try:
        # First, evaluate the baseline model if not already done
        if 'baseline_score' not in globals() or 'evaluation_result' not in globals():
            print("Evaluating baseline model first...")
            base_module = dspy.ChainOfThought(FAQProgSig)
            evaluator = Evaluate(
                devset=dataset.dev,
                metric=metric,
                display_progress=True,
                display_table=False,
                return_all_scores=True,
                return_outputs=True,
                num_threads=4
            )
            baseline_result = evaluator(base_module)
            print(f"Baseline evaluation completed")
        else:
            print("Using existing baseline evaluation")
            baseline_result = evaluation_result  # Use the existing evaluation result
            
        # Show diagnostics for baseline result
        diagnose_evaluation_result(baseline_result, "Baseline")
        
        # Then, evaluate the optimized model
        print("\nEvaluating optimized model...")
        evaluator = Evaluate(
            devset=dataset.dev,
            metric=metric,
            display_progress=True,
            display_table=False,
            return_all_scores=True,
            return_outputs=True,
            num_threads=4
        )
        optimized_result = evaluator(optimized_program)
        print("Optimized evaluation completed")
        
        # Show diagnostics for optimized result
        diagnose_evaluation_result(optimized_result, "Optimized")
        
        # Extract scores for simple comparison
        baseline_score = None
        optimized_score = None
        
        # Extract baseline score
        if isinstance(baseline_result, tuple) and len(baseline_result) >= 1:
            baseline_score = baseline_result[0]
        elif hasattr(baseline_result, 'score'):
            baseline_score = baseline_result.score
            
        # Extract optimized score
        if isinstance(optimized_result, tuple) and len(optimized_result) >= 1:
            optimized_score = optimized_result[0]
        elif hasattr(optimized_result, 'score'):
            optimized_score = optimized_result.score
        
        # Print simple performance comparison if scores were extracted
        if baseline_score is not None and optimized_score is not None:
            improvement = optimized_score - baseline_score
            improvement_percent = (improvement / baseline_score) * 100 if baseline_score > 0 else 0
            
            print(f"\n=== Performance Summary ===")
            print(f"- Baseline score: {baseline_score:.4f}")
            print(f"- Optimized score: {optimized_score:.4f}")
            print(f"- Absolute improvement: {improvement:.4f}")
            print(f"- Relative improvement: {improvement_percent:.2f}%")
            
            # Performance classification
            if improvement_percent >= 5:
                print("Result: Significant improvement ✅")
            elif improvement_percent >= 1:
                print("Result: Modest improvement ✓")
            elif improvement_percent > -1:
                print("Result: Negligible change ⟷")
            else:
                print("Result: Performance regression ⚠️")
        else:
            print("\nCouldn't extract scores for simple comparison")
        
        # Try to export the full comparison
        try:
            print("\nGenerating detailed Excel comparison...")
            filepath = compare_models_and_export(baseline_result, optimized_result)
            print(f"Detailed comparison exported to: {filepath}")
        except Exception as e:
            print(f"\nError during detailed Excel export: {str(e)}")
            print("Falling back to simple performance summary")
            
        return {
            "baseline_score": baseline_score,
            "optimized_score": optimized_score,
            "absolute_improvement": improvement if 'improvement' in locals() else None,
            "relative_improvement_percent": improvement_percent if 'improvement_percent' in locals() else None
        }
        
    except Exception as e:
        print(f"Error in compare_evaluations: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

In [408]:
compare_evaluations()

Using existing baseline evaluation

=== Baseline Diagnostics ===
Type: <class 'tuple'>
Tuple length: 3
First element (score): 72.82
Second element type: <class 'list'>
List length: 152
First item in list type: <class 'tuple'>
First item in list keys: Not a dict

Evaluating optimized model...
Average Metric: 113.69 / 152 (74.8%): 100%|██████████| 152/152 [00:00<00:00, 1177.36it/s]

2025/06/18 00:49:08 INFO dspy.evaluate.evaluate: Average Metric: 113.68850344851178 / 152 (74.8%)



Optimized evaluation completed

=== Optimized Diagnostics ===
Type: <class 'tuple'>
Tuple length: 3
First element (score): 74.8
Second element type: <class 'list'>
List length: 152
First item in list type: <class 'tuple'>
First item in list keys: Not a dict

=== Performance Summary ===
- Baseline score: 72.8200
- Optimized score: 74.8000
- Absolute improvement: 1.9800
- Relative improvement: 2.72%
Result: Modest improvement ✓

Generating detailed Excel comparison...

Error during detailed Excel export: Could not extract examples, outputs, or scores from result list
Falling back to simple performance summary


{'baseline_score': 72.82,
 'optimized_score': 74.8,
 'absolute_improvement': 1.980000000000004,
 'relative_improvement_percent': 2.7190332326284046}

In [None]:
# prompt_of_field_to_optimize = "bank_response"  # Instead of "bank_response"
# semantic = True

# dataset = FAQDataset(
#     data_path="/Users/nami-macos/Documents/BIDV-1/Testset FAQ_multiturn_FAQ_Chain - FAQ Chain filtered.csv",
#     selected_output_field=prompt_of_field_to_optimize,
# )
# metric = FAQScore(
#     selected_output_field=prompt_of_field_to_optimize, semantic=semantic
# )

# # Set up the optimization program
# program = dspy.ChainOfThought(FAQProgSig)

# teleprompter = MIPROv2(
#     metric=metric, 
#     max_bootstrapped_demos=0, 
#     max_labeled_demos=0, 
#     auto='medium',
#     # num_threads=10
#     # num_candidates=5
#     )
# optimized_program = teleprompter.compile(
#     program,
#     trainset=dataset.train,
#     valset=dataset.dev,
#     requires_permission_to_run=False,
#     # num_trials=7       
# )

# # Save the optimized prompt to a file
# optimized_program.save("faq_optimized_prompt.json")