In [57]:
from PromptOps.std_templates import ShotTemplateFormatter
from PromptOps.icqa_templates import ICQATemplateFormatter
from PromptOps.cot_templates import COTTemplateFormatter
from PromptOps.test import PromptCompletion, Test
from PromptOps.test_suite import TestSuite
from PromptOps.perturb import Perturbation

## Define LLM

In [58]:
# Initialize the PromptCompletion model for Llama
llama_url = "http://127.0.0.1:8000/v1/chat/completions"  # Replace with your Llama model URL
completion = PromptCompletion(
    model_provider="llama",
    model="llama-13b",
    system_content="You are an assistant that answer the question",
    temperature=0.5,
    top_p=0.9,
    max_tokens=150,
    llama_url=llama_url
)

In [59]:
perturbation = Perturbation()

## Function

In [60]:
import pandas as pd
# shot_type='zero', 'one', 'few'
def process_test_robust(file_path, shot_type, template):
    """
    Process the input CSV file, run tests on the data, and return index scores.

    Args:
        file_path (str): Path to the input CSV file.
        shot_type (str): Type of shot (e.g., 'one', 'few', etc.). Default is 'one'.

    Returns:
        dict: A dictionary containing scores for each Original_Question_Index.
    """
    # Load the CSV file
    df = pd.read_csv(file_path)

    # Process questions with perturbations
    result_df = perturbation.process_questions(df, question_column="Question", expected_answer_column="Expected_answer")
    columns_to_merge = df.drop(columns=["Question", "Expected_answer"])

    # Merge with additional columns
    merged_result_df = result_df.merge(
        columns_to_merge,
        left_on="Original_Question_Index",
        right_index=True,
        how="left"
    )

    # Rearrange column order
    column_order = ["Original_Question_Index"] + \
                   list(columns_to_merge.columns) + \
                   ["Original_Question", "Perturbation", "Perturbed_Question", "Expected_Answer"]
    merged_result_df = merged_result_df[column_order]

    # Save intermediate results
    output_file_path = "merged_perturbation_results.csv"
    merged_result_df.to_csv(output_file_path, index=False)

    # Format data for testing
    if template=='std':
        robust_formatter = ShotTemplateFormatter(output_file_path)
    elif template=='icqa':
        robust_formatter = ICQATemplateFormatter(output_file_path)
    elif template=='cot':
        robust_formatter = COTTemplateFormatter(output_file_path)

    robust_formatted_data = robust_formatter.format_all_rows(shot_type=shot_type, perturb_type='robust')
    robust_formatter.save_formatted_data_to_csv(robust_formatted_data, 'robust_new_formatted_one_shot.csv')
    robust_formatted_data = pd.DataFrame(robust_formatted_data)

    # Initialize TestSuite
    test_suite = TestSuite()
    index_scores = {}
    
    detailed_results = []  # Store detailed results for each test

    for original_index in robust_formatted_data['Original_Question_Index'].unique():
        print(f"Running [Original_Question_Index {original_index}] test...")

        subset = robust_formatted_data[robust_formatted_data['Original_Question_Index'] == original_index]
        index_result = {"Original_Question_Index": original_index, "tests": []}

        for _, row in subset.iterrows():
            test = Test(
                name=f"Test robust #{row.name + 1}",
                prompt=row["original_prompt"],
                expected_result=row["expected_result"],
                description=f"A test with robust perturbation",
                perturb_method=row["perturb_type"],
                test_type=row["perturb_type"],
                perturb_text=row["perturb_prompt"],
            )
            test_suite.add_test(test)

        test_suite.run_all(completion)
        results, summary = test_suite.summarize()
        total_tests = summary['total_tests']
        failures = summary['failures']
        score = (total_tests - failures) / total_tests * 100

        index_scores[original_index] = score
        index_result["score"] = score
        index_result["summary"] = summary
        index_result["results"] = results
        detailed_results.append(index_result)

        print(f"Summary: {summary}")
        print(f"Robust index {original_index}, score={score:.2f}%")
        test_suite.clear()
    
    return index_scores, detailed_results


In [61]:
def process_test(file_path, shot_type, template, perturbation_types):
    if template=='std':
        formatter = ShotTemplateFormatter(file_path)
    elif template=='icqa':
        formatter = ICQATemplateFormatter(file_path)
    elif template=='cot':
        formatter = COTTemplateFormatter(file_path)
    
    csv_files = []
    for perturb_type in perturbation_types:
        #print(f"Processing {perturb_type} perturbation...")
        formatted_data = formatter.format_all_rows(shot_type=shot_type, perturb_type=perturb_type)
        output_file = f"formatted_{perturb_type}_output.csv"

        formatter.save_formatted_data_to_csv(formatted_data, output_file)
        csv_files.append((perturb_type, output_file))
        #print(f"Saved formatted data with {perturb_type} perturbation to {output_file}")

    test_suite = TestSuite()
    for perturb_type, file_path in csv_files:
        #print(f"Processing file: {file_path} with perturbation: {perturb_type}")
        
        # Load the CSV into a DataFrame
        df = pd.read_csv(file_path)
        
        # Iterate over each row to create a test
        for index, row in df.iterrows():
            
            test = Test(
                name=f"Test {perturb_type} #{index + 1}",
                prompt=row["original_prompt"],  # Original prompt
                expected_result=row["expected_result"],  # Expected result
                description=f"A test with {perturb_type} perturbation",
                perturb_method=perturb_type,  # Perturbation type
                test_type=perturb_type,
                perturb_text=row["perturb_prompt"],  # Perturbed prompt
            )
            test_suite.add_test(test)
    test_suite.run_all(completion)
    results, summary = test_suite.summarize()

    test_suite.export_results("test_results.csv", file_format="csv", overwrite=True)
    test_suite.export_results("test_results.xlsx", file_format="xlsx", overwrite=True)
    test_suite.export_results("test_results.json", file_format="json", overwrite=True)

    test_suite.clear()
    return results, summary


In [62]:
def process_score(index_scores=None, summary=None):
    import json
    
    # Ensure the 'summary' input is not None and provide a default if missing
    if summary is None:
        summary = {'total_tests': 0, 'failures': 0}
    
    # Initialize counters
    overall_total_tests = 0
    overall_failures = 0

    # Threshold for failure
    threshold = 0.7

    # If index_scores is None or empty, initialize it as an empty dictionary
    if index_scores is None:
        index_scores = {}

    # Process index scores if provided
    for index, score in index_scores.items():
        overall_total_tests += 1  # Each index counts as one test
        if score < threshold * 100:  # Convert threshold to percentage
            overall_failures += 1

    # Add results from the summary for total tests and failures
    overall_total_tests += summary.get('total_tests', 0)
    overall_failures += summary.get('failures', 0)

    # Calculate overall pass count
    overall_pass = overall_total_tests - overall_failures

    # Calculate overall failure and pass rates
    overall_failure_rate = (overall_failures / overall_total_tests) * 100 if overall_total_tests > 0 else 0
    overall_pass_rate = (overall_pass / overall_total_tests) * 100 if overall_total_tests > 0 else 0

    # Prepare JSON response
    result = {
        "overall_total_tests": overall_total_tests,
        "overall_failures": overall_failures,
        "overall_failure_rate": overall_failure_rate,
        "overall_pass": overall_pass,
        "overall_pass_rate": overall_pass_rate
    }

    return json.dumps(result, indent=4)  # Convert dictionary to JSON format


In [63]:
def calculate_performance_score(detailed_scores=None, results=None):
    import json

    # Ensure the inputs are not None and provide defaults if missing
    if detailed_scores is None:
        detailed_scores = []
    if results is None:
        results = []

    # Extract all score values from detailed scores
    detailed_scores_values = []
    for score in detailed_scores:
        for result in score.get('results', []):
            detailed_scores_values.append(result.get('score_original', 0))

    # Combine detailed scores and results scores
    all_scores = detailed_scores_values + [result.get('score_original', 0) for result in results]

    # If there are no scores, handle division by zero by returning a score of 0
    if not all_scores:
        overall_performance_score = 0
    else:
        # Calculate the overall performance score
        overall_performance_score = sum(all_scores) / len(all_scores)

    # Initialize a dictionary to group scores by perturbation type
    perturbation_scores = {}

    for result in results:
        test_type = result.get('test_type') or result.get('name', '').split("#")[0].strip().lower()
        if test_type not in perturbation_scores:
            perturbation_scores[test_type] = []
        perturbation_scores[test_type].append(result.get('score_original', 0))

    # Add robust scores from detailed scores to the group
    if 'robust' not in perturbation_scores:
        perturbation_scores['robust'] = []
    for detailed_score in detailed_scores:
        perturbation_scores['robust'].append(detailed_score.get('score', 0))

    # Calculate the average score for each perturbation type
    perturbation_averages = {}
    for perturbation, scores in perturbation_scores.items():
        if scores:  # Avoid division by zero if there are no scores for a perturbation type
            perturbation_averages[perturbation] = sum(scores) / len(scores)
        else:
            perturbation_averages[perturbation] = 0  # Default to 0 if no scores

    # Combine the results into a JSON-friendly dictionary
    result_data = {
        "overall_performance_score": overall_performance_score,
        **perturbation_averages,
    }

    return json.dumps(result_data, indent=4)


## Run

### Robust
- index_scores (use in summary score)  -> calcutale failure rate / total test 
- detailed_results ((use in dashboard score) ) -> calculate performance dashboard

In [64]:
index_scores, robust_results = process_test_robust('one_shot_example.csv', shot_type='one', template='icqa')


Running [Original_Question_Index 0] test...
Summary: {'total_tests': 5, 'failures': 0, 'fail_rate': 0.0}
Robust index 0, score=100.00%
Running [Original_Question_Index 1] test...
Summary: {'total_tests': 8, 'failures': 0, 'fail_rate': 0.0}
Robust index 1, score=100.00%
Running [Original_Question_Index 2] test...
Summary: {'total_tests': 5, 'failures': 0, 'fail_rate': 0.0}
Robust index 2, score=100.00%


In [65]:
# Result
print("Total test:", index_scores)
print("Detailed score:", robust_results)
#{0: 100.0, 1: 100.0, 2: 100.0}
# 0 = '1st Prompt'
# 1 = '2nd Prompt'
# 2 = '3rd Prompt'

Total test: {0: 100.0, 1: 100.0, 2: 100.0}
Detailed score: [{'Original_Question_Index': 0, 'tests': [], 'score': 100.0, 'summary': {'total_tests': 5, 'failures': 0, 'fail_rate': 0.0}, 'results': [{'name': 'Test robust #1', 'description': 'A test with robust perturbation', 'test_type': 'robust', 'prompt': 'Instruction: Answer the following questions based on the context and provide answer in boolean (Yes or No)\nContext: Cucumber plants need insects to pollinate them. Seedless cucumber does not require pollination.\nQ: Is growing seedless cucumber good for a gardener with entomophobia?\nA: Yes\n\nQ: Can cucumbers grow without insects?\nA:', 'expected_result': 'Yes', 'perturb_text': 'Instruction: Answer the following questions based on the context and provide answer in boolean (Yes or No)\nContext: Cucumber plants need insects to pollinate them. Seedless cucumber does not require pollination.\nQ: Is growing seedless cucumber good for a gardener with entomophobia?\nA: Yes\n\nQ: aCn cucumb

### Other perturbation

In [66]:
other_results, summary = process_test('one_shot_example.csv', shot_type='one', template='icqa', perturbation_types = ['taxonomy', 'vocab'])

- summary (use in summary score)  -> calcutale failure rate / total test 
- other_results ((use in dashboard score) ) -> calculate performance dashboard

In [67]:
print(other_results)
print("Summary:", summary)

[{'name': 'Test taxonomy #1', 'description': 'A test with taxonomy perturbation', 'test_type': 'taxonomy', 'prompt': 'Instruction: Answer the following questions based on the context and provide answer in boolean (Yes or No)\nContext: Cucumber plants need insects to pollinate them. Seedless cucumber does not require pollination.\nQ: Is growing seedless cucumber good for a gardener with entomophobia?\nA: Yes\n\nQ: Can cucumbers grow without insects?\nA:', 'expected_result': 'Yes', 'perturb_text': 'Instruction: Answer the following questions based on the context and provide answer in boolean (Yes or No)\nContext: Cucumber plants need insects to pollinate them. Seedless cucumber does not require pollination.\nQ: Is growing seedless cucumber good for a gardener with entomophobia?\nA: Yes\n\nQ: None\nA:', 'pass_condition': 'increase', 'capability': None, 'response_original': 'No', 'response_perturb': 'No', 'score_original': 0.7000372409820557, 'score_perturb': 0.7000372409820557, 'fail': Fa

## Summary score of Robustness and other perturbation

In [68]:
summary = process_score(index_scores, summary)

# # Display results
# print(f"Overall Total Tests: {overall_total_tests}")
# print(f"Overall Failures: {overall_failures}")
# print(f"Overall Failure Rate: {overall_failure_rate:.2f}%")

In [69]:
print(summary)

{
    "overall_total_tests": 9,
    "overall_failures": 1,
    "overall_failure_rate": 11.11111111111111,
    "overall_pass": 8,
    "overall_pass_rate": 88.88888888888889
}


## Dashboard Score

show performance


In [70]:
performance = calculate_performance_score(robust_results, other_results)


In [71]:
print(robust_results)

[{'Original_Question_Index': 0, 'tests': [], 'score': 100.0, 'summary': {'total_tests': 5, 'failures': 0, 'fail_rate': 0.0}, 'results': [{'name': 'Test robust #1', 'description': 'A test with robust perturbation', 'test_type': 'robust', 'prompt': 'Instruction: Answer the following questions based on the context and provide answer in boolean (Yes or No)\nContext: Cucumber plants need insects to pollinate them. Seedless cucumber does not require pollination.\nQ: Is growing seedless cucumber good for a gardener with entomophobia?\nA: Yes\n\nQ: Can cucumbers grow without insects?\nA:', 'expected_result': 'Yes', 'perturb_text': 'Instruction: Answer the following questions based on the context and provide answer in boolean (Yes or No)\nContext: Cucumber plants need insects to pollinate them. Seedless cucumber does not require pollination.\nQ: Is growing seedless cucumber good for a gardener with entomophobia?\nA: Yes\n\nQ: aCn cucumbers grow without insects?\nA:', 'pass_condition': 'increase

In [72]:
print(other_results)

[{'name': 'Test taxonomy #1', 'description': 'A test with taxonomy perturbation', 'test_type': 'taxonomy', 'prompt': 'Instruction: Answer the following questions based on the context and provide answer in boolean (Yes or No)\nContext: Cucumber plants need insects to pollinate them. Seedless cucumber does not require pollination.\nQ: Is growing seedless cucumber good for a gardener with entomophobia?\nA: Yes\n\nQ: Can cucumbers grow without insects?\nA:', 'expected_result': 'Yes', 'perturb_text': 'Instruction: Answer the following questions based on the context and provide answer in boolean (Yes or No)\nContext: Cucumber plants need insects to pollinate them. Seedless cucumber does not require pollination.\nQ: Is growing seedless cucumber good for a gardener with entomophobia?\nA: Yes\n\nQ: None\nA:', 'pass_condition': 'increase', 'capability': None, 'response_original': 'No', 'response_perturb': 'No', 'score_original': 0.7000372409820557, 'score_perturb': 0.7000372409820557, 'fail': Fa

In [73]:
print(performance)
# overall performance score is average score original 

{
    "overall_performance_score": 0.7875263790289561,
    "taxonomy": 0.8000248273213705,
    "vocab": 0.8000248273213705,
    "robust": 100.0
}
