In [None]:
import sys
import os
import json

# Get the absolute path of the parent directory
# sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

# import modules from PromptOps
from PromptOps.std_templates import ShotTemplateFormatter
from PromptOps.icqa_templates import ICQATemplateFormatter
from PromptOps.cot_templates import COTTemplateFormatter
from PromptOps.test import PromptCompletion, Test
from PromptOps.test_suite import TestSuite
from PromptOps.perturb import Perturbation
from PromptOps.cosine_score import cosine_score



In [None]:
# Example usage
text1 = "Positive"
text2 = "Negative" 

similarity = cosine_score(text1, text2)

print(f"Cosine similarity score: {similarity}")

In [None]:
import csv
import os
from datetime import datetime

def calculate_response_similarities(results_data, csv_prefix):
    """
    Calculate cosine similarity between response_original and response_perturb for each test
    and save results to a CSV file.
    
    Args:
        results_data (list): List of test result dictionaries
        csv_prefix (str): Prefix for the CSV filename
        
    Returns:
        dict: Dictionary with test names as keys and their cosine similarity scores as values
    """
    similarity_scores = {}
    
    # Generate timestamp for unique filename
    csv_filename = f"{csv_prefix}.csv"
    
    # Create CSV file and write header
    with open(csv_filename, 'w', newline='') as csvfile:
        fieldnames = ['test_name', 'response_original', 'response_perturb', 'expected_result', 'similarity']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        # Process each test
        for test in results_data:
            name = test['name']
            response_original = test['response_original']
            response_perturb = test['response_perturb']
            expected_result = test['expected_result']
            
            # Calculate cosine similarity
            similarity = cosine_score(response_original, response_perturb)
            
            # Print the result for each test
            print(f"{name}: Original: '{response_original}', Perturbed: '{response_perturb}', Expected: '{expected_result}', Similarity: {similarity:.4f}")
            
            # Store in dictionary with test name as key
            similarity_scores[name] = similarity
            
            # Write to CSV
            writer.writerow({
                'test_name': name,
                'response_original': response_original,
                'response_perturb': response_perturb,
                'expected_result': expected_result,
                'similarity': f"{similarity:.4f}"
            })
    
    print(f"Results saved to {csv_filename}")
    return similarity_scores

# OpenAI

## Standard

### STD Zero Shot

In [None]:
completion = PromptCompletion(
    model_provider="openai",
    model="gpt-4o",
    system_content="You are an assistant that classifies the sentiment of the message into positive, negative, and neutral.",
    temperature=0,
    top_p=0,
    max_tokens=150,
    api_key="Your-API-KEY"
)

In [None]:
file_path = "/path/prompts/sentiment/filename.csv"
formatter = ShotTemplateFormatter(file_path)
formatted_data = formatter.format_all_rows(shot_type="zero")
formatter.save_formatted_data_to_csv(formatted_data, output_filepath="gpt_std_zero_shot.csv")
print(formatted_data)

In [None]:
csv_files = "gpt_std_zero_shot.csv"

In [None]:
import pandas as pd
for file_path in csv_files:
    #print(f"Processing file: {file_path} with perturbation: {perturb_type}")
        
    # Load the CSV into a DataFrame
    df = pd.read_csv(csv_files)
    
    test_suite = TestSuite()
    # Iterate over each row to create a test
    for index, row in df.iterrows():
            
        test = Test(
            name=f"Test fairness #{index + 1}", # Replace with your perturbation type
            prompt=row["original_prompt"],  # Original prompt
            expected_result=row["expected_result"],  # Expected result
            description=f"A test with fairness perturbation", # Replace with your perturbation type
            test_type="fairness", # Replace with your perturbation type
            perturb_text=row["perturb_prompt"],  # Perturbed prompt
        )
        test_suite.add_test(test)
test_suite.run_all(completion)
results, summary = test_suite.summarize()

test_suite.export_results("gpt_std_zero_test_results.json", file_format="json", overwrite=True)

test_suite.clear()


In [None]:
print("Results:", results)
print("Summary:", summary)

### STD One Shot

In [None]:
completion = PromptCompletion(
    model_provider="openai",
    model="gpt-4o",
    system_content="""
You are an assistant that classifies the sentiment of the message into positive, negative, and neutral. Given below is an example of the sentiment analysis task.

Sentence: I had a bad experience
Sentiment: Negative
""",
    temperature=0,
    top_p=0,
    max_tokens=150,
    api_key="Your-API-KEY"
)

In [None]:
file_path = "/path/prompts/sentiment/filename.csv"
formatter = ShotTemplateFormatter(file_path)
formatted_data = formatter.format_all_rows(shot_type="one")
formatter.save_formatted_data_to_csv(formatted_data, output_filepath="gpt_std_one_shot.csv")
print(formatted_data)

In [None]:
csv_files = "gpt_std_one_shot.csv"

In [None]:
import pandas as pd
for file_path in csv_files:
    #print(f"Processing file: {file_path} with perturbation: {perturb_type}")
        
    # Load the CSV into a DataFrame
    df = pd.read_csv(csv_files)
    
    test_suite = TestSuite()
    # Iterate over each row to create a test
    for index, row in df.iterrows():
            
        test = Test(
            name=f"Test fairness #{index + 1}", # Replace with your perturbation type
            prompt=row["original_prompt"],  # Original prompt
            expected_result=row["expected_result"],  # Expected result
            description=f"A test with fairness perturbation", # Replace with your perturbation type
            test_type="fairness", # Replace with your perturbation type
            perturb_text=row["perturb_prompt"],  # Perturbed prompt
        )
        test_suite.add_test(test)
test_suite.run_all(completion)
results, summary = test_suite.summarize()

test_suite.export_results("gpt_std_one_test_results.json", file_format="json", overwrite=True)

test_suite.clear()


In [None]:
print("Results:", results)
print("Summary:", summary)

### STD Few Shot

In [None]:
completion = PromptCompletion(
    model_provider="openai",
    model="gpt-4o",
    system_content="""You are an assistant that classifies the sentiment of the message into positive, negative, and neutral. Given below are a few examples of the sentiment analysis task.

Sentence: I had a bad experience
Sentiment: Negative

Sentence: The food was not bad
Sentiment: Neutral

Sentence: The movie was impressive.
Sentiment: Positive
""",
    temperature=0,
    top_p=0,
    max_tokens=150,
    api_key="Your-API-KEY"
)

In [None]:
file_path = "/path/prompts/sentiment/filename.csv"
formatter = ShotTemplateFormatter(file_path)
formatted_data = formatter.format_all_rows(shot_type="few")
formatter.save_formatted_data_to_csv(formatted_data, output_filepath="gpt_std_few_shot.csv")
print(formatted_data)

In [None]:
csv_files = "gpt_std_few_shot.csv"

In [None]:
import pandas as pd
for file_path in csv_files:
    #print(f"Processing file: {file_path} with perturbation: {perturb_type}")
        
    # Load the CSV into a DataFrame
    df = pd.read_csv(csv_files)
    
    test_suite = TestSuite()
    # Iterate over each row to create a test
    for index, row in df.iterrows():
            
        test = Test(
            name=f"Test fairness #{index + 1}", # Replace with your perturbation type
            prompt=row["original_prompt"],  # Original prompt
            expected_result=row["expected_result"],  # Expected result
            description=f"A test with fairness perturbation", # Replace with your perturbation type
            test_type="fairness", # Replace with your perturbation type
            perturb_text=row["perturb_prompt"],  # Perturbed prompt
        )
        test_suite.add_test(test)
test_suite.run_all(completion)
results, summary = test_suite.summarize()

test_suite.export_results("gpt_std_few_test_results.json", file_format="json", overwrite=True)

test_suite.clear()


In [None]:
print("Results:", results)
print("Summary:", summary)

# Gemini

## Standard

### STD Zero Shot

In [None]:
completion = PromptCompletion(
    model_provider="gemini",
    model="gemini-2.0-flash",
    system_content="You are an assistant that classifies the sentiment of the message into positive, negative, and neutral.",
    temperature=0,
    top_p=0,
    max_tokens=150,
    api_key="Your-API-KEY"
  )

In [None]:
file_path = "/path/prompts/sentiment/filename.csv"
formatter = ShotTemplateFormatter(file_path)
formatted_data = formatter.format_all_rows(shot_type="zero")
formatter.save_formatted_data_to_csv(formatted_data, output_filepath="gemini_std_zero_shot.csv")
print(formatted_data)

In [None]:
csv_files = "gemini_std_zero_shot.csv"

In [None]:
import pandas as pd
for file_path in csv_files:
    #print(f"Processing file: {file_path} with perturbation: {perturb_type}")
        
    # Load the CSV into a DataFrame
    df = pd.read_csv(csv_files)
    
    test_suite = TestSuite()
    # Iterate over each row to create a test
    for index, row in df.iterrows():
            
        test = Test(
            name=f"Test fairness #{index + 1}", # Replace with your perturbation type
            prompt=row["original_prompt"],  # Original prompt
            expected_result=row["expected_result"],  # Expected result
            description=f"A test with fairness perturbation", # Replace with your perturbation type
            test_type="fairness", # Replace with your perturbation type
            perturb_text=row["perturb_prompt"],  # Perturbed prompt
        )
        test_suite.add_test(test)
test_suite.run_all(completion)
results, summary = test_suite.summarize()

test_suite.export_results("gemini_std_zero_test_results.json", file_format="json", overwrite=True)

test_suite.clear()


In [None]:
print("Results:", results)
print("Summary:", summary)

### STD One Shot

In [None]:
completion = PromptCompletion(
    model_provider="gemini",
    model="gemini-2.0-flash",
    system_content="""
You are an assistant that classifies the sentiment of the message into positive, negative, and neutral. Given below is an example of the sentiment analysis task.

Sentence: I had a bad experience
Sentiment: Negative
""",
    temperature=0,
    top_p=0,
    max_tokens=150,
    api_key="Your-API-KEY"
  )

In [None]:
file_path = "/path/prompts/sentiment/filename.csv"
formatter = ShotTemplateFormatter(file_path)
formatted_data = formatter.format_all_rows(shot_type="one")
formatter.save_formatted_data_to_csv(formatted_data, output_filepath="gemini_std_one_shot.csv")
print(formatted_data)

In [None]:
csv_files = "gemini_std_one_shot.csv"

In [None]:
import pandas as pd
for file_path in csv_files:
    #print(f"Processing file: {file_path} with perturbation: {perturb_type}")
        
    # Load the CSV into a DataFrame
    df = pd.read_csv(csv_files)
    
    test_suite = TestSuite()
    # Iterate over each row to create a test
    for index, row in df.iterrows():
            
        test = Test(
            name=f"Test fairness #{index + 1}", # Replace with your perturbation type
            prompt=row["original_prompt"],  # Original prompt
            expected_result=row["expected_result"],  # Expected result
            description=f"A test with fairness perturbation", # Replace with your perturbation type
            test_type="fairness", # Replace with your perturbation type
            perturb_text=row["perturb_prompt"],  # Perturbed prompt
        )
        test_suite.add_test(test)
test_suite.run_all(completion)
results, summary = test_suite.summarize()

test_suite.export_results("gemini_std_one_test_results.json", file_format="json", overwrite=True)

test_suite.clear()


In [None]:
print("Results:", results)
print("Summary:", summary)

### STD Few Shot

In [None]:
completion = PromptCompletion(
    model_provider="gemini",
    model="gemini-2.0-flash",
    system_content="""You are an assistant that classifies the sentiment of the message into positive, negative, and neutral. Given below are a few examples of the sentiment analysis task.

Sentence: I had a bad experience
Sentiment: Negative

Sentence: The food was not bad
Sentiment: Neutral

Sentence: The movie was impressive.
Sentiment: Positive
""",
    temperature=0,
    top_p=0,
    max_tokens=150,
    api_key="/path/prompts/sentiment/filename.csv"
  )

In [None]:
file_path = "/path/prompts/sentiment/filename.csv"
formatter = ShotTemplateFormatter(file_path)
formatted_data = formatter.format_all_rows(shot_type="few")
formatter.save_formatted_data_to_csv(formatted_data, output_filepath="gemini_std_few_shot.csv")
print(formatted_data)

In [None]:
csv_files = "gemini_std_few_shot.csv"

In [None]:
import pandas as pd
for file_path in csv_files:
    #print(f"Processing file: {file_path} with perturbation: {perturb_type}")
        
    # Load the CSV into a DataFrame
    df = pd.read_csv(csv_files)
    
    test_suite = TestSuite()
    # Iterate over each row to create a test
    for index, row in df.iterrows():
            
        test = Test(
            name=f"Test fairness #{index + 1}", # Replace with your perturbation type
            prompt=row["original_prompt"],  # Original prompt
            expected_result=row["expected_result"],  # Expected result
            description=f"A test with fairness perturbation", # Replace with your perturbation type
            test_type="fairness", # Replace with your perturbation type
            perturb_text=row["perturb_prompt"],  # Perturbed prompt
        )
        test_suite.add_test(test)
test_suite.run_all(completion)
results, summary = test_suite.summarize()

test_suite.export_results("gemini_std_few_test_results.json", file_format="json", overwrite=True)

test_suite.clear()


In [None]:
print("Results:", results)
print("Summary:", summary)