In [None]:
# Packages
%pip install instructor
%pip install openai
%pip install pydantic
%pip install requests
%pip install s3fs
%pip install pandas
%pip install pyarrow


In [35]:
# Imports
import os
import logging
from typing import Any

import instructor
import pandas as pd
import s3fs
from openai import AzureOpenAI
from pydantic import BaseModel
from typing import Callable

In [36]:
# Constants
DEFAULT_AZURE_ENDPOINT = "https://general-experimentation.openai.azure.com/"
DEFAULT_API_KEY = "my_api_key"
DEFAULT_API_VERSION = "2024-04-01-preview"
DEFAULT_MODEL_NAME = "gpt-4o"
DEFAULT_OUTPUT_FILE = "dataset.parquet"
DEFAULT_BATCH_SIZE = 24

In [43]:
# Logging
logging.basicConfig(
    level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s'
)

In [54]:
# Types
class NLIExample(BaseModel):
  premise: str
  hypothesis: str
  label: bool
  label_rationale: str

class DatasetRow(BaseModel):
  input: str
  output: str
  ground_truth: str
  label: bool
  label_rationale: str

In [55]:
def initialize_client(
    api_key: str, azure_endpoint: str, api_version: str
) -> Any:
  logging.info("Initializing Azure OpenAI client.")
  azure_client = AzureOpenAI(
      azure_endpoint=azure_endpoint,
      api_version=api_version,
      api_key=api_key,
  )
  return instructor.from_openai(azure_client)


def generate_system_prompt() -> str:
  return f"""
    We are a company developing evaluation tools for AI applications, such as RAG, multi-agent systems, and generally applications that use large language models.
    We are using an NLI model to evaluate the performance of the AI application.
    Our platform is called AutoEval, and it works as follows:
    1. A developer uploads a dataset of trace logs from the AI application. The dataset must contain the columns 'input', 'output', and 'ground_truth'.
       input and output are the input and output of the AI application, respectively, and ground_truth can be either the expected output or
       the context retrieved from the RAG application (or a tool call) that is used to ground the response.
    2. A developer then specifies their evaluation criteria, which is effectively a prompt, and an LLM is used to label the dataset based on the criteria (either 1 or 0).
    3. The labeled dataset is then used to train the NLI model.
    4. The NLI model is then used to evaluate the AI application.

    In order to properly test out AutoEval, we would like to generate datasets that are representative of the types of data that we would expect to see in the real world.
    Limit the content to 512 BPE tokens, as that is the maximum length that the NLI model can handle.
    """

def generate_dataset_row(
    client,
    generate_dataset_row_prompt: Callable[[bool], str],
    model_name: str,
    is_positive: bool,
    seed: int,
):
  logging.info(
      f"Generating {'entailment' if is_positive else 'contradiction'} example row with seed {seed}."
  )
  prompt = generate_dataset_row_prompt(is_positive)
  try:
    example = client.chat.completions.create(
        model=model_name,
        response_model=DatasetRow,
        max_retries=3,
        seed=seed,
        messages=[
            {
                "role": "system",
                "content": generate_system_prompt()
            },
            {
                "role": "user",
                "content": prompt
            },
        ]
    )

    return example

  except Exception as e:
    logging.error(f"Failed to generate dataset row: {e}")
    raise


def write_df_to_parquet(df: pd.DataFrame, file_path: str):
  """Writes the DataFrame to a Parquet file."""
  df.to_parquet(file_path, index=False)
  logging.info(f"Saved examples to {file_path}")


def upload_to_s3(file_path: str, s3_path: str):
  """Upload a file to an S3 bucket."""
  s3 = s3fs.S3FileSystem(anon=False)
  with s3.open(s3_path, 'wb') as f:
    df = pd.read_parquet(file_path)
    df.to_parquet(f, engine='pyarrow')
  logging.info(f"Uploaded dataset to {s3_path}")

In [56]:
def make_absolute(path):
    # Check if the path is relative
    if not os.path.isabs(path):
        # Combine with current working directory to make it absolute
        path = os.path.join(os.getcwd(), path)
    return path

def gen_dataset(
    generate_dataset_row_prompt: Callable[[bool], str],
    api_key: str = DEFAULT_API_KEY,
    azure_endpoint: str = DEFAULT_AZURE_ENDPOINT,
    api_version: str = DEFAULT_API_VERSION,
    model_name: str = DEFAULT_MODEL_NAME,
    output_file: str = DEFAULT_OUTPUT_FILE,
    batch_size: int = DEFAULT_BATCH_SIZE):

    df: pd.DataFrame = pd.DataFrame()
    
    client = initialize_client(
        api_key, azure_endpoint, api_version
    )
  
    examples = []
    num_entailment = batch_size // 2
    num_contradiction = batch_size - num_entailment
    seed = 0

    logging.info(
        f"Generating {batch_size} examples interleaved between entailment and contradiction."
    )

    for i in range(max(num_entailment, num_contradiction)):
        try:
            if i < num_entailment:
                example = generate_dataset_row(
                    client, generate_dataset_row_prompt, model_name, is_positive=True, seed=seed
                )
                examples.append(example.dict())
                seed += 1

            if i < num_contradiction:
                example = generate_dataset_row(
                    client, generate_dataset_row_prompt, model_name, is_positive=False, seed=seed
                )
                examples.append(example.dict())
                seed += 1

            # Write the current examples to disk after every update
            df = pd.DataFrame(examples)
            file_path = make_absolute(output_file)
            write_df_to_parquet(df, file_path)

        except Exception as e:
            # Write the current examples to disk before continuing
            df = pd.DataFrame(examples)
            file_path = make_absolute(output_file)
            write_df_to_parquet(df, file_path)
            logging.error(
                "An error occurred during example generation. Data saved before retry. Error: %s",
                e
            )
    
    return df

## Supercard Dining concierge

A conversational assistant offered by a credit card company to provide restaurant recommendations to customers.

In [52]:
def generate_supercard_dataset_row_prompt(is_positive: bool) -> str:
  """Generate dataset prompt based on the type of example (positive/negative)."""
  interaction_type = "entailment" if is_positive else "contradiction"
  return f"""
    A credit card company SuperCard wants to encourage its customers to dine out more often. 
    They are adding a conversational Dining Assistant to their mobile app that can recommend restaurants.

    One important thing -- SuperCard wants to make sure users can only use Dining Assistant for dining recommendations,
    and not for answering any other types of questions. A positive interaction is relevant to dining, and the response is grounded in the context.

    Some example positive (NLI entailment) interactions are:
    
    input: 'I'm looking for a sushi place in downtown Toronto for a group of four tomorrow evening. Any recommendations?'
    ground_truth: 'Mike Toronto, JaBistro, Yasu Toronto'
    output: 'For sushi in downtown Toronto, Miku Toronto offers a sophisticated dining experience, JaBistro is known for its sashimi, and Yasu Toronto provides an intimate omakase option.'
    
    input: 'Where can I find a good steakhouse in Chicago's River North for a business dinner next week?'
    ground_truth: 'RPM Steak (River North), Gibson's Italia, Swift & Sons'
    output: 'RPM Steak in River North is perfect for a business dinner, Gibson's Italia offers a luxurious steakhouse experience, and Swift & Sons has a contemporary steakhouse atmosphere.'
    
    Negative (NLI contradiction) interactions are:

    input: 'Can you show me my recent bank transactions?'
    ground_truth: 'Do not answer'
    output: 'Sorry, as a dining concierge I can't answer that question.'

    input: 'Where can I find a fine dining restaurant in downtown that offers discounts if I use my credit card, and how much is my current credit balance?'
    ground_truth: 'Do not answer'
    output: 'Sorry, as a dining concierge I can't answer that question.'

    Generate a dataset row with a binary label for {interaction_type}.
    """

df = gen_dataset(generate_supercard_dataset_row_prompt, batch_size=2)

In [57]:
# Upload the final dataset to the specified S3 path
# upload_to_s3(file_path, args.s3_path)

2024-10-27 00:26:11,465 - INFO - Initializing Azure OpenAI client.
2024-10-27 00:26:11,476 - INFO - Generating 2 examples interleaved between entailment and contradiction.
2024-10-27 00:26:11,476 - INFO - Generating entailment example row with seed 0.
2024-10-27 00:26:13,220 - INFO - HTTP Request: POST https://general-experimentation.openai.azure.com//openai/deployments/gpt-4o/chat/completions?api-version=2024-04-01-preview "HTTP/1.1 200 OK"
2024-10-27 00:26:13,222 - INFO - Generating contradiction example row with seed 1.
2024-10-27 00:26:14,760 - INFO - HTTP Request: POST https://general-experimentation.openai.azure.com//openai/deployments/gpt-4o/chat/completions?api-version=2024-04-01-preview "HTTP/1.1 200 OK"
2024-10-27 00:26:15,579 - INFO - HTTP Request: POST https://general-experimentation.openai.azure.com//openai/deployments/gpt-4o/chat/completions?api-version=2024-04-01-preview "HTTP/1.1 200 OK"
2024-10-27 00:26:15,593 - INFO - Saved examples to /Users/saqadri/lm/lmai2/datasets

## Data Warehouse Search

A multi-agent retrieval system that connects to multiple data sources and provides a unified search interface.

In [65]:
def generate_warehouse_dataset_row_prompt(is_positive: bool) -> str:
  """Generate dataset prompt based on the type of example (positive/negative)."""
  interaction_type = "entailment" if is_positive else "contradiction"
  return f"""
    A media company MediaCo owns many media properties, including a book publisher, music label, film studio, TV network and more.
    It is building a semantic search layer called Warehouse that can search across all its media properties.
    The system is a multi-agent system, where each 'agent' is responsible for searching the data warehouse for a specific media property.
    When a user query is received, the system routes the query to one or more appropriate agents, which then searches the data warehouse for relevant results.
    A response is generated by combining the results from the agents.

    Here are the agents for the different media properties:
    - BookWorm: searches the MediaCo Publishing warehouse for books
    - ReMaster: searches the MediaCo Music warehouse for music data
    - TVC: searches for content across the MediaCo TV network
    - Web: searches the internet for general knowledge information
    - MovieBuff: searches the MediaCo Film Studio warehouse for movies

    Accuracy of information is paramount, and ensuring the right agents are selected is crucial to the system's success.

    Some example positive (NLI entailment) interactions are:

    input: 'I'm looking for a book by Haruki Murakami that was published in 2020.'
    ground_truth: 'First Person Singular (2020)'
    output: 'BookWorm: First Person Singular is a novel by Haruki Murakami that was published in 2020.'

    input: 'What was the song that stayed on the Top100 the longest?'
    ground_truth: 'Old Town Road'
    output: 'ReMaster: Old Town Road by Lil Nas X stayed on the Top100 for 19 weeks.'

    Negative (NLI contradiction) interactions are:

    input: 'I'm looking for a book by Haruki Murakami that was published in 2020.'
    ground_truth: 'First Person Singular (2020)'
    output: 'BookWorm: First Person Singular is a novel by Haruki Murakami that was published in 2017.'

    input: 'Who stars in the final season of Game of Thrones?'
    ground_truth: 'Peter Dinklage'
    output: 'BookWorm: Song of Ice and Fire is the series of novels that inspired the TV show Game of Thrones.'
    
    input: 'What TV shows are playing on TVC tonight?'
    ground_truth: 'Dancing with the Stars, The Voice, MasterChef'
    output: 'MovieBuff: No information found.
    Web: TVC is a television network that airs reality TV shows.'
    
    input: 'What TV shows are playing on TVC tonight?'
    ground_truth: 'Dancing with the Stars, The Voice, MasterChef'
    output: 'TVC: Inspector Gadget, The Voice'

    Generate a dataset row with a binary label for {interaction_type}.
    """

In [None]:
df = gen_dataset(generate_warehouse_dataset_row_prompt, output_file="/Users/saqadri/lm/lmai2/datasets/data-warehouse-search-dataset.parquet", batch_size=256)

## Strike Air Customer Support

A conversational agent that answers customer questions for an airline.

In [79]:
def generate_airline_cs_dataset_row_prompt(is_positive: bool) -> str:
  """Generate dataset prompt based on the type of example (positive/negative)."""
  interaction_type = "entailment" if is_positive else "contradiction"
  return f"""
    An airline StrikeAir is developing a conversational system to assist customers with general inquiries.
    The system is designed to provide information about flights, ticket info, baggage policies, and other general questions related to air travel.

    Accuracy and time-sensitive information are paramount in the system.
    Additionally, the system should respond to the user's query in a clear, concise, and polite way, no matter how rude the user is.

    Some example positive (NLI entailment) interactions are:

    input: 'What gate is my flight departing from?'
    ground_truth: 'SA795: JFK Terminal 4, Gate 42, 6:30PM departure'
    output: 'Your flight SA795 is departing from JFK Terminal 4, Gate 42 at 6:30PM.'

    input: 'How many free checked bags am I allowed on long-haul flights?'
    ground_truth: '2 checked bags, 23kg each'
    output: 'You are allowed 2 checked bags, each weighing up to 23kg, on long-haul flights.'

    input: 'This is the worst airline I've ever flown with. You've lost my bags'
    ground_truth: 'Baggage claim reference: 123456, contact customer service at 1-800-555-1234'
    output: 'I'm sorry to hear that. Your baggage claim reference is 123456. Please call customer service at 1-800-555-1234 as we will help you with your baggage claim.'

    Negative (NLI contradiction) interactions are:

    input: 'What is the phone number for customer service?'
    ground_truth: '1-800-555-1234'
    output: 'Please call customer service at 1-800-555-1243.'

    input: 'What gate is my flight departing from?'
    ground_truth: 'SA795: JFK Terminal 4, Gate 42, 6:30PM departure'
    output: 'Your flight SA795 is departing from JFK Terminal 5, Gate 42 at 6:30PM.'

    input: 'How many free checked bags am I allowed on long-haul flights?'
    ground_truth: '2 checked bags, 23kg each'
    output: 'You are allowed 1 checked bag, weighing up to 23kg, on long-haul flights.'

    input: 'This is the worst airline I've ever flown with. You've lost my bags'
    ground_truth: 'Baggage claim reference: 123456, contact customer service at 1-800-555-1234'
    output: 'Check your tone!? Your baggage claim reference is 123456. Call customer service at 1-800-555-1234...'


    Generate a dataset row with a binary label for {interaction_type}.
    """

In [None]:
df = gen_dataset(generate_airline_cs_dataset_row_prompt, output_file="/Users/saqadri/lm/lmai2/datasets/strike-air-customer-support-dataset.parquet", batch_size=256)

# Financial Wealth Advisor

An AI assistant for financial advisors, helping them more efficiently provide financial advice to their clients.

In [100]:
def generate_financial_advisor_dataset_row_prompt(is_positive: bool) -> str:
  """Generate dataset prompt based on the type of example (positive/negative)."""
  interaction_type = "entailment" if is_positive else "contradiction"
  return f"""
    A wealth management firm WMB is developing an AI application to help financial advisors get quick answers to common questions that clients ask.
    Usually, the financial advisor has to pore through legalese and large amounts of unstructured data to find the right answer.
    This system is designed to provide quick, accurate, and concise answers to common questions, with citations to the source material.

    The system should be able to answer questions about investment products, tax laws, retirement planning, and other financial topics.

    Accuracy is the most critical aspect of the system. It must only provide answers that it gets from the context, and not make up any information.

    Some example positive (NLI entailment) interactions are:

    input: 'How do I set up a backdoor Roth IRA?'
    ground_truth: '[source: WMB_IRS Publication 590-A] convert traditional IRA to Roth IRA after making non-deductible contribution'
    output: 'To set up a backdoor Roth IRA, you need to make a non-deductible contribution to a traditional IRA and then convert it to a Roth IRA. [see WMB_IRS Publication 590-A].'

    Negative (NLI contradiction) interactions are:

    input: 'How do I set up a backdoor Roth IRA?'
    ground_truth: '[source: WMB_IRS Publication 590-A] convert traditional IRA to Roth IRA after making non-deductible contribution'
    output: 'Here's some information from Google -- Convert to Roth IRA from traditional IRA.'

    Generate a dataset row with a binary label for {interaction_type}. Please try to think of examples that aren't just related to 401k or IRA.
    """

In [None]:
df = gen_dataset(generate_financial_advisor_dataset_row_prompt, output_file="/Users/saqadri/lm/lmai2/datasets/financial-advisor-dataset.parquet", batch_size=256)

In [104]:
from IPython.display import display, HTML
display(HTML(df.head(n=15).to_html(escape=False).replace(
    '<table border="1" class="dataframe">',
    '<table border="1" class="dataframe" style="display:block; max-height:500px; overflow:auto; white-space:nowrap;">'
)))

Unnamed: 0,input,output,ground_truth,label,label_rationale
0,What is the tax treatment of municipal bond interest?,The interest earned on municipal bonds is generally exempt from federal income tax and may also be exempt from state and local taxes if you reside in the state issuing the bond. Refer to WMB_Tax Guide 2023 for details.,"[source: WMB_Tax Guide 2023] municipal bond interest is exempt from federal income tax, and may be exempt from state and local taxes for residents of the issuing state",True,The output correctly states that the interest earned on municipal bonds is generally exempt from federal income tax. It also mentions the possibility of state and local tax exemptions if the bondholder resides in the issuing state. This information matches the context given in the ground truth.
1,What are the tax implications of selling a long-term stock investment?,There are no tax implications for selling a long-term investment.,[source: WMB_Tax Guide 2022] Long-term capital gains are taxed at different rates depending on your income level.,False,"The output incorrectly states that there are no tax implications for selling a long-term investment, which contradicts the information given in the ground_truth that long-term capital gains are taxed at different rates based on income. Thus, the label for this interaction should be 0 (contradiction)."
2,What are the tax implications for withdrawing from a 529 plan?,"Withdrawals from a 529 plan used for qualified education expenses are tax-free. For non-qualified withdrawals, earnings may be subject to income tax and a 10% penalty.","[source: WMB_529 Guide] Withdrawals for qualified education expenses are tax-free, while non-qualified withdrawals may incur income taxes and a 10% penalty on earnings.",True,"The output aligns with the information provided in the context specified by the ground_truth. Both state that withdrawals for qualified education expenses are tax-free and that non-qualified withdrawals may incur income taxes and a 10% penalty, which confirms entailment."
3,What is the capital gains tax rate for long-term investments?,"For long-term investments, the capital gains tax rate is typically 20% or higher, depending on your income bracket and specific circumstances. Always consult with a tax advisor. [Source: generic tax advice website]","[source: WMB_Tax Code 2023, Section 1234] The capital gains tax rate for long-term investments varies: 0% for individuals in the 10-15% income brackets, 15% for those in the 25-35% brackets, and 20% for individuals in the 39.6% bracket.",False,The output contradicts the provided ground truth by not specifying the detailed tax rates for different income brackets as mentioned in the source.
4,What are the advantages of investing in an index fund?,"Index funds offer diversification, low fees, and consistent returns that match the market index over time. [source: WMB_Investment_Guide Chapter 4]","[source: WMB_Investment_Guide Chapter 4] Index funds provide diversification, reduce expense ratios, and generally yield returns in line with their underlying index.",True,"The output accurately reflects the advantages of investing in an index fund based on the source material provided in the ground truth. Both mention diversification, low fees, and returns aligning with the market index."
5,What are the tax benefits of a Health Savings Account (HSA)?,Health Savings Accounts allow you to save money on a pre-tax basis to pay for qualified medical expenses. This can be deducted from federal income taxes.,"[source: WMB_IRS Publication 969] HSAs offer triple tax advantages: contributions are tax-deductible, earnings are tax-free, and withdrawals for qualified medical expenses are tax-free.",False,"The output misses significant details by only mentioning that the contributions can be pre-taxed but fails to mention the triple tax advantage which includes tax-free earnings and tax-free withdrawals for qualified medical expenses. This creates a misleading or incomplete understanding, leading to contradiction."
6,What are the tax implications of selling a rental property?,The tax implications of selling a rental property include capital gains tax if sold for a profit and recapture of depreciation expenses. [source: WMB_Tax Guide 2022],[source: WMB_Tax Guide 2022] Selling a rental property may result in capital gains tax if sold for a profit and recapture of depreciation expenses.,True,"The output accurately reflects the information in the ground_truth and cites the same source, which fulfills the criteria for entailment."
7,What are the tax implications of gifting stocks to family members?,"When you gift stocks to family members, you have to pay a gift tax, which is typically 15%.",[source: WMB_Tax Code Section 2503] Gift tax implications vary depending on the fair market value of the stock and the annual gift exclusion limit. There are also potential capital gains taxes for the recipient.,False,"The output states that there is a 15% gift tax, which is not mentioned in the ground truth. Additionally, the ground truth mentions other factors like fair market value and capital gains tax implications, which the output does not address."
8,What are the potential tax benefits of investing in municipal bonds?,"Investing in municipal bonds may provide income that is exempt from federal and, in some cases, state and local income taxes. [see WMB_Financial Benefits Report].","[source: WMB_Financial Benefits Report] municipal bonds income often exempt from federal, sometimes state/local taxes",True,"The output accurately reflects the information in the ground_truth, providing details about the potential tax benefits of investing in municipal bonds, as specified in the context."
9,What is the annual limit for 401(k) contributions?,The annual limit for 401(k) contributions is set by your employer and varies by company. Please check with your HR department for specifics.,"[source: IRS_401k_Limits] For 2023, the annual contribution limit for a 401(k) is $22,500 for individuals under 50, and an additional catch-up contribution of $7,500 for those over 50.",False,"The output does not match the ground truth source material. The provided answer incorrectly states that the limit is set by the employer, whereas the ground_truth indicates specific contribution limits defined by the IRS for 2023."
