<a href="https://colab.research.google.com/github/lavanyashukla/tick-tock/blob/main/LWM_1_2_Tick_Tock_%E2%80%93_Bug_or_not.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# I. Setup

In [1]:
!pip install openai -qq
from openai import OpenAI
from google.colab import userdata
from collections import Counter
import pandas as pd
import random
import csv
import json
import os

In [2]:
!git clone https://github.com/lavanyashukla/tick-tock.git

fatal: destination path 'tick-tock' already exists and is not an empty directory.


In [3]:
# load the data
input_file = 'tick-tock/data/support_tickets_clean.csv'

# Open the CSV file for reading
with open(input_file, mode='r', encoding='utf-8') as csvfile:
    # Use csv.DictReader to read the CSV file into a dictionary
    reader = csv.DictReader(csvfile)

    # Initialize a list to store each row (as a dictionary)
    data = []

    # Iterate over the rows in the CSV file
    for row in reader:
        # Each row is a dictionary
        data.append(row)

# Now 'data' is a list of dictionaries, where each dictionary represents a row from the CSV
print(data[0])
print(len(data))

{'description': '10 out of 10\nComments: GrandMaster Training Data Platform\nChannel: In-app', 'raw_subject': 'NPS Response', 'subject': 'NPS Response', 'priority': 'low', 'problem_id': '', 'tags': "['nps_score', 'pendo_nps', 'personal', 'question']", 'id': '36652', 'question': 'question', 'customer_type': "['personal']", 'customer_type_2': '', 'type': 'nps_score'}
26590


In [4]:
# Extracting the 'question' values where available
questions = [item.get('question', 'No question provided') for item in data]

# Counting occurrences of each unique question (or the placeholder 'No question provided')
question_counts = pd.Series(questions).value_counts().reset_index()
question_counts.columns = ['Question', 'Count']

# Display the counts in a table
print(question_counts.to_string(index=False))

            Question  Count
            question  21173
                none   2389
            type_bug   1536
type_feature_request   1492


In [5]:
# Initialize an empty set to store unique questions
unique_questions = set()

# Iterate over each dictionary in the data list
for item in data:
    # Check if 'question' key exists in the dictionary
    if 'question' in item:
        # Add the question to the set (sets automatically handle uniqueness)
        unique_questions.add(item['question'])

# Convert the set back to a list to get a list of unique questions
desired_tags = list(unique_questions)

print(desired_tags)

['type_feature_request', 'type_bug', 'none', 'question']


Steps:

- Navigate to the "Secrets" pane from the left navigation bar
- Add the OPENAI_API_KEY name and value
- Turn on the toggle of "Notebook access"

In [6]:
client = OpenAI(
    # This is the default and can be omitted
    api_key=userdata.get('OPENAI_API_KEY')
)

desired_tags = ["question", "none", "type_bug", "type_feature_request"]
filtered_data = [row for row in data if row.get('question') in desired_tags]
# filtered_data = data

# Make a filtered dataset

In [7]:
# Let's look at the data
show_elem = 4
print(json.dumps(data[show_elem], indent=4))

{
    "description": "10 out of 10\nComments: No message provided.\nChannel: Email",
    "raw_subject": "NPS Response",
    "subject": "NPS Response",
    "priority": "low",
    "problem_id": "",
    "tags": "['nps_score', 'pendo_nps', 'question']",
    "id": "36643",
    "question": "question",
    "customer_type": "",
    "customer_type_2": "",
    "type": "nps_score"
}


In [8]:
count, score_priority, score_tags = 0, 0, 0
acc = {}

# Prompt Engineering

In [9]:
def get_completion(prompt, model="gpt-3.5-turbo", temperature=0):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature,
    )
    return response.choices[0].message.content


evalset_start_all, evalset_end_all = 500, 520

In [10]:
print(acc)

{}


# II. Basic Prompt + Evaluation

In [11]:
def basic_prompt(evalset_start=400, evalset_end=500):
  count, score_tags = 0, 0
  for element in filtered_data[evalset_start:evalset_end]:
      # print(element)
      ticket_text = element['description']

      # Prompt – Classify Class
      prompt = f"""
      Classify the text delimited by triple backticks into one of the following classes.
      Classes: {desired_tags}
      Text: ```{ticket_text}```
      Class: """

      response = get_completion(prompt)

      print("Prediction: "+response)

      if(response == element['question']):
          score_tags += 1
          print("Correct. Actual: "+element['question'])
      else:
          print("Incorrect. Actual: "+element['question'])
      count += 1
      print()

  print("__________________")
  print(f"Priority Accuracy: {score_tags/count}")
  return score_tags/count
acc['basic_prompt'] = basic_prompt(evalset_start_all, evalset_end_all)

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: type_bug
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: type_feature_request
Incorrect. Actual: question

Prediction: type_bug
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: type_feature_request
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

__________________
Priority Accuracy: 0.

# Add Examples

In [12]:
# add examples to the prompt. pick examples randomly unless start is set.
# supports adding both random examples and a pre-defined set of examples
# evaluate on examples filtered_data[evalset_start:evalset_end]
def add_examples(num_examples=5, start=0, evalset_start=400, evalset_end=500, model="gpt-3.5-turbo", temperature=0
                 ):
  count, score_tags = 0, 0
  for element in filtered_data[evalset_start:evalset_end]:
      if start==0:
        # Ensure num_examples does not exceed the length of filtered_data
        num_samples = min(num_examples, len(filtered_data))

        # Randomly pick num_samples elements from filtered_data
        random_elements = random.sample(filtered_data, num_samples)
      else:
        random_elements = filtered_data[start:start+num_examples]

      examples = "".join([
          f"Text: ```{element['description']}```\nClass: {element.get('question', 'No question')}\n"
          for element in random_elements
      ])

      # print(element)
      ticket_text = element['description']

      # Prompt – Classify Class
      prompt = f"""
      Classify the text delimited by triple backticks into one of the following classes.
      Classes: {desired_tags}

      {examples}

      Text: ```{ticket_text}```
      Class: """
      response = get_completion(prompt, model=model, temperature=temperature)

      # print("Prompt: "+prompt)
      print("Prediction: "+response)

      if(response == element['question']):
          score_tags += 1
          print("Correct. Actual: "+element['question'])
      else:
          print("Incorrect. Actual: "+element['question'])
      count += 1
      print()

  print("__________________")
  print(f"Priority Accuracy: {score_tags/count}")
  return score_tags/count
acc['add_5_examples'] = add_examples(5, 1000, evalset_start_all, evalset_end_all)

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: type_feature_request
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: none
Incorrect. Actual: question

__________________
Priority Accuracy: 0.35


In [13]:
acc['add_5_random_examples'] = add_examples(5, 0, evalset_start_all, evalset_end_all)

Prediction: question
Correct. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: type_bug
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: type_bug
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: none
Incorrect. Actual: question

__________________
Priority Accuracy: 0.25


In [14]:
acc['add_5_examples_gpt4'] = add_examples(5, 1000, evalset_start_all, evalset_end_all, model="gpt-4")

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: type_bug
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: type_bug
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: none
Incorrect. Actual: question

__________________
Priority Accuracy: 0.3


In [15]:
acc['add_20_random_examples'] = add_examples(20, 0, evalset_start_all, evalset_end_all)

Prediction: question
Correct. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: type_feature_request
Incorrect. Actual: question

Prediction: type_bug
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: type_bug
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: type_feature_request
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

____________

In [17]:
acc['add_20_examples'] = add_examples(20,1000, evalset_start_all, evalset_end_all)

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: none
Incorrect. Actual: question

__________________
Priority Accuracy: 0.9


# Change Temperature

In [18]:
acc['add_20_examples_change_temp'] = add_examples(20, 1000, evalset_start_all, evalset_end_all, temperature=0.7)

Prediction: none
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

__________________
Priority Accuracy: 0.95

# Change model to GPT-4

In [19]:
acc['add_20_examples_gpt4'] = add_examples(20, 1000, evalset_start_all, evalset_end_all, model="gpt-4")

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: type_bug
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: type_bug
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: none
Incorrect. Actual: question

__________________
Priority Accuracy: 0.3


# Describe Classes

In [20]:
# add examples to the prompt. pick examples randomly unless start is set.
# supports adding both random examples and a pre-defined set of examples
# evaluate on examples filtered_data[evalset_start:evalset_end]
def describe_classes(num_examples=5, start=0, evalset_start=2500, evalset_end=2510, model="gpt-3.5-turbo", temperature=0):
  count, score_tags = 0, 0
  for element in filtered_data[evalset_start:evalset_end]:
      if start==0:
        # Ensure num_examples does not exceed the length of filtered_data
        num_samples = min(num_examples, len(filtered_data))

        # Randomly pick num_samples elements from filtered_data
        random_elements = random.sample(filtered_data, num_samples)
      else:
        random_elements = filtered_data[start:start+num_examples]

      examples = "".join([
          f"Text: ```{element['description']}```\nClass: {element.get('question', 'No question')}\n"
          for element in random_elements
      ])

      # print(element)
      ticket_text = element['description']

      # Prompt – Classify Class
      prompt = f"""
      Given the following description for each class:
      type_feature_request: A request for a feature by a user of Weights & Biases.
      type_bug: A bug report by a user of Weights & Biases.
      question: If the request is not related to Weights & Biases; or doesn't fit the above 2 categories.

      Classify the text delimited by triple backticks into one of the following classes.
      Classes: {desired_tags}

      {examples}

      Text: ```{ticket_text}```
      Class: """
      response = get_completion(prompt)

      # print("Prompt: "+prompt)
      print("Prediction: "+response)

      if(response == element['question']):
          score_tags += 1
          print("Correct. Actual: "+element['question'])
      else:
          print("Incorrect. Actual: "+element['question'])
      count += 1
      print()

  print("__________________")
  print(f"Priority Accuracy: {score_tags/count}")
  print()
  return score_tags/count
acc['add_5_examples_describe_classes'] = describe_classes(5,1000, evalset_start_all, evalset_end_all)
acc['add_20_examples_describe_classes'] = describe_classes(20,1000, evalset_start_all, evalset_end_all)

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: question
Correct. Actual: question

Prediction: type_bug
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: type_feature_request
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: none
Incorrect. Actual: question

Prediction: question
Correct. Actual: question

Prediction: none
Incorrect. Actual: question

__________________
Priority Accuracy: 0.4

P

# Specify Steps

In [21]:
# add examples to the prompt. pick examples randomly unless start is set.
# supports adding both random examples and a pre-defined set of examples
# evaluate on examples filtered_data[evalset_start:evalset_end]
def specify_steps(num_examples=5, start=0, evalset_start=2500, evalset_end=2510, examples=1, model="gpt-3.5-turbo", temperature=0):
  count, score_tags = 0, 0
  for element in filtered_data[evalset_start:evalset_end]:
      if start==0:
        # Ensure num_examples does not exceed the length of filtered_data
        num_samples = min(num_examples, len(filtered_data))

        # Randomly pick num_samples elements from filtered_data
        random_elements = random.sample(filtered_data, num_samples)
      else:
        random_elements = filtered_data[start:start+num_examples]

      examples = "".join([
          f"Text: ```{element['description']}```\nClass: {element.get('question', 'No question')}\n"
          for element in random_elements
      ])

      # print(element)
      ticket_text = element['description']

      # Prompt – Classify Class
      if examples:
        prompt = f"""
        Given the text delimited by triple backticks, perform the following actions:
        1 - Summarize what the user wants in 1-2 lines.
        2 - Recommend a next action based on the user' request.
        3 - Determine if the request is related to the product or company 'Weights & Biases'
        4 - Classify the into one of the following classes. Classes: {desired_tags}
        5 - Output a json object that contains the following keys: summary, recommended_action, is_wb, class

        Here are some examples help you with the classification step.
        {examples}

        And here's the text ```{ticket_text}```"""
      else:
        prompt = f"""
        Given the text delimited by triple backticks, perform the following actions:
        1 - Summarize what the user wants in 1-2 lines.
        2 - Recommend a next action based on the user' request.
        3 - Determine if the request is related to the product or company 'Weights & Biases'
        4 - Classify the into one of the following classes. Classes: {desired_tags}
        5 - Output a json object that contains the following keys: summary, recommended_action, is_wb, class

        Here's the text ```{ticket_text}```"""

      response_json = get_completion(prompt)
      # print("Prompt: "+prompt)
      print("Prediction: ")
      response = json.loads(response_json)
      print("Summary: "+response['summary'])
      print("Recommended Action: "+response['recommended_action'])
      print("Is W&B Related: "+str(response['is_wb']))
      print("Prediction: "+response['class'])

      if(response['class'] == element['question']):
          score_tags += 1
          print("Correct. Actual: "+element['question'])
      else:
          print("Incorrect. Actual: "+element['question'])
      count += 1
      print()

  print("__________________")
  print(f"Priority Accuracy: {score_tags/count}")
  return score_tags/count

acc['specify_steps'] = specify_steps(5,1000, evalset_start_all, evalset_end_all)

Prediction: 
Summary: User rated something 10 out of 10 with no message provided.
Recommended Action: Ask the user for more details or feedback on what they rated.
Is W&B Related: False
Prediction: question
Correct. Actual: question

Prediction: 
Summary: User rated something 10 out of 10 with no message provided.
Recommended Action: Ask the user for more details or feedback on what they rated.
Is W&B Related: False
Prediction: question
Correct. Actual: question

Prediction: 
Summary: The user is promoting an event by The Wall Street Journal about the future of transportation.
Recommended Action: Consider registering for the event to learn about e-bikes and map apps.
Is W&B Related: False
Prediction: question
Correct. Actual: question

Prediction: 
Summary: User is experiencing loading issues with their local environment and has a question.
Recommended Action: Check stdout or system logs for error messages and consider restarting the server with the environment variable LOCAL_RESTORE=t

In [22]:
# acc['specify_steps_no_examples'] = specify_steps(5,1000, evalset_start_all, evalset_end_all, examples=0)

# Explain LLM Reasoning Behind Steps

In [23]:
# add examples to the prompt. pick examples randomly unless start is set.
# supports adding both random examples and a pre-defined set of examples
# evaluate on examples filtered_data[evalset_start:evalset_end]
def specify_steps_explain_reasoning(num_examples=5, start=0, evalset_start=2500, evalset_end=2510, examples=1, model="gpt-3.5-turbo", temperature=0):
  count, score_tags = 0, 0
  for element in filtered_data[evalset_start:evalset_end]:
      if start==0:
        # Ensure num_examples does not exceed the length of filtered_data
        num_samples = min(num_examples, len(filtered_data))

        # Randomly pick num_samples elements from filtered_data
        random_elements = random.sample(filtered_data, num_samples)
      else:
        random_elements = filtered_data[start:start+num_examples]

      examples = "".join([
          f"Text: ```{element['description']}```\nClass: {element.get('question', 'No question')}\n"
          for element in random_elements
      ])

      # print(element)
      ticket_text = element['description']

      # Prompt – Classify Class
      if examples:
        prompt = f"""
        Given the text delimited by triple backticks, perform the following actions:
        1 - summary: Summarize what the user wants in 1-2 lines.
        2 - summary_reasoning: Explain your reasoning for the summary.
        3 - recommended_action: Recommend a next action based on the user' request.
        4 - recommended_action_reasoning: Explain your reasoning for the recommended next action.
        5 - is_wb: Determine if the request is related to the product or company 'Weights & Biases'.
        6 - is_wb_reasoning: Explain your reasoning for detemining if the request is W&B related.
        7 - class: Classify the into one of the following classes. Classes: {desired_tags}
        8 - Output a json object that contains the following keys: summary, summary_reasoning, recommended_action, recommended_action_reasoning, is_wb, is_wb_reasoning, class

        Here are some examples help you with the classification step.
        {examples}

        And here's the text ```{ticket_text}```.

        Make sure the output is only a json object.
        """
      else:
        prompt = f"""
        Given the text delimited by triple backticks, perform the following actions:
        1 - summary: Summarize what the user wants in 1-2 lines.
        2 - summary_reasoning: Explain your reasoning for the summary.
        3 - recommended_action: Recommend a next action based on the user' request.
        4 - recommended_action_reasoning: Explain your reasoning for the recommended next action.
        5 - is_wb: Determine if the request is related to the product or company 'Weights & Biases'.
        6 - is_wb_reasoning: Explain your reasoning for detemining if the request is W&B related.
        7 - class: Classify the into one of the following classes. Classes: {desired_tags}
        8 - Output a json object that contains the following keys: summary, summary_reasoning, recommended_action, recommended_action_reasoning, is_wb, is_wb_reasoning, class

        Here's the text ```{ticket_text}```"""

      response_json = get_completion(prompt)
      # print("Prompt: "+prompt)
      print("Prediction: ")
      response = json.loads(response_json)
      print("Summary: "+response['summary'])
      print("Summary Reasoning: "+response['summary_reasoning'])
      print("Recommended Action: "+response['recommended_action'])
      print("Recommended Reasoning: "+response['recommended_action_reasoning'])
      print("Is W&B Related: "+str(response['is_wb']))
      print("Is W&B Related Reasoning: "+response['is_wb_reasoning'])
      print("Prediction: "+response['class'])

      if(response['class'] == element['question']):
          score_tags += 1
          print("Correct. Actual: "+element['question'])
      else:
          print("Incorrect. Actual: "+element['question'])
      count += 1
      print()

  print("__________________")
  print(f"Priority Accuracy: {score_tags/count}")
  return score_tags/count

acc['specify_steps_explain_reasoning'] = specify_steps_explain_reasoning(5,1000, evalset_start_all, evalset_end_all)

Prediction: 
Summary: User is requesting feedback on a rating of 10 out of 10 with no message provided.
Summary Reasoning: The user is seeking feedback or clarification on a specific rating without any accompanying message.
Recommended Action: Provide feedback or ask for clarification on the rating given.
Recommended Reasoning: Engaging with the user to understand the reason behind the rating can help improve user experience or address any concerns.
Is W&B Related: False
Is W&B Related Reasoning: The request does not mention or relate to Weights & Biases.
Prediction: question
Correct. Actual: question

Prediction: 
Summary: User is requesting assistance with a rating of 10 out of 10 and no message provided.
Summary Reasoning: The user is seeking help or clarification regarding a specific rating without any additional context.
Recommended Action: Request more information from the user to understand the issue or provide general assistance.
Recommended Reasoning: Since the user did not pr

In [24]:
acc['specify_steps_explain_reasoning_gpt4'] = specify_steps_explain_reasoning(5,1000, evalset_start_all, evalset_end_all, model="gpt-4")

Prediction: 
Summary: User is requesting assistance with a rating of 10 out of 10 and no message provided in the comments.
Summary Reasoning: The user is seeking help or clarification regarding a specific rating without any additional comments.
Recommended Action: Reach out to the user to gather more information or provide assistance based on the rating provided.
Recommended Reasoning: Since the user did not provide any message along with the rating, it is important to follow up to understand the context or address any potential issues.
Is W&B Related: False
Is W&B Related Reasoning: The request does not seem to be related to Weights & Biases.
Prediction: question
Correct. Actual: question

Prediction: 
Summary: User is requesting assistance with a rating of 10 out of 10 in-app with no message provided.
Summary Reasoning: The user is seeking help or clarification regarding a specific rating given in-app.
Recommended Action: Reach out to the user to gather more information about the rat

# Prompt Chaining

In [25]:
# add examples to the prompt. pick examples randomly unless start is set.
# supports adding both random examples and a pre-defined set of examples
# evaluate on examples filtered_data[evalset_start:evalset_end]
def prompt_chaining(num_examples=5, start=0, evalset_start=2500, evalset_end=2510, examples=1, model="gpt-3.5-turbo", temperature=0):
  count, score_tags = 0, 0
  for element in filtered_data[evalset_start:evalset_end]:
      if start==0:
        # Ensure num_examples does not exceed the length of filtered_data
        num_samples = min(num_examples, len(filtered_data))

        # Randomly pick num_samples elements from filtered_data
        random_elements = random.sample(filtered_data, num_samples)
      else:
        random_elements = filtered_data[start:start+num_examples]

      examples = "".join([
          f"Text: ```{element['description']}```\nClass: {element.get('question', 'No question')}\n"
          for element in random_elements
      ])

      # print(element)
      ticket_text = element['description']

      # Prompt – Classify Class
      prompt = f"""
      Given the text delimited by triple backticks, perform the following actions:
      1 - summary: Summarize what the user wants in 1-2 lines.
      2 - summary_reasoning: Explain your reasoning for the summary.
      3 - recommended_action: Recommend a next action based on the user' request.
      4 - recommended_action_reasoning: Explain your reasoning for the recommended next action.
      5 - is_wb: Determine if the request is related to the product or company 'Weights & Biases'.
      6 - is_wb_reasoning: Explain your reasoning for detemining if the request is W&B related.
      7 - Output a json object that contains the following keys: summary, summary_reasoning, recommended_action, recommended_action_reasoning, is_wb, is_wb_reasoning, class

      And here's the text ```{ticket_text}```.

      Make sure the output is only a json object.
      """

      response_json = get_completion(prompt)
      # print("Prompt: "+prompt)
      print("Prediction: ")
      response = json.loads(response_json)
      print("Summary: "+response['summary'])
      print("Summary Reasoning: "+response['summary_reasoning'])
      print("Recommended Action: "+response['recommended_action'])
      print("Recommended Reasoning: "+response['recommended_action_reasoning'])
      print("Is W&B Related: "+str(response['is_wb']))
      print("Is W&B Related Reasoning: "+response['is_wb_reasoning'])


      prompt_2 = f"""
        Given the following info about a user request:
        1 - text delimited by triple backticks: ```{ticket_text}```
        2 - summary of what the user wants: {response['summary']}
        3 - recommended next action based on the user' request: {response['recommended_action']}
        4 - whether the request is related to the product or company 'Weights & Biases': {response['is_wb']}

        Classify the text into one of the following classes. Classes: {desired_tags}

        Here are some examples help you with the classification step.
        {examples}

        Only print the name of the class"""
      response_class = get_completion(prompt_2)
      print("Prediction: "+response_class)

      if(response_class == element['question']):
          score_tags += 1
          print("Correct. Actual: "+element['question'])
      else:
          print("Incorrect. Actual: "+element['question'])
      count += 1
      print()

  print("__________________")
  print(f"Priority Accuracy: {score_tags/count}")
  return score_tags/count

# acc['prompt_chaining'] = prompt_chaining(5,1000, evalset_start_all, evalset_end_all)

In [31]:
df = pd.DataFrame(list(acc.items()), columns=['Prompt', 'Accuracy']).sort_values(by='Accuracy', ascending=True)
def highlight_max_accuracy(data, color='mediumpurple'):
    highlight = pd.DataFrame('', index=data.index, columns=data.columns)
    max_accuracy = data['Accuracy'] == data['Accuracy'].max()
    highlight[max_accuracy] = f'background-color: {color}'
    return highlight

# Applying the styling
df = df.style.apply(highlight_max_accuracy, axis=None)
df

Unnamed: 0,Prompt,Accuracy
0,basic_prompt,0.0
2,add_5_random_examples,0.25
3,add_5_examples_gpt4,0.3
7,add_20_examples_gpt4,0.3
1,add_5_examples,0.35
8,add_5_examples_describe_classes,0.4
4,add_20_random_examples,0.7
11,specify_steps_explain_reasoning,0.7
12,specify_steps_explain_reasoning_gpt4,0.85
5,add_20_examples,0.9


# Next Steps
- [ ] Improve performance, make eval set more expansive
- [ ] Improve prompt chaining
- [ ] Improve summarization + suggest next steps using docs + support tickets (try RAG)
- [ ] How to evaluate the summary + next steps?
- [ ] RLHF for summary + next steps

Prompt  Accuracy
               basic_prompt      0.00
             add_5_examples      0.35
      add_5_random_examples      0.10
        add_5_examples_gpt4      0.30
     add_20_random_examples      0.60
            add_20_examples      0.90
add_20_examples_change_temp      0.80
       add_20_examples_gpt4      0.30
           describe_classes      0.40