# Run Metrics.ipynb

Evaluate conversations.

In [None]:
import pickle
import Evaluation_Functions
import nltk
import re
import numpy as np
import textstat
import pandas as pd

# Load conversations
with open('Conversation_Lists.pkl', 'rb') as file:
    loaded_data = pickle.load(file)


In [None]:
# Assign model_task_methods and conversation numbers and put conversations in a list
model_task_methods = []
conversation_numbers = []
flat_conversations_list = []
for model_task_method, conversations in loaded_data.items():
    for i, conversation in enumerate(conversations):
        model_task_methods.append(model_task_method)
        conversation_numbers.append(i+1)
        flat_conversations_list.append(conversation)


In [None]:
# Conversation elements by whether they are user or system
# user_or_system_lists = []
# for conversation in flat_conversations_list:
#     # Even indices are user, odd are system
#     us_list = []
#     for index in range(len(conversation)):
#         if index % 2 == 0:
#             us_list.append('user')
#         else:
#             us_list.append('system')
#     user_or_system_lists.append(us_list)


In [None]:
# Function to get versions of conversations with GPT-4 chat metadata removed
flat_conversations_no_metadata = []
for index, conversation in enumerate(flat_conversations_list):
    # Remove metadata if it's a gpt-4 chat
    if 'gpt-4' in model_task_methods[index]:
        flat_conversations_no_metadata.append(strip_gpt4_meta(conversation))
    else:
        flat_conversations_no_metadata.append(conversation)


In [None]:
# Create versions of conversations that are one concatenated string without metadata
conversation_strings = ["\n".join(conversation) for conversation in flat_conversations_no_metadata]


In [None]:
# Create versions of conversations that are one concatenated string without metadata
# Input data only - keep even indices of the conversation
conversation_strings_input = ["\n".join(conversation[::2]) for conversation in flat_conversations_no_metadata]


In [None]:
# Create versions of conversations that are one concatenated string without metadata
# Output data only - keep odd indices of the conversation
conversation_strings_output = ["\n".join(conversation[1::2]) for conversation in flat_conversations_no_metadata]


In [None]:
# Create versions of conversations that are one concatenated string without metadata
# Prompt only - first item in conversation
conversation_strings_prompt = [conversation[0] for conversation in flat_conversations_no_metadata]


In [None]:
# List of models
model_list = []
for model_task_method in model_task_methods:
    # Split on underscores
    if model_task_method.split('_')[0] == "gpt4":
        model_list.append("gpt-4-0613")
    else:
        model_list.append("text-davinci-003")


## Length

In [None]:
# Length of the entire conversation in tokens
conversation_lengths = []
for i, conversation_string in enumerate(conversation_strings):
    conversation_lengths.append(get_length(conversation_strings, model_list[i]))


In [None]:
input_lengths = []
for i, conversation_string in enumerate(conversation_strings_input):
    input_lengths.append(get_length(conversation_strings_input, model_list[i]))


In [None]:
output_lengths = []
for i, conversation_string in enumerate(conversation_strings_output):
    output_lengths.append(get_length(conversation_strings_output, model_list[i]))


## Cost

text-davinci-003:

2 cents per 1000 tokens

GPT-4:

Input: 3 cents per 1000 tokens

Output 6 cents per 1000 tokens

As of November 11, 2023

In [None]:
conversation_costs = []
for i in range(len(conversation_lengths)):
    if model_list[i] == "gpt-4-0613":
        conversation_costs.append(input_lengths[i] * 0.03 + output_lengths[i] * 0.06)
    else:
        conversation_costs.append(conversation_lengths[i] * 0.02)


## Provided GSM8K Question/Answer Matching

In [None]:
# Load questions and answers
import json

# Load the test question from test.jsonl
with open('GSM8k/test.jsonl', 'r') as f:
    test = f.readlines()

questions = [json.loads(t)['question'] for t in test]
answers = [json.loads(t)['answer'] for t in test]


In [None]:
# Loop over conversations
# For each conversation that has mtm containing gsm8k, 
# loop over questions until one is found that is contained in the conversation's first list element
# Also make a list of the GSM8K answers directly
conversation_gsm8k_question_index = []
gsm8k_answers = []
for conversation_index, conversation in enumerate(flat_conversations_list):
    # Grade gsm8k items
    if 'gsm8k' in model_task_methods[conversation_index]:
        for question_index, question in enumerate(questions):
            if question in conversation[0]:
                conversation_gsm8k_question_index.append(question_index)
                gsm8k_answers.append(answers[question_index])
    # Add NAs for non-gsm8k items
    else:
        conversation_gsm8k_question_index.append('NA')
        gsm8k_answers.append('NA')


## Length Relative to Baseline

Length of the entire interaction in tokens relative to the length of the task/question + a baseline answer. For GSM8K, the baselines are the provided (OpenAI) solution and the answer achieved via direct prompting. For the Creative Writing task, the baseline is the answer achieved via direct prompting. How much is prompt engineering stretching the interaction out? The ratio of engineered answer length to baseline lengths can be informative.

In [None]:
# Compare length with provided gsm8k question and answer
gsm8k_length_vs_provided = []
for idx, conversation_length in enumerate(conversation_lengths):
    if conversation_gsm8k_question_index[idx] != "NA":
        gsm8k_length_vs_provided.append(conversation_length / (get_length(questions[conversation_gsm8k_question_index[idx]] + "\n" + answers[conversation_gsm8k_question_index[idx]], model_list[idx])))
    else:
        gsm8k_length_vs_provided.append('NA')


In [None]:
# Create a list that repeats the sequence of direct prompting conversations for comparison with each conversation
#td3_gsm8k_direct_prompting_responses = loaded_data['td3_gsm8k_direct_prompting_responses']
#gpt4_gsm8k_direct_prompting_responses = loaded_data['gpt4_gsm8k_direct_prompting_responses']
#td3_cw_direct_prompting_responses = loaded_data['td3_cw_direct_prompting_responses']
#gpt4_cw_direct_prompting_responses = loaded_data['gpt4_cw_direct_prompting_responses']

td3_gsm8k_direct_prompting_conversations = []
gpt4_gsm8k_direct_prompting_conversations = []
td3_cw_direct_prompting_conversations = []
gpt4_cw_direct_prompting_conversations = []
for idx, conversation in enumerate(flat_conversations_list):
    if 'td3_gsm8k_direct_prompting_responses' in model_task_methods[idx]:
        td3_gsm8k_direct_prompting_conversations.append(conversation)
    if 'gpt4_gsm8k_direct_prompting_responses' in model_task_methods[idx]:
        gpt4_gsm8k_direct_prompting_conversations.append(conversation)
    if 'td3_cw_direct_prompting_responses' in model_task_methods[idx]:
        td3_cw_direct_prompting_conversations.append(conversation)
    if 'gpt4_cw_direct_prompting_responses' in model_task_methods[idx]:
        gpt4_cw_direct_prompting_conversations.append(conversation)


In [None]:
# Get list of model_task_methods as they appear
# Drop duplicates
model_task_methods_unique = []
for model_task_method in model_task_methods:
    if model_task_method not in model_task_methods_unique:
        model_task_methods_unique.append(model_task_method)


In [None]:
# Construct big list of direct prompting conversations
direct_prompting_conversations = []
for mtm in model_task_methods_unique:
    if 'td3_gsm8k' in mtm:
        direct_prompting_conversations.extend(td3_gsm8k_direct_prompting_conversations)
    elif 'gpt4_gsm8k' in mtm:
        direct_prompting_conversations.extend(gpt4_gsm8k_direct_prompting_conversations)
    elif 'td3_cw' in mtm:
        direct_prompting_conversations.extend(td3_cw_direct_prompting_conversations)
    elif 'gpt4_cw' in mtm:
        direct_prompting_conversations.extend(gpt4_cw_direct_prompting_conversations)


In [None]:
# Compare length with direct prompting
# Also save direct prompting lengths
direct_prompting_lengths = []
length_vs_direct_prompting = []
for idx, conversation_length in enumerate(conversation_lengths):
    direct_prompting_lengths.append(get_length(direct_prompting_conversations[idx], model_list[idx]))
    length_vs_direct_prompting.append(conversation_length / (get_length(direct_prompting_conversations[idx], model_list[idx])))


## Number of Reasoning Steps

Number of reasoning steps - linebreaks, sentences (NLTK sentence tokenizer), strings "step i" and "1. ", "2. ", "3. ", etc. in the response 

In [None]:
num_linebreaks = []
num_sentences = []
num_step_i = []
num_1_dot_etc = []
for output_string in conversation_strings_output:
    num_linebreaks.append(output_string.count('\n'))
    num_sentences.append(len(nltk.sent_tokenize(output_string)))
    # Convert the string to lowercase
    text_lower = output_string.lower()
    # Regular expression to find all occurrences of "step" followed by a number
    matches = re.findall(r"step \d+", text_lower)
    # Count the number of occurrences
    num_occurrences = len(matches)
    num_step_i.append(num_occurrences)
    # Regular expression to find all occurrences of a number followed by a period and a space
    matches = re.findall(r"\d+\.", output_string)
    # Count the number of occurrences
    num_occurrences = len(matches)
    num_1_dot_etc.append(num_occurrences)


## Sentence Length (Response)

For creative writing

NLTK word and sentence tokenizers

In [None]:
sentence_length = []
for idx, output_string in enumerate(conversation_strings_output):
    if "cw" in model_task_method[idx]:
        sentence_length.append(get_nltk_sentence_length(output_string))
    else:
        sentence_length.append('NA')


## Flesch Reading Ease

In [None]:
fres = []
for idx, output_string in enumerate(conversation_strings_output):
    if "cw" in model_task_method[idx]:
        fres.append(textstat.flesch_reading_ease(output_string))
    else:
        fres.append('NA')


## Differences in Scores

Difference of the scores above in responses vs. prompts, additionally responses vs. provided answer for GSM8K


In [None]:
# Compute all measures for prompts
num_linebreaks_prompts = []
num_sentences_prompts = []
num_step_i_prompts = []
num_1_dot_etc_prompts = []
for input_string in conversation_strings_input:
    num_linebreaks_prompts.append(input_string.count('\n'))
    num_sentences_prompts.append(len(nltk.sent_tokenize(input_string)))
    # Convert the string to lowercase
    text_lower = input_string.lower()
    # Regular expression to find all occurrences of "step" followed by a number
    matches = re.findall(r"step \d+", text_lower)
    # Count the number of occurrences
    num_occurrences = len(matches)
    num_step_i_prompts.append(num_occurrences)
    # Regular expression to find all occurrences of a number followed by a period and a space
    matches = re.findall(r"\d+\.", input_string)
    # Count the number of occurrences
    num_occurrences = len(matches)
    num_1_dot_etc_prompts.append(num_occurrences)
sentence_length_prompts = []
for idx, input_string in enumerate(conversation_strings_input):
    if "cw" in model_task_method[idx]:
        sentence_length_prompts.append(get_nltk_sentence_length(input_string))
    else:
        sentence_length_prompts.append('NA')
fres_prompts = []
for idx, input_string in enumerate(conversation_strings_input):
    if "cw" in model_task_method[idx]:
        fres_prompts.append(textstat.flesch_reading_ease(input_string))
    else:
        fres_prompts.append('NA')


In [None]:
# Compute all measures for provided answer for GSM8K
num_linebreaks_provided = []
num_sentences_provided = []
num_step_i_provided = []
num_1_dot_etc_provided = []
for gsm8k_answer in gsm8k_answers:
    if gsm8k_answer != "NA":
        num_linebreaks_provided.append(gsm8k_answer.count('\n'))
        num_sentences_provided.append(len(nltk.sent_tokenize(gsm8k_answer)))
        # Convert the string to lowercase
        text_lower = gsm8k_answer.lower()
        # Regular expression to find all occurrences of "step" followed by a number
        matches = re.findall(r"step \d+", text_lower)
        # Count the number of occurrences
        num_occurrences = len(matches)
        num_step_i_provided.append(num_occurrences)
        # Regular expression to find all occurrences of a number followed by a period and a space
        matches = re.findall(r"\d+\.", gsm8k_answer)
        # Count the number of occurrences
        num_occurrences = len(matches)
        num_1_dot_etc_provided.append(num_occurrences)
    else:
        num_linebreaks_provided.append('NA')
        num_sentences_provided.append('NA')
        num_step_i_provided.append('NA')
        num_1_dot_etc_provided.append('NA')


# Output All Metrics

Put all lists in a dataframe and export to Excel

In [None]:
# Create dataframe
automatic_metrics = pd.DataFrame({
    'model_task_method': model_task_methods,
    'conversation_number': conversation_numbers,
    'conversation_length': conversation_lengths,
    'input_length': input_lengths,
    'output_length': output_lengths,
    'conversation_cost': conversation_costs,
    'gsm8k_question_index': conversation_gsm8k_question_index,
    'gsm8k_answer': gsm8k_answers,
    'gsm8k_length_vs_provided': gsm8k_length_vs_provided,
    'length_vs_direct_prompting': length_vs_direct_prompting,
    'num_linebreaks': num_linebreaks,
    'num_sentences': num_sentences,
    'num_step_i': num_step_i,
    'num_1_dot_etc': num_1_dot_etc,
    'sentence_length': sentence_length,
    'fres': fres,
    'num_linebreaks_prompts': num_linebreaks_prompts,
    'num_sentences_prompts': num_sentences_prompts,
    'num_step_i_prompts': num_step_i_prompts,
    'num_1_dot_etc_prompts': num_1_dot_etc_prompts,
    'sentence_length_prompts': sentence_length_prompts,
    'fres_prompts': fres_prompts,
    'num_linebreaks_provided': num_linebreaks_provided,
    'num_sentences_provided': num_sentences_provided,
    'num_step_i_provided': num_step_i_provided,
    'num_1_dot_etc_provided': num_1_dot_etc_provided
})
                            
# Export to Excel
automatic_metrics.to_excel('Automatic_Metrics.xlsx', index=False)
