-
Notifications
You must be signed in to change notification settings - Fork 4
/
prompt_testing.py
151 lines (128 loc) · 6.56 KB
/
prompt_testing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# run n completions with a prompt and tags
from langchain.chat_models import ChatOpenAI
from langchain.callbacks.manager import tracing_v2_enabled
from langchain.evaluation import load_evaluator
# Set up the first prompt template
from langchain.prompts import (
ChatPromptTemplate,
SystemMessagePromptTemplate,
HumanMessagePromptTemplate,
)
# from langchain.evaluation.criteria import CriteriaEvalChain
import hashlib
from dotenv import load_dotenv
load_dotenv()
# create the llm object
chat = ChatOpenAI(model="gpt-4", temperature=1) # for testing
# llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0) # for evals
# # count the frequency of words in a string
# def word_frequency(string):
# frequency = {}
# for word in string.split():
# if word in frequency:
# frequency[word] += 1
# else:
# frequency[word] = 1
# return frequency
# # count the difference in word frequency between the reference and the prediction
# def word_frequency_difference(reference, prediction):
# # count the frequency of each word in reference and prediction
# reference_frequency = word_frequency(reference)
# prediction_frequency = word_frequency(prediction)
# # calculate the difference between the two
# difference = {}
# for word in reference_frequency:
# if word in prediction_frequency:
# difference[word] = abs(reference_frequency[word] - prediction_frequency[word])
# else:
# difference[word] = reference_frequency[word]
# for word in prediction_frequency:
# if word not in reference_frequency:
# difference[word] = prediction_frequency[word]
# # sum up the difference
# total_difference = sum(difference.values())
# compile the prompt
def compile_prompt(prompt_template, test_case):
# create a prompt template for a System role
system_message_prompt_template = SystemMessagePromptTemplate.from_template(
prompt_template)
# create a string template for a Human role
human_template = "{input_text}"
# create a prompt template for a Human role
human_message_prompt_template = HumanMessagePromptTemplate.from_template(human_template)
# create chat prompt template out of one or several message prompt templates
chat_prompt_template = ChatPromptTemplate.from_messages(
[system_message_prompt_template, human_message_prompt_template])
# generate a final prompt by passing the variables
final_prompt_template = chat_prompt_template.format_prompt(input_text=test_case)
final_prompt = final_prompt_template.to_messages()
return final_prompt
def hash_id(string):
# hash to get a unique id that will be the same if passed the same string
hash = hashlib.md5(string.encode()).hexdigest()
hash_id = hash[:8]
return hash_id
def gen_ex(prompt_templates, test_cases, test_runs=10, project_name="unassigned"):
# check whether prompt templates is a list of strings, and test cases is a list of objects
if not isinstance(prompt_templates, list) or not isinstance(test_cases, list):
raise TypeError("prompt_templates and test_cases must be lists")
if not all(isinstance(item, str) for item in prompt_templates):
raise TypeError("prompt_templates must be a list of strings")
if not all(isinstance(item, object) for item in test_cases):
raise TypeError("test_cases must be a list of objects")
# counter to keep track of the number of runs
counter = 0
# number of test cases multiplied by number of test runs multiplied by number of prompt templates
total = len(test_cases) * test_runs * len(prompt_templates)
# load the evaluators
distance_evaluator = load_evaluator("embedding_distance")
# correctness_evaluator = load_evaluator("labeled_criteria", criteria="correctness")
# create an object to store the results
results = []
for template in prompt_templates:
# hash to get a unique id that will be the same if passed the same template
pid = f"pid_{hash_id(template)}"
# loop through the test cases
for case in test_cases:
# get a unique id for the case
cid = f"cid_{hash_id(case['input_text'])}"
# compile the prompt
final_prompt = compile_prompt(template, case)
# run n completions with the prompt and tags
for _ in range(test_runs):
with tracing_v2_enabled(project_name=project_name, tags=[pid, cid]):
counter += 1
print(f"Running {pid} with {cid} ({counter}/{total})")
result = chat(final_prompt)
content = result.content
embedding_distance = distance_evaluator.evaluate_strings(prediction=content, reference=case["reference"])['score']
# setting up similarity here because need to pass in reference dynamically
# criteria = {"style-similarity": f"How similar is this writing style to the reference text out of 100?\nReference: {case['reference']}"}
# similarity_evaluator = CriteriaEvalChain.from_llm(llm=llm, criteria=criteria)
# similarity_score = similarity_evaluator.evaluate_strings(prediction=content, input=final_prompt)
# print(similarity_score)
# print(f"Similarity: {similarity_score['score']:.2f}")
# correctness = correctness_evaluator.evaluate_strings(
# input=case["input_text"],
# prediction=content,
# reference=case["reference"],
# )
# print(f"Correctness: {correctness}")
# calculate the word frequency difference
# frequency_difference = word_frequency_difference(case["reference"], content)
data = {
"prompt": template,
"case": case["input_text"],
"result": content,
"pid": pid,
"cid": cid,
"run": counter,
"project": project_name,
"reference": case["reference"],
"embedding_distance": embedding_distance,
# "similarity_score": similarity_score['score'],
# "frequency_difference": frequency_difference,
# "correctness": correctness["score"],
}
results.append(data)
return results