In [1]:
import openai
import snowflake.connector
from tqdm import tqdm
import os

api_key_file = 'apikey.txt'
if os.path.isfile(api_key_file):
    with open(api_key_file) as f:
        openai.api_key = f.readline()
else:
    print(f"Error: {api_key_file} not found.")

OPENAI_API_KEY = openai.api_key

#login to snowflake db
con = snowflake.connector.connect(user='vishal.kumar@scale.com',
                                 account='pxa65918',
                                 authenticator='externalbrowser',
                                 warehouse='COMPUTE_WH',
                                 database='SCALE_CRAWLER',
                                 role='GENERAL_RO')

cs = con.cursor()

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...


In [2]:
#Get the prompt and response data from the snowflake db
sql = f'''
select
  ta.task,
  ta.attempted_by,
  t.params:templateVariables:Domain::string as Domain,
  t.params:templateVariables:Subtopic::string as Subtopic,
  t.params:templateVariables:Category::string as Category,
  t.params:templateVariables:Subcategory::string as Subcategory,
  t.params:templateVariables:Subcategory_Description::string as Subcategory_Description,
  ta.response :responses [0] :output :: string as Prompt,
  ta.response :rewrite :: string as Response
from
  scale_prod.public.taskattempts ta
  join scale_prod.public.tasks t on ta.task = t._id
where
  ta.project = '65c6b841652f25eabae60e72'
limit
  10
'''
cs.execute(sql)
idf = cs.fetch_pandas_all()

In [3]:
def evaluator_gpt(prompt):
    completion = openai.ChatCompletion.create(
        model="gpt-4-turbo-preview",
        messages=[{"role":"user", "content":prompt}],
        temperature=0.1
    )
    response = completion.choices[0].message
    response = response.content
    return response

In [5]:
# add three new columns to the dataframe BINARY_EVAL, QUAL_EVAL and IMPROVED_PROMPT
idf['BINARY_EVAL'] = ''
idf['QUAL_EVAL'] = ''
idf['IMPROVED_PROMPT'] = ''
#start a for loop to iterate through idf, add tqdm to show progress
for i in tqdm(range(len(idf))):
    #get the domain, category, subcategory and subcategory description from the first row of the dataframe
    domain = idf['DOMAIN'][i]
    category = idf['CATEGORY'][i]
    subcategory = idf['SUBCATEGORY'][i]
    subcategory_description = idf['SUBCATEGORY_DESCRIPTION'][i]
    prompt = idf['PROMPT'][i]
    eval_query="You are given a prompt(question) and you need to check whether it follows various criteria. Here are the criterion you need to check for\n\nDomain\n Check whether the question belongs to this domain or not. \nCategory\n Check whether the question belongs to this category or not\nSubcategory\n Check whether the question belongs to this category or not based on a subcategory description which will be provided to you. \n\nHere is the question and the criteria that needs your assessment\n\n" + prompt + "\nDomain:" + domain + "\nCategory:" + category + "\nSubcategory:" + subcategory + " (" + subcategory_description + ")\nRespond in following format:\nDomain: Yes/No\nCategory: Yes/No\nSubcategory: Yes/No\n\nDont say anything else, just stick to the format above"
    response = evaluator_gpt(eval_query)
    #add the response to the dataframe in BINARY_EVAL column
    idf['BINARY_EVAL'][i] = response

100%|██████████| 10/10 [00:16<00:00,  1.68s/it]


In [6]:
#start qualitiative evaluation
for i in tqdm(range(len(idf))):
    #get the domain, category, subcategory and subcategory description from the first row of the dataframe
    prompt = idf['PROMPT'][i]
    eval_query="You are now tasked with providing additional insights about the question. \nCreativity: Is the question creative. A creative question is something that is not commonly found on the internet. It is original in its ask and requires multiple levels of thought to answer. \nDepth: Does the question go into the depth of a topic, or is it superficial and simple. \nNumber of instructions: How many instructions are in the question. \n\nHere is the question that needs your assessment:" + prompt + "\nRespond in following format:\nCreativity: Very-Low/Low/Medium/High\nDepth: Very-Low/Low/Medium/High\nNumber of instructions: 1/2/3/4/5/6/7/8/9/10 etc\n\nDont say anything else, just stick to the format above"
    response = evaluator_gpt(eval_query)
    #add the response to the dataframe in the QUAL_EVAL column
    idf['QUAL_EVAL'][i] = response

100%|██████████| 10/10 [00:12<00:00,  1.21s/it]


In [7]:
#generate a more creative version of the prompt
for i in tqdm(range(len(idf))):
    #get the domain, category, subcategory and subcategory description from the first row of the dataframe
    prompt = idf['PROMPT'][i]
    eval_query="You are now tasked with providing a slightly more creative and more domain depth version of the question. Add 2 more instructions.\nRespond with only the new question:\n" + prompt + "\n\nDont say anything else."
    response = evaluator_gpt(eval_query)
    #add the response to the dataframe in the IMPROVED_PROMPT column
    idf['IMPROVED_PROMPT'][i] = response

100%|██████████| 10/10 [00:47<00:00,  4.70s/it]


In [8]:
idf.to_csv('Amsel_Eval.csv', index=False)