In [None]:
import openai
from tqdm import tqdm
import os
import pandas as pd
import snowflake.connector


api_key_file = '/Users/vishalkumar/Documents/apikey.txt'
if os.path.isfile(api_key_file):
    with open(api_key_file) as f:
        openai.api_key = f.readline()
else:
    print(f"Error: {api_key_file} not found.")

OPENAI_API_KEY = openai.api_key

#login to snowflake db
con = snowflake.connector.connect(user='vishal.kumar@scale.com',
                                 account='pxa65918',
                                 authenticator='externalbrowser',
                                 warehouse='COMPUTE_WH',
                                 database='SCALE_CRAWLER',
                                 role='GENERAL_RO')

cs = con.cursor()


def evaluator_gpt(prompt):
    import openai
    client = openai
    text_response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        max_tokens=512
    )
    return text_response


def ismath(exp):
    try:
        result = pd.eval(exp)
        return True
    except Exception:
        return False

In [None]:
#Get data from snowflake
sql = f'''
select
  ta._ID,
  ta.task,
  ta.attempted_by,
  ta.response :responses [5].context.candidates [0].message.content :: string as Prompt,
  ta.response :responses [5].output :: string as Response,
  t.metadata :image_url :: string as image_url
from
  scale_prod.public.taskattempts ta
  join scale_prod.public.tasks t on t._ID = ta.task
where
  ta.project = '667dbe0f5e08440a357aa3c2'
  and ta.attempted_at_review_level = -1
limit 40
'''
cs.execute(sql)
idf = cs.fetch_pandas_all()
print(idf.columns)

In [None]:
# QA Check: Math calculation accuracy
# Step 1: Start a for loop to iterate over the rows of idf
# Step 2: For each row, pass the response to evaluator_gpt function and ask to extract math calculations each in a new line
# Step 3: Pass each new line to pandas eval function
# Step 4: Save the results in a new variable
# Step 5: Compare the results with the results in responses using evaluator_gpt
# Step 6: Return any incorrect results if found

for index, row in tqdm(idf.iterrows(), total=idf.shape[0]):
    correct_count = 0
    total_math_calculations = 0 
    response = row['RESPONSE']
    math_prompt = ("From the provided text, only extract the mathematical calculations such that I can pass them "
                   "directly to pandas eval function. A mathematical calculation is defined as one with mathematical operators. "
                   "Do not convert text/string to math operators. If no math calculation is present, just say 'NA'. If math calculation "
                   "is present, put each calculation in a new line.\n")
    math_response = evaluator_gpt(math_prompt + response).choices[0].message.content
    math_response = math_response.split('\n')
    math_response = [x for x in math_response if x]

    if math_response[0] == 'NA':
        idf.at[index, 'Math_Response'] = ''
        idf.at[index, 'Result'] = ''
        idf.at[index, 'Compare_Response'] = ''
        idf.at[index, 'Correct_Percentage'] = ''
        continue
    
    result_output = []
    compare_output = []

    for math in math_response:
        if ismath(math):
            math = math.replace(',', '') #required to use pandas eval function
            result = pd.eval(math)
            result = round(result, 3) #specifically left to 3 to allow rounding when percentage is being calculated. Results will vary if this is changed. Recommned leaving at 3
            result = "{:,}".format(result) #required to compare outputs correctly
            result_output.append(result)
            compare_prompt = (f"Given the math calculation: {math}, the result is {result}. Evaluate whether this is the same result appearing in the text provided. "
                              f"if yes, respond 'Correct' else respond 'Incorrect'\nWhile comparing the result provided earlier to the result in the text, make sure to "
                              f"convert any percentages to decimals, or vice versa in order to match the format of the two. Also, allow decimal rounding imprecision. "
                              f"Text to evaluate:\n {response}")
            compare_response = evaluator_gpt(compare_prompt).choices[0].message.content
            
            if compare_response == 'Correct':
                correct_count += 1
            compare_output.append(compare_response)

    total_math_calculations = len(compare_output)

    if total_math_calculations > 0:
        correct_percentage = correct_count / total_math_calculations
    else:
        correct_percentage = ''

    idf.at[index, 'Math_Response'] = math_response
    idf.at[index, 'Result'] = result_output
    idf.at[index, 'Compare_Response'] = compare_output
    idf.at[index, 'Correct_Percentage'] = correct_percentage


In [None]:
# QA Check: Prompt is answerable without image
for index, row in tqdm(idf.iterrows(), total=idf.shape[0]):
    question = row['PROMPT']
    response = evaluator_gpt(row['RESPONSE']).choices[0].message.content
    image_prompt = ("Giving you a question and its response. Are all the questions answered correctly? or more information is asked from the user. Here is the question:\n{question}"
                    f"\n Here is the answer.\n{response}\n. Assess whether the answer is complete or more information was needed to provide a complete and correct answer. Dont make assumptions about the question."
                    f"Respond 'Complete' if the answer is complete and 'Incomplete' if the answer is incomplete.\n")
    image_response = evaluator_gpt(image_prompt).choices[0].message.content
    print(image_response)
    print(row['TASK'])
    idf.at[index, 'Image_Response'] = image_response


In [None]:
# QA Check: Auto SOTA Comparison
