In [36]:
import openai
from tqdm import tqdm
import os
import pandas as pd
import snowflake.connector

api_key_file = '/Users/vishalkumar/Documents/apikey.txt'
if os.path.isfile(api_key_file):
    with open(api_key_file) as f:
        openai.api_key = f.readline()
else:
    print(f"Error: {api_key_file} not found.")

OPENAI_API_KEY = openai.api_key

#login to snowflake db
con = snowflake.connector.connect(user='vishal.kumar@scale.com',
                                 account='pxa65918',
                                 authenticator='externalbrowser',
                                 warehouse='COMPUTE_WH',
                                 database='SCALE_CRAWLER',
                                 role='GENERAL_RO')

cs = con.cursor()

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...


In [37]:
def evaluator_gpt(prompt):
    import openai
    client = openai
    text_response = client.chat.completions.create(
        model="gpt-4-0125-preview",
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        max_tokens=512
    )
    return text_response

def vision_gpt(image_url, prompt):
    import openai
    client = openai
    vision_response = client.chat.completions.create(
        model="gpt-4-vision-preview",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": image_url,
                            "detail": "high"
                        },
                    },
                ],
            }
        ],
        max_tokens=1024,
    )
    return vision_response


In [39]:
#Get data from snowflake
sql = f'''
with alltasks AS (
    select
        task
    FROM
        SCALE_PROD.PUBLIC.PIPELINEV3HUMANNODES
    WHERE
        review_level IN (12,10)
        and project = '65e23dd219c580a16e44e374'
        and status = 'pending'
)
select DISTINCT
    v.task,
    t_prompter.metadata:image_url AS IMAGE,
    t_prompter.metadata:image_type AS IMAGE_TYPE,
    t_prompter.metadata:pr_type::string AS PR_TYPE,
    u_prompter.email AS prompter,
    ta_prompter.attempted_by AS prompter_attempt_id,
    ta_prompter.response:responses[2]:output::STRING AS PROMPTER_PROMPT,
    ta_prompter.response:responses[3]:output::STRING AS PROMPTER_RESPONSE,
    u_reviewer.email AS reviewer,
    ta_reviewer.attempted_by AS REVIEWER_ATTEMPT_ID,
    ta_reviewer.response:responses[2]:output::STRING AS REVIEWER_PROMPT,
    ta_reviewer.response:responses[3]:output::STRING AS REVIEWER_RESPONSE,
    ta_reviewer.response:responses[5]:context:response:annotations:"pr-type-validation":response[0][0]::text as pr_type_validation,
FROM
    SCALE_PROD.PUBLIC.PIPELINEV3HUMANNODES v
    JOIN (SELECT task FROM alltasks) at ON at.task = v.task
    JOIN SCALE_PROD.PUBLIC.TASKATTEMPTS ta_prompter ON ta_prompter.task = v.task AND ta_prompter.attempted_at_review_level = -1 AND ta_prompter.review_status = 'fixed'
    JOIN SCALE_PROD.PUBLIC.USERS u_prompter ON u_prompter._id = ta_prompter.attempted_by
    JOIN SCALE_PROD.PUBLIC.TASKS t_prompter ON t_prompter._id = v.task
    JOIN SCALE_PROD.PUBLIC.TASKATTEMPTS ta_reviewer ON ta_reviewer.task = v.task AND ta_reviewer.attempted_at_review_level = 0 AND ta_reviewer.review_outcome = 'fixed'
    JOIN SCALE_PROD.PUBLIC.USERS u_reviewer ON u_reviewer._id = ta_reviewer.attempted_by
    JOIN SCALE_PROD.PUBLIC.TASKS t_reviewer ON t_reviewer._id = v.task
WHERE
    v.project IN ('65e23dd219c580a16e44e374')
'''
cs.execute(sql)
idf = cs.fetch_pandas_all()

In [41]:
prompt_grammar ="Please check if the given text has any spelling or grammatical errors. ONLY say 'Yes' if there are errors, otherwise ONLY say 'No'. Here is the text:\n"
prompt_instruction_following ="I am give you a question and its answer. Check that all the major questions are answered. ONLY say 'Yes' if the answer covers the major parts of the question, otherwise ONLY say 'No' . Here is the question and the answer:\n"
prompt_factuality ="Please check if the given text has any major factual errors. ONLY say 'Yes' if there are factual errors, otherwise ONLY say 'No'. Here is the text:\n"
prompt_answerable="I am giving you a question. You need to tell me if you can answer it or not. ONLY say 'Yes' if you can answer, ONLY say 'No' if more information is needed. Here is the question:\n"
prompt_pr="Please check if the given text mentions that the there is some uncertainty in being able to answer the question. If it does, then ONLY say 'Yes', otherwise if the response is completely CERTAIN then ONLY say 'No'. Here is the text:\n"

In [42]:
#add a column called ERROR_TYPE to idf
idf['POSSIBLE_ERRORS'] = ""
#iterate over each row of idf and use tqdm to show progress
for index, row in tqdm(idf.iterrows(), total=len(idf)):
    try:
        #print(row['REVIEWER_PROMPT'])
        #print(row['REVIEWER_RESPONSE'])
        if row['REVIEWER_PROMPT'] == "[Please edit by clicking the pencil icon]":
            idf.at[index, 'POSSIBLE_ERRORS'] += "Empty Prompt, "
            #print("Empty Prompt")
        if evaluator_gpt(prompt_answerable+row['REVIEWER_PROMPT']).choices[0].message.content == "Yes":
            # concatenate the error with the current value of POSSIBLE_ERRORS
            idf.at[index, 'POSSIBLE_ERRORS'] += "Answerable without image, "
            #print("Answerable without image")
        if row['PR_TYPE_VALIDATION'] == "":
            if row['PR_TYPE'] == "CERTAIN":
                if evaluator_gpt(prompt_instruction_following+"Question:"+row['REVIEWER_PROMPT']+"\nAnswer:"+row['REVIEWER_RESPONSE']).choices[0].message.content == "No":
                    # concatenate the error with the current value of POSSIBLE_ERRORS
                    idf.at[index, 'POSSIBLE_ERRORS'] += "All questions not answered, "
                    #print("All questions not answered")
        else:
            if row['PR_TYPE_VALIDATION'] == "CERTAIN":
                if evaluator_gpt(prompt_instruction_following+"Question:"+row['REVIEWER_PROMPT']+"\nAnswer:"+row['REVIEWER_RESPONSE']).choices[0].message.content == "No":
                    # concatenate the error with the current value of POSSIBLE_ERRORS
                    idf.at[index, 'POSSIBLE_ERRORS'] += "All questions not answered, "
                    #print("All questions not answered")
        if evaluator_gpt(prompt_factuality+row['REVIEWER_RESPONSE']).choices[0].message.content == "Yes":
            # concatenate the error with the current value of POSSIBLE_ERRORS
            idf.at[index, 'POSSIBLE_ERRORS'] += "Factual error, "
            #print("Factual error")
        if evaluator_gpt(prompt_grammar+row['REVIEWER_RESPONSE']).choices[0].message.content == "Yes":
            # concatenate the error with the current value of POSSIBLE_ERRORS
            idf.at[index, 'POSSIBLE_ERRORS'] += "Grammar error, "
            #print("Spelling/Grammar error")
        if row['PR_TYPE_VALIDATION'] == "":
            if row['PR_TYPE'] == "UNCERTAIN":
                if evaluator_gpt(prompt_pr+"Question:"+row['REVIEWER_PROMPT']+"\nAnswer:"+row['REVIEWER_RESPONSE']).choices[0].message.content == "No":
                    # concatenate the error with the current value of POSSIBLE_ERRORS
                    idf.at[index, 'POSSIBLE_ERRORS'] += "Potential PR mismatch, "
                    #print("Potential PR mismatch")
        else:
            if row['PR_TYPE_VALIDATION'] == "UNCERTAIN":
                if evaluator_gpt(prompt_pr+"Question:"+row['REVIEWER_PROMPT']+"\nAnswer:"+row['REVIEWER_RESPONSE']).choices[0].message.content == "No":
                    # concatenate the error with the current value of POSSIBLE_ERRORS
                    idf.at[index, 'POSSIBLE_ERRORS'] += "Potential PR mismatch, "
                    #print("Potential PR mismatch")
    except Exception as e:
        print(f"Error occurred at index {index}: {str(e)}")


100%|██████████| 924/924 [26:38<00:00,  1.73s/it]


In [43]:
idf.to_csv('NG_DI_AutoEval_Output.csv', index=False)