In [1]:
import openai
import snowflake.connector
from tqdm import tqdm
import os

api_key_file = 'apikey.txt'
if os.path.isfile(api_key_file):
    with open(api_key_file) as f:
        openai.api_key = f.readline()
else:
    print(f"Error: {api_key_file} not found.")

OPENAI_API_KEY = openai.api_key

#login to snowflake db
con = snowflake.connector.connect(user='vishal.kumar@scale.com',
                                 account='pxa65918',
                                 authenticator='externalbrowser',
                                 warehouse='COMPUTE_WH',
                                 database='SCALE_CRAWLER',
                                 role='GENERAL_RO')

cs = con.cursor()

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...


In [2]:
#Get data from snowflake
sql = f'''
select 
    t._id as task_id,
    t.params:templateVariables:Domain::string as DOMAIN,
    t.params:templateVariables:Subtopic::string as SUB_TOPIC,
    t.params:templateVariables:Category::string as CATEGORY,
    t.params:templateVariables:Guidance::string as GUIDANCE,
    ta._ID as EVALUATION,
    ta.attempted_by as EVALUATOR,
    t.response:consensusMeta:consensusResponse:responses[3]:output::string as PROMPT,
    t.response:consensusMeta:consensusResponse:responses[3]:context:referenceTexts[0]:content::string as REF_TEXT,
    t.response:consensusMeta:consensusResponse:responses[3]:context:referenceTexts[0]:url::string as REF_URL,
    case
    when (t.response:consensusMeta:consensusResponse:responses[4]:context:displayOrder[0]) = 0 then t.response:consensusMeta:consensusResponse:responses[4]:context:candidates[0]:message:content
    when (t.response:consensusMeta:consensusResponse:responses[4]:context:displayOrder[1]) = 0 then t.response:consensusMeta:consensusResponse:responses[4]:context:candidates[1]:message:content    
    else 'NA'
    end as AI21_RESPONSE,
    case
    when (t.response:consensusMeta:consensusResponse:responses[4]:context:displayOrder[0]) = 1 then t.response:consensusMeta:consensusResponse:responses[4]:context:candidates[0]:message:content
    when (t.response:consensusMeta:consensusResponse:responses[4]:context:displayOrder[1]) = 1 then t.response:consensusMeta:consensusResponse:responses[4]:context:candidates[1]:message:content    
    else 'NA'
    end as GPT4_RESPONSE,
    ta.response:responses[6].context.selectedId::string as PREFERRED_MODEL,
    ta.response:responses[6].context.annotations.Airdale.annotations.response_factuality.response[0]::string as AI21_RESPONSE_FACTUALITY,
    ta.response:responses[6].context.annotations.Airdale.annotations.instruction_following.response[0]::string as AI21_INSTRUCTION_FOLLOWING,
    ta.response:responses[6].context.annotations.Airdale.annotations.Style.response[0]::string as AI21_STYLE,
    ta.response:responses[6].context.annotations.Airdale.annotations.Overall.response[0]::string as AI21_OVERALL,
    ta.response:responses[6].context.annotations.OpenAI.annotations.response_factuality.response[0]::string as OpenAI_RESPONSE_FACTUALITY,
    ta.response:responses[6].context.annotations.OpenAI.annotations.instruction_following.response[0]::string as OpenAI_INSTRUCTION_FOLLOWING,
    ta.response:responses[6].context.annotations.OpenAI.annotations.Style.response[0]::string as OpenAI_STYLE,
    ta.response:responses[6].context.annotations.OpenAI.annotations.Overall.response[0]::string as OpenAI_OVERALL,
    ta.response:responses[8].context.response.annotations.model_feedback.response[0]::string as EVALUATION_MODEL_FEEDBACK,
    SUBSTRING(
        t.params:before[3].params:instructions::string, 
        CHARINDEX('**Number of Instructions:**', t.params:before[3].params:instructions::string) + LEN('**Number of Instructions:**'), 
        CHARINDEX('**Main Topic:**', t.params:before[3].params:instructions::string) - CHARINDEX('**Number of Instructions:**', t.params:before[3].params:instructions::string) - LEN('**Number of Instructions:**')
    ) as NUMBER_OF_INSTRUCTIONS
from scale_prod.public.tasks t
join scale_prod.public_w_deleted.taskattempts ta on t._id = ta.task
where t.batch='65e347e8b47aa6d0d22eb2a3'
and attempted_at_review_level = 4
order by t._id
'''
cs.execute(sql)
idf = cs.fetch_pandas_all()

In [3]:
#add 4 additional columns to idf called NUMBER_OF_INSTRUCTIONS, LENGTH_INSTRUCTIONS, FORMAT_TYPE
idf['NUMBER_OF_INSTRUCTIONS'] = ''
idf['LENGTH_INSTRUCTIONS'] = ''
idf['FORMAT_TYPE'] = ''

#remove duplicate task_id from idf
idf = idf.drop_duplicates(subset=['TASK_ID'])

def evaluator_gpt(prompt):
    completion = openai.ChatCompletion.create(
        model="gpt-4-turbo-preview",
        messages=[{"role":"user", "content":prompt}],
        temperature=0.1
    )
    response = completion.choices[0].message
    response = response.content
    return response

In [7]:
from tqdm import tqdm  
idf['NUMBER_OF_INSTRUCTIONS'] = None
for i, row in tqdm(idf.iterrows(), total=idf.shape[0]):
    try:
        prompt = row['PROMPT']
        number_of_instructions_eval_query = f"You are tasked with counting the number of instructions in a certain provided question. Respond with the numeric value of number of instructions only. Here is the question \n{prompt}"
        response = evaluator_gpt(number_of_instructions_eval_query)
        idf.at[i, 'NUMBER_OF_INSTRUCTIONS'] = response
    except Exception as e:
        print(f"Error occurred for index {i}: {str(e)}")

100%|██████████| 2416/2416 [19:16<00:00,  2.09it/s] 


In [8]:
from tqdm import tqdm  
idf['FORMAT_TYPE'] = None
for i, row in tqdm(idf.iterrows(), total=idf.shape[0]):
    try:
        prompt = row['PROMPT']
        format_eval_query = f"You are now tasked with finding the format type requested in a question. Format type examples are: bullet list, numbered list, table markdown, comma separated list, general markdown etc. You need to respond with only the format and nothing else. Here is the question \n{prompt}"
        response = evaluator_gpt(format_eval_query)
        idf.at[i, 'FORMAT_TYPE'] = response
    except Exception as e:
        print(f"Error occurred for index {i}: {str(e)}")

100%|██████████| 2416/2416 [21:42<00:00,  1.85it/s] 


In [9]:
from tqdm import tqdm  
idf['LENGTH_INSTRUCTIONS'] = None
for i, row in tqdm(idf.iterrows(), total=idf.shape[0]):
    try:
        prompt = row['PROMPT']
        length_eval_query = f"You are now tasked with finding the length instructions in a question. Length instruction is something that specifies how long the answer needs to be. \nFor word length specification respond: <num of words>,words\nFor paragraph specification respond: <num of paragraphs>, paragraphs\nIf no length instruction exists, respond with 'None'. You need to respond with only with the numeric length and length instruction type as shown above. Here is the question:\n{prompt}"
        response = evaluator_gpt(length_eval_query)
        idf.at[i, 'LENGTH_INSTRUCTIONS'] = response
    except Exception as e:
        print(f"Error occurred for index {i}: {str(e)}")

100%|██████████| 2416/2416 [22:46<00:00,  1.77it/s] 


In [11]:
#save idf to csv
idf.to_csv('Airedale.csv', index=False)