In [1]:
import openai
import snowflake.connector
from tqdm import tqdm
import os

api_key_file = 'apikey.txt'
if os.path.isfile(api_key_file):
    with open(api_key_file) as f:
        openai.api_key = f.readline()
else:
    print(f"Error: {api_key_file} not found.")

OPENAI_API_KEY = openai.api_key

#login to snowflake db
con = snowflake.connector.connect(user='vishal.kumar@scale.com',
                                 account='pxa65918',
                                 authenticator='externalbrowser',
                                 warehouse='COMPUTE_WH',
                                 database='SCALE_CRAWLER',
                                 role='GENERAL_RO')

cs = con.cursor()

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...


In [39]:
#Get work logs
sql = f'''
select
  ta.task as TaskID,
  t.params:templateVariables:category::string as tasktype,
  t.params:templateVariables:archetype::string as req_1,
  t.params:templateVariables:archetypeb::string as req_2,
  t.params:templateVariables:archetypec::string as req_3,
  ta.response:responses[4]:context:response:annotations:structured_output_type:response[0][0]::string as SO_type,
  ta.response:prompt::string as Prompt,
  ta.response:responses[3]:context:referenceTexts[0]:content::string as ReferenceText,
  ta.response:rewrite::string as Response
from
  SCALE_PROD.PUBLIC.TASKS t
  join SCALE_PROD.PUBLIC.PIPELINEV3HUMANNODES pp on pp.task = t._id
  join SCALE_PROD.PUBLIC.taskattempts ta on ta.task = t._id
where
  t.project = '65a6b29f5abfb1b5efd9303a'
  and SO_type IS NOT NULL
  and pp.status = 'pending'
  and pp.review_level IN (10,12)  

-- filters the latest taskattempt of the selected task
QUALIFY row_number() OVER (PARTITION BY TA.TASK ORDER BY TA.ATTEMPTED_AT DESC) = 1
'''
cs.execute(sql)
df = cs.fetch_pandas_all()
df = df.dropna(subset=['SO_TYPE'])
df = df[df['SO_TYPE']!='no_structured_output']

In [40]:
def evaluator_gpt(prompt):
    completion = openai.ChatCompletion.create(
        model="gpt-4-turbo-preview",
        messages=[{"role":"user", "content":prompt}],
        temperature=0.1
    )
    response = completion.choices[0].message
    response = response.content
    return response

In [41]:
#evaluate the response for accuracy of structured output
so_type=""
df['EVALUATION_RESULT'] = ''
#start a for loop with tqdm for all rows of df
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    #save value of current row SO_TYPE column in so_type variable
    so_type = row['SO_TYPE']
    eval_set = row['RESPONSE']
    prompt = "Read this " + so_type + " and check whether it is in correct format. Only check the format if it is correct mardown or json or html or csv or table or xml. Do not check accuracy of content. Here it is:\n\n\n" + eval_set + "\n\n\n Only respond with Correct if format is correct, else say incorrect. Dont say anything else"
    #print(prompt)
    response = evaluator_gpt(prompt)
    df.loc[i, 'evaluation_result'] = response
    #print(response)
    

100%|██████████| 336/336 [03:27<00:00,  1.62it/s]


In [42]:
import pandas as pd
#copy df into a new dataframe odf
odf = df.copy()
#replace Correct by 1 in evaluation_result column of odf
odf['evaluation_result'] = odf['evaluation_result'].replace('Correct', 1)
odf['evaluation_result'] = odf['evaluation_result'].replace('Incorrect', 0)
odf['evaluation_result'] = odf['evaluation_result'].replace('incorrect', 0)
df['evaluation_result'] = pd.to_numeric(odf['evaluation_result'])
#calculate mean of evaluation_result by SO_TYPE and print, also display the count of each SO_TYPE, in the same table
# First, group by 'SO_TYPE' and then aggregate to get count and mean for 'evaluation_result'
grouped = df.groupby('SO_TYPE')['evaluation_result'].agg(['count', 'mean'])

# Reset index to turn the index into a column, making the DataFrame suitable for printing
grouped_reset = grouped.reset_index()

# Rename columns for clarity
grouped_reset.columns = ['SO_TYPE', 'COUNT', 'PERCENTAGE_CORRECT']

# Print the resulting DataFrame
print(grouped_reset)

             SO_TYPE  COUNT  PERCENTAGE_CORRECT
0                csv     37            0.540541
1               html      5            0.000000
2               json     44            0.568182
3  python_dictionary      7            0.714286
4              table    236            0.902542
5                xml      7            0.285714


In [43]:
df.to_csv('structured_output_evaluation.csv', index=False)