In [2]:
import openai
from tqdm import tqdm
import os
import pandas as pd
import snowflake.connector
from jarowinkler import jarowinkler_similarity


api_key_file = '/Users/vishalkumar/Documents/apikey.txt'
if os.path.isfile(api_key_file):
    with open(api_key_file) as f:
        openai.api_key = f.readline()
else:
    print(f"Error: {api_key_file} not found.")

OPENAI_API_KEY = openai.api_key

#login to snowflake db
con = snowflake.connector.connect(user='vishal.kumar@scale.com',
                                 account='pxa65918',
                                 authenticator='externalbrowser',
                                 warehouse='COMPUTE_WH',
                                 database='SCALE_CRAWLER',
                                 role='GENERAL_RO')

cs = con.cursor()

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...


In [3]:
#Get data from snowflake
sql = f'''
with base as (
Select
t._id,

--r.value:type::string as step,
array_agg( r.value:context:candidates[0]:message:content::string) as seal_1,
array_agg(r.value:context:candidates[1]:message:content::string) as seal_2,
array_agg(r.value:context:candidates[2]:message:content::string) as seal_3,
array_agg(r.value:context:selectedId::string) as selected,

array_agg(r.value:context:rankedCandidates[0][0]:sourceId::string) rank_1,
array_agg(r.value:context:rankedCandidates[1][0]:sourceId::string) rank_2,
array_agg(r.value:context:rankedCandidates[2][0]:sourceId::string) rank_3,

array_agg(case when r.value:type::string = 'ModelResponseEditor' then r.value:output::string else null end) as edited,
--r.value:index as index

from scale_prod.public.tasks t
,LATERAL FLATTEN(input => t.response:responses, mode => 'array') r
where t.project = '6578a7010281a578c59d167a' 
AND cast(t.completed_at as Date) > '2024-03-26'
group by 1
)
select
_id,
-- Turn 1
seal_1[0]::string as M_Resp_T1_Seal1,
seal_2[0]::string as M_Resp_T1_Seal2,
seal_3[0]::string as M_Resp_T1_Seal3,

selected[0]::string as T1_Selected_Response,

rank_1[0]::string as M_T1_Rank1,
rank_2[0]::string as M_T1_Rank2,
rank_3[0]::string as M_T1_Rank3,

edited[0]::string as Rewritten_1,

-- Turn 2
seal_1[1]::string as M_Resp_T2_Seal1,
seal_2[1]::string as M_Resp_T2_Seal2,
seal_3[1]::string as M_Resp_T2_Seal3,

selected[1]::string as T2_Selected_Response,

rank_1[1]::string as M_T2_Rank1,
rank_2[1]::string as M_T2_Rank2,
rank_3[1]::string as M_T2_Rank3,

edited[1]::string as Rewritten_2,

----- Turn 3
seal_1[2]::string as M_Resp_T3_Seal1,
seal_2[2]::string as M_Resp_T3_Seal2,
seal_3[2]::string as M_Resp_T3_Seal3,

selected[2]::string as T3_Selected_Response,

rank_1[2]::string as M_T3_Rank1,
rank_2[2]::string as M_T3_Rank2,
rank_3[2]::string as M_T3_Rank3,

edited[2]::string as Rewritten_3,

-- Turn 4
seal_1[3]::string as M_Resp_T4_Seal1,
seal_2[3]::string as M_Resp_T4_Seal2,
seal_3[3]::string as M_Resp_T4_Seal3,

selected[3]::string as T4_Selected_Response,

rank_1[3]::string as M_T4_Rank1,
rank_2[3]::string as M_T4_Rank2,
rank_3[3]::string as M_T4_Rank3,

edited[3]::string as Rewritten_4,

-- Turn 5
seal_1[4]::string as M_Resp_T5_Seal1,
seal_2[4]::string as M_Resp_T5_Seal2,
seal_3[4]::string as M_Resp_T5_Seal3,

selected[4]::string as T5_Selected_Response,

rank_1[4]::string as M_T5_Rank1,
rank_2[4]::string as M_T5_Rank2,
rank_3[4]::string as M_T5_Rank3,

edited[4]::string as Rewritten_5

from base
'''
cs.execute(sql)
idf = cs.fetch_pandas_all()

In [4]:
#print columns
print(idf.columns)

Index(['_ID', 'M_RESP_T1_SEAL1', 'M_RESP_T1_SEAL2', 'M_RESP_T1_SEAL3',
       'T1_SELECTED_RESPONSE', 'M_T1_RANK1', 'M_T1_RANK2', 'M_T1_RANK3',
       'REWRITTEN_1', 'M_RESP_T2_SEAL1', 'M_RESP_T2_SEAL2', 'M_RESP_T2_SEAL3',
       'T2_SELECTED_RESPONSE', 'M_T2_RANK1', 'M_T2_RANK2', 'M_T2_RANK3',
       'REWRITTEN_2', 'M_RESP_T3_SEAL1', 'M_RESP_T3_SEAL2', 'M_RESP_T3_SEAL3',
       'T3_SELECTED_RESPONSE', 'M_T3_RANK1', 'M_T3_RANK2', 'M_T3_RANK3',
       'REWRITTEN_3', 'M_RESP_T4_SEAL1', 'M_RESP_T4_SEAL2', 'M_RESP_T4_SEAL3',
       'T4_SELECTED_RESPONSE', 'M_T4_RANK1', 'M_T4_RANK2', 'M_T4_RANK3',
       'REWRITTEN_4', 'M_RESP_T5_SEAL1', 'M_RESP_T5_SEAL2', 'M_RESP_T5_SEAL3',
       'T5_SELECTED_RESPONSE', 'M_T5_RANK1', 'M_T5_RANK2', 'M_T5_RANK3',
       'REWRITTEN_5'],
      dtype='object')


In [5]:
#add 5 new columns called T1_MODEL_RESPONSE, T2_MODEL_RESPONSE, T3_MODEL_RESPONSE, T4_MODEL_RESPONSE, T5_MODEL_RESPONSE
idf['T1_MODEL_RESPONSE'] = ''
idf['T2_MODEL_RESPONSE'] = ''
idf['T3_MODEL_RESPONSE'] = ''
idf['T4_MODEL_RESPONSE'] = ''
idf['T5_MODEL_RESPONSE'] = ''
#if T1_SELECTED_RESPONSE = seal1 then T1_MODEL_RESPONSE = M_Resp_T1_Seal1 else if T1_SELECTED_RESPONSE = seal2 then T1_MODEL_RESPONSE = M_Resp_T1_Seal2 else if T1_SELECTED_RESPONSE = seal3 then T1_MODEL_RESPONSE = M_Resp_T1_Seal3
idf.loc[idf['T1_SELECTED_RESPONSE'] == 'seal1', 'T1_MODEL_RESPONSE'] = idf['M_RESP_T1_SEAL1']
idf.loc[idf['T1_SELECTED_RESPONSE'] == 'seal2', 'T1_MODEL_RESPONSE'] = idf['M_RESP_T1_SEAL2']
idf.loc[idf['T1_SELECTED_RESPONSE'] == 'seal3', 'T1_MODEL_RESPONSE'] = idf['M_RESP_T1_SEAL3']
#if T2_SELECTED_RESPONSE = seal1 then T2_MODEL_RESPONSE = M_RESP_T2_SEAL1 else if T2_SELECTED_RESPONSE = seal2 then T2_MODEL_RESPONSE = M_RESP_T2_SEAL2 else if T2_SELECTED_RESPONSE = seal3 then T2_MODEL_RESPONSE = M_RESP_T2_SEAL3
idf.loc[idf['T2_SELECTED_RESPONSE'] == 'seal1', 'T2_MODEL_RESPONSE'] = idf['M_RESP_T2_SEAL1']
idf.loc[idf['T2_SELECTED_RESPONSE'] == 'seal2', 'T2_MODEL_RESPONSE'] = idf['M_RESP_T2_SEAL2']
idf.loc[idf['T2_SELECTED_RESPONSE'] == 'seal3', 'T2_MODEL_RESPONSE'] = idf['M_RESP_T2_SEAL3']
#if T3_SELECTED_RESPONSE = seal1 then T3_MODEL_RESPONSE = M_RESP_T3_SEAL1 else if T3_SELECTED_RESPONSE = seal2 then T3_MODEL_RESPONSE = M_RESP_T3_SEAL2 else if T3_SELECTED_RESPONSE = seal3 then T3_MODEL_RESPONSE = M_RESP_T3_SEAL3
idf.loc[idf['T3_SELECTED_RESPONSE'] == 'seal1', 'T3_MODEL_RESPONSE'] = idf['M_RESP_T3_SEAL1']
idf.loc[idf['T3_SELECTED_RESPONSE'] == 'seal2', 'T3_MODEL_RESPONSE'] = idf['M_RESP_T3_SEAL2']
idf.loc[idf['T3_SELECTED_RESPONSE'] == 'seal3', 'T3_MODEL_RESPONSE'] = idf['M_RESP_T3_SEAL3']
#if T4_SELECTED_RESPONSE = seal1 then T4_MODEL_RESPONSE = M_RESP_T4_SEAL1 else if T4_SELECTED_RESPONSE = seal2 then T4_MODEL_RESPONSE = M_RESP_T4_SEAL2 else if T4_SELECTED_RESPONSE = seal3 then T4_MODEL_RESPONSE = M_RESP_T4_SEAL3
idf.loc[idf['T4_SELECTED_RESPONSE'] == 'seal1', 'T4_MODEL_RESPONSE'] = idf['M_RESP_T4_SEAL1']
idf.loc[idf['T4_SELECTED_RESPONSE'] == 'seal2', 'T4_MODEL_RESPONSE'] = idf['M_RESP_T4_SEAL2']
idf.loc[idf['T4_SELECTED_RESPONSE'] == 'seal3', 'T4_MODEL_RESPONSE'] = idf['M_RESP_T4_SEAL3']
#if T5_SELECTED_RESPONSE = seal1 then T5_MODEL_RESPONSE = M_RESP_T5_SEAL1 else if T5_SELECTED_RESPONSE = seal2 then T5_MODEL_RESPONSE = M_RESP_T5_SEAL2 else if T5_SELECTED_RESPONSE = seal3 then T5_MODEL_RESPONSE = M_RESP_T5_SEAL3
idf.loc[idf['T5_SELECTED_RESPONSE'] == 'seal1', 'T5_MODEL_RESPONSE'] = idf['M_RESP_T5_SEAL1']
idf.loc[idf['T5_SELECTED_RESPONSE'] == 'seal2', 'T5_MODEL_RESPONSE'] = idf['M_RESP_T5_SEAL2']
idf.loc[idf['T5_SELECTED_RESPONSE'] == 'seal3', 'T5_MODEL_RESPONSE'] = idf['M_RESP_T5_SEAL3']
#print top 5 rows
print(idf.head())

                        _ID  \
0  65f9f63e4b27c36125bae726   
1  65f9f64115f19cd5aed12149   
2  65f9f63e6b2dbba1dca4b7b8   
3  65f9f645e3e48061293e7786   
4  6601ebfac7482b0a62225df2   

                                     M_RESP_T1_SEAL1  \
0  \nDropshipping is a popular e-commerce busines...   
1  - Ellen Pompeo\n- Meredith Grey\n- Alexis Floy...   
2  - The novel "To Kill a Mockingbird" is set in ...   
3  \nSubject: Article on the Importance of Econom...   
4  * The US economy entered 2024 with various ind...   

                                     M_RESP_T1_SEAL2  \
0  \nDropshipping is a popular e-commerce busines...   
1  \n*   Grey's Anatomy\n*   Ellen Pompeo\n*   Me...   
2  - The novel "To Kill a Mockingbird" is set in ...   
3  \nSubject: Economics Education: Benefits and R...   
4  \n[Economic indicators for 2024]\n\n* The US e...   

                                     M_RESP_T1_SEAL3 T1_SELECTED_RESPONSE  \
0  \nDropshipping is a fantastic way to start an ...          

In [6]:

#copy idf to a new df called tdf
tdf = idf.copy()
#print column names
print(tdf.columns)

Index(['_ID', 'M_RESP_T1_SEAL1', 'M_RESP_T1_SEAL2', 'M_RESP_T1_SEAL3',
       'T1_SELECTED_RESPONSE', 'M_T1_RANK1', 'M_T1_RANK2', 'M_T1_RANK3',
       'REWRITTEN_1', 'M_RESP_T2_SEAL1', 'M_RESP_T2_SEAL2', 'M_RESP_T2_SEAL3',
       'T2_SELECTED_RESPONSE', 'M_T2_RANK1', 'M_T2_RANK2', 'M_T2_RANK3',
       'REWRITTEN_2', 'M_RESP_T3_SEAL1', 'M_RESP_T3_SEAL2', 'M_RESP_T3_SEAL3',
       'T3_SELECTED_RESPONSE', 'M_T3_RANK1', 'M_T3_RANK2', 'M_T3_RANK3',
       'REWRITTEN_3', 'M_RESP_T4_SEAL1', 'M_RESP_T4_SEAL2', 'M_RESP_T4_SEAL3',
       'T4_SELECTED_RESPONSE', 'M_T4_RANK1', 'M_T4_RANK2', 'M_T4_RANK3',
       'REWRITTEN_4', 'M_RESP_T5_SEAL1', 'M_RESP_T5_SEAL2', 'M_RESP_T5_SEAL3',
       'T5_SELECTED_RESPONSE', 'M_T5_RANK1', 'M_T5_RANK2', 'M_T5_RANK3',
       'REWRITTEN_5', 'T1_MODEL_RESPONSE', 'T2_MODEL_RESPONSE',
       'T3_MODEL_RESPONSE', 'T4_MODEL_RESPONSE', 'T5_MODEL_RESPONSE'],
      dtype='object')


In [7]:
#Drop tdf columns 'M_RESP_T1_SEAL1', 'M_RESP_T1_SEAL2', 'M_RESP_T1_SEAL3','T1_SELECTED_RESPONSE', 'M_T1_RANK1', 'M_T1_RANK2', 'M_T1_RANK3','M_RESP_T2_SEAL1', 'M_RESP_T2_SEAL2', 'M_RESP_T2_SEAL3', 'T2_SELECTED_RESPONSE', 'M_T2_RANK1', 'M_T2_RANK2', 'M_T2_RANK3', 'M_RESP_T3_SEAL1', 'M_RESP_T3_SEAL2', 'M_RESP_T3_SEAL3', 'T3_SELECTED_RESPONSE', 'M_T3_RANK1', 'M_T3_RANK2', 'M_T3_RANK3', 'M_RESP_T4_SEAL1', 'M_RESP_T4_SEAL2', 'M_RESP_T4_SEAL3', 'T4_SELECTED_RESPONSE', 'M_T4_RANK1', 'M_T4_RANK2', 'M_T4_RANK3', 'M_RESP_T5_SEAL1', 'M_RESP_T5_SEAL2', 'M_RESP_T5_SEAL3','T5_SELECTED_RESPONSE', 'M_T5_RANK1', 'M_T5_RANK2', 'M_T5_RANK3'
tdf = tdf.drop(columns=['M_RESP_T1_SEAL1', 'M_RESP_T1_SEAL2', 'M_RESP_T1_SEAL3','T1_SELECTED_RESPONSE', 'M_T1_RANK1', 'M_T1_RANK2', 'M_T1_RANK3','M_RESP_T2_SEAL1', 'M_RESP_T2_SEAL2', 'M_RESP_T2_SEAL3', 'T2_SELECTED_RESPONSE', 'M_T2_RANK1', 'M_T2_RANK2', 'M_T2_RANK3', 'M_RESP_T3_SEAL1', 'M_RESP_T3_SEAL2', 'M_RESP_T3_SEAL3', 'T3_SELECTED_RESPONSE', 'M_T3_RANK1', 'M_T3_RANK2', 'M_T3_RANK3', 'M_RESP_T4_SEAL1', 'M_RESP_T4_SEAL2', 'M_RESP_T4_SEAL3', 'T4_SELECTED_RESPONSE', 'M_T4_RANK1', 'M_T4_RANK2', 'M_T4_RANK3', 'M_RESP_T5_SEAL1', 'M_RESP_T5_SEAL2', 'M_RESP_T5_SEAL3','T5_SELECTED_RESPONSE', 'M_T5_RANK1', 'M_T5_RANK2', 'M_T5_RANK3'])
print(tdf.columns)

Index(['_ID', 'REWRITTEN_1', 'REWRITTEN_2', 'REWRITTEN_3', 'REWRITTEN_4',
       'REWRITTEN_5', 'T1_MODEL_RESPONSE', 'T2_MODEL_RESPONSE',
       'T3_MODEL_RESPONSE', 'T4_MODEL_RESPONSE', 'T5_MODEL_RESPONSE'],
      dtype='object')


In [8]:
for index, row in tdf.iterrows():
    tdf.at[index, 'SIMILARITY_SCORE_1'] = jarowinkler_similarity(row['REWRITTEN_1'], row['T1_MODEL_RESPONSE'])
    tdf.at[index, 'SIMILARITY_SCORE_2'] = jarowinkler_similarity(row['REWRITTEN_2'], row['T2_MODEL_RESPONSE'])
    tdf.at[index, 'SIMILARITY_SCORE_3'] = jarowinkler_similarity(row['REWRITTEN_3'], row['T3_MODEL_RESPONSE'])
    tdf.at[index, 'SIMILARITY_SCORE_4'] = jarowinkler_similarity(row['REWRITTEN_4'], row['T4_MODEL_RESPONSE'])
    tdf.at[index, 'SIMILARITY_SCORE_5'] = jarowinkler_similarity(row['REWRITTEN_5'], row['T5_MODEL_RESPONSE'])
print(tdf.head())

                        _ID  \
0  65f9f63e4b27c36125bae726   
1  65f9f64115f19cd5aed12149   
2  65f9f63e6b2dbba1dca4b7b8   
3  65f9f645e3e48061293e7786   
4  6601ebfac7482b0a62225df2   

                                         REWRITTEN_1  \
0  Dropshipping is a popular e-commerce business ...   
1  *   Ellen Pompeo\n*   Meredith Grey\n*   Simon...   
2  **Character names**\n\n*   Jean Louise ("Scout...   
3  Subject: Economics Education: Benefits and Rea...   
4  Hey Jan,\n\nHere is the requested economic dat...   

                                         REWRITTEN_2  \
0  \[Opening shot of a computer screen with Shopi...   
1  1.  Pompeo\n2.  praised\n3.  plenty\n4.  premi...   
2  1.  Jean Louise Finch, also known as Scout, is...   
3  Subject: Three Main Reasons for Early Economic...   
4  Hey classmates!\n\nI'm excited to share with y...   

                                         REWRITTEN_3 REWRITTEN_4 REWRITTEN_5  \
0  *   **High profit margins:** Dropshipping busi...       

In [9]:
#tdf to csv
tdf.to_csv('tdf.csv', index=False)