In [None]:
import pandas as pd
import requests
import snowflake.connector
from tqdm import tqdm
import signal


In [None]:
#create an empty df called subdf
subdf = pd.DataFrame(columns=['project_id','project_name'])
subdf = pd.concat([subdf, pd.DataFrame([{'project_id': '65414d0e3d0179ddbc88a8dd', 'project_name':'Biology Single Turn'}])], ignore_index=True)
subdf = pd.concat([subdf, pd.DataFrame([{'project_id': '654155310dc2cd37dec06f4a', 'project_name':'Statistics and Probability Single Turn'}])], ignore_index=True)
subdf = pd.concat([subdf, pd.DataFrame([{'project_id': '65414db2dc8e65e4dfe4963c', 'project_name':'Computer Science Single Turn'}])], ignore_index=True)
subdf = pd.concat([subdf, pd.DataFrame([{'project_id': '653885efb58819f5f03bd655', 'project_name':'Chemistry Single Turn'}])], ignore_index=True)
subdf = pd.concat([subdf, pd.DataFrame([{'project_id': '6552611390f5ee167ef420e8', 'project_name':'Physics Single Turn '}])], ignore_index=True)
subdf = pd.concat([subdf, pd.DataFrame([{'project_id': '65527492168482aa0093a631', 'project_name':'Adv Physics Single Turn '}])], ignore_index=True)
subdf = pd.concat([subdf, pd.DataFrame([{'project_id': '65414aaab70a41e7c12761fa', 'project_name':'Economics Single Turn '}])], ignore_index=True)
subdf = pd.concat([subdf, pd.DataFrame([{'project_id': '6541511b9af2337ed1642cc0', 'project_name':'Finance Single Turn '}])], ignore_index=True)
subdf = pd.concat([subdf, pd.DataFrame([{'project_id': '65414bb77d1db381f7c670e0', 'project_name':'Nursing Single Turn'}])], ignore_index=True)
subdf = pd.concat([subdf, pd.DataFrame([{'project_id': '654155cec180a1a9fafc6f84', 'project_name':'Chemical Engineering Single Turn'}])], ignore_index=True)

In [None]:
#login to snowflake db
con = snowflake.connector.connect(user='vishal.kumar@scale.com',
                                 account='pxa65918',
                                 authenticator='externalbrowser',
                                 warehouse='COMPUTE_WH',
                                 database='SCALE_PROD',
                                 role='GENERAL_RO')
cs = con.cursor()

In [None]:
# Define the timeout handler function
def handler(signum, frame):
    raise TimeoutError()

# Set the signal to call the handler on alarm
signal.signal(signal.SIGALRM, handler)


def get_data_sample(cs, project_id):  
    try:
        # Set an alarm for 8 minutes
        signal.alarm(8 * 60)

        # Get work logs
        sql = f'''
            SELECT 
        pp.name as PROJECT_NAME, 
        ta.task as TASK_ID, 
        ta.RESPONSE:responses[3]:context:message as RESPONSE_FINAL
    FROM taskattempts ta
    JOIN public.projects pp ON pp._id = ta.project
    WHERE ta.project = '{project_id}' 
    and review_level=12
    limit
    120
        '''
        cs.execute(sql)
        rdf = cs.fetch_pandas_all()

        # Deactivate the alarm
        signal.alarm(0)
        return rdf

    except TimeoutError:
        return "Timeout Exception"
    except Exception as e:
        return "Unknown Error"


def get_response(rdf, project_id, project_name):
    tdf = pd.DataFrame(columns=['project', 'task_id', 'response', 'assessment', 'reqid'])    
# Create loop to iterate through each row of the dataframe with progress bar
    for index, row in tqdm(rdf.iterrows(), total=rdf.shape[0]):
            #create and empty dataframe called tdf with columns project, task_id, response, assessment, reqid
            
            # Save the current row in a variable called res
            task_id = row['TASK_ID']
            res = row['RESPONSE_FINAL']
            data = {
                "input": {
                    "input": res 
                }
            }
            headers = {"Authorization": "Basic clfv6zfnd011k1arec1lucb9l"}
            response = requests.post(
                "https://dashboard.scale.com/spellbook/api/v2/deploy/2m03uo1",
                json=data,
                headers=headers
            )
            # Save the 'output' value in response.json() in a variable called result
            result = response.json()['output']
            reqid = response.json()['requestId']
            # Create a new temporary dataframe
            temp_df = pd.DataFrame([{'project_id':project_id, 'project_name':project_name, 'task_id':task_id, 'response': res, 'assessment': result, 'reqid': reqid}])
            # Concatenate the temporary dataframe to tdf
            tdf = pd.concat([tdf, temp_df], ignore_index=True)
            #save tdf as csv with name project_name.csv            
            tdf.to_csv(project_name + '.csv', index=False)
            
    return tdf



def data_prep(rdf):
    try:
        rdf['RESPONSE_FINAL'] = rdf['RESPONSE_FINAL'].str.replace('\n', ' ')
        # Remove all &#x20; characters from RESPONSE_FINAL and replace with space
        rdf['RESPONSE_FINAL'] = rdf['RESPONSE_FINAL'].str.replace('&#x20;', ' ')
        # Remove all 'None' characters from RESPONSE_FINAL and replace with space
        rdf['RESPONSE_FINAL'] = rdf['RESPONSE_FINAL'].str.replace('None', ' ')

        # Remove rows with duplicate task_id
        rdf.drop_duplicates(subset=['TASK_ID'], inplace=True)
        
    except Exception as e:
        print("error")
    return rdf


def score_calc(tdf, project_name):
    # Count total rows excluding header row in tdf
    total_rows = len(tdf.index)
    # Count the number of rows where GPT4_Assessment is 'No errors'
    count = len(tdf[tdf['assessment'] == 'No Error'].index)
    # Calculate accuracy
    accuracy = count / total_rows
    # Print accuracy
    print(project_name," factual accuracy is ", accuracy * 100, "%")
    return accuracy



In [None]:
#start a for loop to loop through each row of subdf
for index, row in subdf.iterrows():
    # Call get_data_sample function and save the returned dataframe in a variable called rdf
    print("Getting sample response data for project ",row['project_name'])
    rdf = get_data_sample(cs, row['project_id'])
    # Call data_prep function and save the returned dataframe in a variable called rdf
    rdf = data_prep(rdf)
    # Call get_response function and save the returned dataframe in a variable called tdf
    tdf = get_response(rdf, row['project_id'], row['project_name'])
    # Call score_calc function and save the returned dataframe in a variable called tdf
    accuracy = score_calc(tdf, row['project_name'])
    # Save calculated accuracy in a dataframe called SubjectScores
    SubjectScores = pd.DataFrame([{'project_name':row['project_name'], 'accuracy': accuracy}])

# Save SubjectScores as csv
SubjectScores.to_csv('SubjectScores.csv', index=False)