## Automatic Evaluation via GPT-4

## 1. Environment Setup

### 1.1. Imports

In [2]:
## Importing DS modules
import pandas as pd

## Importing OpenAI modules
import openai

## Importing other modules
import os
import copy

### 1.2 Global Variables

In [3]:
## Path for evaluation output
TEST_SAVE_PATH = "/Users/kaanaydin/Library/CloudStorage/OneDrive-SharedLibraries-UniversitätSt.Gallen/STUD-NLP Group Project - General/02 Group project/evaluation"

In [4]:
## Set your OpenAI API key
openai.api_key = ''

## 2. OpenAI setup

In [5]:
## Function to evaluate answer given question & retrieve score and explanation from GPT-4

def evaluate(question, answer):
    
    ## Develop prompt based on answer & question. Prompt inspired by Vicuna team's original prompt
    prompt = f"You are a helpful and precise assistant for checking the quality of the answer.\n Question:'{question}' \n Response: '{answer}' \n We would like to request your feedback on the performance of the AI assistants in response to the user question displayed above.\n Please rate the helpfulness, relevance, level of details of the responses. Each answer should receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\n Please first output the score on a scale from 1 to 10 followed by '-|-' as separator and a comprehensive explanation of your evaluation, avoiding any potential bias"

    ## Structure message in GPT-4 format
    message=[{"role": "user", "content": prompt}]

    ## Generate the response from OpenAI GPT-4
    response = openai.ChatCompletion.create(
                model="gpt-4",
                temperature=0.2,
                messages = message)

    ## Retrieve answer from resopnse
    answer = response['choices'][0]['message']['content']
    
    return answer

## 3. Evaluation

In [33]:
## Selected model name to be evaluated 

selected_model_name = "gpt-3.5-davinci"
#selected_model_name = "gelectra-finetuned"

In [34]:
## Read data from your excel file
test = pd.read_csv(os.path.join(TEST_SAVE_PATH, f'{selected_model_name}_evaluation.csv'))

In [39]:
## Lists to capture the scores and the evaluations
scores = []
explanations = []

In [40]:
## Iterating over all rows from the testset

for row in test.itertuples():
    
    ## Retrieving question and answer from the testset
    question = str(row.Frage)
    answer = str(row.ModelAntwort)

    ## Querying model for evaluation
    evaluation = evaluate(question, answer)
    
    ## Retrieving and splitting up the score and explanation
    score = evaluation.split("-|-")[0]
    explanation = evaluation.split("-|-")[1]

    ## Appending the score and explanation to the list
    scores.append(score)
    explanations.append(explanation)

In [39]:
## IN CASE THE CODE ABOVE BREAKS DUE TO OPENAI OVERLOADED WITH REQUESTS

## Define the breaking point where OpenAI was overloaded with requests
breakingpoint = min(len(scores), len(explanations))

## Make sure the list of scores and explanations have the same length
scores = scores[0:breakingpoint]
explanations = explanations[0:breakingpoint]

## Create new df dropping all rows that have already been processed
test2 = test.iloc[breakingpoint:]


## Iterating over all rows from the remaining teset
for row in test2.itertuples():
    
    ## Retrieving question and answer from the testset
    question = str(row.Frage)
    answer = str(row.ModelAntwort)

    ## Querying model for evaluation
    evaluation = evaluate(question, answer)
    
    ## Retrieving and splitting up the score and explanation
    score = evaluation.split("-|-")[0]
    explanation = evaluation.split("-|-")[1]

    ## Appending the score and explanation to the list
    scores.append(score)
    explanations.append(explanation)

In [41]:
## Creating copy of the testeset
evaluation = copy.deepcopy(test)

## Adding scores and explanations to the testset
evaluation["AutoEvalScore"] = scores
evaluation["AutoEvalExplan"] = explanations

In [44]:
## Saving testset
evaluation.to_csv(os.path.join(TEST_SAVE_PATH, f'{selected_model_name}_evaluation.csv'), index=False)