# This notebook is for evaluating the model's accuracy on datasets

In [212]:
import json
import numpy as np
from openai import OpenAI

In [213]:
with open('datasets.json', 'r') as f:
    datasets = json.load(f)

In [214]:
from copy import deepcopy

def get_inputs(dataset, n_train, n_test):

    test_examples = np.random.choice(dataset['length'],size=n_test, replace=False)

    inputs = []
    targets = []
    sentences = []
    for j in test_examples:
        neg_examples = np.random.choice([i for i in range(dataset['length']) if i != j and dataset['targets'][i]==False],size=n_train//2, replace=False)
        pos_examples = np.random.choice([i for i in range(dataset['length']) if i != j and dataset['targets'][i]==True],size=n_train//2, replace=False)
        context_examples = np.random.choice(np.concatenate([neg_examples,pos_examples]),size=2 * (n_train//2), replace=False)
        message = [
            {'role': 'system', 'content': 'Please classify below input as in the provided examples.'}
        ]
        content = []
        for i in context_examples:
            message.append({'role': 'user', 'content': 'Input: "{}"'.format(dataset['data'][i])})
            message.append({'role': 'user', 'content': 'Label: {}'.format(dataset['targets'][i])})
        #content.append('Input: "{}"'.format(dataset['data'][j]))
        message.append({'role': 'user', 'content': 'Input: "{}"'.format(dataset['data'][j])})
        #message.append({'role': 'assistant', 'content': 'Label: {}'.format(dataset['targets'][j])})
        # Multiple choice message
        # message.append({'role': 'system', 'content': 'What feature did you use to make this classification decision? Please briefly explain.\nA) Grammaticality\nB) Capitalization\nC) Sentence sentiment\nD) Subject of the sentence or meaning of words\nE) Other'})
        # COT Message
        #message.append({'role': 'system', 'content': "Why did you make this classification decision? Before coming to a conclusion, try to consider several alternatives and think step-by-step."})
        inputs.append(message)
        targets.append(dataset['targets'][j])
        sentences.append(dataset['data'][j])

    return inputs, targets, sentences

In [215]:
np.random.seed(0)
get_inputs(datasets[1],4,1)

([[{'role': 'system',
    'content': 'Please classify below input as in the provided examples.'},
   {'role': 'user',
    'content': 'Input: "The gentle cow looked up as the farmer approached the barn."'},
   {'role': 'user', 'content': 'Label: True'},
   {'role': 'user', 'content': 'Input: "The cake baked for over an hour."'},
   {'role': 'user', 'content': 'Label: False'},
   {'role': 'user',
    'content': 'Input: "She\'s allergic to bees so avoids flowers when possible."'},
   {'role': 'user', 'content': 'Label: True'},
   {'role': 'user',
    'content': 'Input: "Beach chairs flapped lazily in fresh, salty ocean breezes."'},
   {'role': 'user', 'content': 'Label: False'},
   {'role': 'user', 'content': 'Input: "She loves to read poetry books."'}]],
 [False],
 ['She loves to read poetry books.'])

In [209]:
# id is the index of the dataset
id = 1

# n_train is number of in-context learning examples
n_train = 50

# n_test is total number of test prompts (and thus test sentences) to use (1 test sentence per prompt)
n_test = 10

# OpenAI Api Key
API_KEY = ...

np.random.seed(0)

dataset = datasets[id]

client = OpenAI(api_key=API_KEY)

inputs, targets, sentences = get_inputs(dataset, n_train, min(n_test,dataset['length']))

outputs = []

for input in inputs:
    
    response = client.chat.completions.create(
      model="gpt-4-1106-preview",
      messages=input,
    temperature=0
    )
    t = response.choices[0].message.content
    print(t)
    outputs.append(t)

Label: False
Label: True
Label: False
Label: False
Label: False
Label: False
Label: False
Label: False
Label: False
Label: False


In [210]:
correct = sum([output[7:]==str(target) for output, target in zip(outputs,targets)])
acc = correct/len(outputs)
print(acc)

1.0


In [170]:
# save accuracy and number of training examples used
datasets[id]['accuracy'] = acc
datasets[id]['n_train'] = n_train

In [171]:
# save dataset with new info

with open('datasets.json', 'w') as f:
    json.dump(datasets, f)

In [211]:
print([dataset['rule'] for dataset in datasets])
print([dataset['accuracy'] for dataset in datasets])
print([dataset['n_train'] for dataset in datasets])
print([dataset['length'] for dataset in datasets])
print([sum(dataset['targets']) for dataset in datasets])

['String is all lowercase letters', 'Sentence contains an animal', 'The sentence expresses a positive sentiment.', 'Sentence is grammatical', 'Sentence contains a number', 'String is a palindrome']
[0.92, 0.91, 1.0, 0.95, 0.97, 0.95]
[50, 50, 10, 50, 10, 2]
[130, 179, 100, 100, 100, 100]
[65, 41, 50, 50, 50, 50]
